Index: head/contrib/unbound/services/authzone.c =================================================================== --- head/contrib/unbound/services/authzone.c (revision 368750) +++ head/contrib/unbound/services/authzone.c (revision 368751) @@ -1,6969 +1,6969 @@ /* * services/authzone.c - authoritative zone that is locally hosted. * * Copyright (c) 2017, NLnet Labs. All rights reserved. * * This software is open source. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * Neither the name of the NLNET LABS nor the names of its contributors may * be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /** * \file * * This file contains the functions for an authority zone. This zone * is queried by the iterator, just like a stub or forward zone, but then * the data is locally held. */ #include "config.h" #include "services/authzone.h" #include "util/data/dname.h" #include "util/data/msgparse.h" #include "util/data/msgreply.h" #include "util/data/msgencode.h" #include "util/data/packed_rrset.h" #include "util/regional.h" #include "util/net_help.h" #include "util/netevent.h" #include "util/config_file.h" #include "util/log.h" #include "util/module.h" #include "util/random.h" #include "services/cache/dns.h" #include "services/outside_network.h" #include "services/listen_dnsport.h" #include "services/mesh.h" #include "sldns/rrdef.h" #include "sldns/pkthdr.h" #include "sldns/sbuffer.h" #include "sldns/str2wire.h" #include "sldns/wire2str.h" #include "sldns/parseutil.h" #include "sldns/keyraw.h" #include "validator/val_nsec3.h" #include "validator/val_secalgo.h" #include /** bytes to use for NSEC3 hash buffer. 20 for sha1 */ #define N3HASHBUFLEN 32 /** max number of CNAMEs we are willing to follow (in one answer) */ #define MAX_CNAME_CHAIN 8 /** timeout for probe packets for SOA */ #define AUTH_PROBE_TIMEOUT 100 /* msec */ /** when to stop with SOA probes (when exponential timeouts exceed this) */ #define AUTH_PROBE_TIMEOUT_STOP 1000 /* msec */ /* auth transfer timeout for TCP connections, in msec */ #define AUTH_TRANSFER_TIMEOUT 10000 /* msec */ /* auth transfer max backoff for failed tranfers and probes */ #define AUTH_TRANSFER_MAX_BACKOFF 86400 /* sec */ /* auth http port number */ #define AUTH_HTTP_PORT 80 /* auth https port number */ #define AUTH_HTTPS_PORT 443 /* max depth for nested $INCLUDEs */ #define MAX_INCLUDE_DEPTH 10 /** number of timeouts before we fallback from IXFR to AXFR, * because some versions of servers (eg. dnsmasq) drop IXFR packets. */ #define NUM_TIMEOUTS_FALLBACK_IXFR 3 /** pick up nextprobe task to start waiting to perform transfer actions */ static void xfr_set_timeout(struct auth_xfer* xfr, struct module_env* env, int failure, int lookup_only); /** move to sending the probe packets, next if fails. task_probe */ static void xfr_probe_send_or_end(struct auth_xfer* xfr, struct module_env* env); /** pick up probe task with specified(or NULL) destination first, * or transfer task if nothing to probe, or false if already in progress */ static int xfr_start_probe(struct auth_xfer* xfr, struct module_env* env, struct auth_master* spec); /** delete xfer structure (not its tree entry) */ static void auth_xfer_delete(struct auth_xfer* xfr); /** create new dns_msg */ static struct dns_msg* msg_create(struct regional* region, struct query_info* qinfo) { struct dns_msg* msg = (struct dns_msg*)regional_alloc(region, sizeof(struct dns_msg)); if(!msg) return NULL; msg->qinfo.qname = regional_alloc_init(region, qinfo->qname, qinfo->qname_len); if(!msg->qinfo.qname) return NULL; msg->qinfo.qname_len = qinfo->qname_len; msg->qinfo.qtype = qinfo->qtype; msg->qinfo.qclass = qinfo->qclass; msg->qinfo.local_alias = NULL; /* non-packed reply_info, because it needs to grow the array */ msg->rep = (struct reply_info*)regional_alloc_zero(region, sizeof(struct reply_info)-sizeof(struct rrset_ref)); if(!msg->rep) return NULL; msg->rep->flags = (uint16_t)(BIT_QR | BIT_AA); msg->rep->authoritative = 1; msg->rep->qdcount = 1; /* rrsets is NULL, no rrsets yet */ return msg; } /** grow rrset array by one in msg */ static int msg_grow_array(struct regional* region, struct dns_msg* msg) { if(msg->rep->rrsets == NULL) { msg->rep->rrsets = regional_alloc_zero(region, sizeof(struct ub_packed_rrset_key*)*(msg->rep->rrset_count+1)); if(!msg->rep->rrsets) return 0; } else { struct ub_packed_rrset_key** rrsets_old = msg->rep->rrsets; msg->rep->rrsets = regional_alloc_zero(region, sizeof(struct ub_packed_rrset_key*)*(msg->rep->rrset_count+1)); if(!msg->rep->rrsets) return 0; memmove(msg->rep->rrsets, rrsets_old, sizeof(struct ub_packed_rrset_key*)*msg->rep->rrset_count); } return 1; } /** get ttl of rrset */ static time_t get_rrset_ttl(struct ub_packed_rrset_key* k) { struct packed_rrset_data* d = (struct packed_rrset_data*) k->entry.data; return d->ttl; } /** Copy rrset into region from domain-datanode and packet rrset */ static struct ub_packed_rrset_key* auth_packed_rrset_copy_region(struct auth_zone* z, struct auth_data* node, struct auth_rrset* rrset, struct regional* region, time_t adjust) { struct ub_packed_rrset_key key; memset(&key, 0, sizeof(key)); key.entry.key = &key; key.entry.data = rrset->data; key.rk.dname = node->name; key.rk.dname_len = node->namelen; key.rk.type = htons(rrset->type); key.rk.rrset_class = htons(z->dclass); key.entry.hash = rrset_key_hash(&key.rk); return packed_rrset_copy_region(&key, region, adjust); } /** fix up msg->rep TTL and prefetch ttl */ static void msg_ttl(struct dns_msg* msg) { if(msg->rep->rrset_count == 0) return; if(msg->rep->rrset_count == 1) { msg->rep->ttl = get_rrset_ttl(msg->rep->rrsets[0]); msg->rep->prefetch_ttl = PREFETCH_TTL_CALC(msg->rep->ttl); msg->rep->serve_expired_ttl = msg->rep->ttl + SERVE_EXPIRED_TTL; } else if(get_rrset_ttl(msg->rep->rrsets[msg->rep->rrset_count-1]) < msg->rep->ttl) { msg->rep->ttl = get_rrset_ttl(msg->rep->rrsets[ msg->rep->rrset_count-1]); msg->rep->prefetch_ttl = PREFETCH_TTL_CALC(msg->rep->ttl); msg->rep->serve_expired_ttl = msg->rep->ttl + SERVE_EXPIRED_TTL; } } /** see if rrset is a duplicate in the answer message */ static int msg_rrset_duplicate(struct dns_msg* msg, uint8_t* nm, size_t nmlen, uint16_t type, uint16_t dclass) { size_t i; for(i=0; irep->rrset_count; i++) { struct ub_packed_rrset_key* k = msg->rep->rrsets[i]; if(ntohs(k->rk.type) == type && k->rk.dname_len == nmlen && ntohs(k->rk.rrset_class) == dclass && query_dname_compare(k->rk.dname, nm) == 0) return 1; } return 0; } /** add rrset to answer section (no auth, add rrsets yet) */ static int msg_add_rrset_an(struct auth_zone* z, struct regional* region, struct dns_msg* msg, struct auth_data* node, struct auth_rrset* rrset) { log_assert(msg->rep->ns_numrrsets == 0); log_assert(msg->rep->ar_numrrsets == 0); if(!rrset || !node) return 1; if(msg_rrset_duplicate(msg, node->name, node->namelen, rrset->type, z->dclass)) return 1; /* grow array */ if(!msg_grow_array(region, msg)) return 0; /* copy it */ if(!(msg->rep->rrsets[msg->rep->rrset_count] = auth_packed_rrset_copy_region(z, node, rrset, region, 0))) return 0; msg->rep->rrset_count++; msg->rep->an_numrrsets++; msg_ttl(msg); return 1; } /** add rrset to authority section (no additonal section rrsets yet) */ static int msg_add_rrset_ns(struct auth_zone* z, struct regional* region, struct dns_msg* msg, struct auth_data* node, struct auth_rrset* rrset) { log_assert(msg->rep->ar_numrrsets == 0); if(!rrset || !node) return 1; if(msg_rrset_duplicate(msg, node->name, node->namelen, rrset->type, z->dclass)) return 1; /* grow array */ if(!msg_grow_array(region, msg)) return 0; /* copy it */ if(!(msg->rep->rrsets[msg->rep->rrset_count] = auth_packed_rrset_copy_region(z, node, rrset, region, 0))) return 0; msg->rep->rrset_count++; msg->rep->ns_numrrsets++; msg_ttl(msg); return 1; } /** add rrset to additional section */ static int msg_add_rrset_ar(struct auth_zone* z, struct regional* region, struct dns_msg* msg, struct auth_data* node, struct auth_rrset* rrset) { if(!rrset || !node) return 1; if(msg_rrset_duplicate(msg, node->name, node->namelen, rrset->type, z->dclass)) return 1; /* grow array */ if(!msg_grow_array(region, msg)) return 0; /* copy it */ if(!(msg->rep->rrsets[msg->rep->rrset_count] = auth_packed_rrset_copy_region(z, node, rrset, region, 0))) return 0; msg->rep->rrset_count++; msg->rep->ar_numrrsets++; msg_ttl(msg); return 1; } struct auth_zones* auth_zones_create(void) { struct auth_zones* az = (struct auth_zones*)calloc(1, sizeof(*az)); if(!az) { log_err("out of memory"); return NULL; } rbtree_init(&az->ztree, &auth_zone_cmp); rbtree_init(&az->xtree, &auth_xfer_cmp); lock_rw_init(&az->lock); lock_protect(&az->lock, &az->ztree, sizeof(az->ztree)); lock_protect(&az->lock, &az->xtree, sizeof(az->xtree)); /* also lock protects the rbnode's in struct auth_zone, auth_xfer */ lock_rw_init(&az->rpz_lock); lock_protect(&az->rpz_lock, &az->rpz_first, sizeof(az->rpz_first)); return az; } int auth_zone_cmp(const void* z1, const void* z2) { /* first sort on class, so that hierarchy can be maintained within * a class */ struct auth_zone* a = (struct auth_zone*)z1; struct auth_zone* b = (struct auth_zone*)z2; int m; if(a->dclass != b->dclass) { if(a->dclass < b->dclass) return -1; return 1; } /* sorted such that higher zones sort before lower zones (their * contents) */ return dname_lab_cmp(a->name, a->namelabs, b->name, b->namelabs, &m); } int auth_data_cmp(const void* z1, const void* z2) { struct auth_data* a = (struct auth_data*)z1; struct auth_data* b = (struct auth_data*)z2; int m; /* canonical sort, because DNSSEC needs that */ return dname_canon_lab_cmp(a->name, a->namelabs, b->name, b->namelabs, &m); } int auth_xfer_cmp(const void* z1, const void* z2) { /* first sort on class, so that hierarchy can be maintained within * a class */ struct auth_xfer* a = (struct auth_xfer*)z1; struct auth_xfer* b = (struct auth_xfer*)z2; int m; if(a->dclass != b->dclass) { if(a->dclass < b->dclass) return -1; return 1; } /* sorted such that higher zones sort before lower zones (their * contents) */ return dname_lab_cmp(a->name, a->namelabs, b->name, b->namelabs, &m); } /** delete auth rrset node */ static void auth_rrset_delete(struct auth_rrset* rrset) { if(!rrset) return; free(rrset->data); free(rrset); } /** delete auth data domain node */ static void auth_data_delete(struct auth_data* n) { struct auth_rrset* p, *np; if(!n) return; p = n->rrsets; while(p) { np = p->next; auth_rrset_delete(p); p = np; } free(n->name); free(n); } /** helper traverse to delete zones */ static void auth_data_del(rbnode_type* n, void* ATTR_UNUSED(arg)) { struct auth_data* z = (struct auth_data*)n->key; auth_data_delete(z); } /** delete an auth zone structure (tree remove must be done elsewhere) */ static void auth_zone_delete(struct auth_zone* z, struct auth_zones* az) { if(!z) return; lock_rw_destroy(&z->lock); traverse_postorder(&z->data, auth_data_del, NULL); if(az && z->rpz) { /* keep RPZ linked list intact */ lock_rw_wrlock(&az->rpz_lock); if(z->rpz_az_prev) z->rpz_az_prev->rpz_az_next = z->rpz_az_next; else az->rpz_first = z->rpz_az_next; if(z->rpz_az_next) z->rpz_az_next->rpz_az_prev = z->rpz_az_prev; lock_rw_unlock(&az->rpz_lock); } if(z->rpz) rpz_delete(z->rpz); free(z->name); free(z->zonefile); free(z); } struct auth_zone* auth_zone_create(struct auth_zones* az, uint8_t* nm, size_t nmlen, uint16_t dclass) { struct auth_zone* z = (struct auth_zone*)calloc(1, sizeof(*z)); if(!z) { return NULL; } z->node.key = z; z->dclass = dclass; z->namelen = nmlen; z->namelabs = dname_count_labels(nm); z->name = memdup(nm, nmlen); if(!z->name) { free(z); return NULL; } rbtree_init(&z->data, &auth_data_cmp); lock_rw_init(&z->lock); lock_protect(&z->lock, &z->name, sizeof(*z)-sizeof(rbnode_type)- sizeof(&z->rpz_az_next)-sizeof(&z->rpz_az_prev)); lock_rw_wrlock(&z->lock); /* z lock protects all, except rbtree itself and the rpz linked list * pointers, which are protected using az->lock */ if(!rbtree_insert(&az->ztree, &z->node)) { lock_rw_unlock(&z->lock); auth_zone_delete(z, NULL); log_warn("duplicate auth zone"); return NULL; } return z; } struct auth_zone* auth_zone_find(struct auth_zones* az, uint8_t* nm, size_t nmlen, uint16_t dclass) { struct auth_zone key; key.node.key = &key; key.dclass = dclass; key.name = nm; key.namelen = nmlen; key.namelabs = dname_count_labels(nm); return (struct auth_zone*)rbtree_search(&az->ztree, &key); } struct auth_xfer* auth_xfer_find(struct auth_zones* az, uint8_t* nm, size_t nmlen, uint16_t dclass) { struct auth_xfer key; key.node.key = &key; key.dclass = dclass; key.name = nm; key.namelen = nmlen; key.namelabs = dname_count_labels(nm); return (struct auth_xfer*)rbtree_search(&az->xtree, &key); } /** find an auth zone or sorted less-or-equal, return true if exact */ static int auth_zone_find_less_equal(struct auth_zones* az, uint8_t* nm, size_t nmlen, uint16_t dclass, struct auth_zone** z) { struct auth_zone key; key.node.key = &key; key.dclass = dclass; key.name = nm; key.namelen = nmlen; key.namelabs = dname_count_labels(nm); return rbtree_find_less_equal(&az->ztree, &key, (rbnode_type**)z); } /** find the auth zone that is above the given name */ struct auth_zone* auth_zones_find_zone(struct auth_zones* az, uint8_t* name, size_t name_len, uint16_t dclass) { uint8_t* nm = name; size_t nmlen = name_len; struct auth_zone* z; if(auth_zone_find_less_equal(az, nm, nmlen, dclass, &z)) { /* exact match */ return z; } else { /* less-or-nothing */ if(!z) return NULL; /* nothing smaller, nothing above it */ /* we found smaller name; smaller may be above the name, * but not below it. */ nm = dname_get_shared_topdomain(z->name, name); dname_count_size_labels(nm, &nmlen); z = NULL; } /* search up */ while(!z) { z = auth_zone_find(az, nm, nmlen, dclass); if(z) return z; if(dname_is_root(nm)) break; dname_remove_label(&nm, &nmlen); } return NULL; } /** find or create zone with name str. caller must have lock on az. * returns a wrlocked zone */ static struct auth_zone* auth_zones_find_or_add_zone(struct auth_zones* az, char* name) { uint8_t nm[LDNS_MAX_DOMAINLEN+1]; size_t nmlen = sizeof(nm); struct auth_zone* z; if(sldns_str2wire_dname_buf(name, nm, &nmlen) != 0) { log_err("cannot parse auth zone name: %s", name); return 0; } z = auth_zone_find(az, nm, nmlen, LDNS_RR_CLASS_IN); if(!z) { /* not found, create the zone */ z = auth_zone_create(az, nm, nmlen, LDNS_RR_CLASS_IN); } else { lock_rw_wrlock(&z->lock); } return z; } /** find or create xfer zone with name str. caller must have lock on az. * returns a locked xfer */ static struct auth_xfer* auth_zones_find_or_add_xfer(struct auth_zones* az, struct auth_zone* z) { struct auth_xfer* x; x = auth_xfer_find(az, z->name, z->namelen, z->dclass); if(!x) { /* not found, create the zone */ x = auth_xfer_create(az, z); } else { lock_basic_lock(&x->lock); } return x; } int auth_zone_set_zonefile(struct auth_zone* z, char* zonefile) { if(z->zonefile) free(z->zonefile); if(zonefile == NULL) { z->zonefile = NULL; } else { z->zonefile = strdup(zonefile); if(!z->zonefile) { log_err("malloc failure"); return 0; } } return 1; } /** set auth zone fallback. caller must have lock on zone */ int auth_zone_set_fallback(struct auth_zone* z, char* fallbackstr) { if(strcmp(fallbackstr, "yes") != 0 && strcmp(fallbackstr, "no") != 0){ log_err("auth zone fallback, expected yes or no, got %s", fallbackstr); return 0; } z->fallback_enabled = (strcmp(fallbackstr, "yes")==0); return 1; } /** create domain with the given name */ static struct auth_data* az_domain_create(struct auth_zone* z, uint8_t* nm, size_t nmlen) { struct auth_data* n = (struct auth_data*)malloc(sizeof(*n)); if(!n) return NULL; memset(n, 0, sizeof(*n)); n->node.key = n; n->name = memdup(nm, nmlen); if(!n->name) { free(n); return NULL; } n->namelen = nmlen; n->namelabs = dname_count_labels(nm); if(!rbtree_insert(&z->data, &n->node)) { log_warn("duplicate auth domain name"); free(n->name); free(n); return NULL; } return n; } /** find domain with exactly the given name */ static struct auth_data* az_find_name(struct auth_zone* z, uint8_t* nm, size_t nmlen) { struct auth_zone key; key.node.key = &key; key.name = nm; key.namelen = nmlen; key.namelabs = dname_count_labels(nm); return (struct auth_data*)rbtree_search(&z->data, &key); } /** Find domain name (or closest match) */ static void az_find_domain(struct auth_zone* z, struct query_info* qinfo, int* node_exact, struct auth_data** node) { struct auth_zone key; key.node.key = &key; key.name = qinfo->qname; key.namelen = qinfo->qname_len; key.namelabs = dname_count_labels(key.name); *node_exact = rbtree_find_less_equal(&z->data, &key, (rbnode_type**)node); } /** find or create domain with name in zone */ static struct auth_data* az_domain_find_or_create(struct auth_zone* z, uint8_t* dname, size_t dname_len) { struct auth_data* n = az_find_name(z, dname, dname_len); if(!n) { n = az_domain_create(z, dname, dname_len); } return n; } /** find rrset of given type in the domain */ static struct auth_rrset* az_domain_rrset(struct auth_data* n, uint16_t t) { struct auth_rrset* rrset; if(!n) return NULL; rrset = n->rrsets; while(rrset) { if(rrset->type == t) return rrset; rrset = rrset->next; } return NULL; } /** remove rrset of this type from domain */ static void domain_remove_rrset(struct auth_data* node, uint16_t rr_type) { struct auth_rrset* rrset, *prev; if(!node) return; prev = NULL; rrset = node->rrsets; while(rrset) { if(rrset->type == rr_type) { /* found it, now delete it */ if(prev) prev->next = rrset->next; else node->rrsets = rrset->next; auth_rrset_delete(rrset); return; } prev = rrset; rrset = rrset->next; } } /** find an rrsig index in the rrset. returns true if found */ static int az_rrset_find_rrsig(struct packed_rrset_data* d, uint8_t* rdata, size_t len, size_t* index) { size_t i; for(i=d->count; icount + d->rrsig_count; i++) { if(d->rr_len[i] != len) continue; if(memcmp(d->rr_data[i], rdata, len) == 0) { *index = i; return 1; } } return 0; } /** see if rdata is duplicate */ static int rdata_duplicate(struct packed_rrset_data* d, uint8_t* rdata, size_t len) { size_t i; for(i=0; icount + d->rrsig_count; i++) { if(d->rr_len[i] != len) continue; if(memcmp(d->rr_data[i], rdata, len) == 0) return 1; } return 0; } /** get rrsig type covered from rdata. * @param rdata: rdata in wireformat, starting with 16bit rdlength. * @param rdatalen: length of rdata buffer. * @return type covered (or 0). */ static uint16_t rrsig_rdata_get_type_covered(uint8_t* rdata, size_t rdatalen) { if(rdatalen < 4) return 0; return sldns_read_uint16(rdata+2); } /** remove RR from existing RRset. Also sig, if it is a signature. * reallocates the packed rrset for a new one, false on alloc failure */ static int rrset_remove_rr(struct auth_rrset* rrset, size_t index) { struct packed_rrset_data* d, *old = rrset->data; size_t i; if(index >= old->count + old->rrsig_count) return 0; /* index out of bounds */ d = (struct packed_rrset_data*)calloc(1, packed_rrset_sizeof(old) - ( sizeof(size_t) + sizeof(uint8_t*) + sizeof(time_t) + old->rr_len[index])); if(!d) { log_err("malloc failure"); return 0; } d->ttl = old->ttl; d->count = old->count; d->rrsig_count = old->rrsig_count; if(index < d->count) d->count--; else d->rrsig_count--; d->trust = old->trust; d->security = old->security; /* set rr_len, needed for ptr_fixup */ d->rr_len = (size_t*)((uint8_t*)d + sizeof(struct packed_rrset_data)); if(index > 0) memmove(d->rr_len, old->rr_len, (index)*sizeof(size_t)); if(index+1 < old->count+old->rrsig_count) memmove(&d->rr_len[index], &old->rr_len[index+1], (old->count+old->rrsig_count - (index+1))*sizeof(size_t)); packed_rrset_ptr_fixup(d); /* move over ttls */ if(index > 0) memmove(d->rr_ttl, old->rr_ttl, (index)*sizeof(time_t)); if(index+1 < old->count+old->rrsig_count) memmove(&d->rr_ttl[index], &old->rr_ttl[index+1], (old->count+old->rrsig_count - (index+1))*sizeof(time_t)); /* move over rr_data */ for(i=0; icount+d->rrsig_count; i++) { size_t oldi; if(i < index) oldi = i; else oldi = i+1; memmove(d->rr_data[i], old->rr_data[oldi], d->rr_len[i]); } /* recalc ttl (lowest of remaining RR ttls) */ if(d->count + d->rrsig_count > 0) d->ttl = d->rr_ttl[0]; for(i=0; icount+d->rrsig_count; i++) { if(d->rr_ttl[i] < d->ttl) d->ttl = d->rr_ttl[i]; } free(rrset->data); rrset->data = d; return 1; } /** add RR to existing RRset. If insert_sig is true, add to rrsigs. * This reallocates the packed rrset for a new one */ static int rrset_add_rr(struct auth_rrset* rrset, uint32_t rr_ttl, uint8_t* rdata, size_t rdatalen, int insert_sig) { struct packed_rrset_data* d, *old = rrset->data; size_t total, old_total; d = (struct packed_rrset_data*)calloc(1, packed_rrset_sizeof(old) + sizeof(size_t) + sizeof(uint8_t*) + sizeof(time_t) + rdatalen); if(!d) { log_err("out of memory"); return 0; } /* copy base values */ memcpy(d, old, sizeof(struct packed_rrset_data)); if(!insert_sig) { d->count++; } else { d->rrsig_count++; } old_total = old->count + old->rrsig_count; total = d->count + d->rrsig_count; /* set rr_len, needed for ptr_fixup */ d->rr_len = (size_t*)((uint8_t*)d + sizeof(struct packed_rrset_data)); if(old->count != 0) memmove(d->rr_len, old->rr_len, old->count*sizeof(size_t)); if(old->rrsig_count != 0) memmove(d->rr_len+d->count, old->rr_len+old->count, old->rrsig_count*sizeof(size_t)); if(!insert_sig) d->rr_len[d->count-1] = rdatalen; else d->rr_len[total-1] = rdatalen; packed_rrset_ptr_fixup(d); if((time_t)rr_ttl < d->ttl) d->ttl = rr_ttl; /* copy old values into new array */ if(old->count != 0) { memmove(d->rr_ttl, old->rr_ttl, old->count*sizeof(time_t)); /* all the old rr pieces are allocated sequential, so we * can copy them in one go */ memmove(d->rr_data[0], old->rr_data[0], (old->rr_data[old->count-1] - old->rr_data[0]) + old->rr_len[old->count-1]); } if(old->rrsig_count != 0) { memmove(d->rr_ttl+d->count, old->rr_ttl+old->count, old->rrsig_count*sizeof(time_t)); memmove(d->rr_data[d->count], old->rr_data[old->count], (old->rr_data[old_total-1] - old->rr_data[old->count]) + old->rr_len[old_total-1]); } /* insert new value */ if(!insert_sig) { d->rr_ttl[d->count-1] = rr_ttl; memmove(d->rr_data[d->count-1], rdata, rdatalen); } else { d->rr_ttl[total-1] = rr_ttl; memmove(d->rr_data[total-1], rdata, rdatalen); } rrset->data = d; free(old); return 1; } /** Create new rrset for node with packed rrset with one RR element */ static struct auth_rrset* rrset_create(struct auth_data* node, uint16_t rr_type, uint32_t rr_ttl, uint8_t* rdata, size_t rdatalen) { struct auth_rrset* rrset = (struct auth_rrset*)calloc(1, sizeof(*rrset)); struct auth_rrset* p, *prev; struct packed_rrset_data* d; if(!rrset) { log_err("out of memory"); return NULL; } rrset->type = rr_type; /* the rrset data structure, with one RR */ d = (struct packed_rrset_data*)calloc(1, sizeof(struct packed_rrset_data) + sizeof(size_t) + sizeof(uint8_t*) + sizeof(time_t) + rdatalen); if(!d) { free(rrset); log_err("out of memory"); return NULL; } rrset->data = d; d->ttl = rr_ttl; d->trust = rrset_trust_prim_noglue; d->rr_len = (size_t*)((uint8_t*)d + sizeof(struct packed_rrset_data)); d->rr_data = (uint8_t**)&(d->rr_len[1]); d->rr_ttl = (time_t*)&(d->rr_data[1]); d->rr_data[0] = (uint8_t*)&(d->rr_ttl[1]); /* insert the RR */ d->rr_len[0] = rdatalen; d->rr_ttl[0] = rr_ttl; memmove(d->rr_data[0], rdata, rdatalen); d->count++; /* insert rrset into linked list for domain */ /* find sorted place to link the rrset into the list */ prev = NULL; p = node->rrsets; while(p && p->type<=rr_type) { prev = p; p = p->next; } /* so, prev is smaller, and p is larger than rr_type */ rrset->next = p; if(prev) prev->next = rrset; else node->rrsets = rrset; return rrset; } /** count number (and size) of rrsigs that cover a type */ static size_t rrsig_num_that_cover(struct auth_rrset* rrsig, uint16_t rr_type, size_t* sigsz) { struct packed_rrset_data* d = rrsig->data; size_t i, num = 0; *sigsz = 0; log_assert(d && rrsig->type == LDNS_RR_TYPE_RRSIG); for(i=0; icount+d->rrsig_count; i++) { if(rrsig_rdata_get_type_covered(d->rr_data[i], d->rr_len[i]) == rr_type) { num++; (*sigsz) += d->rr_len[i]; } } return num; } /** See if rrsig set has covered sigs for rrset and move them over */ static int rrset_moveover_rrsigs(struct auth_data* node, uint16_t rr_type, struct auth_rrset* rrset, struct auth_rrset* rrsig) { size_t sigs, sigsz, i, j, total; struct packed_rrset_data* sigold = rrsig->data; struct packed_rrset_data* old = rrset->data; struct packed_rrset_data* d, *sigd; log_assert(rrset->type == rr_type); log_assert(rrsig->type == LDNS_RR_TYPE_RRSIG); sigs = rrsig_num_that_cover(rrsig, rr_type, &sigsz); if(sigs == 0) { /* 0 rrsigs to move over, done */ return 1; } /* allocate rrset sigsz larger for extra sigs elements, and * allocate rrsig sigsz smaller for less sigs elements. */ d = (struct packed_rrset_data*)calloc(1, packed_rrset_sizeof(old) + sigs*(sizeof(size_t) + sizeof(uint8_t*) + sizeof(time_t)) + sigsz); if(!d) { log_err("out of memory"); return 0; } /* copy base values */ total = old->count + old->rrsig_count; memcpy(d, old, sizeof(struct packed_rrset_data)); d->rrsig_count += sigs; /* setup rr_len */ d->rr_len = (size_t*)((uint8_t*)d + sizeof(struct packed_rrset_data)); if(total != 0) memmove(d->rr_len, old->rr_len, total*sizeof(size_t)); j = d->count+d->rrsig_count-sigs; for(i=0; icount+sigold->rrsig_count; i++) { if(rrsig_rdata_get_type_covered(sigold->rr_data[i], sigold->rr_len[i]) == rr_type) { d->rr_len[j] = sigold->rr_len[i]; j++; } } packed_rrset_ptr_fixup(d); /* copy old values into new array */ if(total != 0) { memmove(d->rr_ttl, old->rr_ttl, total*sizeof(time_t)); /* all the old rr pieces are allocated sequential, so we * can copy them in one go */ memmove(d->rr_data[0], old->rr_data[0], (old->rr_data[total-1] - old->rr_data[0]) + old->rr_len[total-1]); } /* move over the rrsigs to the larger rrset*/ j = d->count+d->rrsig_count-sigs; for(i=0; icount+sigold->rrsig_count; i++) { if(rrsig_rdata_get_type_covered(sigold->rr_data[i], sigold->rr_len[i]) == rr_type) { /* move this one over to location j */ d->rr_ttl[j] = sigold->rr_ttl[i]; memmove(d->rr_data[j], sigold->rr_data[i], sigold->rr_len[i]); if(d->rr_ttl[j] < d->ttl) d->ttl = d->rr_ttl[j]; j++; } } /* put it in and deallocate the old rrset */ rrset->data = d; free(old); /* now make rrsig set smaller */ if(sigold->count+sigold->rrsig_count == sigs) { /* remove all sigs from rrsig, remove it entirely */ domain_remove_rrset(node, LDNS_RR_TYPE_RRSIG); return 1; } log_assert(packed_rrset_sizeof(sigold) > sigs*(sizeof(size_t) + sizeof(uint8_t*) + sizeof(time_t)) + sigsz); sigd = (struct packed_rrset_data*)calloc(1, packed_rrset_sizeof(sigold) - sigs*(sizeof(size_t) + sizeof(uint8_t*) + sizeof(time_t)) - sigsz); if(!sigd) { /* no need to free up d, it has already been placed in the * node->rrset structure */ log_err("out of memory"); return 0; } /* copy base values */ memcpy(sigd, sigold, sizeof(struct packed_rrset_data)); /* in sigd the RRSIGs are stored in the base of the RR, in count */ sigd->count -= sigs; /* setup rr_len */ sigd->rr_len = (size_t*)((uint8_t*)sigd + sizeof(struct packed_rrset_data)); j = 0; for(i=0; icount+sigold->rrsig_count; i++) { if(rrsig_rdata_get_type_covered(sigold->rr_data[i], sigold->rr_len[i]) != rr_type) { sigd->rr_len[j] = sigold->rr_len[i]; j++; } } packed_rrset_ptr_fixup(sigd); /* copy old values into new rrsig array */ j = 0; for(i=0; icount+sigold->rrsig_count; i++) { if(rrsig_rdata_get_type_covered(sigold->rr_data[i], sigold->rr_len[i]) != rr_type) { /* move this one over to location j */ sigd->rr_ttl[j] = sigold->rr_ttl[i]; memmove(sigd->rr_data[j], sigold->rr_data[i], sigold->rr_len[i]); if(j==0) sigd->ttl = sigd->rr_ttl[j]; else { if(sigd->rr_ttl[j] < sigd->ttl) sigd->ttl = sigd->rr_ttl[j]; } j++; } } /* put it in and deallocate the old rrset */ rrsig->data = sigd; free(sigold); return 1; } /** copy the rrsigs from the rrset to the rrsig rrset, because the rrset * is going to be deleted. reallocates the RRSIG rrset data. */ static int rrsigs_copy_from_rrset_to_rrsigset(struct auth_rrset* rrset, struct auth_rrset* rrsigset) { size_t i; if(rrset->data->rrsig_count == 0) return 1; /* move them over one by one, because there might be duplicates, * duplicates are ignored */ for(i=rrset->data->count; idata->count+rrset->data->rrsig_count; i++) { uint8_t* rdata = rrset->data->rr_data[i]; size_t rdatalen = rrset->data->rr_len[i]; time_t rr_ttl = rrset->data->rr_ttl[i]; if(rdata_duplicate(rrsigset->data, rdata, rdatalen)) { continue; } if(!rrset_add_rr(rrsigset, rr_ttl, rdata, rdatalen, 0)) return 0; } return 1; } /** Add rr to node, ignores duplicate RRs, * rdata points to buffer with rdatalen octets, starts with 2bytelength. */ static int az_domain_add_rr(struct auth_data* node, uint16_t rr_type, uint32_t rr_ttl, uint8_t* rdata, size_t rdatalen, int* duplicate) { struct auth_rrset* rrset; /* packed rrsets have their rrsigs along with them, sort them out */ if(rr_type == LDNS_RR_TYPE_RRSIG) { uint16_t ctype = rrsig_rdata_get_type_covered(rdata, rdatalen); if((rrset=az_domain_rrset(node, ctype))!= NULL) { /* a node of the correct type exists, add the RRSIG * to the rrset of the covered data type */ if(rdata_duplicate(rrset->data, rdata, rdatalen)) { if(duplicate) *duplicate = 1; return 1; } if(!rrset_add_rr(rrset, rr_ttl, rdata, rdatalen, 1)) return 0; } else if((rrset=az_domain_rrset(node, rr_type))!= NULL) { /* add RRSIG to rrset of type RRSIG */ if(rdata_duplicate(rrset->data, rdata, rdatalen)) { if(duplicate) *duplicate = 1; return 1; } if(!rrset_add_rr(rrset, rr_ttl, rdata, rdatalen, 0)) return 0; } else { /* create rrset of type RRSIG */ if(!rrset_create(node, rr_type, rr_ttl, rdata, rdatalen)) return 0; } } else { /* normal RR type */ if((rrset=az_domain_rrset(node, rr_type))!= NULL) { /* add data to existing node with data type */ if(rdata_duplicate(rrset->data, rdata, rdatalen)) { if(duplicate) *duplicate = 1; return 1; } if(!rrset_add_rr(rrset, rr_ttl, rdata, rdatalen, 0)) return 0; } else { struct auth_rrset* rrsig; /* create new node with data type */ if(!(rrset=rrset_create(node, rr_type, rr_ttl, rdata, rdatalen))) return 0; /* see if node of type RRSIG has signatures that * cover the data type, and move them over */ /* and then make the RRSIG type smaller */ if((rrsig=az_domain_rrset(node, LDNS_RR_TYPE_RRSIG)) != NULL) { if(!rrset_moveover_rrsigs(node, rr_type, rrset, rrsig)) return 0; } } } return 1; } /** insert RR into zone, ignore duplicates */ static int az_insert_rr(struct auth_zone* z, uint8_t* rr, size_t rr_len, size_t dname_len, int* duplicate) { struct auth_data* node; uint8_t* dname = rr; uint16_t rr_type = sldns_wirerr_get_type(rr, rr_len, dname_len); uint16_t rr_class = sldns_wirerr_get_class(rr, rr_len, dname_len); uint32_t rr_ttl = sldns_wirerr_get_ttl(rr, rr_len, dname_len); size_t rdatalen = ((size_t)sldns_wirerr_get_rdatalen(rr, rr_len, dname_len))+2; /* rdata points to rdata prefixed with uint16 rdatalength */ uint8_t* rdata = sldns_wirerr_get_rdatawl(rr, rr_len, dname_len); if(rr_class != z->dclass) { log_err("wrong class for RR"); return 0; } if(!(node=az_domain_find_or_create(z, dname, dname_len))) { log_err("cannot create domain"); return 0; } if(!az_domain_add_rr(node, rr_type, rr_ttl, rdata, rdatalen, duplicate)) { log_err("cannot add RR to domain"); return 0; } if(z->rpz) { if(!(rpz_insert_rr(z->rpz, z->name, z->namelen, dname, dname_len, rr_type, rr_class, rr_ttl, rdata, rdatalen, rr, rr_len))) return 0; } return 1; } /** Remove rr from node, ignores nonexisting RRs, * rdata points to buffer with rdatalen octets, starts with 2bytelength. */ static int az_domain_remove_rr(struct auth_data* node, uint16_t rr_type, uint8_t* rdata, size_t rdatalen, int* nonexist) { struct auth_rrset* rrset; size_t index = 0; /* find the plain RR of the given type */ if((rrset=az_domain_rrset(node, rr_type))!= NULL) { if(packed_rrset_find_rr(rrset->data, rdata, rdatalen, &index)) { if(rrset->data->count == 1 && rrset->data->rrsig_count == 0) { /* last RR, delete the rrset */ domain_remove_rrset(node, rr_type); } else if(rrset->data->count == 1 && rrset->data->rrsig_count != 0) { /* move RRSIGs to the RRSIG rrset, or * this one becomes that RRset */ struct auth_rrset* rrsigset = az_domain_rrset( node, LDNS_RR_TYPE_RRSIG); if(rrsigset) { /* move left over rrsigs to the * existing rrset of type RRSIG */ rrsigs_copy_from_rrset_to_rrsigset( rrset, rrsigset); /* and then delete the rrset */ domain_remove_rrset(node, rr_type); } else { /* no rrset of type RRSIG, this * set is now of that type, * just remove the rr */ if(!rrset_remove_rr(rrset, index)) return 0; rrset->type = LDNS_RR_TYPE_RRSIG; rrset->data->count = rrset->data->rrsig_count; rrset->data->rrsig_count = 0; } } else { /* remove the RR from the rrset */ if(!rrset_remove_rr(rrset, index)) return 0; } return 1; } /* rr not found in rrset */ } /* is it a type RRSIG, look under the covered type */ if(rr_type == LDNS_RR_TYPE_RRSIG) { uint16_t ctype = rrsig_rdata_get_type_covered(rdata, rdatalen); if((rrset=az_domain_rrset(node, ctype))!= NULL) { if(az_rrset_find_rrsig(rrset->data, rdata, rdatalen, &index)) { /* rrsig should have d->count > 0, be * over some rr of that type */ /* remove the rrsig from the rrsigs list of the * rrset */ if(!rrset_remove_rr(rrset, index)) return 0; return 1; } } /* also RRSIG not found */ } /* nothing found to delete */ if(nonexist) *nonexist = 1; return 1; } /** remove RR from zone, ignore if it does not exist, false on alloc failure*/ static int az_remove_rr(struct auth_zone* z, uint8_t* rr, size_t rr_len, size_t dname_len, int* nonexist) { struct auth_data* node; uint8_t* dname = rr; uint16_t rr_type = sldns_wirerr_get_type(rr, rr_len, dname_len); uint16_t rr_class = sldns_wirerr_get_class(rr, rr_len, dname_len); size_t rdatalen = ((size_t)sldns_wirerr_get_rdatalen(rr, rr_len, dname_len))+2; /* rdata points to rdata prefixed with uint16 rdatalength */ uint8_t* rdata = sldns_wirerr_get_rdatawl(rr, rr_len, dname_len); if(rr_class != z->dclass) { log_err("wrong class for RR"); /* really also a nonexisting entry, because no records * of that class in the zone, but return an error because * getting records of the wrong class is a failure of the * zone transfer */ return 0; } node = az_find_name(z, dname, dname_len); if(!node) { /* node with that name does not exist */ /* nonexisting entry, because no such name */ *nonexist = 1; return 1; } if(!az_domain_remove_rr(node, rr_type, rdata, rdatalen, nonexist)) { /* alloc failure or so */ return 0; } /* remove the node, if necessary */ /* an rrsets==NULL entry is not kept around for empty nonterminals, * and also parent nodes are not kept around, so we just delete it */ if(node->rrsets == NULL) { (void)rbtree_delete(&z->data, node); auth_data_delete(node); } if(z->rpz) { rpz_remove_rr(z->rpz, z->namelen, dname, dname_len, rr_type, rr_class, rdata, rdatalen); } return 1; } /** decompress an RR into the buffer where it'll be an uncompressed RR * with uncompressed dname and uncompressed rdata (dnames) */ static int decompress_rr_into_buffer(struct sldns_buffer* buf, uint8_t* pkt, size_t pktlen, uint8_t* dname, uint16_t rr_type, uint16_t rr_class, uint32_t rr_ttl, uint8_t* rr_data, uint16_t rr_rdlen) { sldns_buffer pktbuf; size_t dname_len = 0; size_t rdlenpos; size_t rdlen; uint8_t* rd; const sldns_rr_descriptor* desc; sldns_buffer_init_frm_data(&pktbuf, pkt, pktlen); sldns_buffer_clear(buf); /* decompress dname */ sldns_buffer_set_position(&pktbuf, (size_t)(dname - sldns_buffer_current(&pktbuf))); dname_len = pkt_dname_len(&pktbuf); if(dname_len == 0) return 0; /* parse fail on dname */ if(!sldns_buffer_available(buf, dname_len)) return 0; dname_pkt_copy(&pktbuf, sldns_buffer_current(buf), dname); sldns_buffer_skip(buf, (ssize_t)dname_len); /* type, class, ttl and rdatalength fields */ if(!sldns_buffer_available(buf, 10)) return 0; sldns_buffer_write_u16(buf, rr_type); sldns_buffer_write_u16(buf, rr_class); sldns_buffer_write_u32(buf, rr_ttl); rdlenpos = sldns_buffer_position(buf); sldns_buffer_write_u16(buf, 0); /* rd length position */ /* decompress rdata */ desc = sldns_rr_descript(rr_type); rd = rr_data; rdlen = rr_rdlen; if(rdlen > 0 && desc && desc->_dname_count > 0) { int count = (int)desc->_dname_count; int rdf = 0; size_t len; /* how much rdata to plain copy */ size_t uncompressed_len, compressed_len; size_t oldpos; /* decompress dnames. */ while(rdlen > 0 && count) { switch(desc->_wireformat[rdf]) { case LDNS_RDF_TYPE_DNAME: sldns_buffer_set_position(&pktbuf, (size_t)(rd - sldns_buffer_begin(&pktbuf))); oldpos = sldns_buffer_position(&pktbuf); /* moves pktbuf to right after the * compressed dname, and returns uncompressed * dname length */ uncompressed_len = pkt_dname_len(&pktbuf); if(!uncompressed_len) return 0; /* parse error in dname */ if(!sldns_buffer_available(buf, uncompressed_len)) /* dname too long for buffer */ return 0; dname_pkt_copy(&pktbuf, sldns_buffer_current(buf), rd); sldns_buffer_skip(buf, (ssize_t)uncompressed_len); compressed_len = sldns_buffer_position( &pktbuf) - oldpos; rd += compressed_len; rdlen -= compressed_len; count--; len = 0; break; case LDNS_RDF_TYPE_STR: len = rd[0] + 1; break; default: len = get_rdf_size(desc->_wireformat[rdf]); break; } if(len) { if(!sldns_buffer_available(buf, len)) return 0; /* too long for buffer */ sldns_buffer_write(buf, rd, len); rd += len; rdlen -= len; } rdf++; } } /* copy remaining data */ if(rdlen > 0) { if(!sldns_buffer_available(buf, rdlen)) return 0; sldns_buffer_write(buf, rd, rdlen); } /* fixup rdlength */ sldns_buffer_write_u16_at(buf, rdlenpos, sldns_buffer_position(buf)-rdlenpos-2); sldns_buffer_flip(buf); return 1; } /** insert RR into zone, from packet, decompress RR, * if duplicate is nonNULL set the flag but otherwise ignore duplicates */ static int az_insert_rr_decompress(struct auth_zone* z, uint8_t* pkt, size_t pktlen, struct sldns_buffer* scratch_buffer, uint8_t* dname, uint16_t rr_type, uint16_t rr_class, uint32_t rr_ttl, uint8_t* rr_data, uint16_t rr_rdlen, int* duplicate) { uint8_t* rr; size_t rr_len; size_t dname_len; if(!decompress_rr_into_buffer(scratch_buffer, pkt, pktlen, dname, rr_type, rr_class, rr_ttl, rr_data, rr_rdlen)) { log_err("could not decompress RR"); return 0; } rr = sldns_buffer_begin(scratch_buffer); rr_len = sldns_buffer_limit(scratch_buffer); dname_len = dname_valid(rr, rr_len); return az_insert_rr(z, rr, rr_len, dname_len, duplicate); } /** remove RR from zone, from packet, decompress RR, * if nonexist is nonNULL set the flag but otherwise ignore nonexisting entries*/ static int az_remove_rr_decompress(struct auth_zone* z, uint8_t* pkt, size_t pktlen, struct sldns_buffer* scratch_buffer, uint8_t* dname, uint16_t rr_type, uint16_t rr_class, uint32_t rr_ttl, uint8_t* rr_data, uint16_t rr_rdlen, int* nonexist) { uint8_t* rr; size_t rr_len; size_t dname_len; if(!decompress_rr_into_buffer(scratch_buffer, pkt, pktlen, dname, rr_type, rr_class, rr_ttl, rr_data, rr_rdlen)) { log_err("could not decompress RR"); return 0; } rr = sldns_buffer_begin(scratch_buffer); rr_len = sldns_buffer_limit(scratch_buffer); dname_len = dname_valid(rr, rr_len); return az_remove_rr(z, rr, rr_len, dname_len, nonexist); } /** * Parse zonefile * @param z: zone to read in. * @param in: file to read from (just opened). * @param rr: buffer to use for RRs, 64k. * passed so that recursive includes can use the same buffer and do * not grow the stack too much. * @param rrbuflen: sizeof rr buffer. * @param state: parse state with $ORIGIN, $TTL and 'prev-dname' and so on, * that is kept between includes. * The lineno is set at 1 and then increased by the function. * @param fname: file name. * @param depth: recursion depth for includes * @param cfg: config for chroot. * returns false on failure, has printed an error message */ static int az_parse_file(struct auth_zone* z, FILE* in, uint8_t* rr, size_t rrbuflen, struct sldns_file_parse_state* state, char* fname, int depth, struct config_file* cfg) { size_t rr_len, dname_len; int status; state->lineno = 1; while(!feof(in)) { rr_len = rrbuflen; dname_len = 0; status = sldns_fp2wire_rr_buf(in, rr, &rr_len, &dname_len, state); if(status == LDNS_WIREPARSE_ERR_INCLUDE && rr_len == 0) { /* we have $INCLUDE or $something */ if(strncmp((char*)rr, "$INCLUDE ", 9) == 0 || strncmp((char*)rr, "$INCLUDE\t", 9) == 0) { FILE* inc; int lineno_orig = state->lineno; char* incfile = (char*)rr + 8; if(depth > MAX_INCLUDE_DEPTH) { log_err("%s:%d max include depth" "exceeded", fname, state->lineno); return 0; } /* skip spaces */ while(*incfile == ' ' || *incfile == '\t') incfile++; /* adjust for chroot on include file */ if(cfg->chrootdir && cfg->chrootdir[0] && strncmp(incfile, cfg->chrootdir, strlen(cfg->chrootdir)) == 0) incfile += strlen(cfg->chrootdir); incfile = strdup(incfile); if(!incfile) { log_err("malloc failure"); return 0; } verbose(VERB_ALGO, "opening $INCLUDE %s", incfile); inc = fopen(incfile, "r"); if(!inc) { log_err("%s:%d cannot open include " "file %s: %s", fname, lineno_orig, incfile, strerror(errno)); free(incfile); return 0; } /* recurse read that file now */ if(!az_parse_file(z, inc, rr, rrbuflen, state, incfile, depth+1, cfg)) { log_err("%s:%d cannot parse include " "file %s", fname, lineno_orig, incfile); fclose(inc); free(incfile); return 0; } fclose(inc); verbose(VERB_ALGO, "done with $INCLUDE %s", incfile); free(incfile); state->lineno = lineno_orig; } continue; } if(status != 0) { log_err("parse error %s %d:%d: %s", fname, state->lineno, LDNS_WIREPARSE_OFFSET(status), sldns_get_errorstr_parse(status)); return 0; } if(rr_len == 0) { /* EMPTY line, TTL or ORIGIN */ continue; } /* insert wirerr in rrbuf */ if(!az_insert_rr(z, rr, rr_len, dname_len, NULL)) { char buf[17]; sldns_wire2str_type_buf(sldns_wirerr_get_type(rr, rr_len, dname_len), buf, sizeof(buf)); log_err("%s:%d cannot insert RR of type %s", fname, state->lineno, buf); return 0; } } return 1; } int auth_zone_read_zonefile(struct auth_zone* z, struct config_file* cfg) { uint8_t rr[LDNS_RR_BUF_SIZE]; struct sldns_file_parse_state state; char* zfilename; FILE* in; if(!z || !z->zonefile || z->zonefile[0]==0) return 1; /* no file, or "", nothing to read */ zfilename = z->zonefile; if(cfg->chrootdir && cfg->chrootdir[0] && strncmp(zfilename, cfg->chrootdir, strlen(cfg->chrootdir)) == 0) zfilename += strlen(cfg->chrootdir); if(verbosity >= VERB_ALGO) { char nm[255+1]; dname_str(z->name, nm); verbose(VERB_ALGO, "read zonefile %s for %s", zfilename, nm); } in = fopen(zfilename, "r"); if(!in) { char* n = sldns_wire2str_dname(z->name, z->namelen); if(z->zone_is_slave && errno == ENOENT) { /* we fetch the zone contents later, no file yet */ verbose(VERB_ALGO, "no zonefile %s for %s", zfilename, n?n:"error"); free(n); return 1; } log_err("cannot open zonefile %s for %s: %s", zfilename, n?n:"error", strerror(errno)); free(n); return 0; } /* clear the data tree */ traverse_postorder(&z->data, auth_data_del, NULL); rbtree_init(&z->data, &auth_data_cmp); /* clear the RPZ policies */ if(z->rpz) rpz_clear(z->rpz); memset(&state, 0, sizeof(state)); /* default TTL to 3600 */ state.default_ttl = 3600; /* set $ORIGIN to the zone name */ if(z->namelen <= sizeof(state.origin)) { memcpy(state.origin, z->name, z->namelen); state.origin_len = z->namelen; } /* parse the (toplevel) file */ if(!az_parse_file(z, in, rr, sizeof(rr), &state, zfilename, 0, cfg)) { char* n = sldns_wire2str_dname(z->name, z->namelen); log_err("error parsing zonefile %s for %s", zfilename, n?n:"error"); free(n); fclose(in); return 0; } fclose(in); if(z->rpz) rpz_finish_config(z->rpz); return 1; } /** write buffer to file and check return codes */ static int write_out(FILE* out, const char* str, size_t len) { size_t r; if(len == 0) return 1; r = fwrite(str, 1, len, out); if(r == 0) { log_err("write failed: %s", strerror(errno)); return 0; } else if(r < len) { log_err("write failed: too short (disk full?)"); return 0; } return 1; } /** convert auth rr to string */ static int auth_rr_to_string(uint8_t* nm, size_t nmlen, uint16_t tp, uint16_t cl, struct packed_rrset_data* data, size_t i, char* s, size_t buflen) { int w = 0; size_t slen = buflen, datlen; uint8_t* dat; if(i >= data->count) tp = LDNS_RR_TYPE_RRSIG; dat = nm; datlen = nmlen; w += sldns_wire2str_dname_scan(&dat, &datlen, &s, &slen, NULL, 0, NULL); w += sldns_str_print(&s, &slen, "\t"); w += sldns_str_print(&s, &slen, "%lu\t", (unsigned long)data->rr_ttl[i]); w += sldns_wire2str_class_print(&s, &slen, cl); w += sldns_str_print(&s, &slen, "\t"); w += sldns_wire2str_type_print(&s, &slen, tp); w += sldns_str_print(&s, &slen, "\t"); datlen = data->rr_len[i]-2; dat = data->rr_data[i]+2; w += sldns_wire2str_rdata_scan(&dat, &datlen, &s, &slen, tp, NULL, 0, NULL); if(tp == LDNS_RR_TYPE_DNSKEY) { w += sldns_str_print(&s, &slen, " ;{id = %u}", sldns_calc_keytag_raw(data->rr_data[i]+2, data->rr_len[i]-2)); } w += sldns_str_print(&s, &slen, "\n"); if(w >= (int)buflen) { log_nametypeclass(NO_VERBOSE, "RR too long to print", nm, tp, cl); return 0; } return 1; } /** write rrset to file */ static int auth_zone_write_rrset(struct auth_zone* z, struct auth_data* node, struct auth_rrset* r, FILE* out) { size_t i, count = r->data->count + r->data->rrsig_count; char buf[LDNS_RR_BUF_SIZE]; for(i=0; iname, node->namelen, r->type, z->dclass, r->data, i, buf, sizeof(buf))) { verbose(VERB_ALGO, "failed to rr2str rr %d", (int)i); continue; } if(!write_out(out, buf, strlen(buf))) return 0; } return 1; } /** write domain to file */ static int auth_zone_write_domain(struct auth_zone* z, struct auth_data* n, FILE* out) { struct auth_rrset* r; /* if this is zone apex, write SOA first */ if(z->namelen == n->namelen) { struct auth_rrset* soa = az_domain_rrset(n, LDNS_RR_TYPE_SOA); if(soa) { if(!auth_zone_write_rrset(z, n, soa, out)) return 0; } } /* write all the RRsets for this domain */ for(r = n->rrsets; r; r = r->next) { if(z->namelen == n->namelen && r->type == LDNS_RR_TYPE_SOA) continue; /* skip SOA here */ if(!auth_zone_write_rrset(z, n, r, out)) return 0; } return 1; } int auth_zone_write_file(struct auth_zone* z, const char* fname) { FILE* out; struct auth_data* n; out = fopen(fname, "w"); if(!out) { log_err("could not open %s: %s", fname, strerror(errno)); return 0; } RBTREE_FOR(n, struct auth_data*, &z->data) { if(!auth_zone_write_domain(z, n, out)) { log_err("could not write domain to %s", fname); fclose(out); return 0; } } fclose(out); return 1; } /** read all auth zones from file (if they have) */ static int auth_zones_read_zones(struct auth_zones* az, struct config_file* cfg) { struct auth_zone* z; lock_rw_wrlock(&az->lock); RBTREE_FOR(z, struct auth_zone*, &az->ztree) { lock_rw_wrlock(&z->lock); if(!auth_zone_read_zonefile(z, cfg)) { lock_rw_unlock(&z->lock); lock_rw_unlock(&az->lock); return 0; } lock_rw_unlock(&z->lock); } lock_rw_unlock(&az->lock); return 1; } /** find serial number of zone or false if none */ int auth_zone_get_serial(struct auth_zone* z, uint32_t* serial) { struct auth_data* apex; struct auth_rrset* soa; struct packed_rrset_data* d; apex = az_find_name(z, z->name, z->namelen); if(!apex) return 0; soa = az_domain_rrset(apex, LDNS_RR_TYPE_SOA); if(!soa || soa->data->count==0) return 0; /* no RRset or no RRs in rrset */ if(soa->data->rr_len[0] < 2+4*5) return 0; /* SOA too short */ d = soa->data; *serial = sldns_read_uint32(d->rr_data[0]+(d->rr_len[0]-20)); return 1; } /** Find auth_zone SOA and populate the values in xfr(soa values). */ static int xfr_find_soa(struct auth_zone* z, struct auth_xfer* xfr) { struct auth_data* apex; struct auth_rrset* soa; struct packed_rrset_data* d; apex = az_find_name(z, z->name, z->namelen); if(!apex) return 0; soa = az_domain_rrset(apex, LDNS_RR_TYPE_SOA); if(!soa || soa->data->count==0) return 0; /* no RRset or no RRs in rrset */ if(soa->data->rr_len[0] < 2+4*5) return 0; /* SOA too short */ /* SOA record ends with serial, refresh, retry, expiry, minimum, * as 4 byte fields */ d = soa->data; xfr->have_zone = 1; xfr->serial = sldns_read_uint32(d->rr_data[0]+(d->rr_len[0]-20)); xfr->refresh = sldns_read_uint32(d->rr_data[0]+(d->rr_len[0]-16)); xfr->retry = sldns_read_uint32(d->rr_data[0]+(d->rr_len[0]-12)); xfr->expiry = sldns_read_uint32(d->rr_data[0]+(d->rr_len[0]-8)); /* soa minimum at d->rr_len[0]-4 */ return 1; } /** * Setup auth_xfer zone * This populates the have_zone, soa values, and so on times. * Doesn't do network traffic yet, can set option flags. * @param z: locked by caller, and modified for setup * @param x: locked by caller, and modified. * @return false on failure. */ static int auth_xfer_setup(struct auth_zone* z, struct auth_xfer* x) { /* for a zone without zone transfers, x==NULL, so skip them, * i.e. the zone config is fixed with no masters or urls */ if(!z || !x) return 1; if(!xfr_find_soa(z, x)) { return 1; } /* nothing for probe, nextprobe and transfer tasks */ return 1; } /** * Setup all zones * @param az: auth zones structure * @return false on failure. */ static int auth_zones_setup_zones(struct auth_zones* az) { struct auth_zone* z; struct auth_xfer* x; lock_rw_wrlock(&az->lock); RBTREE_FOR(z, struct auth_zone*, &az->ztree) { lock_rw_wrlock(&z->lock); x = auth_xfer_find(az, z->name, z->namelen, z->dclass); if(x) { lock_basic_lock(&x->lock); } if(!auth_xfer_setup(z, x)) { if(x) { lock_basic_unlock(&x->lock); } lock_rw_unlock(&z->lock); lock_rw_unlock(&az->lock); return 0; } if(x) { lock_basic_unlock(&x->lock); } lock_rw_unlock(&z->lock); } lock_rw_unlock(&az->lock); return 1; } /** set config items and create zones */ static int auth_zones_cfg(struct auth_zones* az, struct config_auth* c) { struct auth_zone* z; struct auth_xfer* x = NULL; /* create zone */ if(c->isrpz) { /* if the rpz lock is needed, grab it before the other * locks to avoid a lock dependency cycle */ lock_rw_wrlock(&az->rpz_lock); } lock_rw_wrlock(&az->lock); if(!(z=auth_zones_find_or_add_zone(az, c->name))) { lock_rw_unlock(&az->lock); if(c->isrpz) { lock_rw_unlock(&az->rpz_lock); } return 0; } if(c->masters || c->urls) { if(!(x=auth_zones_find_or_add_xfer(az, z))) { lock_rw_unlock(&az->lock); lock_rw_unlock(&z->lock); if(c->isrpz) { lock_rw_unlock(&az->rpz_lock); } return 0; } } if(c->for_downstream) az->have_downstream = 1; lock_rw_unlock(&az->lock); /* set options */ z->zone_deleted = 0; if(!auth_zone_set_zonefile(z, c->zonefile)) { if(x) { lock_basic_unlock(&x->lock); } lock_rw_unlock(&z->lock); if(c->isrpz) { lock_rw_unlock(&az->rpz_lock); } return 0; } z->for_downstream = c->for_downstream; z->for_upstream = c->for_upstream; z->fallback_enabled = c->fallback_enabled; if(c->isrpz && !z->rpz){ if(!(z->rpz = rpz_create(c))){ fatal_exit("Could not setup RPZ zones"); return 0; } lock_protect(&z->lock, &z->rpz->local_zones, sizeof(*z->rpz)); /* the az->rpz_lock is locked above */ z->rpz_az_next = az->rpz_first; if(az->rpz_first) az->rpz_first->rpz_az_prev = z; az->rpz_first = z; } if(c->isrpz) { lock_rw_unlock(&az->rpz_lock); } /* xfer zone */ if(x) { z->zone_is_slave = 1; /* set options on xfer zone */ if(!xfer_set_masters(&x->task_probe->masters, c, 0)) { lock_basic_unlock(&x->lock); lock_rw_unlock(&z->lock); return 0; } if(!xfer_set_masters(&x->task_transfer->masters, c, 1)) { lock_basic_unlock(&x->lock); lock_rw_unlock(&z->lock); return 0; } lock_basic_unlock(&x->lock); } lock_rw_unlock(&z->lock); return 1; } /** set all auth zones deleted, then in auth_zones_cfg, it marks them * as nondeleted (if they are still in the config), and then later * we can find deleted zones */ static void az_setall_deleted(struct auth_zones* az) { struct auth_zone* z; lock_rw_wrlock(&az->lock); RBTREE_FOR(z, struct auth_zone*, &az->ztree) { lock_rw_wrlock(&z->lock); z->zone_deleted = 1; lock_rw_unlock(&z->lock); } lock_rw_unlock(&az->lock); } /** find zones that are marked deleted and delete them. * This is called from apply_cfg, and there are no threads and no * workers, so the xfr can just be deleted. */ static void az_delete_deleted_zones(struct auth_zones* az) { struct auth_zone* z; struct auth_zone* delete_list = NULL, *next; struct auth_xfer* xfr; lock_rw_wrlock(&az->lock); RBTREE_FOR(z, struct auth_zone*, &az->ztree) { lock_rw_wrlock(&z->lock); if(z->zone_deleted) { /* we cannot alter the rbtree right now, but * we can put it on a linked list and then * delete it */ z->delete_next = delete_list; delete_list = z; } lock_rw_unlock(&z->lock); } /* now we are out of the tree loop and we can loop and delete * the zones */ z = delete_list; while(z) { next = z->delete_next; xfr = auth_xfer_find(az, z->name, z->namelen, z->dclass); if(xfr) { (void)rbtree_delete(&az->xtree, &xfr->node); auth_xfer_delete(xfr); } (void)rbtree_delete(&az->ztree, &z->node); auth_zone_delete(z, az); z = next; } lock_rw_unlock(&az->lock); } int auth_zones_apply_cfg(struct auth_zones* az, struct config_file* cfg, int setup, int* is_rpz) { struct config_auth* p; az_setall_deleted(az); for(p = cfg->auths; p; p = p->next) { if(!p->name || p->name[0] == 0) { log_warn("auth-zone without a name, skipped"); continue; } *is_rpz = (*is_rpz || p->isrpz); if(!auth_zones_cfg(az, p)) { log_err("cannot config auth zone %s", p->name); return 0; } } az_delete_deleted_zones(az); if(!auth_zones_read_zones(az, cfg)) return 0; if(setup) { if(!auth_zones_setup_zones(az)) return 0; } return 1; } /** delete chunks * @param at: transfer structure with chunks list. The chunks and their * data are freed. */ static void auth_chunks_delete(struct auth_transfer* at) { if(at->chunks_first) { struct auth_chunk* c, *cn; c = at->chunks_first; while(c) { cn = c->next; free(c->data); free(c); c = cn; } } at->chunks_first = NULL; at->chunks_last = NULL; } /** free master addr list */ static void auth_free_master_addrs(struct auth_addr* list) { struct auth_addr *n; while(list) { n = list->next; free(list); list = n; } } /** free the masters list */ static void auth_free_masters(struct auth_master* list) { struct auth_master* n; while(list) { n = list->next; auth_free_master_addrs(list->list); free(list->host); free(list->file); free(list); list = n; } } /** delete auth xfer structure * @param xfr: delete this xfer and its tasks. */ static void auth_xfer_delete(struct auth_xfer* xfr) { if(!xfr) return; lock_basic_destroy(&xfr->lock); free(xfr->name); if(xfr->task_nextprobe) { comm_timer_delete(xfr->task_nextprobe->timer); free(xfr->task_nextprobe); } if(xfr->task_probe) { auth_free_masters(xfr->task_probe->masters); comm_point_delete(xfr->task_probe->cp); comm_timer_delete(xfr->task_probe->timer); free(xfr->task_probe); } if(xfr->task_transfer) { auth_free_masters(xfr->task_transfer->masters); comm_point_delete(xfr->task_transfer->cp); comm_timer_delete(xfr->task_transfer->timer); if(xfr->task_transfer->chunks_first) { auth_chunks_delete(xfr->task_transfer); } free(xfr->task_transfer); } auth_free_masters(xfr->allow_notify_list); free(xfr); } /** helper traverse to delete zones */ static void auth_zone_del(rbnode_type* n, void* ATTR_UNUSED(arg)) { struct auth_zone* z = (struct auth_zone*)n->key; auth_zone_delete(z, NULL); } /** helper traverse to delete xfer zones */ static void auth_xfer_del(rbnode_type* n, void* ATTR_UNUSED(arg)) { struct auth_xfer* z = (struct auth_xfer*)n->key; auth_xfer_delete(z); } void auth_zones_delete(struct auth_zones* az) { if(!az) return; lock_rw_destroy(&az->lock); lock_rw_destroy(&az->rpz_lock); traverse_postorder(&az->ztree, auth_zone_del, NULL); traverse_postorder(&az->xtree, auth_xfer_del, NULL); free(az); } /** true if domain has only nsec3 */ static int domain_has_only_nsec3(struct auth_data* n) { struct auth_rrset* rrset = n->rrsets; int nsec3_seen = 0; while(rrset) { if(rrset->type == LDNS_RR_TYPE_NSEC3) { nsec3_seen = 1; } else if(rrset->type != LDNS_RR_TYPE_RRSIG) { return 0; } rrset = rrset->next; } return nsec3_seen; } /** see if the domain has a wildcard child '*.domain' */ static struct auth_data* az_find_wildcard_domain(struct auth_zone* z, uint8_t* nm, size_t nmlen) { uint8_t wc[LDNS_MAX_DOMAINLEN]; if(nmlen+2 > sizeof(wc)) return NULL; /* result would be too long */ wc[0] = 1; /* length of wildcard label */ wc[1] = (uint8_t)'*'; /* wildcard label */ memmove(wc+2, nm, nmlen); return az_find_name(z, wc, nmlen+2); } /** find wildcard between qname and cename */ static struct auth_data* az_find_wildcard(struct auth_zone* z, struct query_info* qinfo, struct auth_data* ce) { uint8_t* nm = qinfo->qname; size_t nmlen = qinfo->qname_len; struct auth_data* node; if(!dname_subdomain_c(nm, z->name)) return NULL; /* out of zone */ while((node=az_find_wildcard_domain(z, nm, nmlen))==NULL) { /* see if we can go up to find the wildcard */ if(nmlen == z->namelen) return NULL; /* top of zone reached */ if(ce && nmlen == ce->namelen) return NULL; /* ce reached */ if(dname_is_root(nm)) return NULL; /* cannot go up */ dname_remove_label(&nm, &nmlen); } return node; } /** domain is not exact, find first candidate ce (name that matches * a part of qname) in tree */ static struct auth_data* az_find_candidate_ce(struct auth_zone* z, struct query_info* qinfo, struct auth_data* n) { uint8_t* nm; size_t nmlen; if(n) { nm = dname_get_shared_topdomain(qinfo->qname, n->name); } else { nm = qinfo->qname; } dname_count_size_labels(nm, &nmlen); n = az_find_name(z, nm, nmlen); /* delete labels and go up on name */ while(!n) { if(dname_is_root(nm)) return NULL; /* cannot go up */ dname_remove_label(&nm, &nmlen); n = az_find_name(z, nm, nmlen); } return n; } /** go up the auth tree to next existing name. */ static struct auth_data* az_domain_go_up(struct auth_zone* z, struct auth_data* n) { uint8_t* nm = n->name; size_t nmlen = n->namelen; while(!dname_is_root(nm)) { dname_remove_label(&nm, &nmlen); if((n=az_find_name(z, nm, nmlen)) != NULL) return n; } return NULL; } /** Find the closest encloser, an name that exists and is above the * qname. * return true if the node (param node) is existing, nonobscured and * can be used to generate answers from. It is then also node_exact. * returns false if the node is not good enough (or it wasn't node_exact) * in this case the ce can be filled. * if ce is NULL, no ce exists, and likely the zone is completely empty, * not even with a zone apex. * if ce is nonNULL it is the closest enclosing upper name (that exists * itself for answer purposes). That name may have DNAME, NS or wildcard * rrset is the closest DNAME or NS rrset that was found. */ static int az_find_ce(struct auth_zone* z, struct query_info* qinfo, struct auth_data* node, int node_exact, struct auth_data** ce, struct auth_rrset** rrset) { struct auth_data* n = node; *ce = NULL; *rrset = NULL; if(!node_exact) { /* if not exact, lookup closest exact match */ n = az_find_candidate_ce(z, qinfo, n); } else { /* if exact, the node itself is the first candidate ce */ *ce = n; } /* no direct answer from nsec3-only domains */ if(n && domain_has_only_nsec3(n)) { node_exact = 0; *ce = NULL; } /* with exact matches, walk up the labels until we find the * delegation, or DNAME or zone end */ while(n) { /* see if the current candidate has issues */ /* not zone apex and has type NS */ if(n->namelen != z->namelen && (*rrset=az_domain_rrset(n, LDNS_RR_TYPE_NS)) && /* delegate here, but DS at exact the dp has notype */ (qinfo->qtype != LDNS_RR_TYPE_DS || n->namelen != qinfo->qname_len)) { /* referral */ /* this is ce and the lowernode is nonexisting */ *ce = n; return 0; } /* not equal to qname and has type DNAME */ if(n->namelen != qinfo->qname_len && (*rrset=az_domain_rrset(n, LDNS_RR_TYPE_DNAME))) { /* this is ce and the lowernode is nonexisting */ *ce = n; return 0; } if(*ce == NULL && !domain_has_only_nsec3(n)) { /* if not found yet, this exact name must be * our lowest match (but not nsec3onlydomain) */ *ce = n; } /* walk up the tree by removing labels from name and lookup */ n = az_domain_go_up(z, n); } /* found no problems, if it was an exact node, it is fine to use */ return node_exact; } /** add additional A/AAAA from domain names in rrset rdata (+offset) * offset is number of bytes in rdata where the dname is located. */ static int az_add_additionals_from(struct auth_zone* z, struct regional* region, struct dns_msg* msg, struct auth_rrset* rrset, size_t offset) { struct packed_rrset_data* d = rrset->data; size_t i; if(!d) return 0; for(i=0; icount; i++) { size_t dlen; struct auth_data* domain; struct auth_rrset* ref; if(d->rr_len[i] < 2+offset) continue; /* too short */ if(!(dlen = dname_valid(d->rr_data[i]+2+offset, d->rr_len[i]-2-offset))) continue; /* malformed */ domain = az_find_name(z, d->rr_data[i]+2+offset, dlen); if(!domain) continue; if((ref=az_domain_rrset(domain, LDNS_RR_TYPE_A)) != NULL) { if(!msg_add_rrset_ar(z, region, msg, domain, ref)) return 0; } if((ref=az_domain_rrset(domain, LDNS_RR_TYPE_AAAA)) != NULL) { if(!msg_add_rrset_ar(z, region, msg, domain, ref)) return 0; } } return 1; } /** add negative SOA record (with negative TTL) */ static int az_add_negative_soa(struct auth_zone* z, struct regional* region, struct dns_msg* msg) { uint32_t minimum; struct packed_rrset_data* d; struct auth_rrset* soa; struct auth_data* apex = az_find_name(z, z->name, z->namelen); if(!apex) return 0; soa = az_domain_rrset(apex, LDNS_RR_TYPE_SOA); if(!soa) return 0; /* must be first to put in message; we want to fix the TTL with * one RRset here, otherwise we'd need to loop over the RRs to get * the resulting lower TTL */ log_assert(msg->rep->rrset_count == 0); if(!msg_add_rrset_ns(z, region, msg, apex, soa)) return 0; /* fixup TTL */ d = (struct packed_rrset_data*)msg->rep->rrsets[msg->rep->rrset_count-1]->entry.data; /* last 4 bytes are minimum ttl in network format */ if(d->count == 0) return 0; if(d->rr_len[0] < 2+4) return 0; minimum = sldns_read_uint32(d->rr_data[0]+(d->rr_len[0]-4)); d->ttl = (time_t)minimum; d->rr_ttl[0] = (time_t)minimum; msg->rep->ttl = get_rrset_ttl(msg->rep->rrsets[0]); msg->rep->prefetch_ttl = PREFETCH_TTL_CALC(msg->rep->ttl); msg->rep->serve_expired_ttl = msg->rep->ttl + SERVE_EXPIRED_TTL; return 1; } /** See if the query goes to empty nonterminal (that has no auth_data, * but there are nodes underneath. We already checked that there are * not NS, or DNAME above, so that we only need to check if some node * exists below (with nonempty rr list), return true if emptynonterminal */ static int az_empty_nonterminal(struct auth_zone* z, struct query_info* qinfo, struct auth_data* node) { struct auth_data* next; if(!node) { /* no smaller was found, use first (smallest) node as the * next one */ next = (struct auth_data*)rbtree_first(&z->data); } else { next = (struct auth_data*)rbtree_next(&node->node); } while(next && (rbnode_type*)next != RBTREE_NULL && next->rrsets == NULL) { /* the next name has empty rrsets, is an empty nonterminal * itself, see if there exists something below it */ next = (struct auth_data*)rbtree_next(&node->node); } if((rbnode_type*)next == RBTREE_NULL || !next) { /* there is no next node, so something below it cannot * exist */ return 0; } /* a next node exists, if there was something below the query, * this node has to be it. See if it is below the query name */ if(dname_strict_subdomain_c(next->name, qinfo->qname)) return 1; return 0; } /** create synth cname target name in buffer, or fail if too long */ static size_t synth_cname_buf(uint8_t* qname, size_t qname_len, size_t dname_len, uint8_t* dtarg, size_t dtarglen, uint8_t* buf, size_t buflen) { size_t newlen = qname_len + dtarglen - dname_len; if(newlen > buflen) { /* YXDOMAIN error */ return 0; } /* new name is concatenation of qname front (without DNAME owner) * and DNAME target name */ memcpy(buf, qname, qname_len-dname_len); memmove(buf+(qname_len-dname_len), dtarg, dtarglen); return newlen; } /** create synthetic CNAME rrset for in a DNAME answer in region, * false on alloc failure, cname==NULL when name too long. */ static int create_synth_cname(uint8_t* qname, size_t qname_len, struct regional* region, struct auth_data* node, struct auth_rrset* dname, uint16_t dclass, struct ub_packed_rrset_key** cname) { uint8_t buf[LDNS_MAX_DOMAINLEN]; uint8_t* dtarg; size_t dtarglen, newlen; struct packed_rrset_data* d; /* get DNAME target name */ if(dname->data->count < 1) return 0; if(dname->data->rr_len[0] < 3) return 0; /* at least rdatalen +1 */ dtarg = dname->data->rr_data[0]+2; dtarglen = dname->data->rr_len[0]-2; if(sldns_read_uint16(dname->data->rr_data[0]) != dtarglen) return 0; /* rdatalen in DNAME rdata is malformed */ if(dname_valid(dtarg, dtarglen) != dtarglen) return 0; /* DNAME RR has malformed rdata */ if(qname_len == 0) return 0; /* too short */ if(qname_len <= node->namelen) return 0; /* qname too short for dname removal */ /* synthesize a CNAME */ newlen = synth_cname_buf(qname, qname_len, node->namelen, dtarg, dtarglen, buf, sizeof(buf)); if(newlen == 0) { /* YXDOMAIN error */ *cname = NULL; return 1; } *cname = (struct ub_packed_rrset_key*)regional_alloc(region, sizeof(struct ub_packed_rrset_key)); if(!*cname) return 0; /* out of memory */ memset(&(*cname)->entry, 0, sizeof((*cname)->entry)); (*cname)->entry.key = (*cname); (*cname)->rk.type = htons(LDNS_RR_TYPE_CNAME); (*cname)->rk.rrset_class = htons(dclass); (*cname)->rk.flags = 0; (*cname)->rk.dname = regional_alloc_init(region, qname, qname_len); if(!(*cname)->rk.dname) return 0; /* out of memory */ (*cname)->rk.dname_len = qname_len; (*cname)->entry.hash = rrset_key_hash(&(*cname)->rk); d = (struct packed_rrset_data*)regional_alloc_zero(region, sizeof(struct packed_rrset_data) + sizeof(size_t) + sizeof(uint8_t*) + sizeof(time_t) + sizeof(uint16_t) + newlen); if(!d) return 0; /* out of memory */ (*cname)->entry.data = d; d->ttl = 0; /* 0 for synthesized CNAME TTL */ d->count = 1; d->rrsig_count = 0; d->trust = rrset_trust_ans_noAA; d->rr_len = (size_t*)((uint8_t*)d + sizeof(struct packed_rrset_data)); d->rr_len[0] = newlen + sizeof(uint16_t); packed_rrset_ptr_fixup(d); d->rr_ttl[0] = d->ttl; sldns_write_uint16(d->rr_data[0], newlen); memmove(d->rr_data[0] + sizeof(uint16_t), buf, newlen); return 1; } /** add a synthesized CNAME to the answer section */ static int add_synth_cname(struct auth_zone* z, uint8_t* qname, size_t qname_len, struct regional* region, struct dns_msg* msg, struct auth_data* dname, struct auth_rrset* rrset) { struct ub_packed_rrset_key* cname; /* synthesize a CNAME */ if(!create_synth_cname(qname, qname_len, region, dname, rrset, z->dclass, &cname)) { /* out of memory */ return 0; } if(!cname) { /* cname cannot be create because of YXDOMAIN */ msg->rep->flags |= LDNS_RCODE_YXDOMAIN; return 1; } /* add cname to message */ if(!msg_grow_array(region, msg)) return 0; msg->rep->rrsets[msg->rep->rrset_count] = cname; msg->rep->rrset_count++; msg->rep->an_numrrsets++; msg_ttl(msg); return 1; } /** Change a dname to a different one, for wildcard namechange */ static void az_change_dnames(struct dns_msg* msg, uint8_t* oldname, uint8_t* newname, size_t newlen, int an_only) { size_t i; size_t start = 0, end = msg->rep->rrset_count; if(!an_only) start = msg->rep->an_numrrsets; if(an_only) end = msg->rep->an_numrrsets; for(i=start; irep->rrsets[i]->rk.dname, oldname) == 0) { msg->rep->rrsets[i]->rk.dname = newname; msg->rep->rrsets[i]->rk.dname_len = newlen; } } } /** find NSEC record covering the query */ static struct auth_rrset* az_find_nsec_cover(struct auth_zone* z, struct auth_data** node) { uint8_t* nm = (*node)->name; size_t nmlen = (*node)->namelen; struct auth_rrset* rrset; /* find the NSEC for the smallest-or-equal node */ /* if node == NULL, we did not find a smaller name. But the zone * name is the smallest name and should have an NSEC. So there is * no NSEC to return (for a properly signed zone) */ /* for empty nonterminals, the auth-data node should not exist, * and thus we don't need to go rbtree_previous here to find * a domain with an NSEC record */ /* but there could be glue, and if this is node, then it has no NSEC. * Go up to find nonglue (previous) NSEC-holding nodes */ while((rrset=az_domain_rrset(*node, LDNS_RR_TYPE_NSEC)) == NULL) { if(dname_is_root(nm)) return NULL; if(nmlen == z->namelen) return NULL; dname_remove_label(&nm, &nmlen); /* adjust *node for the nsec rrset to find in */ *node = az_find_name(z, nm, nmlen); } return rrset; } /** Find NSEC and add for wildcard denial */ static int az_nsec_wildcard_denial(struct auth_zone* z, struct regional* region, struct dns_msg* msg, uint8_t* cenm, size_t cenmlen) { struct query_info qinfo; int node_exact; struct auth_data* node; struct auth_rrset* nsec; uint8_t wc[LDNS_MAX_DOMAINLEN]; if(cenmlen+2 > sizeof(wc)) return 0; /* result would be too long */ wc[0] = 1; /* length of wildcard label */ wc[1] = (uint8_t)'*'; /* wildcard label */ memmove(wc+2, cenm, cenmlen); /* we have '*.ce' in wc wildcard name buffer */ /* get nsec cover for that */ qinfo.qname = wc; qinfo.qname_len = cenmlen+2; qinfo.qtype = 0; qinfo.qclass = 0; az_find_domain(z, &qinfo, &node_exact, &node); if((nsec=az_find_nsec_cover(z, &node)) != NULL) { if(!msg_add_rrset_ns(z, region, msg, node, nsec)) return 0; } return 1; } /** Find the NSEC3PARAM rrset (if any) and if true you have the parameters */ static int az_nsec3_param(struct auth_zone* z, int* algo, size_t* iter, uint8_t** salt, size_t* saltlen) { struct auth_data* apex; struct auth_rrset* param; size_t i; apex = az_find_name(z, z->name, z->namelen); if(!apex) return 0; param = az_domain_rrset(apex, LDNS_RR_TYPE_NSEC3PARAM); if(!param || param->data->count==0) return 0; /* no RRset or no RRs in rrset */ /* find out which NSEC3PARAM RR has supported parameters */ /* skip unknown flags (dynamic signer is recalculating nsec3 chain) */ for(i=0; idata->count; i++) { uint8_t* rdata = param->data->rr_data[i]+2; size_t rdatalen = param->data->rr_len[i]; if(rdatalen < 2+5) continue; /* too short */ if(!nsec3_hash_algo_size_supported((int)(rdata[0]))) continue; /* unsupported algo */ if(rdatalen < (size_t)(2+5+(size_t)rdata[4])) continue; /* salt missing */ if((rdata[1]&NSEC3_UNKNOWN_FLAGS)!=0) continue; /* unknown flags */ *algo = (int)(rdata[0]); *iter = sldns_read_uint16(rdata+2); *saltlen = rdata[4]; if(*saltlen == 0) *salt = NULL; else *salt = rdata+5; return 1; } /* no supported params */ return 0; } /** Hash a name with nsec3param into buffer, it has zone name appended. * return length of hash */ static size_t az_nsec3_hash(uint8_t* buf, size_t buflen, uint8_t* nm, size_t nmlen, int algo, size_t iter, uint8_t* salt, size_t saltlen) { size_t hlen = nsec3_hash_algo_size_supported(algo); /* buffer has domain name, nsec3hash, and 256 is for max saltlen * (salt has 0-255 length) */ unsigned char p[LDNS_MAX_DOMAINLEN+1+N3HASHBUFLEN+256]; size_t i; if(nmlen+saltlen > sizeof(p) || hlen+saltlen > sizeof(p)) return 0; if(hlen > buflen) return 0; /* somehow too large for destination buffer */ /* hashfunc(name, salt) */ memmove(p, nm, nmlen); query_dname_tolower(p); if(salt && saltlen > 0) memmove(p+nmlen, salt, saltlen); (void)secalgo_nsec3_hash(algo, p, nmlen+saltlen, (unsigned char*)buf); for(i=0; i 0) memmove(p+hlen, salt, saltlen); (void)secalgo_nsec3_hash(algo, p, hlen+saltlen, (unsigned char*)buf); } return hlen; } /** Hash name and return b32encoded hashname for lookup, zone name appended */ static int az_nsec3_hashname(struct auth_zone* z, uint8_t* hashname, size_t* hashnmlen, uint8_t* nm, size_t nmlen, int algo, size_t iter, uint8_t* salt, size_t saltlen) { uint8_t hash[N3HASHBUFLEN]; size_t hlen; int ret; hlen = az_nsec3_hash(hash, sizeof(hash), nm, nmlen, algo, iter, salt, saltlen); if(!hlen) return 0; /* b32 encode */ if(*hashnmlen < hlen*2+1+z->namelen) /* approx b32 as hexb16 */ return 0; ret = sldns_b32_ntop_extended_hex(hash, hlen, (char*)(hashname+1), (*hashnmlen)-1); if(ret<1) return 0; hashname[0] = (uint8_t)ret; ret++; if((*hashnmlen) - ret < z->namelen) return 0; memmove(hashname+ret, z->name, z->namelen); *hashnmlen = z->namelen+(size_t)ret; return 1; } /** Find the datanode that covers the nsec3hash-name */ static struct auth_data* az_nsec3_findnode(struct auth_zone* z, uint8_t* hashnm, size_t hashnmlen) { struct query_info qinfo; struct auth_data* node; int node_exact; qinfo.qclass = 0; qinfo.qtype = 0; qinfo.qname = hashnm; qinfo.qname_len = hashnmlen; /* because canonical ordering and b32 nsec3 ordering are the same. * this is a good lookup to find the nsec3 name. */ az_find_domain(z, &qinfo, &node_exact, &node); /* but we may have to skip non-nsec3 nodes */ /* this may be a lot, the way to speed that up is to have a * separate nsec3 tree with nsec3 nodes */ while(node && (rbnode_type*)node != RBTREE_NULL && !az_domain_rrset(node, LDNS_RR_TYPE_NSEC3)) { node = (struct auth_data*)rbtree_previous(&node->node); } if((rbnode_type*)node == RBTREE_NULL) node = NULL; return node; } /** Find cover for hashed(nm, nmlen) (or NULL) */ static struct auth_data* az_nsec3_find_cover(struct auth_zone* z, uint8_t* nm, size_t nmlen, int algo, size_t iter, uint8_t* salt, size_t saltlen) { struct auth_data* node; uint8_t hname[LDNS_MAX_DOMAINLEN]; size_t hlen = sizeof(hname); if(!az_nsec3_hashname(z, hname, &hlen, nm, nmlen, algo, iter, salt, saltlen)) return NULL; node = az_nsec3_findnode(z, hname, hlen); if(node) return node; /* we did not find any, perhaps because the NSEC3 hash is before * the first hash, we have to find the 'last hash' in the zone */ node = (struct auth_data*)rbtree_last(&z->data); while(node && (rbnode_type*)node != RBTREE_NULL && !az_domain_rrset(node, LDNS_RR_TYPE_NSEC3)) { node = (struct auth_data*)rbtree_previous(&node->node); } if((rbnode_type*)node == RBTREE_NULL) node = NULL; return node; } /** Find exact match for hashed(nm, nmlen) NSEC3 record or NULL */ static struct auth_data* az_nsec3_find_exact(struct auth_zone* z, uint8_t* nm, size_t nmlen, int algo, size_t iter, uint8_t* salt, size_t saltlen) { struct auth_data* node; uint8_t hname[LDNS_MAX_DOMAINLEN]; size_t hlen = sizeof(hname); if(!az_nsec3_hashname(z, hname, &hlen, nm, nmlen, algo, iter, salt, saltlen)) return NULL; node = az_find_name(z, hname, hlen); if(az_domain_rrset(node, LDNS_RR_TYPE_NSEC3)) return node; return NULL; } /** Return nextcloser name (as a ref into the qname). This is one label * more than the cenm (cename must be a suffix of qname) */ static void az_nsec3_get_nextcloser(uint8_t* cenm, uint8_t* qname, size_t qname_len, uint8_t** nx, size_t* nxlen) { int celabs = dname_count_labels(cenm); int qlabs = dname_count_labels(qname); int strip = qlabs - celabs -1; log_assert(dname_strict_subdomain(qname, qlabs, cenm, celabs)); *nx = qname; *nxlen = qname_len; if(strip>0) dname_remove_labels(nx, nxlen, strip); } /** Find the closest encloser that has exact NSEC3. * updated cenm to the new name. If it went up no-exact-ce is true. */ static struct auth_data* az_nsec3_find_ce(struct auth_zone* z, uint8_t** cenm, size_t* cenmlen, int* no_exact_ce, int algo, size_t iter, uint8_t* salt, size_t saltlen) { struct auth_data* node; while((node = az_nsec3_find_exact(z, *cenm, *cenmlen, algo, iter, salt, saltlen)) == NULL) { if(*cenmlen == z->namelen) { /* next step up would take us out of the zone. fail */ return NULL; } *no_exact_ce = 1; dname_remove_label(cenm, cenmlen); } return node; } /* Insert NSEC3 record in authority section, if NULL does nothing */ static int az_nsec3_insert(struct auth_zone* z, struct regional* region, struct dns_msg* msg, struct auth_data* node) { struct auth_rrset* nsec3; if(!node) return 1; /* no node, skip this */ nsec3 = az_domain_rrset(node, LDNS_RR_TYPE_NSEC3); if(!nsec3) return 1; /* if no nsec3 RR, skip it */ if(!msg_add_rrset_ns(z, region, msg, node, nsec3)) return 0; return 1; } /** add NSEC3 records to the zone for the nsec3 proof. * Specify with the flags with parts of the proof are required. * the ce is the exact matching name (for notype) but also delegation points. * qname is the one where the nextcloser name can be derived from. * If NSEC3 is not properly there (in the zone) nothing is added. * always enabled: include nsec3 proving about the Closest Encloser. * that is an exact match that should exist for it. * If that does not exist, a higher exact match + nxproof is enabled * (for some sort of opt-out empty nonterminal cases). * nodataproof: search for exact match and include that instead. * ceproof: include ce proof NSEC3 (omitted for wildcard replies). * nxproof: include denial of the qname. * wcproof: include denial of wildcard (wildcard.ce). */ static int az_add_nsec3_proof(struct auth_zone* z, struct regional* region, struct dns_msg* msg, uint8_t* cenm, size_t cenmlen, uint8_t* qname, size_t qname_len, int nodataproof, int ceproof, int nxproof, int wcproof) { int algo; size_t iter, saltlen; uint8_t* salt; int no_exact_ce = 0; struct auth_data* node; /* find parameters of nsec3 proof */ if(!az_nsec3_param(z, &algo, &iter, &salt, &saltlen)) return 1; /* no nsec3 */ if(nodataproof) { /* see if the node has a hash of itself for the nodata * proof nsec3, this has to be an exact match nsec3. */ struct auth_data* match; match = az_nsec3_find_exact(z, qname, qname_len, algo, iter, salt, saltlen); if(match) { if(!az_nsec3_insert(z, region, msg, match)) return 0; /* only nodata NSEC3 needed, no CE or others. */ return 1; } } /* find ce that has an NSEC3 */ if(ceproof) { node = az_nsec3_find_ce(z, &cenm, &cenmlen, &no_exact_ce, algo, iter, salt, saltlen); if(no_exact_ce) nxproof = 1; if(!az_nsec3_insert(z, region, msg, node)) return 0; } if(nxproof) { uint8_t* nx; size_t nxlen; /* create nextcloser domain name */ az_nsec3_get_nextcloser(cenm, qname, qname_len, &nx, &nxlen); /* find nsec3 that matches or covers it */ node = az_nsec3_find_cover(z, nx, nxlen, algo, iter, salt, saltlen); if(!az_nsec3_insert(z, region, msg, node)) return 0; } if(wcproof) { /* create wildcard name *.ce */ uint8_t wc[LDNS_MAX_DOMAINLEN]; size_t wclen; if(cenmlen+2 > sizeof(wc)) return 0; /* result would be too long */ wc[0] = 1; /* length of wildcard label */ wc[1] = (uint8_t)'*'; /* wildcard label */ memmove(wc+2, cenm, cenmlen); wclen = cenmlen+2; /* find nsec3 that matches or covers it */ node = az_nsec3_find_cover(z, wc, wclen, algo, iter, salt, saltlen); if(!az_nsec3_insert(z, region, msg, node)) return 0; } return 1; } /** generate answer for positive answer */ static int az_generate_positive_answer(struct auth_zone* z, struct regional* region, struct dns_msg* msg, struct auth_data* node, struct auth_rrset* rrset) { if(!msg_add_rrset_an(z, region, msg, node, rrset)) return 0; /* see if we want additional rrs */ if(rrset->type == LDNS_RR_TYPE_MX) { if(!az_add_additionals_from(z, region, msg, rrset, 2)) return 0; } else if(rrset->type == LDNS_RR_TYPE_SRV) { if(!az_add_additionals_from(z, region, msg, rrset, 6)) return 0; } else if(rrset->type == LDNS_RR_TYPE_NS) { if(!az_add_additionals_from(z, region, msg, rrset, 0)) return 0; } return 1; } /** generate answer for type ANY answer */ static int az_generate_any_answer(struct auth_zone* z, struct regional* region, struct dns_msg* msg, struct auth_data* node) { struct auth_rrset* rrset; int added = 0; /* add a couple (at least one) RRs */ if((rrset=az_domain_rrset(node, LDNS_RR_TYPE_SOA)) != NULL) { if(!msg_add_rrset_an(z, region, msg, node, rrset)) return 0; added++; } if((rrset=az_domain_rrset(node, LDNS_RR_TYPE_MX)) != NULL) { if(!msg_add_rrset_an(z, region, msg, node, rrset)) return 0; added++; } if((rrset=az_domain_rrset(node, LDNS_RR_TYPE_A)) != NULL) { if(!msg_add_rrset_an(z, region, msg, node, rrset)) return 0; added++; } if((rrset=az_domain_rrset(node, LDNS_RR_TYPE_AAAA)) != NULL) { if(!msg_add_rrset_an(z, region, msg, node, rrset)) return 0; added++; } if(added == 0 && node && node->rrsets) { if(!msg_add_rrset_an(z, region, msg, node, node->rrsets)) return 0; } return 1; } /** follow cname chain and add more data to the answer section */ static int follow_cname_chain(struct auth_zone* z, uint16_t qtype, struct regional* region, struct dns_msg* msg, struct packed_rrset_data* d) { int maxchain = 0; /* see if we can add the target of the CNAME into the answer */ while(maxchain++ < MAX_CNAME_CHAIN) { struct auth_data* node; struct auth_rrset* rrset; size_t clen; /* d has cname rdata */ if(d->count == 0) break; /* no CNAME */ if(d->rr_len[0] < 2+1) break; /* too small */ if((clen=dname_valid(d->rr_data[0]+2, d->rr_len[0]-2))==0) break; /* malformed */ if(!dname_subdomain_c(d->rr_data[0]+2, z->name)) break; /* target out of zone */ if((node = az_find_name(z, d->rr_data[0]+2, clen))==NULL) break; /* no such target name */ if((rrset=az_domain_rrset(node, qtype))!=NULL) { /* done we found the target */ if(!msg_add_rrset_an(z, region, msg, node, rrset)) return 0; break; } if((rrset=az_domain_rrset(node, LDNS_RR_TYPE_CNAME))==NULL) break; /* no further CNAME chain, notype */ if(!msg_add_rrset_an(z, region, msg, node, rrset)) return 0; d = rrset->data; } return 1; } /** generate answer for cname answer */ static int az_generate_cname_answer(struct auth_zone* z, struct query_info* qinfo, struct regional* region, struct dns_msg* msg, struct auth_data* node, struct auth_rrset* rrset) { if(!msg_add_rrset_an(z, region, msg, node, rrset)) return 0; if(!rrset) return 1; if(!follow_cname_chain(z, qinfo->qtype, region, msg, rrset->data)) return 0; return 1; } /** generate answer for notype answer */ static int az_generate_notype_answer(struct auth_zone* z, struct regional* region, struct dns_msg* msg, struct auth_data* node) { struct auth_rrset* rrset; if(!az_add_negative_soa(z, region, msg)) return 0; /* DNSSEC denial NSEC */ if((rrset=az_domain_rrset(node, LDNS_RR_TYPE_NSEC))!=NULL) { if(!msg_add_rrset_ns(z, region, msg, node, rrset)) return 0; } else if(node) { /* DNSSEC denial NSEC3 */ if(!az_add_nsec3_proof(z, region, msg, node->name, node->namelen, msg->qinfo.qname, msg->qinfo.qname_len, 1, 1, 0, 0)) return 0; } return 1; } /** generate answer for referral answer */ static int az_generate_referral_answer(struct auth_zone* z, struct regional* region, struct dns_msg* msg, struct auth_data* ce, struct auth_rrset* rrset) { struct auth_rrset* ds, *nsec; /* turn off AA flag, referral is nonAA because it leaves the zone */ log_assert(ce); msg->rep->flags &= ~BIT_AA; if(!msg_add_rrset_ns(z, region, msg, ce, rrset)) return 0; /* add DS or deny it */ if((ds=az_domain_rrset(ce, LDNS_RR_TYPE_DS))!=NULL) { if(!msg_add_rrset_ns(z, region, msg, ce, ds)) return 0; } else { /* deny the DS */ if((nsec=az_domain_rrset(ce, LDNS_RR_TYPE_NSEC))!=NULL) { if(!msg_add_rrset_ns(z, region, msg, ce, nsec)) return 0; } else { if(!az_add_nsec3_proof(z, region, msg, ce->name, ce->namelen, msg->qinfo.qname, msg->qinfo.qname_len, 1, 1, 0, 0)) return 0; } } /* add additional rrs for type NS */ if(!az_add_additionals_from(z, region, msg, rrset, 0)) return 0; return 1; } /** generate answer for DNAME answer */ static int az_generate_dname_answer(struct auth_zone* z, struct query_info* qinfo, struct regional* region, struct dns_msg* msg, struct auth_data* ce, struct auth_rrset* rrset) { log_assert(ce); /* add the DNAME and then a CNAME */ if(!msg_add_rrset_an(z, region, msg, ce, rrset)) return 0; if(!add_synth_cname(z, qinfo->qname, qinfo->qname_len, region, msg, ce, rrset)) return 0; if(FLAGS_GET_RCODE(msg->rep->flags) == LDNS_RCODE_YXDOMAIN) return 1; if(msg->rep->rrset_count == 0 || !msg->rep->rrsets[msg->rep->rrset_count-1]) return 0; if(!follow_cname_chain(z, qinfo->qtype, region, msg, (struct packed_rrset_data*)msg->rep->rrsets[ msg->rep->rrset_count-1]->entry.data)) return 0; return 1; } /** generate answer for wildcard answer */ static int az_generate_wildcard_answer(struct auth_zone* z, struct query_info* qinfo, struct regional* region, struct dns_msg* msg, struct auth_data* ce, struct auth_data* wildcard, struct auth_data* node) { struct auth_rrset* rrset, *nsec; int insert_ce = 0; if((rrset=az_domain_rrset(wildcard, qinfo->qtype)) != NULL) { /* wildcard has type, add it */ if(!msg_add_rrset_an(z, region, msg, wildcard, rrset)) return 0; az_change_dnames(msg, wildcard->name, msg->qinfo.qname, msg->qinfo.qname_len, 1); } else if((rrset=az_domain_rrset(wildcard, LDNS_RR_TYPE_CNAME))!=NULL) { /* wildcard has cname instead, do that */ if(!msg_add_rrset_an(z, region, msg, wildcard, rrset)) return 0; az_change_dnames(msg, wildcard->name, msg->qinfo.qname, msg->qinfo.qname_len, 1); if(!follow_cname_chain(z, qinfo->qtype, region, msg, rrset->data)) return 0; } else if(qinfo->qtype == LDNS_RR_TYPE_ANY && wildcard->rrsets) { /* add ANY rrsets from wildcard node */ if(!az_generate_any_answer(z, region, msg, wildcard)) return 0; az_change_dnames(msg, wildcard->name, msg->qinfo.qname, msg->qinfo.qname_len, 1); } else { /* wildcard has nodata, notype answer */ /* call other notype routine for dnssec notype denials */ if(!az_generate_notype_answer(z, region, msg, wildcard)) return 0; /* because the notype, there is no positive data with an * RRSIG that indicates the wildcard position. Thus the * wildcard qname denial needs to have a CE nsec3. */ insert_ce = 1; } /* ce and node for dnssec denial of wildcard original name */ if((nsec=az_find_nsec_cover(z, &node)) != NULL) { if(!msg_add_rrset_ns(z, region, msg, node, nsec)) return 0; } else if(ce) { uint8_t* wildup = wildcard->name; size_t wilduplen= wildcard->namelen; dname_remove_label(&wildup, &wilduplen); if(!az_add_nsec3_proof(z, region, msg, wildup, wilduplen, msg->qinfo.qname, msg->qinfo.qname_len, 0, insert_ce, 1, 0)) return 0; } /* fixup name of wildcard from *.zone to qname, use already allocated * pointer to msg qname */ az_change_dnames(msg, wildcard->name, msg->qinfo.qname, msg->qinfo.qname_len, 0); return 1; } /** generate answer for nxdomain answer */ static int az_generate_nxdomain_answer(struct auth_zone* z, struct regional* region, struct dns_msg* msg, struct auth_data* ce, struct auth_data* node) { struct auth_rrset* nsec; msg->rep->flags |= LDNS_RCODE_NXDOMAIN; if(!az_add_negative_soa(z, region, msg)) return 0; if((nsec=az_find_nsec_cover(z, &node)) != NULL) { if(!msg_add_rrset_ns(z, region, msg, node, nsec)) return 0; if(ce && !az_nsec_wildcard_denial(z, region, msg, ce->name, ce->namelen)) return 0; } else if(ce) { if(!az_add_nsec3_proof(z, region, msg, ce->name, ce->namelen, msg->qinfo.qname, msg->qinfo.qname_len, 0, 1, 1, 1)) return 0; } return 1; } /** Create answers when an exact match exists for the domain name */ static int az_generate_answer_with_node(struct auth_zone* z, struct query_info* qinfo, struct regional* region, struct dns_msg* msg, struct auth_data* node) { struct auth_rrset* rrset; /* positive answer, rrset we are looking for exists */ if((rrset=az_domain_rrset(node, qinfo->qtype)) != NULL) { return az_generate_positive_answer(z, region, msg, node, rrset); } /* CNAME? */ if((rrset=az_domain_rrset(node, LDNS_RR_TYPE_CNAME)) != NULL) { return az_generate_cname_answer(z, qinfo, region, msg, node, rrset); } /* type ANY ? */ if(qinfo->qtype == LDNS_RR_TYPE_ANY) { return az_generate_any_answer(z, region, msg, node); } /* NOERROR/NODATA (no such type at domain name) */ return az_generate_notype_answer(z, region, msg, node); } /** Generate answer without an existing-node that we can use. * So it'll be a referral, DNAME or nxdomain */ static int az_generate_answer_nonexistnode(struct auth_zone* z, struct query_info* qinfo, struct regional* region, struct dns_msg* msg, struct auth_data* ce, struct auth_rrset* rrset, struct auth_data* node) { struct auth_data* wildcard; /* we do not have an exact matching name (that exists) */ /* see if we have a NS or DNAME in the ce */ if(ce && rrset && rrset->type == LDNS_RR_TYPE_NS) { return az_generate_referral_answer(z, region, msg, ce, rrset); } if(ce && rrset && rrset->type == LDNS_RR_TYPE_DNAME) { return az_generate_dname_answer(z, qinfo, region, msg, ce, rrset); } /* if there is an empty nonterminal, wildcard and nxdomain don't * happen, it is a notype answer */ if(az_empty_nonterminal(z, qinfo, node)) { return az_generate_notype_answer(z, region, msg, node); } /* see if we have a wildcard under the ce */ if((wildcard=az_find_wildcard(z, qinfo, ce)) != NULL) { return az_generate_wildcard_answer(z, qinfo, region, msg, ce, wildcard, node); } /* generate nxdomain answer */ return az_generate_nxdomain_answer(z, region, msg, ce, node); } /** Lookup answer in a zone. */ static int auth_zone_generate_answer(struct auth_zone* z, struct query_info* qinfo, struct regional* region, struct dns_msg** msg, int* fallback) { struct auth_data* node, *ce; struct auth_rrset* rrset; int node_exact, node_exists; /* does the zone want fallback in case of failure? */ *fallback = z->fallback_enabled; if(!(*msg=msg_create(region, qinfo))) return 0; /* lookup if there is a matching domain name for the query */ az_find_domain(z, qinfo, &node_exact, &node); /* see if node exists for generating answers from (i.e. not glue and * obscured by NS or DNAME or NSEC3-only), and also return the * closest-encloser from that, closest node that should be used * to generate answers from that is above the query */ node_exists = az_find_ce(z, qinfo, node, node_exact, &ce, &rrset); if(verbosity >= VERB_ALGO) { char zname[256], qname[256], nname[256], cename[256], tpstr[32], rrstr[32]; sldns_wire2str_dname_buf(qinfo->qname, qinfo->qname_len, qname, sizeof(qname)); sldns_wire2str_type_buf(qinfo->qtype, tpstr, sizeof(tpstr)); sldns_wire2str_dname_buf(z->name, z->namelen, zname, sizeof(zname)); if(node) sldns_wire2str_dname_buf(node->name, node->namelen, nname, sizeof(nname)); else snprintf(nname, sizeof(nname), "NULL"); if(ce) sldns_wire2str_dname_buf(ce->name, ce->namelen, cename, sizeof(cename)); else snprintf(cename, sizeof(cename), "NULL"); if(rrset) sldns_wire2str_type_buf(rrset->type, rrstr, sizeof(rrstr)); else snprintf(rrstr, sizeof(rrstr), "NULL"); log_info("auth_zone %s query %s %s, domain %s %s %s, " "ce %s, rrset %s", zname, qname, tpstr, nname, (node_exact?"exact":"notexact"), (node_exists?"exist":"notexist"), cename, rrstr); } if(node_exists) { /* the node is fine, generate answer from node */ return az_generate_answer_with_node(z, qinfo, region, *msg, node); } return az_generate_answer_nonexistnode(z, qinfo, region, *msg, ce, rrset, node); } int auth_zones_lookup(struct auth_zones* az, struct query_info* qinfo, struct regional* region, struct dns_msg** msg, int* fallback, uint8_t* dp_nm, size_t dp_nmlen) { int r; struct auth_zone* z; /* find the zone that should contain the answer. */ lock_rw_rdlock(&az->lock); z = auth_zone_find(az, dp_nm, dp_nmlen, qinfo->qclass); if(!z) { lock_rw_unlock(&az->lock); /* no auth zone, fallback to internet */ *fallback = 1; return 0; } lock_rw_rdlock(&z->lock); lock_rw_unlock(&az->lock); /* if not for upstream queries, fallback */ if(!z->for_upstream) { lock_rw_unlock(&z->lock); *fallback = 1; return 0; } if(z->zone_expired) { *fallback = z->fallback_enabled; lock_rw_unlock(&z->lock); return 0; } /* see what answer that zone would generate */ r = auth_zone_generate_answer(z, qinfo, region, msg, fallback); lock_rw_unlock(&z->lock); return r; } /** encode auth answer */ static void auth_answer_encode(struct query_info* qinfo, struct module_env* env, struct edns_data* edns, struct comm_reply* repinfo, sldns_buffer* buf, struct regional* temp, struct dns_msg* msg) { uint16_t udpsize; udpsize = edns->udp_size; edns->edns_version = EDNS_ADVERTISED_VERSION; edns->udp_size = EDNS_ADVERTISED_SIZE; edns->ext_rcode = 0; edns->bits &= EDNS_DO; if(!inplace_cb_reply_local_call(env, qinfo, NULL, msg->rep, (int)FLAGS_GET_RCODE(msg->rep->flags), edns, repinfo, temp) || !reply_info_answer_encode(qinfo, msg->rep, *(uint16_t*)sldns_buffer_begin(buf), sldns_buffer_read_u16_at(buf, 2), buf, 0, 0, temp, udpsize, edns, (int)(edns->bits&EDNS_DO), 0)) { error_encode(buf, (LDNS_RCODE_SERVFAIL|BIT_AA), qinfo, *(uint16_t*)sldns_buffer_begin(buf), sldns_buffer_read_u16_at(buf, 2), edns); } } /** encode auth error answer */ static void auth_error_encode(struct query_info* qinfo, struct module_env* env, struct edns_data* edns, struct comm_reply* repinfo, sldns_buffer* buf, struct regional* temp, int rcode) { edns->edns_version = EDNS_ADVERTISED_VERSION; edns->udp_size = EDNS_ADVERTISED_SIZE; edns->ext_rcode = 0; edns->bits &= EDNS_DO; if(!inplace_cb_reply_local_call(env, qinfo, NULL, NULL, rcode, edns, repinfo, temp)) edns->opt_list = NULL; error_encode(buf, rcode|BIT_AA, qinfo, *(uint16_t*)sldns_buffer_begin(buf), sldns_buffer_read_u16_at(buf, 2), edns); } int auth_zones_answer(struct auth_zones* az, struct module_env* env, struct query_info* qinfo, struct edns_data* edns, struct comm_reply* repinfo, struct sldns_buffer* buf, struct regional* temp) { struct dns_msg* msg = NULL; struct auth_zone* z; int r; int fallback = 0; lock_rw_rdlock(&az->lock); if(!az->have_downstream) { /* no downstream auth zones */ lock_rw_unlock(&az->lock); return 0; } if(qinfo->qtype == LDNS_RR_TYPE_DS) { uint8_t* delname = qinfo->qname; size_t delnamelen = qinfo->qname_len; dname_remove_label(&delname, &delnamelen); z = auth_zones_find_zone(az, delname, delnamelen, qinfo->qclass); } else { z = auth_zones_find_zone(az, qinfo->qname, qinfo->qname_len, qinfo->qclass); } if(!z) { /* no zone above it */ lock_rw_unlock(&az->lock); return 0; } lock_rw_rdlock(&z->lock); lock_rw_unlock(&az->lock); if(!z->for_downstream) { lock_rw_unlock(&z->lock); return 0; } if(z->zone_expired) { if(z->fallback_enabled) { lock_rw_unlock(&z->lock); return 0; } lock_rw_unlock(&z->lock); lock_rw_wrlock(&az->lock); az->num_query_down++; lock_rw_unlock(&az->lock); auth_error_encode(qinfo, env, edns, repinfo, buf, temp, LDNS_RCODE_SERVFAIL); return 1; } /* answer it from zone z */ r = auth_zone_generate_answer(z, qinfo, temp, &msg, &fallback); lock_rw_unlock(&z->lock); if(!r && fallback) { /* fallback to regular answering (recursive) */ return 0; } lock_rw_wrlock(&az->lock); az->num_query_down++; lock_rw_unlock(&az->lock); /* encode answer */ if(!r) auth_error_encode(qinfo, env, edns, repinfo, buf, temp, LDNS_RCODE_SERVFAIL); else auth_answer_encode(qinfo, env, edns, repinfo, buf, temp, msg); return 1; } int auth_zones_can_fallback(struct auth_zones* az, uint8_t* nm, size_t nmlen, uint16_t dclass) { int r; struct auth_zone* z; lock_rw_rdlock(&az->lock); z = auth_zone_find(az, nm, nmlen, dclass); if(!z) { lock_rw_unlock(&az->lock); /* no such auth zone, fallback */ return 1; } lock_rw_rdlock(&z->lock); lock_rw_unlock(&az->lock); r = z->fallback_enabled || (!z->for_upstream); lock_rw_unlock(&z->lock); return r; } int auth_zone_parse_notify_serial(sldns_buffer* pkt, uint32_t *serial) { struct query_info q; uint16_t rdlen; memset(&q, 0, sizeof(q)); sldns_buffer_set_position(pkt, 0); if(!query_info_parse(&q, pkt)) return 0; if(LDNS_ANCOUNT(sldns_buffer_begin(pkt)) == 0) return 0; /* skip name of RR in answer section */ if(sldns_buffer_remaining(pkt) < 1) return 0; if(pkt_dname_len(pkt) == 0) return 0; /* check type */ if(sldns_buffer_remaining(pkt) < 10 /* type,class,ttl,rdatalen*/) return 0; if(sldns_buffer_read_u16(pkt) != LDNS_RR_TYPE_SOA) return 0; sldns_buffer_skip(pkt, 2); /* class */ sldns_buffer_skip(pkt, 4); /* ttl */ rdlen = sldns_buffer_read_u16(pkt); /* rdatalen */ if(sldns_buffer_remaining(pkt) < rdlen) return 0; if(rdlen < 22) return 0; /* bad soa length */ sldns_buffer_skip(pkt, (ssize_t)(rdlen-20)); *serial = sldns_buffer_read_u32(pkt); /* return true when has serial in answer section */ return 1; } /** see if addr appears in the list */ static int addr_in_list(struct auth_addr* list, struct sockaddr_storage* addr, socklen_t addrlen) { struct auth_addr* p; for(p=list; p; p=p->next) { if(sockaddr_cmp_addr(addr, addrlen, &p->addr, p->addrlen)==0) return 1; } return 0; } /** check if an address matches a master specification (or one of its * addresses in the addr list) */ static int addr_matches_master(struct auth_master* master, struct sockaddr_storage* addr, socklen_t addrlen, struct auth_master** fromhost) { struct sockaddr_storage a; socklen_t alen = 0; int net = 0; if(addr_in_list(master->list, addr, addrlen)) { *fromhost = master; return 1; } /* compare address (but not port number, that is the destination * port of the master, the port number of the received notify is * allowed to by any port on that master) */ if(extstrtoaddr(master->host, &a, &alen) && sockaddr_cmp_addr(addr, addrlen, &a, alen)==0) { *fromhost = master; return 1; } /* prefixes, addr/len, like 10.0.0.0/8 */ /* not http and has a / and there is one / */ if(master->allow_notify && !master->http && strchr(master->host, '/') != NULL && strchr(master->host, '/') == strrchr(master->host, '/') && netblockstrtoaddr(master->host, UNBOUND_DNS_PORT, &a, &alen, &net) && alen == addrlen) { if(addr_in_common(addr, (addr_is_ip6(addr, addrlen)?128:32), &a, net, alen) >= net) { *fromhost = NULL; /* prefix does not have destination to send the probe or transfer with */ return 1; /* matches the netblock */ } } return 0; } /** check access list for notifies */ static int az_xfr_allowed_notify(struct auth_xfer* xfr, struct sockaddr_storage* addr, socklen_t addrlen, struct auth_master** fromhost) { struct auth_master* p; for(p=xfr->allow_notify_list; p; p=p->next) { if(addr_matches_master(p, addr, addrlen, fromhost)) { return 1; } } return 0; } /** see if the serial means the zone has to be updated, i.e. the serial * is newer than the zone serial, or we have no zone */ static int xfr_serial_means_update(struct auth_xfer* xfr, uint32_t serial) { if(!xfr->have_zone) return 1; /* no zone, anything is better */ if(xfr->zone_expired) return 1; /* expired, the sent serial is better than expired data */ if(compare_serial(xfr->serial, serial) < 0) return 1; /* our serial is smaller than the sent serial, the data is newer, fetch it */ return 0; } /** note notify serial, updates the notify information in the xfr struct */ static void xfr_note_notify_serial(struct auth_xfer* xfr, int has_serial, uint32_t serial) { if(xfr->notify_received && xfr->notify_has_serial && has_serial) { /* see if this serial is newer */ if(compare_serial(xfr->notify_serial, serial) < 0) xfr->notify_serial = serial; } else if(xfr->notify_received && xfr->notify_has_serial && !has_serial) { /* remove serial, we have notify without serial */ xfr->notify_has_serial = 0; xfr->notify_serial = 0; } else if(xfr->notify_received && !xfr->notify_has_serial) { /* we already have notify without serial, keep it * that way; no serial check when current operation * is done */ } else { xfr->notify_received = 1; xfr->notify_has_serial = has_serial; xfr->notify_serial = serial; } } /** process a notify serial, start new probe or note serial. xfr is locked */ static void xfr_process_notify(struct auth_xfer* xfr, struct module_env* env, int has_serial, uint32_t serial, struct auth_master* fromhost) { /* if the serial of notify is older than we have, don't fetch * a zone, we already have it */ if(has_serial && !xfr_serial_means_update(xfr, serial)) { lock_basic_unlock(&xfr->lock); return; } /* start new probe with this addr src, or note serial */ if(!xfr_start_probe(xfr, env, fromhost)) { /* not started because already in progress, note the serial */ xfr_note_notify_serial(xfr, has_serial, serial); lock_basic_unlock(&xfr->lock); } /* successful end of start_probe unlocked xfr->lock */ } int auth_zones_notify(struct auth_zones* az, struct module_env* env, uint8_t* nm, size_t nmlen, uint16_t dclass, struct sockaddr_storage* addr, socklen_t addrlen, int has_serial, uint32_t serial, int* refused) { struct auth_xfer* xfr; struct auth_master* fromhost = NULL; /* see which zone this is */ lock_rw_rdlock(&az->lock); xfr = auth_xfer_find(az, nm, nmlen, dclass); if(!xfr) { lock_rw_unlock(&az->lock); /* no such zone, refuse the notify */ *refused = 1; return 0; } lock_basic_lock(&xfr->lock); lock_rw_unlock(&az->lock); /* check access list for notifies */ if(!az_xfr_allowed_notify(xfr, addr, addrlen, &fromhost)) { lock_basic_unlock(&xfr->lock); /* notify not allowed, refuse the notify */ *refused = 1; return 0; } /* process the notify */ xfr_process_notify(xfr, env, has_serial, serial, fromhost); return 1; } int auth_zones_startprobesequence(struct auth_zones* az, struct module_env* env, uint8_t* nm, size_t nmlen, uint16_t dclass) { struct auth_xfer* xfr; lock_rw_rdlock(&az->lock); xfr = auth_xfer_find(az, nm, nmlen, dclass); if(!xfr) { lock_rw_unlock(&az->lock); return 0; } lock_basic_lock(&xfr->lock); lock_rw_unlock(&az->lock); xfr_process_notify(xfr, env, 0, 0, NULL); return 1; } /** set a zone expired */ static void auth_xfer_set_expired(struct auth_xfer* xfr, struct module_env* env, int expired) { struct auth_zone* z; /* expire xfr */ lock_basic_lock(&xfr->lock); xfr->zone_expired = expired; lock_basic_unlock(&xfr->lock); /* find auth_zone */ lock_rw_rdlock(&env->auth_zones->lock); z = auth_zone_find(env->auth_zones, xfr->name, xfr->namelen, xfr->dclass); if(!z) { lock_rw_unlock(&env->auth_zones->lock); return; } lock_rw_wrlock(&z->lock); lock_rw_unlock(&env->auth_zones->lock); /* expire auth_zone */ z->zone_expired = expired; lock_rw_unlock(&z->lock); } /** find master (from notify or probe) in list of masters */ static struct auth_master* find_master_by_host(struct auth_master* list, char* host) { struct auth_master* p; for(p=list; p; p=p->next) { if(strcmp(p->host, host) == 0) return p; } return NULL; } /** delete the looked up auth_addrs for all the masters in the list */ static void xfr_masterlist_free_addrs(struct auth_master* list) { struct auth_master* m; for(m=list; m; m=m->next) { if(m->list) { auth_free_master_addrs(m->list); m->list = NULL; } } } /** copy a list of auth_addrs */ static struct auth_addr* auth_addr_list_copy(struct auth_addr* source) { struct auth_addr* list = NULL, *last = NULL; struct auth_addr* p; for(p=source; p; p=p->next) { struct auth_addr* a = (struct auth_addr*)memdup(p, sizeof(*p)); if(!a) { log_err("malloc failure"); auth_free_master_addrs(list); return NULL; } a->next = NULL; if(last) last->next = a; if(!list) list = a; last = a; } return list; } /** copy a master to a new structure, NULL on alloc failure */ static struct auth_master* auth_master_copy(struct auth_master* o) { struct auth_master* m; if(!o) return NULL; m = (struct auth_master*)memdup(o, sizeof(*o)); if(!m) { log_err("malloc failure"); return NULL; } m->next = NULL; if(m->host) { m->host = strdup(m->host); if(!m->host) { free(m); log_err("malloc failure"); return NULL; } } if(m->file) { m->file = strdup(m->file); if(!m->file) { free(m->host); free(m); log_err("malloc failure"); return NULL; } } if(m->list) { m->list = auth_addr_list_copy(m->list); if(!m->list) { free(m->file); free(m->host); free(m); return NULL; } } return m; } /** copy the master addresses from the task_probe lookups to the allow_notify * list of masters */ static void probe_copy_masters_for_allow_notify(struct auth_xfer* xfr) { struct auth_master* list = NULL, *last = NULL; struct auth_master* p; /* build up new list with copies */ for(p = xfr->task_probe->masters; p; p=p->next) { struct auth_master* m = auth_master_copy(p); if(!m) { auth_free_masters(list); /* failed because of malloc failure, use old list */ return; } m->next = NULL; if(last) last->next = m; if(!list) list = m; last = m; } /* success, replace list */ auth_free_masters(xfr->allow_notify_list); xfr->allow_notify_list = list; } /** start the lookups for task_transfer */ static void xfr_transfer_start_lookups(struct auth_xfer* xfr) { /* delete all the looked up addresses in the list */ xfr->task_transfer->scan_addr = NULL; xfr_masterlist_free_addrs(xfr->task_transfer->masters); /* start lookup at the first master */ xfr->task_transfer->lookup_target = xfr->task_transfer->masters; xfr->task_transfer->lookup_aaaa = 0; } /** move to the next lookup of hostname for task_transfer */ static void xfr_transfer_move_to_next_lookup(struct auth_xfer* xfr, struct module_env* env) { if(!xfr->task_transfer->lookup_target) return; /* already at end of list */ if(!xfr->task_transfer->lookup_aaaa && env->cfg->do_ip6) { /* move to lookup AAAA */ xfr->task_transfer->lookup_aaaa = 1; return; } xfr->task_transfer->lookup_target = xfr->task_transfer->lookup_target->next; xfr->task_transfer->lookup_aaaa = 0; if(!env->cfg->do_ip4 && xfr->task_transfer->lookup_target!=NULL) xfr->task_transfer->lookup_aaaa = 1; } /** start the lookups for task_probe */ static void xfr_probe_start_lookups(struct auth_xfer* xfr) { /* delete all the looked up addresses in the list */ xfr->task_probe->scan_addr = NULL; xfr_masterlist_free_addrs(xfr->task_probe->masters); /* start lookup at the first master */ xfr->task_probe->lookup_target = xfr->task_probe->masters; xfr->task_probe->lookup_aaaa = 0; } /** move to the next lookup of hostname for task_probe */ static void xfr_probe_move_to_next_lookup(struct auth_xfer* xfr, struct module_env* env) { if(!xfr->task_probe->lookup_target) return; /* already at end of list */ if(!xfr->task_probe->lookup_aaaa && env->cfg->do_ip6) { /* move to lookup AAAA */ xfr->task_probe->lookup_aaaa = 1; return; } xfr->task_probe->lookup_target = xfr->task_probe->lookup_target->next; xfr->task_probe->lookup_aaaa = 0; if(!env->cfg->do_ip4 && xfr->task_probe->lookup_target!=NULL) xfr->task_probe->lookup_aaaa = 1; } /** start the iteration of the task_transfer list of masters */ static void xfr_transfer_start_list(struct auth_xfer* xfr, struct auth_master* spec) { if(spec) { xfr->task_transfer->scan_specific = find_master_by_host( xfr->task_transfer->masters, spec->host); if(xfr->task_transfer->scan_specific) { xfr->task_transfer->scan_target = NULL; xfr->task_transfer->scan_addr = NULL; if(xfr->task_transfer->scan_specific->list) xfr->task_transfer->scan_addr = xfr->task_transfer->scan_specific->list; return; } } /* no specific (notified) host to scan */ xfr->task_transfer->scan_specific = NULL; xfr->task_transfer->scan_addr = NULL; /* pick up first scan target */ xfr->task_transfer->scan_target = xfr->task_transfer->masters; if(xfr->task_transfer->scan_target && xfr->task_transfer-> scan_target->list) xfr->task_transfer->scan_addr = xfr->task_transfer->scan_target->list; } /** start the iteration of the task_probe list of masters */ static void xfr_probe_start_list(struct auth_xfer* xfr, struct auth_master* spec) { if(spec) { xfr->task_probe->scan_specific = find_master_by_host( xfr->task_probe->masters, spec->host); if(xfr->task_probe->scan_specific) { xfr->task_probe->scan_target = NULL; xfr->task_probe->scan_addr = NULL; if(xfr->task_probe->scan_specific->list) xfr->task_probe->scan_addr = xfr->task_probe->scan_specific->list; return; } } /* no specific (notified) host to scan */ xfr->task_probe->scan_specific = NULL; xfr->task_probe->scan_addr = NULL; /* pick up first scan target */ xfr->task_probe->scan_target = xfr->task_probe->masters; if(xfr->task_probe->scan_target && xfr->task_probe->scan_target->list) xfr->task_probe->scan_addr = xfr->task_probe->scan_target->list; } /** pick up the master that is being scanned right now, task_transfer */ static struct auth_master* xfr_transfer_current_master(struct auth_xfer* xfr) { if(xfr->task_transfer->scan_specific) return xfr->task_transfer->scan_specific; return xfr->task_transfer->scan_target; } /** pick up the master that is being scanned right now, task_probe */ static struct auth_master* xfr_probe_current_master(struct auth_xfer* xfr) { if(xfr->task_probe->scan_specific) return xfr->task_probe->scan_specific; return xfr->task_probe->scan_target; } /** true if at end of list, task_transfer */ static int xfr_transfer_end_of_list(struct auth_xfer* xfr) { return !xfr->task_transfer->scan_specific && !xfr->task_transfer->scan_target; } /** true if at end of list, task_probe */ static int xfr_probe_end_of_list(struct auth_xfer* xfr) { return !xfr->task_probe->scan_specific && !xfr->task_probe->scan_target; } /** move to next master in list, task_transfer */ static void xfr_transfer_nextmaster(struct auth_xfer* xfr) { if(!xfr->task_transfer->scan_specific && !xfr->task_transfer->scan_target) return; if(xfr->task_transfer->scan_addr) { xfr->task_transfer->scan_addr = xfr->task_transfer->scan_addr->next; if(xfr->task_transfer->scan_addr) return; } if(xfr->task_transfer->scan_specific) { xfr->task_transfer->scan_specific = NULL; xfr->task_transfer->scan_target = xfr->task_transfer->masters; if(xfr->task_transfer->scan_target && xfr->task_transfer-> scan_target->list) xfr->task_transfer->scan_addr = xfr->task_transfer->scan_target->list; return; } if(!xfr->task_transfer->scan_target) return; xfr->task_transfer->scan_target = xfr->task_transfer->scan_target->next; if(xfr->task_transfer->scan_target && xfr->task_transfer-> scan_target->list) xfr->task_transfer->scan_addr = xfr->task_transfer->scan_target->list; return; } /** move to next master in list, task_probe */ static void xfr_probe_nextmaster(struct auth_xfer* xfr) { if(!xfr->task_probe->scan_specific && !xfr->task_probe->scan_target) return; if(xfr->task_probe->scan_addr) { xfr->task_probe->scan_addr = xfr->task_probe->scan_addr->next; if(xfr->task_probe->scan_addr) return; } if(xfr->task_probe->scan_specific) { xfr->task_probe->scan_specific = NULL; xfr->task_probe->scan_target = xfr->task_probe->masters; if(xfr->task_probe->scan_target && xfr->task_probe-> scan_target->list) xfr->task_probe->scan_addr = xfr->task_probe->scan_target->list; return; } if(!xfr->task_probe->scan_target) return; xfr->task_probe->scan_target = xfr->task_probe->scan_target->next; if(xfr->task_probe->scan_target && xfr->task_probe-> scan_target->list) xfr->task_probe->scan_addr = xfr->task_probe->scan_target->list; return; } /** create SOA probe packet for xfr */ static void xfr_create_soa_probe_packet(struct auth_xfer* xfr, sldns_buffer* buf, uint16_t id) { struct query_info qinfo; memset(&qinfo, 0, sizeof(qinfo)); qinfo.qname = xfr->name; qinfo.qname_len = xfr->namelen; qinfo.qtype = LDNS_RR_TYPE_SOA; qinfo.qclass = xfr->dclass; qinfo_query_encode(buf, &qinfo); sldns_buffer_write_u16_at(buf, 0, id); } /** create IXFR/AXFR packet for xfr */ static void xfr_create_ixfr_packet(struct auth_xfer* xfr, sldns_buffer* buf, uint16_t id, struct auth_master* master) { struct query_info qinfo; uint32_t serial; int have_zone; have_zone = xfr->have_zone; serial = xfr->serial; memset(&qinfo, 0, sizeof(qinfo)); qinfo.qname = xfr->name; qinfo.qname_len = xfr->namelen; xfr->task_transfer->got_xfr_serial = 0; xfr->task_transfer->rr_scan_num = 0; xfr->task_transfer->incoming_xfr_serial = 0; xfr->task_transfer->on_ixfr_is_axfr = 0; xfr->task_transfer->on_ixfr = 1; qinfo.qtype = LDNS_RR_TYPE_IXFR; if(!have_zone || xfr->task_transfer->ixfr_fail || !master->ixfr) { qinfo.qtype = LDNS_RR_TYPE_AXFR; xfr->task_transfer->ixfr_fail = 0; xfr->task_transfer->on_ixfr = 0; } qinfo.qclass = xfr->dclass; qinfo_query_encode(buf, &qinfo); sldns_buffer_write_u16_at(buf, 0, id); /* append serial for IXFR */ if(qinfo.qtype == LDNS_RR_TYPE_IXFR) { size_t end = sldns_buffer_limit(buf); sldns_buffer_clear(buf); sldns_buffer_set_position(buf, end); /* auth section count 1 */ sldns_buffer_write_u16_at(buf, LDNS_NSCOUNT_OFF, 1); /* write SOA */ sldns_buffer_write_u8(buf, 0xC0); /* compressed ptr to qname */ sldns_buffer_write_u8(buf, 0x0C); sldns_buffer_write_u16(buf, LDNS_RR_TYPE_SOA); sldns_buffer_write_u16(buf, qinfo.qclass); sldns_buffer_write_u32(buf, 0); /* ttl */ sldns_buffer_write_u16(buf, 22); /* rdata length */ sldns_buffer_write_u8(buf, 0); /* . */ sldns_buffer_write_u8(buf, 0); /* . */ sldns_buffer_write_u32(buf, serial); /* serial */ sldns_buffer_write_u32(buf, 0); /* refresh */ sldns_buffer_write_u32(buf, 0); /* retry */ sldns_buffer_write_u32(buf, 0); /* expire */ sldns_buffer_write_u32(buf, 0); /* minimum */ sldns_buffer_flip(buf); } } /** check if returned packet is OK */ static int check_packet_ok(sldns_buffer* pkt, uint16_t qtype, struct auth_xfer* xfr, uint32_t* serial) { /* parse to see if packet worked, valid reply */ /* check serial number of SOA */ if(sldns_buffer_limit(pkt) < LDNS_HEADER_SIZE) return 0; /* check ID */ if(LDNS_ID_WIRE(sldns_buffer_begin(pkt)) != xfr->task_probe->id) return 0; /* check flag bits and rcode */ if(!LDNS_QR_WIRE(sldns_buffer_begin(pkt))) return 0; if(LDNS_OPCODE_WIRE(sldns_buffer_begin(pkt)) != LDNS_PACKET_QUERY) return 0; if(LDNS_RCODE_WIRE(sldns_buffer_begin(pkt)) != LDNS_RCODE_NOERROR) return 0; /* check qname */ if(LDNS_QDCOUNT(sldns_buffer_begin(pkt)) != 1) return 0; sldns_buffer_skip(pkt, LDNS_HEADER_SIZE); if(sldns_buffer_remaining(pkt) < xfr->namelen) return 0; if(query_dname_compare(sldns_buffer_current(pkt), xfr->name) != 0) return 0; sldns_buffer_skip(pkt, (ssize_t)xfr->namelen); /* check qtype, qclass */ if(sldns_buffer_remaining(pkt) < 4) return 0; if(sldns_buffer_read_u16(pkt) != qtype) return 0; if(sldns_buffer_read_u16(pkt) != xfr->dclass) return 0; if(serial) { uint16_t rdlen; /* read serial number, from answer section SOA */ if(LDNS_ANCOUNT(sldns_buffer_begin(pkt)) == 0) return 0; /* read from first record SOA record */ if(sldns_buffer_remaining(pkt) < 1) return 0; if(dname_pkt_compare(pkt, sldns_buffer_current(pkt), xfr->name) != 0) return 0; if(!pkt_dname_len(pkt)) return 0; /* type, class, ttl, rdatalen */ if(sldns_buffer_remaining(pkt) < 4+4+2) return 0; if(sldns_buffer_read_u16(pkt) != qtype) return 0; if(sldns_buffer_read_u16(pkt) != xfr->dclass) return 0; sldns_buffer_skip(pkt, 4); /* ttl */ rdlen = sldns_buffer_read_u16(pkt); if(sldns_buffer_remaining(pkt) < rdlen) return 0; if(sldns_buffer_remaining(pkt) < 1) return 0; if(!pkt_dname_len(pkt)) /* soa name */ return 0; if(sldns_buffer_remaining(pkt) < 1) return 0; if(!pkt_dname_len(pkt)) /* soa name */ return 0; if(sldns_buffer_remaining(pkt) < 20) return 0; *serial = sldns_buffer_read_u32(pkt); } return 1; } /** read one line from chunks into buffer at current position */ static int chunkline_get_line(struct auth_chunk** chunk, size_t* chunk_pos, sldns_buffer* buf) { int readsome = 0; while(*chunk) { /* more text in this chunk? */ if(*chunk_pos < (*chunk)->len) { readsome = 1; while(*chunk_pos < (*chunk)->len) { char c = (char)((*chunk)->data[*chunk_pos]); (*chunk_pos)++; if(sldns_buffer_remaining(buf) < 2) { /* buffer too short */ verbose(VERB_ALGO, "http chunkline, " "line too long"); return 0; } sldns_buffer_write_u8(buf, (uint8_t)c); if(c == '\n') { /* we are done */ return 1; } } } /* move to next chunk */ *chunk = (*chunk)->next; *chunk_pos = 0; } /* no more text */ if(readsome) return 1; return 0; } /** count number of open and closed parenthesis in a chunkline */ static int chunkline_count_parens(sldns_buffer* buf, size_t start) { size_t end = sldns_buffer_position(buf); size_t i; int count = 0; int squote = 0, dquote = 0; for(i=start; i 0) { chunkline_remove_trailcomment(buf, pos); pos = sldns_buffer_position(buf); if(!chunkline_get_line(chunk, chunk_pos, buf)) { if(sldns_buffer_position(buf) < sldns_buffer_limit(buf)) sldns_buffer_write_u8_at(buf, sldns_buffer_position(buf), 0); else sldns_buffer_write_u8_at(buf, sldns_buffer_position(buf)-1, 0); sldns_buffer_flip(buf); return 0; } parens += chunkline_count_parens(buf, pos); } if(sldns_buffer_remaining(buf) < 1) { verbose(VERB_ALGO, "http chunkline: " "line too long"); return 0; } sldns_buffer_write_u8_at(buf, sldns_buffer_position(buf), 0); sldns_buffer_flip(buf); return 1; } /** process $ORIGIN for http */ static int http_parse_origin(sldns_buffer* buf, struct sldns_file_parse_state* pstate) { char* line = (char*)sldns_buffer_begin(buf); if(strncmp(line, "$ORIGIN", 7) == 0 && isspace((unsigned char)line[7])) { int s; pstate->origin_len = sizeof(pstate->origin); s = sldns_str2wire_dname_buf(sldns_strip_ws(line+8), pstate->origin, &pstate->origin_len); if(s) pstate->origin_len = 0; return 1; } return 0; } /** process $TTL for http */ static int http_parse_ttl(sldns_buffer* buf, struct sldns_file_parse_state* pstate) { char* line = (char*)sldns_buffer_begin(buf); if(strncmp(line, "$TTL", 4) == 0 && isspace((unsigned char)line[4])) { const char* end = NULL; pstate->default_ttl = sldns_str2period( sldns_strip_ws(line+5), &end); return 1; } return 0; } /** find noncomment RR line in chunks, collates lines if ( ) format */ static int chunkline_non_comment_RR(struct auth_chunk** chunk, size_t* chunk_pos, sldns_buffer* buf, struct sldns_file_parse_state* pstate) { while(chunkline_get_line_collated(chunk, chunk_pos, buf)) { if(chunkline_is_comment_line_or_empty(buf)) { /* a comment, go to next line */ continue; } if(http_parse_origin(buf, pstate)) { continue; /* $ORIGIN has been handled */ } if(http_parse_ttl(buf, pstate)) { continue; /* $TTL has been handled */ } return 1; } /* no noncomments, fail */ return 0; } /** check syntax of chunklist zonefile, parse first RR, return false on * failure and return a string in the scratch buffer (first RR string) * on failure. */ static int http_zonefile_syntax_check(struct auth_xfer* xfr, sldns_buffer* buf) { uint8_t rr[LDNS_RR_BUF_SIZE]; size_t rr_len, dname_len = 0; struct sldns_file_parse_state pstate; struct auth_chunk* chunk; size_t chunk_pos; int e; memset(&pstate, 0, sizeof(pstate)); pstate.default_ttl = 3600; if(xfr->namelen < sizeof(pstate.origin)) { pstate.origin_len = xfr->namelen; memmove(pstate.origin, xfr->name, xfr->namelen); } chunk = xfr->task_transfer->chunks_first; chunk_pos = 0; if(!chunkline_non_comment_RR(&chunk, &chunk_pos, buf, &pstate)) { return 0; } rr_len = sizeof(rr); e=sldns_str2wire_rr_buf((char*)sldns_buffer_begin(buf), rr, &rr_len, &dname_len, pstate.default_ttl, pstate.origin_len?pstate.origin:NULL, pstate.origin_len, pstate.prev_rr_len?pstate.prev_rr:NULL, pstate.prev_rr_len); if(e != 0) { log_err("parse failure on first RR[%d]: %s", LDNS_WIREPARSE_OFFSET(e), sldns_get_errorstr_parse(LDNS_WIREPARSE_ERROR(e))); return 0; } /* check that class is correct */ if(sldns_wirerr_get_class(rr, rr_len, dname_len) != xfr->dclass) { log_err("parse failure: first record in downloaded zonefile " "from wrong RR class"); return 0; } return 1; } /** sum sizes of chunklist */ static size_t chunklist_sum(struct auth_chunk* list) { struct auth_chunk* p; size_t s = 0; for(p=list; p; p=p->next) { s += p->len; } return s; } /** remove newlines from collated line */ static void chunkline_newline_removal(sldns_buffer* buf) { size_t i, end=sldns_buffer_limit(buf); for(i=0; idefault_ttl, pstate->origin_len?pstate->origin:NULL, pstate->origin_len, pstate->prev_rr_len?pstate->prev_rr:NULL, pstate->prev_rr_len); if(e != 0) { log_err("%s/%s parse failure RR[%d]: %s in '%s'", xfr->task_transfer->master->host, xfr->task_transfer->master->file, LDNS_WIREPARSE_OFFSET(e), sldns_get_errorstr_parse(LDNS_WIREPARSE_ERROR(e)), line); return 0; } if(rr_len == 0) return 1; /* empty line or so */ /* set prev */ if(dname_len < sizeof(pstate->prev_rr)) { memmove(pstate->prev_rr, rr, dname_len); pstate->prev_rr_len = dname_len; } return az_insert_rr(z, rr, rr_len, dname_len, NULL); } /** RR list iterator, returns RRs from answer section one by one from the * dns packets in the chunklist */ static void chunk_rrlist_start(struct auth_xfer* xfr, struct auth_chunk** rr_chunk, int* rr_num, size_t* rr_pos) { *rr_chunk = xfr->task_transfer->chunks_first; *rr_num = 0; *rr_pos = 0; } /** RR list iterator, see if we are at the end of the list */ static int chunk_rrlist_end(struct auth_chunk* rr_chunk, int rr_num) { while(rr_chunk) { if(rr_chunk->len < LDNS_HEADER_SIZE) return 1; if(rr_num < (int)LDNS_ANCOUNT(rr_chunk->data)) return 0; /* no more RRs in this chunk */ /* continue with next chunk, see if it has RRs */ rr_chunk = rr_chunk->next; rr_num = 0; } return 1; } /** RR list iterator, move to next RR */ static void chunk_rrlist_gonext(struct auth_chunk** rr_chunk, int* rr_num, size_t* rr_pos, size_t rr_nextpos) { /* already at end of chunks? */ if(!*rr_chunk) return; /* move within this chunk */ if((*rr_chunk)->len >= LDNS_HEADER_SIZE && (*rr_num)+1 < (int)LDNS_ANCOUNT((*rr_chunk)->data)) { (*rr_num) += 1; *rr_pos = rr_nextpos; return; } /* no more RRs in this chunk */ /* continue with next chunk, see if it has RRs */ if(*rr_chunk) *rr_chunk = (*rr_chunk)->next; while(*rr_chunk) { *rr_num = 0; *rr_pos = 0; if((*rr_chunk)->len >= LDNS_HEADER_SIZE && LDNS_ANCOUNT((*rr_chunk)->data) > 0) { return; } *rr_chunk = (*rr_chunk)->next; } } /** RR iterator, get current RR information, false on parse error */ static int chunk_rrlist_get_current(struct auth_chunk* rr_chunk, int rr_num, size_t rr_pos, uint8_t** rr_dname, uint16_t* rr_type, uint16_t* rr_class, uint32_t* rr_ttl, uint16_t* rr_rdlen, uint8_t** rr_rdata, size_t* rr_nextpos) { sldns_buffer pkt; /* integrity checks on position */ if(!rr_chunk) return 0; if(rr_chunk->len < LDNS_HEADER_SIZE) return 0; if(rr_num >= (int)LDNS_ANCOUNT(rr_chunk->data)) return 0; if(rr_pos >= rr_chunk->len) return 0; /* fetch rr information */ sldns_buffer_init_frm_data(&pkt, rr_chunk->data, rr_chunk->len); if(rr_pos == 0) { size_t i; /* skip question section */ sldns_buffer_set_position(&pkt, LDNS_HEADER_SIZE); for(i=0; idata); i++) { if(pkt_dname_len(&pkt) == 0) return 0; if(sldns_buffer_remaining(&pkt) < 4) return 0; sldns_buffer_skip(&pkt, 4); /* type and class */ } } else { sldns_buffer_set_position(&pkt, rr_pos); } *rr_dname = sldns_buffer_current(&pkt); if(pkt_dname_len(&pkt) == 0) return 0; if(sldns_buffer_remaining(&pkt) < 10) return 0; *rr_type = sldns_buffer_read_u16(&pkt); *rr_class = sldns_buffer_read_u16(&pkt); *rr_ttl = sldns_buffer_read_u32(&pkt); *rr_rdlen = sldns_buffer_read_u16(&pkt); if(sldns_buffer_remaining(&pkt) < (*rr_rdlen)) return 0; *rr_rdata = sldns_buffer_current(&pkt); sldns_buffer_skip(&pkt, (ssize_t)(*rr_rdlen)); *rr_nextpos = sldns_buffer_position(&pkt); return 1; } /** print log message where we are in parsing the zone transfer */ static void log_rrlist_position(const char* label, struct auth_chunk* rr_chunk, uint8_t* rr_dname, uint16_t rr_type, size_t rr_counter) { sldns_buffer pkt; size_t dlen; uint8_t buf[256]; char str[256]; char typestr[32]; sldns_buffer_init_frm_data(&pkt, rr_chunk->data, rr_chunk->len); sldns_buffer_set_position(&pkt, (size_t)(rr_dname - sldns_buffer_begin(&pkt))); if((dlen=pkt_dname_len(&pkt)) == 0) return; if(dlen >= sizeof(buf)) return; dname_pkt_copy(&pkt, buf, rr_dname); dname_str(buf, str); (void)sldns_wire2str_type_buf(rr_type, typestr, sizeof(typestr)); verbose(VERB_ALGO, "%s at[%d] %s %s", label, (int)rr_counter, str, typestr); } /** check that start serial is OK for ixfr. we are at rr_counter == 0, * and we are going to check rr_counter == 1 (has to be type SOA) serial */ static int ixfr_start_serial(struct auth_chunk* rr_chunk, int rr_num, size_t rr_pos, uint8_t* rr_dname, uint16_t rr_type, uint16_t rr_class, uint32_t rr_ttl, uint16_t rr_rdlen, uint8_t* rr_rdata, size_t rr_nextpos, uint32_t transfer_serial, uint32_t xfr_serial) { uint32_t startserial; /* move forward on RR */ chunk_rrlist_gonext(&rr_chunk, &rr_num, &rr_pos, rr_nextpos); if(chunk_rrlist_end(rr_chunk, rr_num)) { /* no second SOA */ verbose(VERB_OPS, "IXFR has no second SOA record"); return 0; } if(!chunk_rrlist_get_current(rr_chunk, rr_num, rr_pos, &rr_dname, &rr_type, &rr_class, &rr_ttl, &rr_rdlen, &rr_rdata, &rr_nextpos)) { verbose(VERB_OPS, "IXFR cannot parse second SOA record"); /* failed to parse RR */ return 0; } if(rr_type != LDNS_RR_TYPE_SOA) { verbose(VERB_OPS, "IXFR second record is not type SOA"); return 0; } if(rr_rdlen < 22) { verbose(VERB_OPS, "IXFR, second SOA has short rdlength"); return 0; /* bad SOA rdlen */ } startserial = sldns_read_uint32(rr_rdata+rr_rdlen-20); if(startserial == transfer_serial) { /* empty AXFR, not an IXFR */ verbose(VERB_OPS, "IXFR second serial same as first"); return 0; } if(startserial != xfr_serial) { /* wrong start serial, it does not match the serial in * memory */ verbose(VERB_OPS, "IXFR is from serial %u to %u but %u " "in memory, rejecting the zone transfer", (unsigned)startserial, (unsigned)transfer_serial, (unsigned)xfr_serial); return 0; } /* everything OK in second SOA serial */ return 1; } /** apply IXFR to zone in memory. z is locked. false on failure(mallocfail) */ static int apply_ixfr(struct auth_xfer* xfr, struct auth_zone* z, struct sldns_buffer* scratch_buffer) { struct auth_chunk* rr_chunk; int rr_num; size_t rr_pos; uint8_t* rr_dname, *rr_rdata; uint16_t rr_type, rr_class, rr_rdlen; uint32_t rr_ttl; size_t rr_nextpos; int have_transfer_serial = 0; uint32_t transfer_serial = 0; size_t rr_counter = 0; int delmode = 0; int softfail = 0; /* start RR iterator over chunklist of packets */ chunk_rrlist_start(xfr, &rr_chunk, &rr_num, &rr_pos); while(!chunk_rrlist_end(rr_chunk, rr_num)) { if(!chunk_rrlist_get_current(rr_chunk, rr_num, rr_pos, &rr_dname, &rr_type, &rr_class, &rr_ttl, &rr_rdlen, &rr_rdata, &rr_nextpos)) { /* failed to parse RR */ return 0; } if(verbosity>=7) log_rrlist_position("apply ixfr", rr_chunk, rr_dname, rr_type, rr_counter); /* twiddle add/del mode and check for start and end */ if(rr_counter == 0 && rr_type != LDNS_RR_TYPE_SOA) return 0; if(rr_counter == 1 && rr_type != LDNS_RR_TYPE_SOA) { /* this is an AXFR returned from the IXFR master */ /* but that should already have been detected, by * on_ixfr_is_axfr */ return 0; } if(rr_type == LDNS_RR_TYPE_SOA) { uint32_t serial; if(rr_rdlen < 22) return 0; /* bad SOA rdlen */ serial = sldns_read_uint32(rr_rdata+rr_rdlen-20); if(have_transfer_serial == 0) { have_transfer_serial = 1; transfer_serial = serial; delmode = 1; /* gets negated below */ /* check second RR before going any further */ if(!ixfr_start_serial(rr_chunk, rr_num, rr_pos, rr_dname, rr_type, rr_class, rr_ttl, rr_rdlen, rr_rdata, rr_nextpos, transfer_serial, xfr->serial)) { return 0; } } else if(transfer_serial == serial) { have_transfer_serial++; if(rr_counter == 1) { /* empty AXFR, with SOA; SOA; */ /* should have been detected by * on_ixfr_is_axfr */ return 0; } if(have_transfer_serial == 3) { /* see serial three times for end */ /* eg. IXFR: * SOA 3 start * SOA 1 second RR, followed by del * SOA 2 followed by add * SOA 2 followed by del * SOA 3 followed by add * SOA 3 end */ /* ended by SOA record */ xfr->serial = transfer_serial; break; } } /* twiddle add/del mode */ /* switch from delete part to add part and back again * just before the soa, it gets deleted and added too * this means we switch to delete mode for the final * SOA(so skip that one) */ delmode = !delmode; } /* process this RR */ /* if the RR is deleted twice or added twice, then we * softfail, and continue with the rest of the IXFR, so * that we serve something fairly nice during the refetch */ if(verbosity>=7) log_rrlist_position((delmode?"del":"add"), rr_chunk, rr_dname, rr_type, rr_counter); if(delmode) { /* delete this RR */ int nonexist = 0; if(!az_remove_rr_decompress(z, rr_chunk->data, rr_chunk->len, scratch_buffer, rr_dname, rr_type, rr_class, rr_ttl, rr_rdata, rr_rdlen, &nonexist)) { /* failed, malloc error or so */ return 0; } if(nonexist) { /* it was removal of a nonexisting RR */ if(verbosity>=4) log_rrlist_position( "IXFR error nonexistent RR", rr_chunk, rr_dname, rr_type, rr_counter); softfail = 1; } } else if(rr_counter != 0) { /* skip first SOA RR for addition, it is added in * the addition part near the end of the ixfr, when * that serial is seen the second time. */ int duplicate = 0; /* add this RR */ if(!az_insert_rr_decompress(z, rr_chunk->data, rr_chunk->len, scratch_buffer, rr_dname, rr_type, rr_class, rr_ttl, rr_rdata, rr_rdlen, &duplicate)) { /* failed, malloc error or so */ return 0; } if(duplicate) { /* it was a duplicate */ if(verbosity>=4) log_rrlist_position( "IXFR error duplicate RR", rr_chunk, rr_dname, rr_type, rr_counter); softfail = 1; } } rr_counter++; chunk_rrlist_gonext(&rr_chunk, &rr_num, &rr_pos, rr_nextpos); } if(softfail) { verbose(VERB_ALGO, "IXFR did not apply cleanly, fetching full zone"); return 0; } return 1; } /** apply AXFR to zone in memory. z is locked. false on failure(mallocfail) */ static int apply_axfr(struct auth_xfer* xfr, struct auth_zone* z, struct sldns_buffer* scratch_buffer) { struct auth_chunk* rr_chunk; int rr_num; size_t rr_pos; uint8_t* rr_dname, *rr_rdata; uint16_t rr_type, rr_class, rr_rdlen; uint32_t rr_ttl; uint32_t serial = 0; size_t rr_nextpos; size_t rr_counter = 0; int have_end_soa = 0; /* clear the data tree */ traverse_postorder(&z->data, auth_data_del, NULL); rbtree_init(&z->data, &auth_data_cmp); /* clear the RPZ policies */ if(z->rpz) rpz_clear(z->rpz); xfr->have_zone = 0; xfr->serial = 0; /* insert all RRs in to the zone */ /* insert the SOA only once, skip the last one */ /* start RR iterator over chunklist of packets */ chunk_rrlist_start(xfr, &rr_chunk, &rr_num, &rr_pos); while(!chunk_rrlist_end(rr_chunk, rr_num)) { if(!chunk_rrlist_get_current(rr_chunk, rr_num, rr_pos, &rr_dname, &rr_type, &rr_class, &rr_ttl, &rr_rdlen, &rr_rdata, &rr_nextpos)) { /* failed to parse RR */ return 0; } if(verbosity>=7) log_rrlist_position("apply_axfr", rr_chunk, rr_dname, rr_type, rr_counter); if(rr_type == LDNS_RR_TYPE_SOA) { if(rr_counter != 0) { /* end of the axfr */ have_end_soa = 1; break; } if(rr_rdlen < 22) return 0; /* bad SOA rdlen */ serial = sldns_read_uint32(rr_rdata+rr_rdlen-20); } /* add this RR */ if(!az_insert_rr_decompress(z, rr_chunk->data, rr_chunk->len, scratch_buffer, rr_dname, rr_type, rr_class, rr_ttl, rr_rdata, rr_rdlen, NULL)) { /* failed, malloc error or so */ return 0; } rr_counter++; chunk_rrlist_gonext(&rr_chunk, &rr_num, &rr_pos, rr_nextpos); } if(!have_end_soa) { log_err("no end SOA record for AXFR"); return 0; } xfr->serial = serial; xfr->have_zone = 1; return 1; } /** apply HTTP to zone in memory. z is locked. false on failure(mallocfail) */ static int apply_http(struct auth_xfer* xfr, struct auth_zone* z, struct sldns_buffer* scratch_buffer) { /* parse data in chunks */ /* parse RR's and read into memory. ignore $INCLUDE from the * downloaded file*/ struct sldns_file_parse_state pstate; struct auth_chunk* chunk; size_t chunk_pos; memset(&pstate, 0, sizeof(pstate)); pstate.default_ttl = 3600; if(xfr->namelen < sizeof(pstate.origin)) { pstate.origin_len = xfr->namelen; memmove(pstate.origin, xfr->name, xfr->namelen); } if(verbosity >= VERB_ALGO) verbose(VERB_ALGO, "http download %s of size %d", xfr->task_transfer->master->file, (int)chunklist_sum(xfr->task_transfer->chunks_first)); if(xfr->task_transfer->chunks_first && verbosity >= VERB_ALGO) { char preview[1024]; if(xfr->task_transfer->chunks_first->len+1 > sizeof(preview)) { memmove(preview, xfr->task_transfer->chunks_first->data, sizeof(preview)-1); preview[sizeof(preview)-1]=0; } else { memmove(preview, xfr->task_transfer->chunks_first->data, xfr->task_transfer->chunks_first->len); preview[xfr->task_transfer->chunks_first->len]=0; } log_info("auth zone http downloaded content preview: %s", preview); } /* perhaps a little syntax check before we try to apply the data? */ if(!http_zonefile_syntax_check(xfr, scratch_buffer)) { log_err("http download %s/%s does not contain a zonefile, " "but got '%s'", xfr->task_transfer->master->host, xfr->task_transfer->master->file, sldns_buffer_begin(scratch_buffer)); return 0; } /* clear the data tree */ traverse_postorder(&z->data, auth_data_del, NULL); rbtree_init(&z->data, &auth_data_cmp); /* clear the RPZ policies */ if(z->rpz) rpz_clear(z->rpz); xfr->have_zone = 0; xfr->serial = 0; chunk = xfr->task_transfer->chunks_first; chunk_pos = 0; pstate.lineno = 0; while(chunkline_get_line_collated(&chunk, &chunk_pos, scratch_buffer)) { /* process this line */ pstate.lineno++; chunkline_newline_removal(scratch_buffer); if(chunkline_is_comment_line_or_empty(scratch_buffer)) { continue; } /* parse line and add RR */ if(http_parse_origin(scratch_buffer, &pstate)) { continue; /* $ORIGIN has been handled */ } if(http_parse_ttl(scratch_buffer, &pstate)) { continue; /* $TTL has been handled */ } if(!http_parse_add_rr(xfr, z, scratch_buffer, &pstate)) { verbose(VERB_ALGO, "error parsing line [%s:%d] %s", xfr->task_transfer->master->file, pstate.lineno, sldns_buffer_begin(scratch_buffer)); return 0; } } return 1; } /** write http chunks to zonefile to create downloaded file */ static int auth_zone_write_chunks(struct auth_xfer* xfr, const char* fname) { FILE* out; struct auth_chunk* p; out = fopen(fname, "w"); if(!out) { log_err("could not open %s: %s", fname, strerror(errno)); return 0; } for(p = xfr->task_transfer->chunks_first; p ; p = p->next) { if(!write_out(out, (char*)p->data, p->len)) { log_err("could not write http download to %s", fname); fclose(out); return 0; } } fclose(out); return 1; } /** write to zonefile after zone has been updated */ static void xfr_write_after_update(struct auth_xfer* xfr, struct module_env* env) { struct config_file* cfg = env->cfg; struct auth_zone* z; char tmpfile[1024]; char* zfilename; lock_basic_unlock(&xfr->lock); /* get lock again, so it is a readlock and concurrently queries * can be answered */ lock_rw_rdlock(&env->auth_zones->lock); z = auth_zone_find(env->auth_zones, xfr->name, xfr->namelen, xfr->dclass); if(!z) { lock_rw_unlock(&env->auth_zones->lock); /* the zone is gone, ignore xfr results */ lock_basic_lock(&xfr->lock); return; } lock_rw_rdlock(&z->lock); lock_basic_lock(&xfr->lock); lock_rw_unlock(&env->auth_zones->lock); if(z->zonefile == NULL || z->zonefile[0] == 0) { lock_rw_unlock(&z->lock); /* no write needed, no zonefile set */ return; } zfilename = z->zonefile; if(cfg->chrootdir && cfg->chrootdir[0] && strncmp(zfilename, cfg->chrootdir, strlen(cfg->chrootdir)) == 0) zfilename += strlen(cfg->chrootdir); if(verbosity >= VERB_ALGO) { char nm[255+1]; dname_str(z->name, nm); verbose(VERB_ALGO, "write zonefile %s for %s", zfilename, nm); } /* write to tempfile first */ if((size_t)strlen(zfilename) + 16 > sizeof(tmpfile)) { verbose(VERB_ALGO, "tmpfilename too long, cannot update " " zonefile %s", zfilename); lock_rw_unlock(&z->lock); return; } snprintf(tmpfile, sizeof(tmpfile), "%s.tmp%u", zfilename, (unsigned)getpid()); if(xfr->task_transfer->master->http) { /* use the stored chunk list to write them */ if(!auth_zone_write_chunks(xfr, tmpfile)) { unlink(tmpfile); lock_rw_unlock(&z->lock); return; } } else if(!auth_zone_write_file(z, tmpfile)) { unlink(tmpfile); lock_rw_unlock(&z->lock); return; } if(rename(tmpfile, zfilename) < 0) { log_err("could not rename(%s, %s): %s", tmpfile, zfilename, strerror(errno)); unlink(tmpfile); lock_rw_unlock(&z->lock); return; } lock_rw_unlock(&z->lock); } /** process chunk list and update zone in memory, * return false if it did not work */ static int xfr_process_chunk_list(struct auth_xfer* xfr, struct module_env* env, int* ixfr_fail) { struct auth_zone* z; /* obtain locks and structures */ /* release xfr lock, then, while holding az->lock grab both * z->lock and xfr->lock */ lock_basic_unlock(&xfr->lock); lock_rw_rdlock(&env->auth_zones->lock); z = auth_zone_find(env->auth_zones, xfr->name, xfr->namelen, xfr->dclass); if(!z) { lock_rw_unlock(&env->auth_zones->lock); /* the zone is gone, ignore xfr results */ lock_basic_lock(&xfr->lock); return 0; } lock_rw_wrlock(&z->lock); lock_basic_lock(&xfr->lock); lock_rw_unlock(&env->auth_zones->lock); /* apply data */ if(xfr->task_transfer->master->http) { if(!apply_http(xfr, z, env->scratch_buffer)) { lock_rw_unlock(&z->lock); verbose(VERB_ALGO, "http from %s: could not store data", xfr->task_transfer->master->host); return 0; } } else if(xfr->task_transfer->on_ixfr && !xfr->task_transfer->on_ixfr_is_axfr) { if(!apply_ixfr(xfr, z, env->scratch_buffer)) { lock_rw_unlock(&z->lock); verbose(VERB_ALGO, "xfr from %s: could not store IXFR" " data", xfr->task_transfer->master->host); *ixfr_fail = 1; return 0; } } else { if(!apply_axfr(xfr, z, env->scratch_buffer)) { lock_rw_unlock(&z->lock); verbose(VERB_ALGO, "xfr from %s: could not store AXFR" " data", xfr->task_transfer->master->host); return 0; } } xfr->zone_expired = 0; z->zone_expired = 0; if(!xfr_find_soa(z, xfr)) { lock_rw_unlock(&z->lock); verbose(VERB_ALGO, "xfr from %s: no SOA in zone after update" " (or malformed RR)", xfr->task_transfer->master->host); return 0; } if(xfr->have_zone) xfr->lease_time = *env->now; if(z->rpz) rpz_finish_config(z->rpz); /* unlock */ lock_rw_unlock(&z->lock); if(verbosity >= VERB_QUERY && xfr->have_zone) { char zname[256]; dname_str(xfr->name, zname); verbose(VERB_QUERY, "auth zone %s updated to serial %u", zname, (unsigned)xfr->serial); } /* see if we need to write to a zonefile */ xfr_write_after_update(xfr, env); return 1; } /** disown task_transfer. caller must hold xfr.lock */ static void xfr_transfer_disown(struct auth_xfer* xfr) { /* remove timer (from this worker's event base) */ comm_timer_delete(xfr->task_transfer->timer); xfr->task_transfer->timer = NULL; /* remove the commpoint */ comm_point_delete(xfr->task_transfer->cp); xfr->task_transfer->cp = NULL; /* we don't own this item anymore */ xfr->task_transfer->worker = NULL; xfr->task_transfer->env = NULL; } /** lookup a host name for its addresses, if needed */ static int xfr_transfer_lookup_host(struct auth_xfer* xfr, struct module_env* env) { struct sockaddr_storage addr; socklen_t addrlen = 0; struct auth_master* master = xfr->task_transfer->lookup_target; struct query_info qinfo; uint16_t qflags = BIT_RD; uint8_t dname[LDNS_MAX_DOMAINLEN+1]; struct edns_data edns; sldns_buffer* buf = env->scratch_buffer; if(!master) return 0; if(extstrtoaddr(master->host, &addr, &addrlen)) { /* not needed, host is in IP addr format */ return 0; } if(master->allow_notify) return 0; /* allow-notifies are not transferred from, no lookup is needed */ /* use mesh_new_callback to probe for non-addr hosts, * and then wait for them to be looked up (in cache, or query) */ qinfo.qname_len = sizeof(dname); if(sldns_str2wire_dname_buf(master->host, dname, &qinfo.qname_len) != 0) { log_err("cannot parse host name of master %s", master->host); return 0; } qinfo.qname = dname; qinfo.qclass = xfr->dclass; qinfo.qtype = LDNS_RR_TYPE_A; if(xfr->task_transfer->lookup_aaaa) qinfo.qtype = LDNS_RR_TYPE_AAAA; qinfo.local_alias = NULL; if(verbosity >= VERB_ALGO) { char buf1[512]; char buf2[LDNS_MAX_DOMAINLEN+1]; dname_str(xfr->name, buf2); snprintf(buf1, sizeof(buf1), "auth zone %s: master lookup" " for task_transfer", buf2); log_query_info(VERB_ALGO, buf1, &qinfo); } edns.edns_present = 1; edns.ext_rcode = 0; edns.edns_version = 0; edns.bits = EDNS_DO; edns.opt_list = NULL; if(sldns_buffer_capacity(buf) < 65535) edns.udp_size = (uint16_t)sldns_buffer_capacity(buf); else edns.udp_size = 65535; /* unlock xfr during mesh_new_callback() because the callback can be * called straight away */ lock_basic_unlock(&xfr->lock); if(!mesh_new_callback(env->mesh, &qinfo, qflags, &edns, buf, 0, &auth_xfer_transfer_lookup_callback, xfr)) { lock_basic_lock(&xfr->lock); log_err("out of memory lookup up master %s", master->host); return 0; } lock_basic_lock(&xfr->lock); return 1; } /** initiate TCP to the target and fetch zone. * returns true if that was successfully started, and timeout setup. */ static int xfr_transfer_init_fetch(struct auth_xfer* xfr, struct module_env* env) { struct sockaddr_storage addr; socklen_t addrlen = 0; struct auth_master* master = xfr->task_transfer->master; char *auth_name = NULL; struct timeval t; int timeout; if(!master) return 0; if(master->allow_notify) return 0; /* only for notify */ /* get master addr */ if(xfr->task_transfer->scan_addr) { addrlen = xfr->task_transfer->scan_addr->addrlen; memmove(&addr, &xfr->task_transfer->scan_addr->addr, addrlen); } else { if(!authextstrtoaddr(master->host, &addr, &addrlen, &auth_name)) { /* the ones that are not in addr format are supposed * to be looked up. The lookup has failed however, * so skip them */ char zname[255+1]; dname_str(xfr->name, zname); log_err("%s: failed lookup, cannot transfer from master %s", zname, master->host); return 0; } } /* remove previous TCP connection (if any) */ if(xfr->task_transfer->cp) { comm_point_delete(xfr->task_transfer->cp); xfr->task_transfer->cp = NULL; } if(!xfr->task_transfer->timer) { xfr->task_transfer->timer = comm_timer_create(env->worker_base, auth_xfer_transfer_timer_callback, xfr); if(!xfr->task_transfer->timer) { log_err("malloc failure"); return 0; } } timeout = AUTH_TRANSFER_TIMEOUT; #ifndef S_SPLINT_S t.tv_sec = timeout/1000; t.tv_usec = (timeout%1000)*1000; #endif if(master->http) { /* perform http fetch */ /* store http port number into sockaddr, * unless someone used unbound's host@port notation */ xfr->task_transfer->on_ixfr = 0; if(strchr(master->host, '@') == NULL) sockaddr_store_port(&addr, addrlen, master->port); xfr->task_transfer->cp = outnet_comm_point_for_http( env->outnet, auth_xfer_transfer_http_callback, xfr, &addr, addrlen, -1, master->ssl, master->host, master->file); if(!xfr->task_transfer->cp) { char zname[255+1], as[256]; dname_str(xfr->name, zname); addr_to_str(&addr, addrlen, as, sizeof(as)); verbose(VERB_ALGO, "cannot create http cp " "connection for %s to %s", zname, as); return 0; } comm_timer_set(xfr->task_transfer->timer, &t); if(verbosity >= VERB_ALGO) { char zname[255+1], as[256]; dname_str(xfr->name, zname); addr_to_str(&addr, addrlen, as, sizeof(as)); verbose(VERB_ALGO, "auth zone %s transfer next HTTP fetch from %s started", zname, as); } return 1; } /* perform AXFR/IXFR */ /* set the packet to be written */ /* create new ID */ xfr->task_transfer->id = (uint16_t)(ub_random(env->rnd)&0xffff); xfr_create_ixfr_packet(xfr, env->scratch_buffer, xfr->task_transfer->id, master); /* connect on fd */ xfr->task_transfer->cp = outnet_comm_point_for_tcp(env->outnet, auth_xfer_transfer_tcp_callback, xfr, &addr, addrlen, env->scratch_buffer, -1, auth_name != NULL, auth_name); if(!xfr->task_transfer->cp) { char zname[255+1], as[256]; dname_str(xfr->name, zname); addr_to_str(&addr, addrlen, as, sizeof(as)); verbose(VERB_ALGO, "cannot create tcp cp connection for " "xfr %s to %s", zname, as); return 0; } comm_timer_set(xfr->task_transfer->timer, &t); if(verbosity >= VERB_ALGO) { char zname[255+1], as[256]; dname_str(xfr->name, zname); addr_to_str(&addr, addrlen, as, sizeof(as)); verbose(VERB_ALGO, "auth zone %s transfer next %s fetch from %s started", zname, (xfr->task_transfer->on_ixfr?"IXFR":"AXFR"), as); } return 1; } /** perform next lookup, next transfer TCP, or end and resume wait time task */ static void xfr_transfer_nexttarget_or_end(struct auth_xfer* xfr, struct module_env* env) { log_assert(xfr->task_transfer->worker == env->worker); /* are we performing lookups? */ while(xfr->task_transfer->lookup_target) { if(xfr_transfer_lookup_host(xfr, env)) { /* wait for lookup to finish, * note that the hostname may be in unbound's cache * and we may then get an instant cache response, * and that calls the callback just like a full * lookup and lookup failures also call callback */ if(verbosity >= VERB_ALGO) { char zname[255+1]; dname_str(xfr->name, zname); verbose(VERB_ALGO, "auth zone %s transfer next target lookup", zname); } lock_basic_unlock(&xfr->lock); return; } xfr_transfer_move_to_next_lookup(xfr, env); } /* initiate TCP and fetch the zone from the master */ /* and set timeout on it */ while(!xfr_transfer_end_of_list(xfr)) { xfr->task_transfer->master = xfr_transfer_current_master(xfr); if(xfr_transfer_init_fetch(xfr, env)) { /* successfully started, wait for callback */ lock_basic_unlock(&xfr->lock); return; } /* failed to fetch, next master */ xfr_transfer_nextmaster(xfr); } if(verbosity >= VERB_ALGO) { char zname[255+1]; dname_str(xfr->name, zname); verbose(VERB_ALGO, "auth zone %s transfer failed, wait", zname); } /* we failed to fetch the zone, move to wait task * use the shorter retry timeout */ xfr_transfer_disown(xfr); /* pick up the nextprobe task and wait */ if(xfr->task_nextprobe->worker == NULL) xfr_set_timeout(xfr, env, 1, 0); lock_basic_unlock(&xfr->lock); } /** add addrs from A or AAAA rrset to the master */ static void xfr_master_add_addrs(struct auth_master* m, struct ub_packed_rrset_key* rrset, uint16_t rrtype) { size_t i; struct packed_rrset_data* data; if(!m || !rrset) return; if(rrtype != LDNS_RR_TYPE_A && rrtype != LDNS_RR_TYPE_AAAA) return; data = (struct packed_rrset_data*)rrset->entry.data; for(i=0; icount; i++) { struct auth_addr* a; size_t len = data->rr_len[i] - 2; uint8_t* rdata = data->rr_data[i]+2; if(rrtype == LDNS_RR_TYPE_A && len != INET_SIZE) continue; /* wrong length for A */ if(rrtype == LDNS_RR_TYPE_AAAA && len != INET6_SIZE) continue; /* wrong length for AAAA */ /* add and alloc it */ a = (struct auth_addr*)calloc(1, sizeof(*a)); if(!a) { log_err("out of memory"); return; } if(rrtype == LDNS_RR_TYPE_A) { struct sockaddr_in* sa; a->addrlen = (socklen_t)sizeof(*sa); sa = (struct sockaddr_in*)&a->addr; sa->sin_family = AF_INET; sa->sin_port = (in_port_t)htons(UNBOUND_DNS_PORT); memmove(&sa->sin_addr, rdata, INET_SIZE); } else { struct sockaddr_in6* sa; a->addrlen = (socklen_t)sizeof(*sa); sa = (struct sockaddr_in6*)&a->addr; sa->sin6_family = AF_INET6; sa->sin6_port = (in_port_t)htons(UNBOUND_DNS_PORT); memmove(&sa->sin6_addr, rdata, INET6_SIZE); } if(verbosity >= VERB_ALGO) { char s[64]; addr_to_str(&a->addr, a->addrlen, s, sizeof(s)); verbose(VERB_ALGO, "auth host %s lookup %s", m->host, s); } /* append to list */ a->next = m->list; m->list = a; } } /** callback for task_transfer lookup of host name, of A or AAAA */ void auth_xfer_transfer_lookup_callback(void* arg, int rcode, sldns_buffer* buf, enum sec_status ATTR_UNUSED(sec), char* ATTR_UNUSED(why_bogus), int ATTR_UNUSED(was_ratelimited)) { struct auth_xfer* xfr = (struct auth_xfer*)arg; struct module_env* env; log_assert(xfr->task_transfer); lock_basic_lock(&xfr->lock); env = xfr->task_transfer->env; if(!env || env->outnet->want_to_quit) { lock_basic_unlock(&xfr->lock); return; /* stop on quit */ } /* process result */ if(rcode == LDNS_RCODE_NOERROR) { uint16_t wanted_qtype = LDNS_RR_TYPE_A; struct regional* temp = env->scratch; struct query_info rq; struct reply_info* rep; if(xfr->task_transfer->lookup_aaaa) wanted_qtype = LDNS_RR_TYPE_AAAA; memset(&rq, 0, sizeof(rq)); rep = parse_reply_in_temp_region(buf, temp, &rq); if(rep && rq.qtype == wanted_qtype && FLAGS_GET_RCODE(rep->flags) == LDNS_RCODE_NOERROR) { /* parsed successfully */ struct ub_packed_rrset_key* answer = reply_find_answer_rrset(&rq, rep); if(answer) { xfr_master_add_addrs(xfr->task_transfer-> lookup_target, answer, wanted_qtype); } else { if(verbosity >= VERB_ALGO) { char zname[255+1]; dname_str(xfr->name, zname); verbose(VERB_ALGO, "auth zone %s host %s type %s transfer lookup has nodata", zname, xfr->task_transfer->lookup_target->host, (xfr->task_transfer->lookup_aaaa?"AAAA":"A")); } } } else { if(verbosity >= VERB_ALGO) { char zname[255+1]; dname_str(xfr->name, zname); verbose(VERB_ALGO, "auth zone %s host %s type %s transfer lookup has no answer", zname, xfr->task_transfer->lookup_target->host, (xfr->task_transfer->lookup_aaaa?"AAAA":"A")); } } regional_free_all(temp); } else { if(verbosity >= VERB_ALGO) { char zname[255+1]; dname_str(xfr->name, zname); verbose(VERB_ALGO, "auth zone %s host %s type %s transfer lookup failed", zname, xfr->task_transfer->lookup_target->host, (xfr->task_transfer->lookup_aaaa?"AAAA":"A")); } } if(xfr->task_transfer->lookup_target->list && xfr->task_transfer->lookup_target == xfr_transfer_current_master(xfr)) xfr->task_transfer->scan_addr = xfr->task_transfer->lookup_target->list; /* move to lookup AAAA after A lookup, move to next hostname lookup, * or move to fetch the zone, or, if nothing to do, end task_transfer */ xfr_transfer_move_to_next_lookup(xfr, env); xfr_transfer_nexttarget_or_end(xfr, env); } /** check if xfer (AXFR or IXFR) packet is OK. * return false if we lost connection (SERVFAIL, or unreadable). * return false if we need to move from IXFR to AXFR, with gonextonfail * set to false, so the same master is tried again, but with AXFR. * return true if fine to link into data. * return true with transferdone=true when the transfer has ended. */ static int check_xfer_packet(sldns_buffer* pkt, struct auth_xfer* xfr, int* gonextonfail, int* transferdone) { uint8_t* wire = sldns_buffer_begin(pkt); int i; if(sldns_buffer_limit(pkt) < LDNS_HEADER_SIZE) { verbose(VERB_ALGO, "xfr to %s failed, packet too small", xfr->task_transfer->master->host); return 0; } if(!LDNS_QR_WIRE(wire)) { verbose(VERB_ALGO, "xfr to %s failed, packet has no QR flag", xfr->task_transfer->master->host); return 0; } if(LDNS_TC_WIRE(wire)) { verbose(VERB_ALGO, "xfr to %s failed, packet has TC flag", xfr->task_transfer->master->host); return 0; } /* check ID */ if(LDNS_ID_WIRE(wire) != xfr->task_transfer->id) { verbose(VERB_ALGO, "xfr to %s failed, packet wrong ID", xfr->task_transfer->master->host); return 0; } if(LDNS_RCODE_WIRE(wire) != LDNS_RCODE_NOERROR) { char rcode[32]; sldns_wire2str_rcode_buf((int)LDNS_RCODE_WIRE(wire), rcode, sizeof(rcode)); /* if we are doing IXFR, check for fallback */ if(xfr->task_transfer->on_ixfr) { if(LDNS_RCODE_WIRE(wire) == LDNS_RCODE_NOTIMPL || LDNS_RCODE_WIRE(wire) == LDNS_RCODE_SERVFAIL || LDNS_RCODE_WIRE(wire) == LDNS_RCODE_REFUSED || LDNS_RCODE_WIRE(wire) == LDNS_RCODE_FORMERR) { verbose(VERB_ALGO, "xfr to %s, fallback " "from IXFR to AXFR (with rcode %s)", xfr->task_transfer->master->host, rcode); xfr->task_transfer->ixfr_fail = 1; *gonextonfail = 0; return 0; } } verbose(VERB_ALGO, "xfr to %s failed, packet with rcode %s", xfr->task_transfer->master->host, rcode); return 0; } if(LDNS_OPCODE_WIRE(wire) != LDNS_PACKET_QUERY) { verbose(VERB_ALGO, "xfr to %s failed, packet with bad opcode", xfr->task_transfer->master->host); return 0; } if(LDNS_QDCOUNT(wire) > 1) { verbose(VERB_ALGO, "xfr to %s failed, packet has qdcount %d", xfr->task_transfer->master->host, (int)LDNS_QDCOUNT(wire)); return 0; } /* check qname */ sldns_buffer_set_position(pkt, LDNS_HEADER_SIZE); for(i=0; i<(int)LDNS_QDCOUNT(wire); i++) { size_t pos = sldns_buffer_position(pkt); uint16_t qtype, qclass; if(pkt_dname_len(pkt) == 0) { verbose(VERB_ALGO, "xfr to %s failed, packet with " "malformed dname", xfr->task_transfer->master->host); return 0; } if(dname_pkt_compare(pkt, sldns_buffer_at(pkt, pos), xfr->name) != 0) { verbose(VERB_ALGO, "xfr to %s failed, packet with " "wrong qname", xfr->task_transfer->master->host); return 0; } if(sldns_buffer_remaining(pkt) < 4) { verbose(VERB_ALGO, "xfr to %s failed, packet with " "truncated query RR", xfr->task_transfer->master->host); return 0; } qtype = sldns_buffer_read_u16(pkt); qclass = sldns_buffer_read_u16(pkt); if(qclass != xfr->dclass) { verbose(VERB_ALGO, "xfr to %s failed, packet with " "wrong qclass", xfr->task_transfer->master->host); return 0; } if(xfr->task_transfer->on_ixfr) { if(qtype != LDNS_RR_TYPE_IXFR) { verbose(VERB_ALGO, "xfr to %s failed, packet " "with wrong qtype, expected IXFR", xfr->task_transfer->master->host); return 0; } } else { if(qtype != LDNS_RR_TYPE_AXFR) { verbose(VERB_ALGO, "xfr to %s failed, packet " "with wrong qtype, expected AXFR", xfr->task_transfer->master->host); return 0; } } } /* check parse of RRs in packet, store first SOA serial * to be able to detect last SOA (with that serial) to see if done */ /* also check for IXFR 'zone up to date' reply */ for(i=0; i<(int)LDNS_ANCOUNT(wire); i++) { size_t pos = sldns_buffer_position(pkt); uint16_t tp, rdlen; if(pkt_dname_len(pkt) == 0) { verbose(VERB_ALGO, "xfr to %s failed, packet with " "malformed dname in answer section", xfr->task_transfer->master->host); return 0; } if(sldns_buffer_remaining(pkt) < 10) { verbose(VERB_ALGO, "xfr to %s failed, packet with " "truncated RR", xfr->task_transfer->master->host); return 0; } tp = sldns_buffer_read_u16(pkt); (void)sldns_buffer_read_u16(pkt); /* class */ (void)sldns_buffer_read_u32(pkt); /* ttl */ rdlen = sldns_buffer_read_u16(pkt); if(sldns_buffer_remaining(pkt) < rdlen) { verbose(VERB_ALGO, "xfr to %s failed, packet with " "truncated RR rdata", xfr->task_transfer->master->host); return 0; } /* RR parses (haven't checked rdata itself), now look at * SOA records to see serial number */ if(xfr->task_transfer->rr_scan_num == 0 && tp != LDNS_RR_TYPE_SOA) { verbose(VERB_ALGO, "xfr to %s failed, packet with " "malformed zone transfer, no start SOA", xfr->task_transfer->master->host); return 0; } if(xfr->task_transfer->rr_scan_num == 1 && tp != LDNS_RR_TYPE_SOA) { /* second RR is not a SOA record, this is not an IXFR * the master is replying with an AXFR */ xfr->task_transfer->on_ixfr_is_axfr = 1; } if(tp == LDNS_RR_TYPE_SOA) { uint32_t serial; if(rdlen < 22) { verbose(VERB_ALGO, "xfr to %s failed, packet " "with SOA with malformed rdata", xfr->task_transfer->master->host); return 0; } if(dname_pkt_compare(pkt, sldns_buffer_at(pkt, pos), xfr->name) != 0) { verbose(VERB_ALGO, "xfr to %s failed, packet " "with SOA with wrong dname", xfr->task_transfer->master->host); return 0; } /* read serial number of SOA */ serial = sldns_buffer_read_u32_at(pkt, sldns_buffer_position(pkt)+rdlen-20); /* check for IXFR 'zone has SOA x' reply */ if(xfr->task_transfer->on_ixfr && xfr->task_transfer->rr_scan_num == 0 && LDNS_ANCOUNT(wire)==1) { verbose(VERB_ALGO, "xfr to %s ended, " "IXFR reply that zone has serial %u," " fallback from IXFR to AXFR", xfr->task_transfer->master->host, (unsigned)serial); xfr->task_transfer->ixfr_fail = 1; *gonextonfail = 0; return 0; } /* if first SOA, store serial number */ if(xfr->task_transfer->got_xfr_serial == 0) { xfr->task_transfer->got_xfr_serial = 1; xfr->task_transfer->incoming_xfr_serial = serial; verbose(VERB_ALGO, "xfr %s: contains " "SOA serial %u", xfr->task_transfer->master->host, (unsigned)serial); /* see if end of AXFR */ } else if(!xfr->task_transfer->on_ixfr || xfr->task_transfer->on_ixfr_is_axfr) { /* second SOA with serial is the end * for AXFR */ *transferdone = 1; verbose(VERB_ALGO, "xfr %s: last AXFR packet", xfr->task_transfer->master->host); /* for IXFR, count SOA records with that serial */ } else if(xfr->task_transfer->incoming_xfr_serial == serial && xfr->task_transfer->got_xfr_serial == 1) { xfr->task_transfer->got_xfr_serial++; /* if not first soa, if serial==firstserial, the * third time we are at the end, for IXFR */ } else if(xfr->task_transfer->incoming_xfr_serial == serial && xfr->task_transfer->got_xfr_serial == 2) { verbose(VERB_ALGO, "xfr %s: last IXFR packet", xfr->task_transfer->master->host); *transferdone = 1; /* continue parse check, if that succeeds, * transfer is done */ } } xfr->task_transfer->rr_scan_num++; /* skip over RR rdata to go to the next RR */ sldns_buffer_skip(pkt, (ssize_t)rdlen); } /* check authority section */ /* we skip over the RRs checking packet format */ for(i=0; i<(int)LDNS_NSCOUNT(wire); i++) { uint16_t rdlen; if(pkt_dname_len(pkt) == 0) { verbose(VERB_ALGO, "xfr to %s failed, packet with " "malformed dname in authority section", xfr->task_transfer->master->host); return 0; } if(sldns_buffer_remaining(pkt) < 10) { verbose(VERB_ALGO, "xfr to %s failed, packet with " "truncated RR", xfr->task_transfer->master->host); return 0; } (void)sldns_buffer_read_u16(pkt); /* type */ (void)sldns_buffer_read_u16(pkt); /* class */ (void)sldns_buffer_read_u32(pkt); /* ttl */ rdlen = sldns_buffer_read_u16(pkt); if(sldns_buffer_remaining(pkt) < rdlen) { verbose(VERB_ALGO, "xfr to %s failed, packet with " "truncated RR rdata", xfr->task_transfer->master->host); return 0; } /* skip over RR rdata to go to the next RR */ sldns_buffer_skip(pkt, (ssize_t)rdlen); } /* check additional section */ for(i=0; i<(int)LDNS_ARCOUNT(wire); i++) { uint16_t rdlen; if(pkt_dname_len(pkt) == 0) { verbose(VERB_ALGO, "xfr to %s failed, packet with " "malformed dname in additional section", xfr->task_transfer->master->host); return 0; } if(sldns_buffer_remaining(pkt) < 10) { verbose(VERB_ALGO, "xfr to %s failed, packet with " "truncated RR", xfr->task_transfer->master->host); return 0; } (void)sldns_buffer_read_u16(pkt); /* type */ (void)sldns_buffer_read_u16(pkt); /* class */ (void)sldns_buffer_read_u32(pkt); /* ttl */ rdlen = sldns_buffer_read_u16(pkt); if(sldns_buffer_remaining(pkt) < rdlen) { verbose(VERB_ALGO, "xfr to %s failed, packet with " "truncated RR rdata", xfr->task_transfer->master->host); return 0; } /* skip over RR rdata to go to the next RR */ sldns_buffer_skip(pkt, (ssize_t)rdlen); } return 1; } /** Link the data from this packet into the worklist of transferred data */ static int xfer_link_data(sldns_buffer* pkt, struct auth_xfer* xfr) { /* alloc it */ struct auth_chunk* e; e = (struct auth_chunk*)calloc(1, sizeof(*e)); if(!e) return 0; e->next = NULL; e->len = sldns_buffer_limit(pkt); e->data = memdup(sldns_buffer_begin(pkt), e->len); if(!e->data) { free(e); return 0; } /* alloc succeeded, link into list */ if(!xfr->task_transfer->chunks_first) xfr->task_transfer->chunks_first = e; if(xfr->task_transfer->chunks_last) xfr->task_transfer->chunks_last->next = e; xfr->task_transfer->chunks_last = e; return 1; } /** task transfer. the list of data is complete. process it and if failed * move to next master, if succeeded, end the task transfer */ static void process_list_end_transfer(struct auth_xfer* xfr, struct module_env* env) { int ixfr_fail = 0; if(xfr_process_chunk_list(xfr, env, &ixfr_fail)) { /* it worked! */ auth_chunks_delete(xfr->task_transfer); /* we fetched the zone, move to wait task */ xfr_transfer_disown(xfr); if(xfr->notify_received && (!xfr->notify_has_serial || (xfr->notify_has_serial && xfr_serial_means_update(xfr, xfr->notify_serial)))) { uint32_t sr = xfr->notify_serial; int has_sr = xfr->notify_has_serial; /* we received a notify while probe/transfer was * in progress. start a new probe and transfer */ xfr->notify_received = 0; xfr->notify_has_serial = 0; xfr->notify_serial = 0; if(!xfr_start_probe(xfr, env, NULL)) { /* if we couldn't start it, already in * progress; restore notify serial, * while xfr still locked */ xfr->notify_received = 1; xfr->notify_has_serial = has_sr; xfr->notify_serial = sr; lock_basic_unlock(&xfr->lock); } return; } else { /* pick up the nextprobe task and wait (normail wait time) */ if(xfr->task_nextprobe->worker == NULL) xfr_set_timeout(xfr, env, 0, 0); } lock_basic_unlock(&xfr->lock); return; } /* processing failed */ /* when done, delete data from list */ auth_chunks_delete(xfr->task_transfer); if(ixfr_fail) { xfr->task_transfer->ixfr_fail = 1; } else { xfr_transfer_nextmaster(xfr); } xfr_transfer_nexttarget_or_end(xfr, env); } /** callback for the task_transfer timer */ void auth_xfer_transfer_timer_callback(void* arg) { struct auth_xfer* xfr = (struct auth_xfer*)arg; struct module_env* env; int gonextonfail = 1; log_assert(xfr->task_transfer); lock_basic_lock(&xfr->lock); env = xfr->task_transfer->env; if(!env || env->outnet->want_to_quit) { lock_basic_unlock(&xfr->lock); return; /* stop on quit */ } verbose(VERB_ALGO, "xfr stopped, connection timeout to %s", xfr->task_transfer->master->host); /* see if IXFR caused the failure, if so, try AXFR */ if(xfr->task_transfer->on_ixfr) { xfr->task_transfer->ixfr_possible_timeout_count++; if(xfr->task_transfer->ixfr_possible_timeout_count >= NUM_TIMEOUTS_FALLBACK_IXFR) { verbose(VERB_ALGO, "xfr to %s, fallback " "from IXFR to AXFR (because of timeouts)", xfr->task_transfer->master->host); xfr->task_transfer->ixfr_fail = 1; gonextonfail = 0; } } /* delete transferred data from list */ auth_chunks_delete(xfr->task_transfer); comm_point_delete(xfr->task_transfer->cp); xfr->task_transfer->cp = NULL; if(gonextonfail) xfr_transfer_nextmaster(xfr); xfr_transfer_nexttarget_or_end(xfr, env); } /** callback for task_transfer tcp connections */ int auth_xfer_transfer_tcp_callback(struct comm_point* c, void* arg, int err, struct comm_reply* ATTR_UNUSED(repinfo)) { struct auth_xfer* xfr = (struct auth_xfer*)arg; struct module_env* env; int gonextonfail = 1; int transferdone = 0; log_assert(xfr->task_transfer); lock_basic_lock(&xfr->lock); env = xfr->task_transfer->env; if(!env || env->outnet->want_to_quit) { lock_basic_unlock(&xfr->lock); return 0; /* stop on quit */ } /* stop the timer */ comm_timer_disable(xfr->task_transfer->timer); if(err != NETEVENT_NOERROR) { /* connection failed, closed, or timeout */ /* stop this transfer, cleanup * and continue task_transfer*/ verbose(VERB_ALGO, "xfr stopped, connection lost to %s", xfr->task_transfer->master->host); /* see if IXFR caused the failure, if so, try AXFR */ if(xfr->task_transfer->on_ixfr) { xfr->task_transfer->ixfr_possible_timeout_count++; if(xfr->task_transfer->ixfr_possible_timeout_count >= NUM_TIMEOUTS_FALLBACK_IXFR) { verbose(VERB_ALGO, "xfr to %s, fallback " "from IXFR to AXFR (because of timeouts)", xfr->task_transfer->master->host); xfr->task_transfer->ixfr_fail = 1; gonextonfail = 0; } } failed: /* delete transferred data from list */ auth_chunks_delete(xfr->task_transfer); comm_point_delete(xfr->task_transfer->cp); xfr->task_transfer->cp = NULL; if(gonextonfail) xfr_transfer_nextmaster(xfr); xfr_transfer_nexttarget_or_end(xfr, env); return 0; } /* note that IXFR worked without timeout */ if(xfr->task_transfer->on_ixfr) xfr->task_transfer->ixfr_possible_timeout_count = 0; /* handle returned packet */ /* if it fails, cleanup and end this transfer */ /* if it needs to fallback from IXFR to AXFR, do that */ if(!check_xfer_packet(c->buffer, xfr, &gonextonfail, &transferdone)) { goto failed; } /* if it is good, link it into the list of data */ /* if the link into list of data fails (malloc fail) cleanup and end */ if(!xfer_link_data(c->buffer, xfr)) { verbose(VERB_ALGO, "xfr stopped to %s, malloc failed", xfr->task_transfer->master->host); goto failed; } /* if the transfer is done now, disconnect and process the list */ if(transferdone) { comm_point_delete(xfr->task_transfer->cp); xfr->task_transfer->cp = NULL; process_list_end_transfer(xfr, env); return 0; } /* if we want to read more messages, setup the commpoint to read * a DNS packet, and the timeout */ lock_basic_unlock(&xfr->lock); c->tcp_is_reading = 1; sldns_buffer_clear(c->buffer); comm_point_start_listening(c, -1, AUTH_TRANSFER_TIMEOUT); return 0; } /** callback for task_transfer http connections */ int auth_xfer_transfer_http_callback(struct comm_point* c, void* arg, int err, struct comm_reply* repinfo) { struct auth_xfer* xfr = (struct auth_xfer*)arg; struct module_env* env; log_assert(xfr->task_transfer); lock_basic_lock(&xfr->lock); env = xfr->task_transfer->env; if(!env || env->outnet->want_to_quit) { lock_basic_unlock(&xfr->lock); return 0; /* stop on quit */ } verbose(VERB_ALGO, "auth zone transfer http callback"); /* stop the timer */ comm_timer_disable(xfr->task_transfer->timer); if(err != NETEVENT_NOERROR && err != NETEVENT_DONE) { /* connection failed, closed, or timeout */ /* stop this transfer, cleanup * and continue task_transfer*/ verbose(VERB_ALGO, "http stopped, connection lost to %s", xfr->task_transfer->master->host); failed: /* delete transferred data from list */ auth_chunks_delete(xfr->task_transfer); if(repinfo) repinfo->c = NULL; /* signal cp deleted to the routine calling this callback */ comm_point_delete(xfr->task_transfer->cp); xfr->task_transfer->cp = NULL; xfr_transfer_nextmaster(xfr); xfr_transfer_nexttarget_or_end(xfr, env); return 0; } /* if it is good, link it into the list of data */ /* if the link into list of data fails (malloc fail) cleanup and end */ if(sldns_buffer_limit(c->buffer) > 0) { verbose(VERB_ALGO, "auth zone http queued up %d bytes", (int)sldns_buffer_limit(c->buffer)); if(!xfer_link_data(c->buffer, xfr)) { verbose(VERB_ALGO, "http stopped to %s, malloc failed", xfr->task_transfer->master->host); goto failed; } } /* if the transfer is done now, disconnect and process the list */ if(err == NETEVENT_DONE) { if(repinfo) repinfo->c = NULL; /* signal cp deleted to the routine calling this callback */ comm_point_delete(xfr->task_transfer->cp); xfr->task_transfer->cp = NULL; process_list_end_transfer(xfr, env); return 0; } /* if we want to read more messages, setup the commpoint to read * a DNS packet, and the timeout */ lock_basic_unlock(&xfr->lock); c->tcp_is_reading = 1; sldns_buffer_clear(c->buffer); comm_point_start_listening(c, -1, AUTH_TRANSFER_TIMEOUT); return 0; } /** start transfer task by this worker , xfr is locked. */ static void xfr_start_transfer(struct auth_xfer* xfr, struct module_env* env, struct auth_master* master) { log_assert(xfr->task_transfer != NULL); log_assert(xfr->task_transfer->worker == NULL); log_assert(xfr->task_transfer->chunks_first == NULL); log_assert(xfr->task_transfer->chunks_last == NULL); xfr->task_transfer->worker = env->worker; xfr->task_transfer->env = env; /* init transfer process */ /* find that master in the transfer's list of masters? */ xfr_transfer_start_list(xfr, master); /* start lookup for hostnames in transfer master list */ xfr_transfer_start_lookups(xfr); /* initiate TCP, and set timeout on it */ xfr_transfer_nexttarget_or_end(xfr, env); } /** disown task_probe. caller must hold xfr.lock */ static void xfr_probe_disown(struct auth_xfer* xfr) { /* remove timer (from this worker's event base) */ comm_timer_delete(xfr->task_probe->timer); xfr->task_probe->timer = NULL; /* remove the commpoint */ comm_point_delete(xfr->task_probe->cp); xfr->task_probe->cp = NULL; /* we don't own this item anymore */ xfr->task_probe->worker = NULL; xfr->task_probe->env = NULL; } /** send the UDP probe to the master, this is part of task_probe */ static int xfr_probe_send_probe(struct auth_xfer* xfr, struct module_env* env, int timeout) { struct sockaddr_storage addr; socklen_t addrlen = 0; struct timeval t; /* pick master */ struct auth_master* master = xfr_probe_current_master(xfr); char *auth_name = NULL; if(!master) return 0; if(master->allow_notify) return 0; /* only for notify */ if(master->http) return 0; /* only masters get SOA UDP probe, not urls, if those are in this list */ /* get master addr */ if(xfr->task_probe->scan_addr) { addrlen = xfr->task_probe->scan_addr->addrlen; memmove(&addr, &xfr->task_probe->scan_addr->addr, addrlen); } else { if(!authextstrtoaddr(master->host, &addr, &addrlen, &auth_name)) { /* the ones that are not in addr format are supposed * to be looked up. The lookup has failed however, * so skip them */ char zname[255+1]; dname_str(xfr->name, zname); log_err("%s: failed lookup, cannot probe to master %s", zname, master->host); return 0; } if (auth_name != NULL) { if (addr.ss_family == AF_INET && (int)ntohs(((struct sockaddr_in *)&addr)->sin_port) == env->cfg->ssl_port) ((struct sockaddr_in *)&addr)->sin_port = htons((uint16_t)env->cfg->port); else if (addr.ss_family == AF_INET6 && (int)ntohs(((struct sockaddr_in6 *)&addr)->sin6_port) == env->cfg->ssl_port) ((struct sockaddr_in6 *)&addr)->sin6_port = htons((uint16_t)env->cfg->port); } } /* create packet */ /* create new ID for new probes, but not on timeout retries, * this means we'll accept replies to previous retries to same ip */ if(timeout == AUTH_PROBE_TIMEOUT) xfr->task_probe->id = (uint16_t)(ub_random(env->rnd)&0xffff); xfr_create_soa_probe_packet(xfr, env->scratch_buffer, xfr->task_probe->id); /* we need to remove the cp if we have a different ip4/ip6 type now */ if(xfr->task_probe->cp && ((xfr->task_probe->cp_is_ip6 && !addr_is_ip6(&addr, addrlen)) || (!xfr->task_probe->cp_is_ip6 && addr_is_ip6(&addr, addrlen))) ) { comm_point_delete(xfr->task_probe->cp); xfr->task_probe->cp = NULL; } if(!xfr->task_probe->cp) { if(addr_is_ip6(&addr, addrlen)) xfr->task_probe->cp_is_ip6 = 1; else xfr->task_probe->cp_is_ip6 = 0; xfr->task_probe->cp = outnet_comm_point_for_udp(env->outnet, auth_xfer_probe_udp_callback, xfr, &addr, addrlen); if(!xfr->task_probe->cp) { char zname[255+1], as[256]; dname_str(xfr->name, zname); addr_to_str(&addr, addrlen, as, sizeof(as)); verbose(VERB_ALGO, "cannot create udp cp for " "probe %s to %s", zname, as); return 0; } } if(!xfr->task_probe->timer) { xfr->task_probe->timer = comm_timer_create(env->worker_base, auth_xfer_probe_timer_callback, xfr); if(!xfr->task_probe->timer) { log_err("malloc failure"); return 0; } } /* send udp packet */ if(!comm_point_send_udp_msg(xfr->task_probe->cp, env->scratch_buffer, - (struct sockaddr*)&addr, addrlen)) { + (struct sockaddr*)&addr, addrlen, 0)) { char zname[255+1], as[256]; dname_str(xfr->name, zname); addr_to_str(&addr, addrlen, as, sizeof(as)); verbose(VERB_ALGO, "failed to send soa probe for %s to %s", zname, as); return 0; } if(verbosity >= VERB_ALGO) { char zname[255+1], as[256]; dname_str(xfr->name, zname); addr_to_str(&addr, addrlen, as, sizeof(as)); verbose(VERB_ALGO, "auth zone %s soa probe sent to %s", zname, as); } xfr->task_probe->timeout = timeout; #ifndef S_SPLINT_S t.tv_sec = timeout/1000; t.tv_usec = (timeout%1000)*1000; #endif comm_timer_set(xfr->task_probe->timer, &t); return 1; } /** callback for task_probe timer */ void auth_xfer_probe_timer_callback(void* arg) { struct auth_xfer* xfr = (struct auth_xfer*)arg; struct module_env* env; log_assert(xfr->task_probe); lock_basic_lock(&xfr->lock); env = xfr->task_probe->env; if(!env || env->outnet->want_to_quit) { lock_basic_unlock(&xfr->lock); return; /* stop on quit */ } if(verbosity >= VERB_ALGO) { char zname[255+1]; dname_str(xfr->name, zname); verbose(VERB_ALGO, "auth zone %s soa probe timeout", zname); } if(xfr->task_probe->timeout <= AUTH_PROBE_TIMEOUT_STOP) { /* try again with bigger timeout */ if(xfr_probe_send_probe(xfr, env, xfr->task_probe->timeout*2)) { lock_basic_unlock(&xfr->lock); return; } } /* delete commpoint so a new one is created, with a fresh port nr */ comm_point_delete(xfr->task_probe->cp); xfr->task_probe->cp = NULL; /* too many timeouts (or fail to send), move to next or end */ xfr_probe_nextmaster(xfr); xfr_probe_send_or_end(xfr, env); } /** callback for task_probe udp packets */ int auth_xfer_probe_udp_callback(struct comm_point* c, void* arg, int err, struct comm_reply* repinfo) { struct auth_xfer* xfr = (struct auth_xfer*)arg; struct module_env* env; log_assert(xfr->task_probe); lock_basic_lock(&xfr->lock); env = xfr->task_probe->env; if(!env || env->outnet->want_to_quit) { lock_basic_unlock(&xfr->lock); return 0; /* stop on quit */ } /* the comm_point_udp_callback is in a for loop for NUM_UDP_PER_SELECT * and we set rep.c=NULL to stop if from looking inside the commpoint*/ repinfo->c = NULL; /* stop the timer */ comm_timer_disable(xfr->task_probe->timer); /* see if we got a packet and what that means */ if(err == NETEVENT_NOERROR) { uint32_t serial = 0; if(check_packet_ok(c->buffer, LDNS_RR_TYPE_SOA, xfr, &serial)) { /* successful lookup */ if(verbosity >= VERB_ALGO) { char buf[256]; dname_str(xfr->name, buf); verbose(VERB_ALGO, "auth zone %s: soa probe " "serial is %u", buf, (unsigned)serial); } /* see if this serial indicates that the zone has * to be updated */ if(xfr_serial_means_update(xfr, serial)) { /* if updated, start the transfer task, if needed */ verbose(VERB_ALGO, "auth_zone updated, start transfer"); if(xfr->task_transfer->worker == NULL) { struct auth_master* master = xfr_probe_current_master(xfr); /* if we have download URLs use them * in preference to this master we * just probed the SOA from */ if(xfr->task_transfer->masters && xfr->task_transfer->masters->http) master = NULL; xfr_probe_disown(xfr); xfr_start_transfer(xfr, env, master); return 0; } /* other tasks are running, we don't do this anymore */ xfr_probe_disown(xfr); lock_basic_unlock(&xfr->lock); /* return, we don't sent a reply to this udp packet, * and we setup the tasks to do next */ return 0; } else { verbose(VERB_ALGO, "auth_zone master reports unchanged soa serial"); /* we if cannot find updates amongst the * masters, this means we then have a new lease * on the zone */ xfr->task_probe->have_new_lease = 1; } } else { if(verbosity >= VERB_ALGO) { char buf[256]; dname_str(xfr->name, buf); verbose(VERB_ALGO, "auth zone %s: bad reply to soa probe", buf); } } } else { if(verbosity >= VERB_ALGO) { char buf[256]; dname_str(xfr->name, buf); verbose(VERB_ALGO, "auth zone %s: soa probe failed", buf); } } /* failed lookup or not an update */ /* delete commpoint so a new one is created, with a fresh port nr */ comm_point_delete(xfr->task_probe->cp); xfr->task_probe->cp = NULL; /* if the result was not a successfull probe, we need * to send the next one */ xfr_probe_nextmaster(xfr); xfr_probe_send_or_end(xfr, env); return 0; } /** lookup a host name for its addresses, if needed */ static int xfr_probe_lookup_host(struct auth_xfer* xfr, struct module_env* env) { struct sockaddr_storage addr; socklen_t addrlen = 0; struct auth_master* master = xfr->task_probe->lookup_target; struct query_info qinfo; uint16_t qflags = BIT_RD; uint8_t dname[LDNS_MAX_DOMAINLEN+1]; struct edns_data edns; sldns_buffer* buf = env->scratch_buffer; if(!master) return 0; if(extstrtoaddr(master->host, &addr, &addrlen)) { /* not needed, host is in IP addr format */ return 0; } if(master->allow_notify && !master->http && strchr(master->host, '/') != NULL && strchr(master->host, '/') == strrchr(master->host, '/')) { return 0; /* is IP/prefix format, not something to look up */ } /* use mesh_new_callback to probe for non-addr hosts, * and then wait for them to be looked up (in cache, or query) */ qinfo.qname_len = sizeof(dname); if(sldns_str2wire_dname_buf(master->host, dname, &qinfo.qname_len) != 0) { log_err("cannot parse host name of master %s", master->host); return 0; } qinfo.qname = dname; qinfo.qclass = xfr->dclass; qinfo.qtype = LDNS_RR_TYPE_A; if(xfr->task_probe->lookup_aaaa) qinfo.qtype = LDNS_RR_TYPE_AAAA; qinfo.local_alias = NULL; if(verbosity >= VERB_ALGO) { char buf1[512]; char buf2[LDNS_MAX_DOMAINLEN+1]; dname_str(xfr->name, buf2); snprintf(buf1, sizeof(buf1), "auth zone %s: master lookup" " for task_probe", buf2); log_query_info(VERB_ALGO, buf1, &qinfo); } edns.edns_present = 1; edns.ext_rcode = 0; edns.edns_version = 0; edns.bits = EDNS_DO; edns.opt_list = NULL; if(sldns_buffer_capacity(buf) < 65535) edns.udp_size = (uint16_t)sldns_buffer_capacity(buf); else edns.udp_size = 65535; /* unlock xfr during mesh_new_callback() because the callback can be * called straight away */ lock_basic_unlock(&xfr->lock); if(!mesh_new_callback(env->mesh, &qinfo, qflags, &edns, buf, 0, &auth_xfer_probe_lookup_callback, xfr)) { lock_basic_lock(&xfr->lock); log_err("out of memory lookup up master %s", master->host); return 0; } lock_basic_lock(&xfr->lock); return 1; } /** move to sending the probe packets, next if fails. task_probe */ static void xfr_probe_send_or_end(struct auth_xfer* xfr, struct module_env* env) { /* are we doing hostname lookups? */ while(xfr->task_probe->lookup_target) { if(xfr_probe_lookup_host(xfr, env)) { /* wait for lookup to finish, * note that the hostname may be in unbound's cache * and we may then get an instant cache response, * and that calls the callback just like a full * lookup and lookup failures also call callback */ if(verbosity >= VERB_ALGO) { char zname[255+1]; dname_str(xfr->name, zname); verbose(VERB_ALGO, "auth zone %s probe next target lookup", zname); } lock_basic_unlock(&xfr->lock); return; } xfr_probe_move_to_next_lookup(xfr, env); } /* probe of list has ended. Create or refresh the list of of * allow_notify addrs */ probe_copy_masters_for_allow_notify(xfr); if(verbosity >= VERB_ALGO) { char zname[255+1]; dname_str(xfr->name, zname); verbose(VERB_ALGO, "auth zone %s probe: notify addrs updated", zname); } if(xfr->task_probe->only_lookup) { /* only wanted lookups for copy, stop probe and start wait */ xfr->task_probe->only_lookup = 0; if(verbosity >= VERB_ALGO) { char zname[255+1]; dname_str(xfr->name, zname); verbose(VERB_ALGO, "auth zone %s probe: finished only_lookup", zname); } xfr_probe_disown(xfr); if(xfr->task_nextprobe->worker == NULL) xfr_set_timeout(xfr, env, 0, 0); lock_basic_unlock(&xfr->lock); return; } /* send probe packets */ while(!xfr_probe_end_of_list(xfr)) { if(xfr_probe_send_probe(xfr, env, AUTH_PROBE_TIMEOUT)) { /* successfully sent probe, wait for callback */ lock_basic_unlock(&xfr->lock); return; } /* failed to send probe, next master */ xfr_probe_nextmaster(xfr); } /* done with probe sequence, wait */ if(xfr->task_probe->have_new_lease) { /* if zone not updated, start the wait timer again */ if(verbosity >= VERB_ALGO) { char zname[255+1]; dname_str(xfr->name, zname); verbose(VERB_ALGO, "auth_zone %s unchanged, new lease, wait", zname); } xfr_probe_disown(xfr); if(xfr->have_zone) xfr->lease_time = *env->now; if(xfr->task_nextprobe->worker == NULL) xfr_set_timeout(xfr, env, 0, 0); } else { if(verbosity >= VERB_ALGO) { char zname[255+1]; dname_str(xfr->name, zname); verbose(VERB_ALGO, "auth zone %s soa probe failed, wait to retry", zname); } /* we failed to send this as well, move to the wait task, * use the shorter retry timeout */ xfr_probe_disown(xfr); /* pick up the nextprobe task and wait */ if(xfr->task_nextprobe->worker == NULL) xfr_set_timeout(xfr, env, 1, 0); } lock_basic_unlock(&xfr->lock); } /** callback for task_probe lookup of host name, of A or AAAA */ void auth_xfer_probe_lookup_callback(void* arg, int rcode, sldns_buffer* buf, enum sec_status ATTR_UNUSED(sec), char* ATTR_UNUSED(why_bogus), int ATTR_UNUSED(was_ratelimited)) { struct auth_xfer* xfr = (struct auth_xfer*)arg; struct module_env* env; log_assert(xfr->task_probe); lock_basic_lock(&xfr->lock); env = xfr->task_probe->env; if(!env || env->outnet->want_to_quit) { lock_basic_unlock(&xfr->lock); return; /* stop on quit */ } /* process result */ if(rcode == LDNS_RCODE_NOERROR) { uint16_t wanted_qtype = LDNS_RR_TYPE_A; struct regional* temp = env->scratch; struct query_info rq; struct reply_info* rep; if(xfr->task_probe->lookup_aaaa) wanted_qtype = LDNS_RR_TYPE_AAAA; memset(&rq, 0, sizeof(rq)); rep = parse_reply_in_temp_region(buf, temp, &rq); if(rep && rq.qtype == wanted_qtype && FLAGS_GET_RCODE(rep->flags) == LDNS_RCODE_NOERROR) { /* parsed successfully */ struct ub_packed_rrset_key* answer = reply_find_answer_rrset(&rq, rep); if(answer) { xfr_master_add_addrs(xfr->task_probe-> lookup_target, answer, wanted_qtype); } else { if(verbosity >= VERB_ALGO) { char zname[255+1]; dname_str(xfr->name, zname); verbose(VERB_ALGO, "auth zone %s host %s type %s probe lookup has nodata", zname, xfr->task_probe->lookup_target->host, (xfr->task_probe->lookup_aaaa?"AAAA":"A")); } } } else { if(verbosity >= VERB_ALGO) { char zname[255+1]; dname_str(xfr->name, zname); verbose(VERB_ALGO, "auth zone %s host %s type %s probe lookup has no address", zname, xfr->task_probe->lookup_target->host, (xfr->task_probe->lookup_aaaa?"AAAA":"A")); } } regional_free_all(temp); } else { if(verbosity >= VERB_ALGO) { char zname[255+1]; dname_str(xfr->name, zname); verbose(VERB_ALGO, "auth zone %s host %s type %s probe lookup failed", zname, xfr->task_probe->lookup_target->host, (xfr->task_probe->lookup_aaaa?"AAAA":"A")); } } if(xfr->task_probe->lookup_target->list && xfr->task_probe->lookup_target == xfr_probe_current_master(xfr)) xfr->task_probe->scan_addr = xfr->task_probe->lookup_target->list; /* move to lookup AAAA after A lookup, move to next hostname lookup, * or move to send the probes, or, if nothing to do, end task_probe */ xfr_probe_move_to_next_lookup(xfr, env); xfr_probe_send_or_end(xfr, env); } /** disown task_nextprobe. caller must hold xfr.lock */ static void xfr_nextprobe_disown(struct auth_xfer* xfr) { /* delete the timer, because the next worker to pick this up may * not have the same event base */ comm_timer_delete(xfr->task_nextprobe->timer); xfr->task_nextprobe->timer = NULL; xfr->task_nextprobe->next_probe = 0; /* we don't own this item anymore */ xfr->task_nextprobe->worker = NULL; xfr->task_nextprobe->env = NULL; } /** xfer nextprobe timeout callback, this is part of task_nextprobe */ void auth_xfer_timer(void* arg) { struct auth_xfer* xfr = (struct auth_xfer*)arg; struct module_env* env; log_assert(xfr->task_nextprobe); lock_basic_lock(&xfr->lock); env = xfr->task_nextprobe->env; if(!env || env->outnet->want_to_quit) { lock_basic_unlock(&xfr->lock); return; /* stop on quit */ } /* see if zone has expired, and if so, also set auth_zone expired */ if(xfr->have_zone && !xfr->zone_expired && *env->now >= xfr->lease_time + xfr->expiry) { lock_basic_unlock(&xfr->lock); auth_xfer_set_expired(xfr, env, 1); lock_basic_lock(&xfr->lock); } xfr_nextprobe_disown(xfr); if(!xfr_start_probe(xfr, env, NULL)) { /* not started because already in progress */ lock_basic_unlock(&xfr->lock); } } /** return true if there are probe (SOA UDP query) targets in the master list*/ static int have_probe_targets(struct auth_master* list) { struct auth_master* p; for(p=list; p; p = p->next) { if(!p->allow_notify && p->host) return 1; } return 0; } /** start task_probe if possible, if no masters for probe start task_transfer * returns true if task has been started, and false if the task is already * in progress. */ static int xfr_start_probe(struct auth_xfer* xfr, struct module_env* env, struct auth_master* spec) { /* see if we need to start a probe (or maybe it is already in * progress (due to notify)) */ if(xfr->task_probe->worker == NULL) { if(!have_probe_targets(xfr->task_probe->masters) && !(xfr->task_probe->only_lookup && xfr->task_probe->masters != NULL)) { /* useless to pick up task_probe, no masters to * probe. Instead attempt to pick up task transfer */ if(xfr->task_transfer->worker == NULL) { xfr_start_transfer(xfr, env, spec); return 1; } /* task transfer already in progress */ return 0; } /* pick up the probe task ourselves */ xfr->task_probe->worker = env->worker; xfr->task_probe->env = env; xfr->task_probe->cp = NULL; /* start the task */ /* have not seen a new lease yet, this scan */ xfr->task_probe->have_new_lease = 0; /* if this was a timeout, no specific first master to scan */ /* otherwise, spec is nonNULL the notified master, scan * first and also transfer first from it */ xfr_probe_start_list(xfr, spec); /* setup to start the lookup of hostnames of masters afresh */ xfr_probe_start_lookups(xfr); /* send the probe packet or next send, or end task */ xfr_probe_send_or_end(xfr, env); return 1; } return 0; } /** for task_nextprobe. * determine next timeout for auth_xfer. Also (re)sets timer. * @param xfr: task structure * @param env: module environment, with worker and time. * @param failure: set true if timer should be set for failure retry. * @param lookup_only: only perform lookups when timer done, 0 sec timeout */ static void xfr_set_timeout(struct auth_xfer* xfr, struct module_env* env, int failure, int lookup_only) { struct timeval tv; log_assert(xfr->task_nextprobe != NULL); log_assert(xfr->task_nextprobe->worker == NULL || xfr->task_nextprobe->worker == env->worker); /* normally, nextprobe = startoflease + refresh, * but if expiry is sooner, use that one. * after a failure, use the retry timer instead. */ xfr->task_nextprobe->next_probe = *env->now; if(xfr->lease_time && !failure) xfr->task_nextprobe->next_probe = xfr->lease_time; if(!failure) { xfr->task_nextprobe->backoff = 0; } else { if(xfr->task_nextprobe->backoff == 0) xfr->task_nextprobe->backoff = 3; else xfr->task_nextprobe->backoff *= 2; if(xfr->task_nextprobe->backoff > AUTH_TRANSFER_MAX_BACKOFF) xfr->task_nextprobe->backoff = AUTH_TRANSFER_MAX_BACKOFF; } if(xfr->have_zone) { time_t wait = xfr->refresh; if(failure) wait = xfr->retry; if(xfr->expiry < wait) xfr->task_nextprobe->next_probe += xfr->expiry; else xfr->task_nextprobe->next_probe += wait; if(failure) xfr->task_nextprobe->next_probe += xfr->task_nextprobe->backoff; /* put the timer exactly on expiry, if possible */ if(xfr->lease_time && xfr->lease_time+xfr->expiry < xfr->task_nextprobe->next_probe && xfr->lease_time+xfr->expiry > *env->now) xfr->task_nextprobe->next_probe = xfr->lease_time+xfr->expiry; } else { xfr->task_nextprobe->next_probe += xfr->task_nextprobe->backoff; } if(!xfr->task_nextprobe->timer) { xfr->task_nextprobe->timer = comm_timer_create( env->worker_base, auth_xfer_timer, xfr); if(!xfr->task_nextprobe->timer) { /* failed to malloc memory. likely zone transfer * also fails for that. skip the timeout */ char zname[255+1]; dname_str(xfr->name, zname); log_err("cannot allocate timer, no refresh for %s", zname); return; } } xfr->task_nextprobe->worker = env->worker; xfr->task_nextprobe->env = env; if(*(xfr->task_nextprobe->env->now) <= xfr->task_nextprobe->next_probe) tv.tv_sec = xfr->task_nextprobe->next_probe - *(xfr->task_nextprobe->env->now); else tv.tv_sec = 0; if(tv.tv_sec != 0 && lookup_only && xfr->task_probe->masters) { /* don't lookup_only, if lookup timeout is 0 anyway, * or if we don't have masters to lookup */ tv.tv_sec = 0; if(xfr->task_probe->worker == NULL) xfr->task_probe->only_lookup = 1; } if(verbosity >= VERB_ALGO) { char zname[255+1]; dname_str(xfr->name, zname); verbose(VERB_ALGO, "auth zone %s timeout in %d seconds", zname, (int)tv.tv_sec); } tv.tv_usec = 0; comm_timer_set(xfr->task_nextprobe->timer, &tv); } /** initial pick up of worker timeouts, ties events to worker event loop */ void auth_xfer_pickup_initial(struct auth_zones* az, struct module_env* env) { struct auth_xfer* x; lock_rw_wrlock(&az->lock); RBTREE_FOR(x, struct auth_xfer*, &az->xtree) { lock_basic_lock(&x->lock); /* set lease_time, because we now have timestamp in env, * (not earlier during startup and apply_cfg), and this * notes the start time when the data was acquired */ if(x->have_zone) x->lease_time = *env->now; if(x->task_nextprobe && x->task_nextprobe->worker == NULL) { xfr_set_timeout(x, env, 0, 1); } lock_basic_unlock(&x->lock); } lock_rw_unlock(&az->lock); } void auth_zones_cleanup(struct auth_zones* az) { struct auth_xfer* x; lock_rw_wrlock(&az->lock); RBTREE_FOR(x, struct auth_xfer*, &az->xtree) { lock_basic_lock(&x->lock); if(x->task_nextprobe && x->task_nextprobe->worker != NULL) { xfr_nextprobe_disown(x); } if(x->task_probe && x->task_probe->worker != NULL) { xfr_probe_disown(x); } if(x->task_transfer && x->task_transfer->worker != NULL) { auth_chunks_delete(x->task_transfer); xfr_transfer_disown(x); } lock_basic_unlock(&x->lock); } lock_rw_unlock(&az->lock); } /** * malloc the xfer and tasks * @param z: auth_zone with name of zone. */ static struct auth_xfer* auth_xfer_new(struct auth_zone* z) { struct auth_xfer* xfr; xfr = (struct auth_xfer*)calloc(1, sizeof(*xfr)); if(!xfr) return NULL; xfr->name = memdup(z->name, z->namelen); if(!xfr->name) { free(xfr); return NULL; } xfr->node.key = xfr; xfr->namelen = z->namelen; xfr->namelabs = z->namelabs; xfr->dclass = z->dclass; xfr->task_nextprobe = (struct auth_nextprobe*)calloc(1, sizeof(struct auth_nextprobe)); if(!xfr->task_nextprobe) { free(xfr->name); free(xfr); return NULL; } xfr->task_probe = (struct auth_probe*)calloc(1, sizeof(struct auth_probe)); if(!xfr->task_probe) { free(xfr->task_nextprobe); free(xfr->name); free(xfr); return NULL; } xfr->task_transfer = (struct auth_transfer*)calloc(1, sizeof(struct auth_transfer)); if(!xfr->task_transfer) { free(xfr->task_probe); free(xfr->task_nextprobe); free(xfr->name); free(xfr); return NULL; } lock_basic_init(&xfr->lock); lock_protect(&xfr->lock, &xfr->name, sizeof(xfr->name)); lock_protect(&xfr->lock, &xfr->namelen, sizeof(xfr->namelen)); lock_protect(&xfr->lock, xfr->name, xfr->namelen); lock_protect(&xfr->lock, &xfr->namelabs, sizeof(xfr->namelabs)); lock_protect(&xfr->lock, &xfr->dclass, sizeof(xfr->dclass)); lock_protect(&xfr->lock, &xfr->notify_received, sizeof(xfr->notify_received)); lock_protect(&xfr->lock, &xfr->notify_serial, sizeof(xfr->notify_serial)); lock_protect(&xfr->lock, &xfr->zone_expired, sizeof(xfr->zone_expired)); lock_protect(&xfr->lock, &xfr->have_zone, sizeof(xfr->have_zone)); lock_protect(&xfr->lock, &xfr->serial, sizeof(xfr->serial)); lock_protect(&xfr->lock, &xfr->retry, sizeof(xfr->retry)); lock_protect(&xfr->lock, &xfr->refresh, sizeof(xfr->refresh)); lock_protect(&xfr->lock, &xfr->expiry, sizeof(xfr->expiry)); lock_protect(&xfr->lock, &xfr->lease_time, sizeof(xfr->lease_time)); lock_protect(&xfr->lock, &xfr->task_nextprobe->worker, sizeof(xfr->task_nextprobe->worker)); lock_protect(&xfr->lock, &xfr->task_probe->worker, sizeof(xfr->task_probe->worker)); lock_protect(&xfr->lock, &xfr->task_transfer->worker, sizeof(xfr->task_transfer->worker)); lock_basic_lock(&xfr->lock); return xfr; } /** Create auth_xfer structure. * This populates the have_zone, soa values, and so on times. * and sets the timeout, if a zone transfer is needed a short timeout is set. * For that the auth_zone itself must exist (and read in zonefile) * returns false on alloc failure. */ struct auth_xfer* auth_xfer_create(struct auth_zones* az, struct auth_zone* z) { struct auth_xfer* xfr; /* malloc it */ xfr = auth_xfer_new(z); if(!xfr) { log_err("malloc failure"); return NULL; } /* insert in tree */ (void)rbtree_insert(&az->xtree, &xfr->node); return xfr; } /** create new auth_master structure */ static struct auth_master* auth_master_new(struct auth_master*** list) { struct auth_master *m; m = (struct auth_master*)calloc(1, sizeof(*m)); if(!m) { log_err("malloc failure"); return NULL; } /* set first pointer to m, or next pointer of previous element to m */ (**list) = m; /* store m's next pointer as future point to store at */ (*list) = &(m->next); return m; } /** dup_prefix : create string from initial part of other string, malloced */ static char* dup_prefix(char* str, size_t num) { char* result; size_t len = strlen(str); if(len < num) num = len; /* not more than strlen */ result = (char*)malloc(num+1); if(!result) { log_err("malloc failure"); return result; } memmove(result, str, num); result[num] = 0; return result; } /** dup string and print error on error */ static char* dup_all(char* str) { char* result = strdup(str); if(!result) { log_err("malloc failure"); return NULL; } return result; } /** find first of two characters */ static char* str_find_first_of_chars(char* s, char a, char b) { char* ra = strchr(s, a); char* rb = strchr(s, b); if(!ra) return rb; if(!rb) return ra; if(ra < rb) return ra; return rb; } /** parse URL into host and file parts, false on malloc or parse error */ static int parse_url(char* url, char** host, char** file, int* port, int* ssl) { char* p = url; /* parse http://www.example.com/file.htm * or http://127.0.0.1 (index.html) * or https://[::1@1234]/a/b/c/d */ *ssl = 1; *port = AUTH_HTTPS_PORT; /* parse http:// or https:// */ if(strncmp(p, "http://", 7) == 0) { p += 7; *ssl = 0; *port = AUTH_HTTP_PORT; } else if(strncmp(p, "https://", 8) == 0) { p += 8; } else if(strstr(p, "://") && strchr(p, '/') > strstr(p, "://") && strchr(p, ':') >= strstr(p, "://")) { char* uri = dup_prefix(p, (size_t)(strstr(p, "://")-p)); log_err("protocol %s:// not supported (for url %s)", uri?uri:"", p); free(uri); return 0; } /* parse hostname part */ if(p[0] == '[') { char* end = strchr(p, ']'); p++; /* skip over [ */ if(end) { *host = dup_prefix(p, (size_t)(end-p)); if(!*host) return 0; p = end+1; /* skip over ] */ } else { *host = dup_all(p); if(!*host) return 0; p = end; } } else { char* end = str_find_first_of_chars(p, ':', '/'); if(end) { *host = dup_prefix(p, (size_t)(end-p)); if(!*host) return 0; } else { *host = dup_all(p); if(!*host) return 0; } p = end; /* at next : or / or NULL */ } /* parse port number */ if(p && p[0] == ':') { char* end = NULL; *port = strtol(p+1, &end, 10); p = end; } /* parse filename part */ while(p && *p == '/') p++; if(!p || p[0] == 0) *file = strdup("index.html"); else *file = strdup(p); if(!*file) { log_err("malloc failure"); return 0; } return 1; } int xfer_set_masters(struct auth_master** list, struct config_auth* c, int with_http) { struct auth_master* m; struct config_strlist* p; /* list points to the first, or next pointer for the new element */ while(*list) { list = &( (*list)->next ); } if(with_http) for(p = c->urls; p; p = p->next) { m = auth_master_new(&list); m->http = 1; if(!parse_url(p->str, &m->host, &m->file, &m->port, &m->ssl)) return 0; } for(p = c->masters; p; p = p->next) { m = auth_master_new(&list); m->ixfr = 1; /* this flag is not configurable */ m->host = strdup(p->str); if(!m->host) { log_err("malloc failure"); return 0; } } for(p = c->allow_notify; p; p = p->next) { m = auth_master_new(&list); m->allow_notify = 1; m->host = strdup(p->str); if(!m->host) { log_err("malloc failure"); return 0; } } return 1; } #define SERIAL_BITS 32 int compare_serial(uint32_t a, uint32_t b) { const uint32_t cutoff = ((uint32_t) 1 << (SERIAL_BITS - 1)); if (a == b) { return 0; } else if ((a < b && b - a < cutoff) || (a > b && a - b > cutoff)) { return -1; } else { return 1; } } Index: head/contrib/unbound/services/outside_network.c =================================================================== --- head/contrib/unbound/services/outside_network.c (revision 368750) +++ head/contrib/unbound/services/outside_network.c (revision 368751) @@ -1,3437 +1,3430 @@ /* * services/outside_network.c - implement sending of queries and wait answer. * * Copyright (c) 2007, NLnet Labs. All rights reserved. * * This software is open source. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * Neither the name of the NLNET LABS nor the names of its contributors may * be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /** * \file * * This file has functions to send queries to authoritative servers and * wait for the pending answer events. */ #include "config.h" #include #ifdef HAVE_SYS_TYPES_H # include #endif #include #include "services/outside_network.h" #include "services/listen_dnsport.h" #include "services/cache/infra.h" #include "iterator/iterator.h" #include "util/data/msgparse.h" #include "util/data/msgreply.h" #include "util/data/msgencode.h" #include "util/data/dname.h" #include "util/netevent.h" #include "util/log.h" #include "util/net_help.h" #include "util/random.h" #include "util/fptr_wlist.h" #include "util/edns.h" #include "sldns/sbuffer.h" #include "dnstap/dnstap.h" #ifdef HAVE_OPENSSL_SSL_H #include #endif #ifdef HAVE_X509_VERIFY_PARAM_SET1_HOST #include #endif #ifdef HAVE_NETDB_H #include #endif #include /** number of times to retry making a random ID that is unique. */ #define MAX_ID_RETRY 1000 /** number of times to retry finding interface, port that can be opened. */ #define MAX_PORT_RETRY 10000 /** number of retries on outgoing UDP queries */ #define OUTBOUND_UDP_RETRY 1 /** initiate TCP transaction for serviced query */ static void serviced_tcp_initiate(struct serviced_query* sq, sldns_buffer* buff); /** with a fd available, randomize and send UDP */ static int randomize_and_send_udp(struct pending* pend, sldns_buffer* packet, int timeout); /** remove waiting tcp from the outnet waiting list */ static void waiting_list_remove(struct outside_network* outnet, struct waiting_tcp* w); int pending_cmp(const void* key1, const void* key2) { struct pending *p1 = (struct pending*)key1; struct pending *p2 = (struct pending*)key2; if(p1->id < p2->id) return -1; if(p1->id > p2->id) return 1; log_assert(p1->id == p2->id); return sockaddr_cmp(&p1->addr, p1->addrlen, &p2->addr, p2->addrlen); } int serviced_cmp(const void* key1, const void* key2) { struct serviced_query* q1 = (struct serviced_query*)key1; struct serviced_query* q2 = (struct serviced_query*)key2; int r; if(q1->qbuflen < q2->qbuflen) return -1; if(q1->qbuflen > q2->qbuflen) return 1; log_assert(q1->qbuflen == q2->qbuflen); log_assert(q1->qbuflen >= 15 /* 10 header, root, type, class */); /* alternate casing of qname is still the same query */ if((r = memcmp(q1->qbuf, q2->qbuf, 10)) != 0) return r; if((r = memcmp(q1->qbuf+q1->qbuflen-4, q2->qbuf+q2->qbuflen-4, 4)) != 0) return r; if(q1->dnssec != q2->dnssec) { if(q1->dnssec < q2->dnssec) return -1; return 1; } if((r = query_dname_compare(q1->qbuf+10, q2->qbuf+10)) != 0) return r; if((r = edns_opt_list_compare(q1->opt_list, q2->opt_list)) != 0) return r; return sockaddr_cmp(&q1->addr, q1->addrlen, &q2->addr, q2->addrlen); } /** compare if the reuse element has the same address, port and same ssl-is * used-for-it characteristic */ static int reuse_cmp_addrportssl(const void* key1, const void* key2) { struct reuse_tcp* r1 = (struct reuse_tcp*)key1; struct reuse_tcp* r2 = (struct reuse_tcp*)key2; int r; /* compare address and port */ r = sockaddr_cmp(&r1->addr, r1->addrlen, &r2->addr, r2->addrlen); if(r != 0) return r; /* compare if SSL-enabled */ if(r1->is_ssl && !r2->is_ssl) return 1; if(!r1->is_ssl && r2->is_ssl) return -1; return 0; } int reuse_cmp(const void* key1, const void* key2) { int r; r = reuse_cmp_addrportssl(key1, key2); if(r != 0) return r; /* compare ptr value */ if(key1 < key2) return -1; if(key1 > key2) return 1; return 0; } int reuse_id_cmp(const void* key1, const void* key2) { struct waiting_tcp* w1 = (struct waiting_tcp*)key1; struct waiting_tcp* w2 = (struct waiting_tcp*)key2; if(w1->id < w2->id) return -1; if(w1->id > w2->id) return 1; return 0; } /** delete waiting_tcp entry. Does not unlink from waiting list. * @param w: to delete. */ static void waiting_tcp_delete(struct waiting_tcp* w) { if(!w) return; if(w->timer) comm_timer_delete(w->timer); free(w); } /** * Pick random outgoing-interface of that family, and bind it. * port set to 0 so OS picks a port number for us. * if it is the ANY address, do not bind. * @param w: tcp structure with destination address. * @param s: socket fd. * @return false on error, socket closed. */ static int pick_outgoing_tcp(struct waiting_tcp* w, int s) { struct port_if* pi = NULL; int num; #ifdef INET6 if(addr_is_ip6(&w->addr, w->addrlen)) num = w->outnet->num_ip6; else #endif num = w->outnet->num_ip4; if(num == 0) { log_err("no TCP outgoing interfaces of family"); log_addr(VERB_OPS, "for addr", &w->addr, w->addrlen); sock_close(s); return 0; } #ifdef INET6 if(addr_is_ip6(&w->addr, w->addrlen)) pi = &w->outnet->ip6_ifs[ub_random_max(w->outnet->rnd, num)]; else #endif pi = &w->outnet->ip4_ifs[ub_random_max(w->outnet->rnd, num)]; log_assert(pi); if(addr_is_any(&pi->addr, pi->addrlen)) { /* binding to the ANY interface is for listening sockets */ return 1; } /* set port to 0 */ if(addr_is_ip6(&pi->addr, pi->addrlen)) ((struct sockaddr_in6*)&pi->addr)->sin6_port = 0; else ((struct sockaddr_in*)&pi->addr)->sin_port = 0; if(bind(s, (struct sockaddr*)&pi->addr, pi->addrlen) != 0) { log_err("outgoing tcp: bind: %s", sock_strerror(errno)); sock_close(s); return 0; } log_addr(VERB_ALGO, "tcp bound to src", &pi->addr, pi->addrlen); return 1; } /** get TCP file descriptor for address, returns -1 on failure, * tcp_mss is 0 or maxseg size to set for TCP packets. */ int outnet_get_tcp_fd(struct sockaddr_storage* addr, socklen_t addrlen, int tcp_mss, int dscp) { int s; int af; char* err; #ifdef SO_REUSEADDR int on = 1; #endif #ifdef INET6 if(addr_is_ip6(addr, addrlen)){ s = socket(PF_INET6, SOCK_STREAM, IPPROTO_TCP); af = AF_INET6; } else { #else { #endif af = AF_INET; s = socket(PF_INET, SOCK_STREAM, IPPROTO_TCP); } if(s == -1) { log_err_addr("outgoing tcp: socket", sock_strerror(errno), addr, addrlen); return -1; } #ifdef SO_REUSEADDR if(setsockopt(s, SOL_SOCKET, SO_REUSEADDR, (void*)&on, (socklen_t)sizeof(on)) < 0) { verbose(VERB_ALGO, "outgoing tcp:" " setsockopt(.. SO_REUSEADDR ..) failed"); } #endif err = set_ip_dscp(s, af, dscp); if(err != NULL) { verbose(VERB_ALGO, "outgoing tcp:" "error setting IP DiffServ codepoint on socket"); } if(tcp_mss > 0) { #if defined(IPPROTO_TCP) && defined(TCP_MAXSEG) if(setsockopt(s, IPPROTO_TCP, TCP_MAXSEG, (void*)&tcp_mss, (socklen_t)sizeof(tcp_mss)) < 0) { verbose(VERB_ALGO, "outgoing tcp:" " setsockopt(.. TCP_MAXSEG ..) failed"); } #else verbose(VERB_ALGO, "outgoing tcp:" " setsockopt(TCP_MAXSEG) unsupported"); #endif /* defined(IPPROTO_TCP) && defined(TCP_MAXSEG) */ } return s; } /** connect tcp connection to addr, 0 on failure */ int outnet_tcp_connect(int s, struct sockaddr_storage* addr, socklen_t addrlen) { if(connect(s, (struct sockaddr*)addr, addrlen) == -1) { #ifndef USE_WINSOCK #ifdef EINPROGRESS if(errno != EINPROGRESS) { #endif if(tcp_connect_errno_needs_log( (struct sockaddr*)addr, addrlen)) log_err_addr("outgoing tcp: connect", strerror(errno), addr, addrlen); close(s); return 0; #ifdef EINPROGRESS } #endif #else /* USE_WINSOCK */ if(WSAGetLastError() != WSAEINPROGRESS && WSAGetLastError() != WSAEWOULDBLOCK) { closesocket(s); return 0; } #endif } return 1; } /** log reuse item addr and ptr with message */ static void log_reuse_tcp(enum verbosity_value v, const char* msg, struct reuse_tcp* reuse) { uint16_t port; char addrbuf[128]; if(verbosity < v) return; addr_to_str(&reuse->addr, reuse->addrlen, addrbuf, sizeof(addrbuf)); port = ntohs(((struct sockaddr_in*)&reuse->addr)->sin_port); verbose(v, "%s %s#%u fd %d", msg, addrbuf, (unsigned)port, reuse->pending->c->fd); } /** pop the first element from the writewait list */ static struct waiting_tcp* reuse_write_wait_pop(struct reuse_tcp* reuse) { struct waiting_tcp* w = reuse->write_wait_first; if(!w) return NULL; log_assert(w->write_wait_queued); log_assert(!w->write_wait_prev); reuse->write_wait_first = w->write_wait_next; if(w->write_wait_next) w->write_wait_next->write_wait_prev = NULL; else reuse->write_wait_last = NULL; w->write_wait_queued = 0; return w; } /** remove the element from the writewait list */ static void reuse_write_wait_remove(struct reuse_tcp* reuse, struct waiting_tcp* w) { if(!w) return; if(!w->write_wait_queued) return; if(w->write_wait_prev) w->write_wait_prev->write_wait_next = w->write_wait_next; else reuse->write_wait_first = w->write_wait_next; if(w->write_wait_next) w->write_wait_next->write_wait_prev = w->write_wait_prev; else reuse->write_wait_last = w->write_wait_prev; w->write_wait_queued = 0; } /** push the element after the last on the writewait list */ static void reuse_write_wait_push_back(struct reuse_tcp* reuse, struct waiting_tcp* w) { if(!w) return; log_assert(!w->write_wait_queued); if(reuse->write_wait_last) { reuse->write_wait_last->write_wait_next = w; w->write_wait_prev = reuse->write_wait_last; } else { reuse->write_wait_first = w; } reuse->write_wait_last = w; w->write_wait_queued = 1; } /** insert element in tree by id */ void reuse_tree_by_id_insert(struct reuse_tcp* reuse, struct waiting_tcp* w) { log_assert(w->id_node.key == NULL); w->id_node.key = w; rbtree_insert(&reuse->tree_by_id, &w->id_node); } /** find element in tree by id */ struct waiting_tcp* reuse_tcp_by_id_find(struct reuse_tcp* reuse, uint16_t id) { struct waiting_tcp key_w; rbnode_type* n; memset(&key_w, 0, sizeof(key_w)); key_w.id_node.key = &key_w; key_w.id = id; n = rbtree_search(&reuse->tree_by_id, &key_w); if(!n) return NULL; return (struct waiting_tcp*)n->key; } /** return ID value of rbnode in tree_by_id */ static uint16_t tree_by_id_get_id(rbnode_type* node) { struct waiting_tcp* w = (struct waiting_tcp*)node->key; return w->id; } /** insert into reuse tcp tree and LRU, false on failure (duplicate) */ static int reuse_tcp_insert(struct outside_network* outnet, struct pending_tcp* pend_tcp) { log_reuse_tcp(VERB_CLIENT, "reuse_tcp_insert", &pend_tcp->reuse); if(pend_tcp->reuse.item_on_lru_list) return 1; pend_tcp->reuse.node.key = &pend_tcp->reuse; pend_tcp->reuse.pending = pend_tcp; if(!rbtree_insert(&outnet->tcp_reuse, &pend_tcp->reuse.node)) { /* this is a duplicate connection, close this one */ verbose(VERB_CLIENT, "reuse_tcp_insert: duplicate connection"); pend_tcp->reuse.node.key = NULL; return 0; } /* insert into LRU, first is newest */ pend_tcp->reuse.lru_prev = NULL; if(outnet->tcp_reuse_first) { pend_tcp->reuse.lru_next = outnet->tcp_reuse_first; outnet->tcp_reuse_first->lru_prev = &pend_tcp->reuse; } else { pend_tcp->reuse.lru_next = NULL; outnet->tcp_reuse_last = &pend_tcp->reuse; } outnet->tcp_reuse_first = &pend_tcp->reuse; pend_tcp->reuse.item_on_lru_list = 1; return 1; } /** find reuse tcp stream to destination for query, or NULL if none */ static struct reuse_tcp* reuse_tcp_find(struct outside_network* outnet, struct sockaddr_storage* addr, socklen_t addrlen, int use_ssl) { struct waiting_tcp key_w; struct pending_tcp key_p; struct comm_point c; rbnode_type* result = NULL, *prev; verbose(VERB_CLIENT, "reuse_tcp_find"); memset(&key_w, 0, sizeof(key_w)); memset(&key_p, 0, sizeof(key_p)); memset(&c, 0, sizeof(c)); key_p.query = &key_w; key_p.c = &c; key_p.reuse.pending = &key_p; key_p.reuse.node.key = &key_p.reuse; if(use_ssl) key_p.reuse.is_ssl = 1; if(addrlen > (socklen_t)sizeof(key_p.reuse.addr)) return NULL; memmove(&key_p.reuse.addr, addr, addrlen); key_p.reuse.addrlen = addrlen; verbose(VERB_CLIENT, "reuse_tcp_find: num reuse streams %u", (unsigned)outnet->tcp_reuse.count); if(outnet->tcp_reuse.root == NULL || outnet->tcp_reuse.root == RBTREE_NULL) return NULL; if(rbtree_find_less_equal(&outnet->tcp_reuse, &key_p.reuse.node, &result)) { /* exact match */ /* but the key is on stack, and ptr is compared, impossible */ log_assert(&key_p.reuse != (struct reuse_tcp*)result); log_assert(&key_p != ((struct reuse_tcp*)result)->pending); } /* not found, return null */ if(!result || result == RBTREE_NULL) return NULL; verbose(VERB_CLIENT, "reuse_tcp_find check inexact match"); /* inexact match, find one of possibly several connections to the * same destination address, with the correct port, ssl, and * also less than max number of open queries, or else, fail to open * a new one */ /* rewind to start of sequence of same address,port,ssl */ prev = rbtree_previous(result); while(prev && prev != RBTREE_NULL && reuse_cmp_addrportssl(prev->key, &key_p.reuse) == 0) { result = prev; prev = rbtree_previous(result); } /* loop to find first one that has correct characteristics */ while(result && result != RBTREE_NULL && reuse_cmp_addrportssl(result->key, &key_p.reuse) == 0) { if(((struct reuse_tcp*)result)->tree_by_id.count < MAX_REUSE_TCP_QUERIES) { /* same address, port, ssl-yes-or-no, and has * space for another query */ return (struct reuse_tcp*)result; } result = rbtree_next(result); } return NULL; } /** use the buffer to setup writing the query */ static void outnet_tcp_take_query_setup(int s, struct pending_tcp* pend, struct waiting_tcp* w) { struct timeval tv; verbose(VERB_CLIENT, "outnet_tcp_take_query_setup: setup packet to write " "len %d timeout %d msec", (int)w->pkt_len, w->timeout); pend->c->tcp_write_pkt = w->pkt; pend->c->tcp_write_pkt_len = w->pkt_len; pend->c->tcp_write_and_read = 1; pend->c->tcp_write_byte_count = 0; pend->c->tcp_is_reading = 0; comm_point_start_listening(pend->c, s, -1); /* set timer on the waiting_tcp entry, this is the write timeout * for the written packet. The timer on pend->c is the timer * for when there is no written packet and we have readtimeouts */ #ifndef S_SPLINT_S tv.tv_sec = w->timeout/1000; tv.tv_usec = (w->timeout%1000)*1000; #endif /* if the waiting_tcp was previously waiting for a buffer in the * outside_network.tcpwaitlist, then the timer is reset now that * we start writing it */ comm_timer_set(w->timer, &tv); } /** use next free buffer to service a tcp query */ static int outnet_tcp_take_into_use(struct waiting_tcp* w) { struct pending_tcp* pend = w->outnet->tcp_free; int s; log_assert(pend); log_assert(w->pkt); log_assert(w->pkt_len > 0); log_assert(w->addrlen > 0); pend->c->tcp_do_toggle_rw = 0; pend->c->tcp_do_close = 0; /* open socket */ s = outnet_get_tcp_fd(&w->addr, w->addrlen, w->outnet->tcp_mss, w->outnet->ip_dscp); if(s == -1) return 0; if(!pick_outgoing_tcp(w, s)) return 0; fd_set_nonblock(s); #ifdef USE_OSX_MSG_FASTOPEN /* API for fast open is different here. We use a connectx() function and then writes can happen as normal even using SSL.*/ /* connectx requires that the len be set in the sockaddr struct*/ struct sockaddr_in *addr_in = (struct sockaddr_in *)&w->addr; addr_in->sin_len = w->addrlen; sa_endpoints_t endpoints; endpoints.sae_srcif = 0; endpoints.sae_srcaddr = NULL; endpoints.sae_srcaddrlen = 0; endpoints.sae_dstaddr = (struct sockaddr *)&w->addr; endpoints.sae_dstaddrlen = w->addrlen; if (connectx(s, &endpoints, SAE_ASSOCID_ANY, CONNECT_DATA_IDEMPOTENT | CONNECT_RESUME_ON_READ_WRITE, NULL, 0, NULL, NULL) == -1) { /* if fails, failover to connect for OSX 10.10 */ #ifdef EINPROGRESS if(errno != EINPROGRESS) { #else if(1) { #endif if(connect(s, (struct sockaddr*)&w->addr, w->addrlen) == -1) { #else /* USE_OSX_MSG_FASTOPEN*/ #ifdef USE_MSG_FASTOPEN pend->c->tcp_do_fastopen = 1; /* Only do TFO for TCP in which case no connect() is required here. Don't combine client TFO with SSL, since OpenSSL can't currently support doing a handshake on fd that already isn't connected*/ if (w->outnet->sslctx && w->ssl_upstream) { if(connect(s, (struct sockaddr*)&w->addr, w->addrlen) == -1) { #else /* USE_MSG_FASTOPEN*/ if(connect(s, (struct sockaddr*)&w->addr, w->addrlen) == -1) { #endif /* USE_MSG_FASTOPEN*/ #endif /* USE_OSX_MSG_FASTOPEN*/ #ifndef USE_WINSOCK #ifdef EINPROGRESS if(errno != EINPROGRESS) { #else if(1) { #endif if(tcp_connect_errno_needs_log( (struct sockaddr*)&w->addr, w->addrlen)) log_err_addr("outgoing tcp: connect", strerror(errno), &w->addr, w->addrlen); close(s); #else /* USE_WINSOCK */ if(WSAGetLastError() != WSAEINPROGRESS && WSAGetLastError() != WSAEWOULDBLOCK) { closesocket(s); #endif return 0; } } #ifdef USE_MSG_FASTOPEN } #endif /* USE_MSG_FASTOPEN */ #ifdef USE_OSX_MSG_FASTOPEN } } #endif /* USE_OSX_MSG_FASTOPEN */ if(w->outnet->sslctx && w->ssl_upstream) { pend->c->ssl = outgoing_ssl_fd(w->outnet->sslctx, s); if(!pend->c->ssl) { pend->c->fd = s; comm_point_close(pend->c); return 0; } verbose(VERB_ALGO, "the query is using TLS encryption, for %s", (w->tls_auth_name?w->tls_auth_name:"an unauthenticated connection")); #ifdef USE_WINSOCK comm_point_tcp_win_bio_cb(pend->c, pend->c->ssl); #endif pend->c->ssl_shake_state = comm_ssl_shake_write; if(!set_auth_name_on_ssl(pend->c->ssl, w->tls_auth_name, w->outnet->tls_use_sni)) { pend->c->fd = s; #ifdef HAVE_SSL SSL_free(pend->c->ssl); #endif pend->c->ssl = NULL; comm_point_close(pend->c); return 0; } } w->next_waiting = (void*)pend; w->outnet->num_tcp_outgoing++; w->outnet->tcp_free = pend->next_free; pend->next_free = NULL; pend->query = w; pend->reuse.outnet = w->outnet; pend->c->repinfo.addrlen = w->addrlen; pend->c->tcp_more_read_again = &pend->reuse.cp_more_read_again; pend->c->tcp_more_write_again = &pend->reuse.cp_more_write_again; pend->reuse.cp_more_read_again = 0; pend->reuse.cp_more_write_again = 0; memcpy(&pend->c->repinfo.addr, &w->addr, w->addrlen); pend->reuse.pending = pend; if(pend->c->ssl) pend->reuse.is_ssl = 1; else pend->reuse.is_ssl = 0; /* insert in reuse by address tree if not already inserted there */ (void)reuse_tcp_insert(w->outnet, pend); reuse_tree_by_id_insert(&pend->reuse, w); outnet_tcp_take_query_setup(s, pend, w); return 1; } /** Touch the lru of a reuse_tcp element, it is in use. * This moves it to the front of the list, where it is not likely to * be closed. Items at the back of the list are closed to make space. */ static void reuse_tcp_lru_touch(struct outside_network* outnet, struct reuse_tcp* reuse) { if(!reuse->item_on_lru_list) return; /* not on the list, no lru to modify */ if(!reuse->lru_prev) return; /* already first in the list */ /* remove at current position */ /* since it is not first, there is a previous element */ reuse->lru_prev->lru_next = reuse->lru_next; if(reuse->lru_next) reuse->lru_next->lru_prev = reuse->lru_prev; else outnet->tcp_reuse_last = reuse->lru_prev; /* insert at the front */ reuse->lru_prev = NULL; reuse->lru_next = outnet->tcp_reuse_first; /* since it is not first, it is not the only element and * lru_next is thus not NULL and thus reuse is now not the last in * the list, so outnet->tcp_reuse_last does not need to be modified */ outnet->tcp_reuse_first = reuse; } /** call callback on waiting_tcp, if not NULL */ static void waiting_tcp_callback(struct waiting_tcp* w, struct comm_point* c, int error, struct comm_reply* reply_info) { if(w->cb) { fptr_ok(fptr_whitelist_pending_tcp(w->cb)); (void)(*w->cb)(c, w->cb_arg, error, reply_info); } } /** see if buffers can be used to service TCP queries */ static void use_free_buffer(struct outside_network* outnet) { struct waiting_tcp* w; while(outnet->tcp_free && outnet->tcp_wait_first && !outnet->want_to_quit) { struct reuse_tcp* reuse = NULL; w = outnet->tcp_wait_first; outnet->tcp_wait_first = w->next_waiting; if(outnet->tcp_wait_last == w) outnet->tcp_wait_last = NULL; w->on_tcp_waiting_list = 0; reuse = reuse_tcp_find(outnet, &w->addr, w->addrlen, w->ssl_upstream); if(reuse) { log_reuse_tcp(VERB_CLIENT, "use free buffer for waiting tcp: " "found reuse", reuse); reuse_tcp_lru_touch(outnet, reuse); comm_timer_disable(w->timer); w->next_waiting = (void*)reuse->pending; reuse_tree_by_id_insert(reuse, w); if(reuse->pending->query) { /* on the write wait list */ reuse_write_wait_push_back(reuse, w); } else { /* write straight away */ /* stop the timer on read of the fd */ comm_point_stop_listening(reuse->pending->c); reuse->pending->query = w; outnet_tcp_take_query_setup( reuse->pending->c->fd, reuse->pending, w); } } else { struct pending_tcp* pend = w->outnet->tcp_free; rbtree_init(&pend->reuse.tree_by_id, reuse_id_cmp); pend->reuse.pending = pend; memcpy(&pend->reuse.addr, &w->addr, w->addrlen); pend->reuse.addrlen = w->addrlen; if(!outnet_tcp_take_into_use(w)) { waiting_tcp_callback(w, NULL, NETEVENT_CLOSED, NULL); waiting_tcp_delete(w); } } } } /** add waiting_tcp element to the outnet tcp waiting list */ static void outnet_add_tcp_waiting(struct outside_network* outnet, struct waiting_tcp* w) { struct timeval tv; if(w->on_tcp_waiting_list) return; w->next_waiting = NULL; if(outnet->tcp_wait_last) outnet->tcp_wait_last->next_waiting = w; else outnet->tcp_wait_first = w; outnet->tcp_wait_last = w; w->on_tcp_waiting_list = 1; #ifndef S_SPLINT_S tv.tv_sec = w->timeout/1000; tv.tv_usec = (w->timeout%1000)*1000; #endif comm_timer_set(w->timer, &tv); } /** delete element from tree by id */ static void reuse_tree_by_id_delete(struct reuse_tcp* reuse, struct waiting_tcp* w) { log_assert(w->id_node.key != NULL); rbtree_delete(&reuse->tree_by_id, w); w->id_node.key = NULL; } /** move writewait list to go for another connection. */ static void reuse_move_writewait_away(struct outside_network* outnet, struct pending_tcp* pend) { /* the writewait list has not been written yet, so if the * stream was closed, they have not actually been failed, only * the queries written. Other queries can get written to another * stream. For upstreams that do not support multiple queries * and answers, the stream can get closed, and then the queries * can get written on a new socket */ struct waiting_tcp* w; if(pend->query && pend->query->error_count == 0 && pend->c->tcp_write_pkt == pend->query->pkt && pend->c->tcp_write_pkt_len == pend->query->pkt_len) { /* since the current query is not written, it can also * move to a free buffer */ if(verbosity >= VERB_CLIENT && pend->query->pkt_len > 12+2+2 && LDNS_QDCOUNT(pend->query->pkt) > 0 && dname_valid(pend->query->pkt+12, pend->query->pkt_len-12)) { char buf[LDNS_MAX_DOMAINLEN+1]; dname_str(pend->query->pkt+12, buf); verbose(VERB_CLIENT, "reuse_move_writewait_away current %s %d bytes were written", buf, (int)pend->c->tcp_write_byte_count); } pend->c->tcp_write_pkt = NULL; pend->c->tcp_write_pkt_len = 0; pend->c->tcp_write_and_read = 0; pend->reuse.cp_more_read_again = 0; pend->reuse.cp_more_write_again = 0; pend->c->tcp_is_reading = 1; w = pend->query; pend->query = NULL; /* increase error count, so that if the next socket fails too * the server selection is run again with this query failed * and it can select a different server (if possible), or * fail the query */ w->error_count ++; reuse_tree_by_id_delete(&pend->reuse, w); outnet_add_tcp_waiting(outnet, w); } while((w = reuse_write_wait_pop(&pend->reuse)) != NULL) { if(verbosity >= VERB_CLIENT && w->pkt_len > 12+2+2 && LDNS_QDCOUNT(w->pkt) > 0 && dname_valid(w->pkt+12, w->pkt_len-12)) { char buf[LDNS_MAX_DOMAINLEN+1]; dname_str(w->pkt+12, buf); verbose(VERB_CLIENT, "reuse_move_writewait_away item %s", buf); } reuse_tree_by_id_delete(&pend->reuse, w); outnet_add_tcp_waiting(outnet, w); } } /** remove reused element from tree and lru list */ static void reuse_tcp_remove_tree_list(struct outside_network* outnet, struct reuse_tcp* reuse) { verbose(VERB_CLIENT, "reuse_tcp_remove_tree_list"); if(reuse->node.key) { /* delete it from reuse tree */ (void)rbtree_delete(&outnet->tcp_reuse, &reuse->node); reuse->node.key = NULL; } /* delete from reuse list */ if(reuse->item_on_lru_list) { if(reuse->lru_prev) { /* assert that members of the lru list are waiting * and thus have a pending pointer to the struct */ log_assert(reuse->lru_prev->pending); reuse->lru_prev->lru_next = reuse->lru_next; } else { log_assert(!reuse->lru_next || reuse->lru_next->pending); outnet->tcp_reuse_first = reuse->lru_next; } if(reuse->lru_next) { /* assert that members of the lru list are waiting * and thus have a pending pointer to the struct */ log_assert(reuse->lru_next->pending); reuse->lru_next->lru_prev = reuse->lru_prev; } else { log_assert(!reuse->lru_prev || reuse->lru_prev->pending); outnet->tcp_reuse_last = reuse->lru_prev; } reuse->item_on_lru_list = 0; } } /** helper function that deletes an element from the tree of readwait * elements in tcp reuse structure */ static void reuse_del_readwait_elem(rbnode_type* node, void* ATTR_UNUSED(arg)) { struct waiting_tcp* w = (struct waiting_tcp*)node->key; waiting_tcp_delete(w); } /** delete readwait waiting_tcp elements, deletes the elements in the list */ void reuse_del_readwait(rbtree_type* tree_by_id) { if(tree_by_id->root == NULL || tree_by_id->root == RBTREE_NULL) return; traverse_postorder(tree_by_id, &reuse_del_readwait_elem, NULL); rbtree_init(tree_by_id, reuse_id_cmp); } /** decommission a tcp buffer, closes commpoint and frees waiting_tcp entry */ static void decommission_pending_tcp(struct outside_network* outnet, struct pending_tcp* pend) { verbose(VERB_CLIENT, "decommission_pending_tcp"); pend->next_free = outnet->tcp_free; outnet->tcp_free = pend; if(pend->reuse.node.key) { /* needs unlink from the reuse tree to get deleted */ reuse_tcp_remove_tree_list(outnet, &pend->reuse); } /* free SSL structure after remove from outnet tcp reuse tree, * because the c->ssl null or not is used for sorting in the tree */ if(pend->c->ssl) { #ifdef HAVE_SSL SSL_shutdown(pend->c->ssl); SSL_free(pend->c->ssl); pend->c->ssl = NULL; #endif } comm_point_close(pend->c); pend->reuse.cp_more_read_again = 0; pend->reuse.cp_more_write_again = 0; /* unlink the query and writewait list, it is part of the tree * nodes and is deleted */ pend->query = NULL; pend->reuse.write_wait_first = NULL; pend->reuse.write_wait_last = NULL; reuse_del_readwait(&pend->reuse.tree_by_id); } /** perform failure callbacks for waiting queries in reuse read rbtree */ static void reuse_cb_readwait_for_failure(rbtree_type* tree_by_id, int err) { rbnode_type* node; if(tree_by_id->root == NULL || tree_by_id->root == RBTREE_NULL) return; node = rbtree_first(tree_by_id); while(node && node != RBTREE_NULL) { struct waiting_tcp* w = (struct waiting_tcp*)node->key; waiting_tcp_callback(w, NULL, err, NULL); node = rbtree_next(node); } } /** perform callbacks for failure and also decommission pending tcp. * the callbacks remove references in sq->pending to the waiting_tcp * members of the tree_by_id in the pending tcp. The pending_tcp is * removed before the callbacks, so that the callbacks do not modify * the pending_tcp due to its reference in the outside_network reuse tree */ static void reuse_cb_and_decommission(struct outside_network* outnet, struct pending_tcp* pend, int error) { rbtree_type store; store = pend->reuse.tree_by_id; pend->query = NULL; rbtree_init(&pend->reuse.tree_by_id, reuse_id_cmp); pend->reuse.write_wait_first = NULL; pend->reuse.write_wait_last = NULL; decommission_pending_tcp(outnet, pend); reuse_cb_readwait_for_failure(&store, error); reuse_del_readwait(&store); } /** set timeout on tcp fd and setup read event to catch incoming dns msgs */ static void reuse_tcp_setup_timeout(struct pending_tcp* pend_tcp) { log_reuse_tcp(VERB_CLIENT, "reuse_tcp_setup_timeout", &pend_tcp->reuse); comm_point_start_listening(pend_tcp->c, -1, REUSE_TIMEOUT); } /** set timeout on tcp fd and setup read event to catch incoming dns msgs */ static void reuse_tcp_setup_read_and_timeout(struct pending_tcp* pend_tcp) { log_reuse_tcp(VERB_CLIENT, "reuse_tcp_setup_readtimeout", &pend_tcp->reuse); sldns_buffer_clear(pend_tcp->c->buffer); pend_tcp->c->tcp_is_reading = 1; pend_tcp->c->tcp_byte_count = 0; comm_point_stop_listening(pend_tcp->c); comm_point_start_listening(pend_tcp->c, -1, REUSE_TIMEOUT); } int outnet_tcp_cb(struct comm_point* c, void* arg, int error, struct comm_reply *reply_info) { struct pending_tcp* pend = (struct pending_tcp*)arg; struct outside_network* outnet = pend->reuse.outnet; struct waiting_tcp* w = NULL; verbose(VERB_ALGO, "outnettcp cb"); if(error == NETEVENT_TIMEOUT) { if(pend->c->tcp_write_and_read) { verbose(VERB_QUERY, "outnettcp got tcp timeout " "for read, ignored because write underway"); /* if we are writing, ignore readtimer, wait for write timer * or write is done */ return 0; } else { verbose(VERB_QUERY, "outnettcp got tcp timeout %s", (pend->reuse.tree_by_id.count?"for reading pkt": "for keepalive for reuse")); } /* must be timeout for reading or keepalive reuse, * close it. */ reuse_tcp_remove_tree_list(outnet, &pend->reuse); } else if(error == NETEVENT_PKT_WRITTEN) { /* the packet we want to write has been written. */ verbose(VERB_ALGO, "outnet tcp pkt was written event"); log_assert(c == pend->c); log_assert(pend->query->pkt == pend->c->tcp_write_pkt); log_assert(pend->query->pkt_len == pend->c->tcp_write_pkt_len); pend->c->tcp_write_pkt = NULL; pend->c->tcp_write_pkt_len = 0; /* the pend.query is already in tree_by_id */ log_assert(pend->query->id_node.key); pend->query = NULL; /* setup to write next packet or setup read timeout */ if(pend->reuse.write_wait_first) { verbose(VERB_ALGO, "outnet tcp setup next pkt"); /* we can write it straight away perhaps, set flag * because this callback called after a tcp write * succeeded and likely more buffer space is available * and we can write some more. */ pend->reuse.cp_more_write_again = 1; pend->query = reuse_write_wait_pop(&pend->reuse); comm_point_stop_listening(pend->c); outnet_tcp_take_query_setup(pend->c->fd, pend, pend->query); } else { verbose(VERB_ALGO, "outnet tcp writes done, wait"); pend->c->tcp_write_and_read = 0; pend->reuse.cp_more_read_again = 0; pend->reuse.cp_more_write_again = 0; pend->c->tcp_is_reading = 1; comm_point_stop_listening(pend->c); reuse_tcp_setup_timeout(pend); } return 0; } else if(error != NETEVENT_NOERROR) { verbose(VERB_QUERY, "outnettcp got tcp error %d", error); reuse_move_writewait_away(outnet, pend); /* pass error below and exit */ } else { /* check ID */ if(sldns_buffer_limit(c->buffer) < sizeof(uint16_t)) { log_addr(VERB_QUERY, "outnettcp: bad ID in reply, too short, from:", &pend->reuse.addr, pend->reuse.addrlen); error = NETEVENT_CLOSED; } else { uint16_t id = LDNS_ID_WIRE(sldns_buffer_begin( c->buffer)); /* find the query the reply is for */ w = reuse_tcp_by_id_find(&pend->reuse, id); } } if(error == NETEVENT_NOERROR && !w) { /* no struct waiting found in tree, no reply to call */ log_addr(VERB_QUERY, "outnettcp: bad ID in reply, from:", &pend->reuse.addr, pend->reuse.addrlen); error = NETEVENT_CLOSED; } if(error == NETEVENT_NOERROR) { /* add to reuse tree so it can be reused, if not a failure. * This is possible if the state machine wants to make a tcp * query again to the same destination. */ if(outnet->tcp_reuse.count < outnet->tcp_reuse_max) { (void)reuse_tcp_insert(outnet, pend); } } if(w) { reuse_tree_by_id_delete(&pend->reuse, w); verbose(VERB_CLIENT, "outnet tcp callback query err %d buflen %d", error, (int)sldns_buffer_limit(c->buffer)); waiting_tcp_callback(w, c, error, reply_info); waiting_tcp_delete(w); } verbose(VERB_CLIENT, "outnet_tcp_cb reuse after cb"); if(error == NETEVENT_NOERROR && pend->reuse.node.key) { verbose(VERB_CLIENT, "outnet_tcp_cb reuse after cb: keep it"); /* it is in the reuse_tcp tree, with other queries, or * on the empty list. do not decommission it */ /* if there are more outstanding queries, we could try to * read again, to see if it is on the input, * because this callback called after a successful read * and there could be more bytes to read on the input */ if(pend->reuse.tree_by_id.count != 0) pend->reuse.cp_more_read_again = 1; reuse_tcp_setup_read_and_timeout(pend); return 0; } verbose(VERB_CLIENT, "outnet_tcp_cb reuse after cb: decommission it"); /* no queries on it, no space to keep it. or timeout or closed due * to error. Close it */ reuse_cb_and_decommission(outnet, pend, (error==NETEVENT_TIMEOUT? NETEVENT_TIMEOUT:NETEVENT_CLOSED)); use_free_buffer(outnet); return 0; } /** lower use count on pc, see if it can be closed */ static void portcomm_loweruse(struct outside_network* outnet, struct port_comm* pc) { struct port_if* pif; pc->num_outstanding--; if(pc->num_outstanding > 0) { return; } /* close it and replace in unused list */ verbose(VERB_ALGO, "close of port %d", pc->number); comm_point_close(pc->cp); pif = pc->pif; log_assert(pif->inuse > 0); #ifndef DISABLE_EXPLICIT_PORT_RANDOMISATION pif->avail_ports[pif->avail_total - pif->inuse] = pc->number; #endif pif->inuse--; pif->out[pc->index] = pif->out[pif->inuse]; pif->out[pc->index]->index = pc->index; pc->next = outnet->unused_fds; outnet->unused_fds = pc; } /** try to send waiting UDP queries */ static void outnet_send_wait_udp(struct outside_network* outnet) { struct pending* pend; /* process waiting queries */ while(outnet->udp_wait_first && outnet->unused_fds && !outnet->want_to_quit) { pend = outnet->udp_wait_first; outnet->udp_wait_first = pend->next_waiting; if(!pend->next_waiting) outnet->udp_wait_last = NULL; sldns_buffer_clear(outnet->udp_buff); sldns_buffer_write(outnet->udp_buff, pend->pkt, pend->pkt_len); sldns_buffer_flip(outnet->udp_buff); free(pend->pkt); /* freeing now makes get_mem correct */ pend->pkt = NULL; pend->pkt_len = 0; if(!randomize_and_send_udp(pend, outnet->udp_buff, pend->timeout)) { /* callback error on pending */ if(pend->cb) { fptr_ok(fptr_whitelist_pending_udp(pend->cb)); (void)(*pend->cb)(outnet->unused_fds->cp, pend->cb_arg, NETEVENT_CLOSED, NULL); } pending_delete(outnet, pend); } } } int outnet_udp_cb(struct comm_point* c, void* arg, int error, struct comm_reply *reply_info) { struct outside_network* outnet = (struct outside_network*)arg; struct pending key; struct pending* p; verbose(VERB_ALGO, "answer cb"); if(error != NETEVENT_NOERROR) { verbose(VERB_QUERY, "outnetudp got udp error %d", error); return 0; } if(sldns_buffer_limit(c->buffer) < LDNS_HEADER_SIZE) { verbose(VERB_QUERY, "outnetudp udp too short"); return 0; } log_assert(reply_info); /* setup lookup key */ key.id = (unsigned)LDNS_ID_WIRE(sldns_buffer_begin(c->buffer)); memcpy(&key.addr, &reply_info->addr, reply_info->addrlen); key.addrlen = reply_info->addrlen; verbose(VERB_ALGO, "Incoming reply id = %4.4x", key.id); log_addr(VERB_ALGO, "Incoming reply addr =", &reply_info->addr, reply_info->addrlen); /* find it, see if this thing is a valid query response */ verbose(VERB_ALGO, "lookup size is %d entries", (int)outnet->pending->count); p = (struct pending*)rbtree_search(outnet->pending, &key); if(!p) { verbose(VERB_QUERY, "received unwanted or unsolicited udp reply dropped."); log_buf(VERB_ALGO, "dropped message", c->buffer); outnet->unwanted_replies++; if(outnet->unwanted_threshold && ++outnet->unwanted_total >= outnet->unwanted_threshold) { log_warn("unwanted reply total reached threshold (%u)" " you may be under attack." " defensive action: clearing the cache", (unsigned)outnet->unwanted_threshold); fptr_ok(fptr_whitelist_alloc_cleanup( outnet->unwanted_action)); (*outnet->unwanted_action)(outnet->unwanted_param); outnet->unwanted_total = 0; } return 0; } verbose(VERB_ALGO, "received udp reply."); log_buf(VERB_ALGO, "udp message", c->buffer); if(p->pc->cp != c) { verbose(VERB_QUERY, "received reply id,addr on wrong port. " "dropped."); outnet->unwanted_replies++; if(outnet->unwanted_threshold && ++outnet->unwanted_total >= outnet->unwanted_threshold) { log_warn("unwanted reply total reached threshold (%u)" " you may be under attack." " defensive action: clearing the cache", (unsigned)outnet->unwanted_threshold); fptr_ok(fptr_whitelist_alloc_cleanup( outnet->unwanted_action)); (*outnet->unwanted_action)(outnet->unwanted_param); outnet->unwanted_total = 0; } return 0; } comm_timer_disable(p->timer); verbose(VERB_ALGO, "outnet handle udp reply"); /* delete from tree first in case callback creates a retry */ (void)rbtree_delete(outnet->pending, p->node.key); if(p->cb) { fptr_ok(fptr_whitelist_pending_udp(p->cb)); (void)(*p->cb)(p->pc->cp, p->cb_arg, NETEVENT_NOERROR, reply_info); } portcomm_loweruse(outnet, p->pc); pending_delete(NULL, p); outnet_send_wait_udp(outnet); return 0; } /** calculate number of ip4 and ip6 interfaces*/ static void calc_num46(char** ifs, int num_ifs, int do_ip4, int do_ip6, int* num_ip4, int* num_ip6) { int i; *num_ip4 = 0; *num_ip6 = 0; if(num_ifs <= 0) { if(do_ip4) *num_ip4 = 1; if(do_ip6) *num_ip6 = 1; return; } for(i=0; ioutnet; verbose(VERB_ALGO, "timeout udp with delay"); portcomm_loweruse(outnet, p->pc); pending_delete(outnet, p); outnet_send_wait_udp(outnet); } void pending_udp_timer_cb(void *arg) { struct pending* p = (struct pending*)arg; struct outside_network* outnet = p->outnet; /* it timed out */ verbose(VERB_ALGO, "timeout udp"); if(p->cb) { fptr_ok(fptr_whitelist_pending_udp(p->cb)); (void)(*p->cb)(p->pc->cp, p->cb_arg, NETEVENT_TIMEOUT, NULL); } /* if delayclose, keep port open for a longer time. * But if the udpwaitlist exists, then we are struggling to * keep up with demand for sockets, so do not wait, but service * the customer (customer service more important than portICMPs) */ if(outnet->delayclose && !outnet->udp_wait_first) { p->cb = NULL; p->timer->callback = &pending_udp_timer_delay_cb; comm_timer_set(p->timer, &outnet->delay_tv); return; } portcomm_loweruse(outnet, p->pc); pending_delete(outnet, p); outnet_send_wait_udp(outnet); } /** create pending_tcp buffers */ static int create_pending_tcp(struct outside_network* outnet, size_t bufsize) { size_t i; if(outnet->num_tcp == 0) return 1; /* no tcp needed, nothing to do */ if(!(outnet->tcp_conns = (struct pending_tcp **)calloc( outnet->num_tcp, sizeof(struct pending_tcp*)))) return 0; for(i=0; inum_tcp; i++) { if(!(outnet->tcp_conns[i] = (struct pending_tcp*)calloc(1, sizeof(struct pending_tcp)))) return 0; outnet->tcp_conns[i]->next_free = outnet->tcp_free; outnet->tcp_free = outnet->tcp_conns[i]; outnet->tcp_conns[i]->c = comm_point_create_tcp_out( outnet->base, bufsize, outnet_tcp_cb, outnet->tcp_conns[i]); if(!outnet->tcp_conns[i]->c) return 0; } return 1; } /** setup an outgoing interface, ready address */ static int setup_if(struct port_if* pif, const char* addrstr, int* avail, int numavail, size_t numfd) { #ifndef DISABLE_EXPLICIT_PORT_RANDOMISATION pif->avail_total = numavail; pif->avail_ports = (int*)memdup(avail, (size_t)numavail*sizeof(int)); if(!pif->avail_ports) return 0; #endif if(!ipstrtoaddr(addrstr, UNBOUND_DNS_PORT, &pif->addr, &pif->addrlen) && !netblockstrtoaddr(addrstr, UNBOUND_DNS_PORT, &pif->addr, &pif->addrlen, &pif->pfxlen)) return 0; pif->maxout = (int)numfd; pif->inuse = 0; pif->out = (struct port_comm**)calloc(numfd, sizeof(struct port_comm*)); if(!pif->out) return 0; return 1; } struct outside_network* outside_network_create(struct comm_base *base, size_t bufsize, size_t num_ports, char** ifs, int num_ifs, int do_ip4, int do_ip6, size_t num_tcp, int dscp, struct infra_cache* infra, struct ub_randstate* rnd, int use_caps_for_id, int* availports, int numavailports, size_t unwanted_threshold, int tcp_mss, void (*unwanted_action)(void*), void* unwanted_param, int do_udp, void* sslctx, int delayclose, int tls_use_sni, struct dt_env* dtenv, int udp_connect) { struct outside_network* outnet = (struct outside_network*) calloc(1, sizeof(struct outside_network)); size_t k; if(!outnet) { log_err("malloc failed"); return NULL; } comm_base_timept(base, &outnet->now_secs, &outnet->now_tv); outnet->base = base; outnet->num_tcp = num_tcp; outnet->num_tcp_outgoing = 0; outnet->infra = infra; outnet->rnd = rnd; outnet->sslctx = sslctx; outnet->tls_use_sni = tls_use_sni; #ifdef USE_DNSTAP outnet->dtenv = dtenv; #else (void)dtenv; #endif outnet->svcd_overhead = 0; outnet->want_to_quit = 0; outnet->unwanted_threshold = unwanted_threshold; outnet->unwanted_action = unwanted_action; outnet->unwanted_param = unwanted_param; outnet->use_caps_for_id = use_caps_for_id; outnet->do_udp = do_udp; outnet->tcp_mss = tcp_mss; outnet->ip_dscp = dscp; #ifndef S_SPLINT_S if(delayclose) { outnet->delayclose = 1; outnet->delay_tv.tv_sec = delayclose/1000; outnet->delay_tv.tv_usec = (delayclose%1000)*1000; } #endif if(udp_connect) { outnet->udp_connect = 1; } if(numavailports == 0 || num_ports == 0) { log_err("no outgoing ports available"); outside_network_delete(outnet); return NULL; } #ifndef INET6 do_ip6 = 0; #endif calc_num46(ifs, num_ifs, do_ip4, do_ip6, &outnet->num_ip4, &outnet->num_ip6); if(outnet->num_ip4 != 0) { if(!(outnet->ip4_ifs = (struct port_if*)calloc( (size_t)outnet->num_ip4, sizeof(struct port_if)))) { log_err("malloc failed"); outside_network_delete(outnet); return NULL; } } if(outnet->num_ip6 != 0) { if(!(outnet->ip6_ifs = (struct port_if*)calloc( (size_t)outnet->num_ip6, sizeof(struct port_if)))) { log_err("malloc failed"); outside_network_delete(outnet); return NULL; } } if( !(outnet->udp_buff = sldns_buffer_new(bufsize)) || !(outnet->pending = rbtree_create(pending_cmp)) || !(outnet->serviced = rbtree_create(serviced_cmp)) || !create_pending_tcp(outnet, bufsize)) { log_err("malloc failed"); outside_network_delete(outnet); return NULL; } rbtree_init(&outnet->tcp_reuse, reuse_cmp); outnet->tcp_reuse_max = num_tcp; /* allocate commpoints */ for(k=0; kcp = comm_point_create_udp(outnet->base, -1, outnet->udp_buff, outnet_udp_cb, outnet); if(!pc->cp) { log_err("malloc failed"); free(pc); outside_network_delete(outnet); return NULL; } pc->next = outnet->unused_fds; outnet->unused_fds = pc; } /* allocate interfaces */ if(num_ifs == 0) { if(do_ip4 && !setup_if(&outnet->ip4_ifs[0], "0.0.0.0", availports, numavailports, num_ports)) { log_err("malloc failed"); outside_network_delete(outnet); return NULL; } if(do_ip6 && !setup_if(&outnet->ip6_ifs[0], "::", availports, numavailports, num_ports)) { log_err("malloc failed"); outside_network_delete(outnet); return NULL; } } else { size_t done_4 = 0, done_6 = 0; int i; for(i=0; iip6_ifs[done_6], ifs[i], availports, numavailports, num_ports)){ log_err("malloc failed"); outside_network_delete(outnet); return NULL; } done_6++; } if(!str_is_ip6(ifs[i]) && do_ip4) { if(!setup_if(&outnet->ip4_ifs[done_4], ifs[i], availports, numavailports, num_ports)){ log_err("malloc failed"); outside_network_delete(outnet); return NULL; } done_4++; } } } return outnet; } /** helper pending delete */ static void pending_node_del(rbnode_type* node, void* arg) { struct pending* pend = (struct pending*)node; struct outside_network* outnet = (struct outside_network*)arg; pending_delete(outnet, pend); } /** helper serviced delete */ static void serviced_node_del(rbnode_type* node, void* ATTR_UNUSED(arg)) { struct serviced_query* sq = (struct serviced_query*)node; struct service_callback* p = sq->cblist, *np; free(sq->qbuf); free(sq->zone); free(sq->tls_auth_name); edns_opt_list_free(sq->opt_list); while(p) { np = p->next; free(p); p = np; } free(sq); } void outside_network_quit_prepare(struct outside_network* outnet) { if(!outnet) return; /* prevent queued items from being sent */ outnet->want_to_quit = 1; } void outside_network_delete(struct outside_network* outnet) { if(!outnet) return; outnet->want_to_quit = 1; /* check every element, since we can be called on malloc error */ if(outnet->pending) { /* free pending elements, but do no unlink from tree. */ traverse_postorder(outnet->pending, pending_node_del, NULL); free(outnet->pending); } if(outnet->serviced) { traverse_postorder(outnet->serviced, serviced_node_del, NULL); free(outnet->serviced); } if(outnet->udp_buff) sldns_buffer_free(outnet->udp_buff); if(outnet->unused_fds) { struct port_comm* p = outnet->unused_fds, *np; while(p) { np = p->next; comm_point_delete(p->cp); free(p); p = np; } outnet->unused_fds = NULL; } if(outnet->ip4_ifs) { int i, k; for(i=0; inum_ip4; i++) { for(k=0; kip4_ifs[i].inuse; k++) { struct port_comm* pc = outnet->ip4_ifs[i]. out[k]; comm_point_delete(pc->cp); free(pc); } #ifndef DISABLE_EXPLICIT_PORT_RANDOMISATION free(outnet->ip4_ifs[i].avail_ports); #endif free(outnet->ip4_ifs[i].out); } free(outnet->ip4_ifs); } if(outnet->ip6_ifs) { int i, k; for(i=0; inum_ip6; i++) { for(k=0; kip6_ifs[i].inuse; k++) { struct port_comm* pc = outnet->ip6_ifs[i]. out[k]; comm_point_delete(pc->cp); free(pc); } #ifndef DISABLE_EXPLICIT_PORT_RANDOMISATION free(outnet->ip6_ifs[i].avail_ports); #endif free(outnet->ip6_ifs[i].out); } free(outnet->ip6_ifs); } if(outnet->tcp_conns) { size_t i; for(i=0; inum_tcp; i++) if(outnet->tcp_conns[i]) { if(outnet->tcp_conns[i]->query && !outnet->tcp_conns[i]->query-> on_tcp_waiting_list) { /* delete waiting_tcp elements that * the tcp conn is working on */ struct pending_tcp* pend = (struct pending_tcp*)outnet-> tcp_conns[i]->query-> next_waiting; decommission_pending_tcp(outnet, pend); } comm_point_delete(outnet->tcp_conns[i]->c); waiting_tcp_delete(outnet->tcp_conns[i]->query); free(outnet->tcp_conns[i]); } free(outnet->tcp_conns); } if(outnet->tcp_wait_first) { struct waiting_tcp* p = outnet->tcp_wait_first, *np; while(p) { np = p->next_waiting; waiting_tcp_delete(p); p = np; } } /* was allocated in struct pending that was deleted above */ rbtree_init(&outnet->tcp_reuse, reuse_cmp); outnet->tcp_reuse_first = NULL; outnet->tcp_reuse_last = NULL; if(outnet->udp_wait_first) { struct pending* p = outnet->udp_wait_first, *np; while(p) { np = p->next_waiting; pending_delete(NULL, p); p = np; } } free(outnet); } void pending_delete(struct outside_network* outnet, struct pending* p) { if(!p) return; if(outnet && outnet->udp_wait_first && (p->next_waiting || p == outnet->udp_wait_last) ) { /* delete from waiting list, if it is in the waiting list */ struct pending* prev = NULL, *x = outnet->udp_wait_first; while(x && x != p) { prev = x; x = x->next_waiting; } if(x) { log_assert(x == p); if(prev) prev->next_waiting = p->next_waiting; else outnet->udp_wait_first = p->next_waiting; if(outnet->udp_wait_last == p) outnet->udp_wait_last = prev; } } if(outnet) { (void)rbtree_delete(outnet->pending, p->node.key); } if(p->timer) comm_timer_delete(p->timer); free(p->pkt); free(p); } static void sai6_putrandom(struct sockaddr_in6 *sa, int pfxlen, struct ub_randstate *rnd) { int i, last; if(!(pfxlen > 0 && pfxlen < 128)) return; for(i = 0; i < (128 - pfxlen) / 8; i++) { sa->sin6_addr.s6_addr[15-i] = (uint8_t)ub_random_max(rnd, 256); } last = pfxlen & 7; if(last != 0) { sa->sin6_addr.s6_addr[15-i] |= ((0xFF >> last) & ub_random_max(rnd, 256)); } } /** * Try to open a UDP socket for outgoing communication. * Sets sockets options as needed. * @param addr: socket address. * @param addrlen: length of address. * @param pfxlen: length of network prefix (for address randomisation). * @param port: port override for addr. * @param inuse: if -1 is returned, this bool means the port was in use. * @param rnd: random state (for address randomisation). * @param dscp: DSCP to use. * @return fd or -1 */ static int udp_sockport(struct sockaddr_storage* addr, socklen_t addrlen, int pfxlen, int port, int* inuse, struct ub_randstate* rnd, int dscp) { int fd, noproto; if(addr_is_ip6(addr, addrlen)) { int freebind = 0; struct sockaddr_in6 sa = *(struct sockaddr_in6*)addr; sa.sin6_port = (in_port_t)htons((uint16_t)port); sa.sin6_flowinfo = 0; sa.sin6_scope_id = 0; if(pfxlen != 0) { freebind = 1; sai6_putrandom(&sa, pfxlen, rnd); } fd = create_udp_sock(AF_INET6, SOCK_DGRAM, (struct sockaddr*)&sa, addrlen, 1, inuse, &noproto, 0, 0, 0, NULL, 0, freebind, 0, dscp); } else { struct sockaddr_in* sa = (struct sockaddr_in*)addr; sa->sin_port = (in_port_t)htons((uint16_t)port); fd = create_udp_sock(AF_INET, SOCK_DGRAM, (struct sockaddr*)addr, addrlen, 1, inuse, &noproto, 0, 0, 0, NULL, 0, 0, 0, dscp); } return fd; } /** Select random ID */ static int select_id(struct outside_network* outnet, struct pending* pend, sldns_buffer* packet) { int id_tries = 0; pend->id = ((unsigned)ub_random(outnet->rnd)>>8) & 0xffff; LDNS_ID_SET(sldns_buffer_begin(packet), pend->id); /* insert in tree */ pend->node.key = pend; while(!rbtree_insert(outnet->pending, &pend->node)) { /* change ID to avoid collision */ pend->id = ((unsigned)ub_random(outnet->rnd)>>8) & 0xffff; LDNS_ID_SET(sldns_buffer_begin(packet), pend->id); id_tries++; if(id_tries == MAX_ID_RETRY) { pend->id=99999; /* non existant ID */ log_err("failed to generate unique ID, drop msg"); return 0; } } verbose(VERB_ALGO, "inserted new pending reply id=%4.4x", pend->id); return 1; } /** Select random interface and port */ static int select_ifport(struct outside_network* outnet, struct pending* pend, int num_if, struct port_if* ifs) { int my_if, my_port, fd, portno, inuse, tries=0; struct port_if* pif; /* randomly select interface and port */ if(num_if == 0) { verbose(VERB_QUERY, "Need to send query but have no " "outgoing interfaces of that family"); return 0; } log_assert(outnet->unused_fds); tries = 0; while(1) { my_if = ub_random_max(outnet->rnd, num_if); pif = &ifs[my_if]; #ifndef DISABLE_EXPLICIT_PORT_RANDOMISATION if(outnet->udp_connect) { /* if we connect() we cannot reuse fds for a port */ if(pif->inuse >= pif->avail_total) { tries++; if(tries < MAX_PORT_RETRY) continue; log_err("failed to find an open port, drop msg"); return 0; } my_port = pif->inuse + ub_random_max(outnet->rnd, pif->avail_total - pif->inuse); } else { my_port = ub_random_max(outnet->rnd, pif->avail_total); if(my_port < pif->inuse) { /* port already open */ pend->pc = pif->out[my_port]; verbose(VERB_ALGO, "using UDP if=%d port=%d", my_if, pend->pc->number); break; } } /* try to open new port, if fails, loop to try again */ log_assert(pif->inuse < pif->maxout); portno = pif->avail_ports[my_port - pif->inuse]; #else my_port = portno = 0; #endif fd = udp_sockport(&pif->addr, pif->addrlen, pif->pfxlen, portno, &inuse, outnet->rnd, outnet->ip_dscp); if(fd == -1 && !inuse) { /* nonrecoverable error making socket */ return 0; } if(fd != -1) { verbose(VERB_ALGO, "opened UDP if=%d port=%d", my_if, portno); if(outnet->udp_connect) { /* connect() to the destination */ if(connect(fd, (struct sockaddr*)&pend->addr, pend->addrlen) < 0) { log_err_addr("udp connect failed", strerror(errno), &pend->addr, pend->addrlen); sock_close(fd); return 0; } } /* grab fd */ pend->pc = outnet->unused_fds; outnet->unused_fds = pend->pc->next; /* setup portcomm */ pend->pc->next = NULL; pend->pc->number = portno; pend->pc->pif = pif; pend->pc->index = pif->inuse; pend->pc->num_outstanding = 0; comm_point_start_listening(pend->pc->cp, fd, -1); /* grab port in interface */ pif->out[pif->inuse] = pend->pc; #ifndef DISABLE_EXPLICIT_PORT_RANDOMISATION pif->avail_ports[my_port - pif->inuse] = pif->avail_ports[pif->avail_total-pif->inuse-1]; #endif pif->inuse++; break; } /* failed, already in use */ verbose(VERB_QUERY, "port %d in use, trying another", portno); tries++; if(tries == MAX_PORT_RETRY) { log_err("failed to find an open port, drop msg"); return 0; } } log_assert(pend->pc); pend->pc->num_outstanding++; return 1; } static int randomize_and_send_udp(struct pending* pend, sldns_buffer* packet, int timeout) { struct timeval tv; struct outside_network* outnet = pend->sq->outnet; /* select id */ if(!select_id(outnet, pend, packet)) { return 0; } /* select src_if, port */ if(addr_is_ip6(&pend->addr, pend->addrlen)) { if(!select_ifport(outnet, pend, outnet->num_ip6, outnet->ip6_ifs)) return 0; } else { if(!select_ifport(outnet, pend, outnet->num_ip4, outnet->ip4_ifs)) return 0; } log_assert(pend->pc && pend->pc->cp); /* send it over the commlink */ - if(outnet->udp_connect) { - if(!comm_point_send_udp_msg(pend->pc->cp, packet, NULL, 0)) { - portcomm_loweruse(outnet, pend->pc); - return 0; - } - } else { - if(!comm_point_send_udp_msg(pend->pc->cp, packet, - (struct sockaddr*)&pend->addr, pend->addrlen)) { - portcomm_loweruse(outnet, pend->pc); - return 0; - } + if(!comm_point_send_udp_msg(pend->pc->cp, packet, + (struct sockaddr*)&pend->addr, pend->addrlen, outnet->udp_connect)) { + portcomm_loweruse(outnet, pend->pc); + return 0; } /* system calls to set timeout after sending UDP to make roundtrip smaller. */ #ifndef S_SPLINT_S tv.tv_sec = timeout/1000; tv.tv_usec = (timeout%1000)*1000; #endif comm_timer_set(pend->timer, &tv); #ifdef USE_DNSTAP if(outnet->dtenv && (outnet->dtenv->log_resolver_query_messages || outnet->dtenv->log_forwarder_query_messages)) dt_msg_send_outside_query(outnet->dtenv, &pend->addr, comm_udp, pend->sq->zone, pend->sq->zonelen, packet); #endif return 1; } struct pending* pending_udp_query(struct serviced_query* sq, struct sldns_buffer* packet, int timeout, comm_point_callback_type* cb, void* cb_arg) { struct pending* pend = (struct pending*)calloc(1, sizeof(*pend)); if(!pend) return NULL; pend->outnet = sq->outnet; pend->sq = sq; pend->addrlen = sq->addrlen; memmove(&pend->addr, &sq->addr, sq->addrlen); pend->cb = cb; pend->cb_arg = cb_arg; pend->node.key = pend; pend->timer = comm_timer_create(sq->outnet->base, pending_udp_timer_cb, pend); if(!pend->timer) { free(pend); return NULL; } if(sq->outnet->unused_fds == NULL) { /* no unused fd, cannot create a new port (randomly) */ verbose(VERB_ALGO, "no fds available, udp query waiting"); pend->timeout = timeout; pend->pkt_len = sldns_buffer_limit(packet); pend->pkt = (uint8_t*)memdup(sldns_buffer_begin(packet), pend->pkt_len); if(!pend->pkt) { comm_timer_delete(pend->timer); free(pend); return NULL; } /* put at end of waiting list */ if(sq->outnet->udp_wait_last) sq->outnet->udp_wait_last->next_waiting = pend; else sq->outnet->udp_wait_first = pend; sq->outnet->udp_wait_last = pend; return pend; } if(!randomize_and_send_udp(pend, packet, timeout)) { pending_delete(sq->outnet, pend); return NULL; } return pend; } void outnet_tcptimer(void* arg) { struct waiting_tcp* w = (struct waiting_tcp*)arg; struct outside_network* outnet = w->outnet; verbose(VERB_CLIENT, "outnet_tcptimer"); if(w->on_tcp_waiting_list) { /* it is on the waiting list */ waiting_list_remove(outnet, w); waiting_tcp_callback(w, NULL, NETEVENT_TIMEOUT, NULL); waiting_tcp_delete(w); } else { /* it was in use */ struct pending_tcp* pend=(struct pending_tcp*)w->next_waiting; reuse_cb_and_decommission(outnet, pend, NETEVENT_TIMEOUT); } use_free_buffer(outnet); } /** close the oldest reuse_tcp connection to make a fd and struct pend * available for a new stream connection */ static void reuse_tcp_close_oldest(struct outside_network* outnet) { struct pending_tcp* pend; verbose(VERB_CLIENT, "reuse_tcp_close_oldest"); if(!outnet->tcp_reuse_last) return; pend = outnet->tcp_reuse_last->pending; /* snip off of LRU */ log_assert(pend->reuse.lru_next == NULL); if(pend->reuse.lru_prev) { outnet->tcp_reuse_last = pend->reuse.lru_prev; pend->reuse.lru_prev->lru_next = NULL; } else { outnet->tcp_reuse_last = NULL; outnet->tcp_reuse_first = NULL; } pend->reuse.item_on_lru_list = 0; /* free up */ reuse_cb_and_decommission(outnet, pend, NETEVENT_CLOSED); } /** find spare ID value for reuse tcp stream. That is random and also does * not collide with an existing query ID that is in use or waiting */ uint16_t reuse_tcp_select_id(struct reuse_tcp* reuse, struct outside_network* outnet) { uint16_t id = 0, curid, nextid; const int try_random = 2000; int i; unsigned select, count, space; rbnode_type* node; /* make really sure the tree is not empty */ if(reuse->tree_by_id.count == 0) { id = ((unsigned)ub_random(outnet->rnd)>>8) & 0xffff; return id; } /* try to find random empty spots by picking them */ for(i = 0; irnd)>>8) & 0xffff; if(!reuse_tcp_by_id_find(reuse, id)) { return id; } } /* equally pick a random unused element from the tree that is * not in use. Pick a the n-th index of an ununused number, * then loop over the empty spaces in the tree and find it */ log_assert(reuse->tree_by_id.count < 0xffff); select = ub_random_max(outnet->rnd, 0xffff - reuse->tree_by_id.count); /* select value now in 0 .. num free - 1 */ count = 0; /* number of free spaces passed by */ node = rbtree_first(&reuse->tree_by_id); log_assert(node && node != RBTREE_NULL); /* tree not empty */ /* see if select is before first node */ if(select < tree_by_id_get_id(node)) return select; count += tree_by_id_get_id(node); /* perhaps select is between nodes */ while(node && node != RBTREE_NULL) { rbnode_type* next = rbtree_next(node); if(next && next != RBTREE_NULL) { curid = tree_by_id_get_id(node); nextid = tree_by_id_get_id(next); log_assert(curid < nextid); if(curid != 0xffff && curid + 1 < nextid) { /* space between nodes */ space = nextid - curid - 1; log_assert(select >= count); if(select < count + space) { /* here it is */ return curid + 1 + (select - count); } count += space; } } node = next; } /* select is after the last node */ /* count is the number of free positions before the nodes in the * tree */ node = rbtree_last(&reuse->tree_by_id); log_assert(node && node != RBTREE_NULL); /* tree not empty */ curid = tree_by_id_get_id(node); log_assert(count + (0xffff-curid) + reuse->tree_by_id.count == 0xffff); return curid + 1 + (select - count); } struct waiting_tcp* pending_tcp_query(struct serviced_query* sq, sldns_buffer* packet, int timeout, comm_point_callback_type* callback, void* callback_arg) { struct pending_tcp* pend = sq->outnet->tcp_free; struct reuse_tcp* reuse = NULL; struct waiting_tcp* w; verbose(VERB_CLIENT, "pending_tcp_query"); if(sldns_buffer_limit(packet) < sizeof(uint16_t)) { verbose(VERB_ALGO, "pending tcp query with too short buffer < 2"); return NULL; } /* find out if a reused stream to the target exists */ /* if so, take it into use */ reuse = reuse_tcp_find(sq->outnet, &sq->addr, sq->addrlen, sq->ssl_upstream); if(reuse) { log_reuse_tcp(VERB_CLIENT, "pending_tcp_query: found reuse", reuse); log_assert(reuse->pending); pend = reuse->pending; reuse_tcp_lru_touch(sq->outnet, reuse); } /* if !pend but we have reuse streams, close a reuse stream * to be able to open a new one to this target, no use waiting * to reuse a file descriptor while another query needs to use * that buffer and file descriptor now. */ if(!pend) { reuse_tcp_close_oldest(sq->outnet); pend = sq->outnet->tcp_free; } /* allocate space to store query */ w = (struct waiting_tcp*)malloc(sizeof(struct waiting_tcp) + sldns_buffer_limit(packet)); if(!w) { return NULL; } if(!(w->timer = comm_timer_create(sq->outnet->base, outnet_tcptimer, w))) { free(w); return NULL; } w->pkt = (uint8_t*)w + sizeof(struct waiting_tcp); w->pkt_len = sldns_buffer_limit(packet); memmove(w->pkt, sldns_buffer_begin(packet), w->pkt_len); if(reuse) w->id = reuse_tcp_select_id(reuse, sq->outnet); else w->id = ((unsigned)ub_random(sq->outnet->rnd)>>8) & 0xffff; LDNS_ID_SET(w->pkt, w->id); memcpy(&w->addr, &sq->addr, sq->addrlen); w->addrlen = sq->addrlen; w->outnet = sq->outnet; w->on_tcp_waiting_list = 0; w->next_waiting = NULL; w->cb = callback; w->cb_arg = callback_arg; w->ssl_upstream = sq->ssl_upstream; w->tls_auth_name = sq->tls_auth_name; w->timeout = timeout; w->id_node.key = NULL; w->write_wait_prev = NULL; w->write_wait_next = NULL; w->write_wait_queued = 0; w->error_count = 0; if(pend) { /* we have a buffer available right now */ if(reuse) { /* reuse existing fd, write query and continue */ /* store query in tree by id */ verbose(VERB_CLIENT, "pending_tcp_query: reuse, store"); w->next_waiting = (void*)pend; reuse_tree_by_id_insert(&pend->reuse, w); /* can we write right now? */ if(pend->query == NULL) { /* write straight away */ /* stop the timer on read of the fd */ comm_point_stop_listening(pend->c); pend->query = w; outnet_tcp_take_query_setup(pend->c->fd, pend, w); } else { /* put it in the waiting list for * this stream */ reuse_write_wait_push_back(&pend->reuse, w); } } else { /* create new fd and connect to addr, setup to * write query */ verbose(VERB_CLIENT, "pending_tcp_query: new fd, connect"); rbtree_init(&pend->reuse.tree_by_id, reuse_id_cmp); pend->reuse.pending = pend; memcpy(&pend->reuse.addr, &sq->addr, sq->addrlen); pend->reuse.addrlen = sq->addrlen; if(!outnet_tcp_take_into_use(w)) { waiting_tcp_delete(w); return NULL; } } } else { /* queue up */ /* waiting for a buffer on the outside network buffer wait * list */ verbose(VERB_CLIENT, "pending_tcp_query: queue to wait"); outnet_add_tcp_waiting(sq->outnet, w); } #ifdef USE_DNSTAP if(sq->outnet->dtenv && (sq->outnet->dtenv->log_resolver_query_messages || sq->outnet->dtenv->log_forwarder_query_messages)) dt_msg_send_outside_query(sq->outnet->dtenv, &sq->addr, comm_tcp, sq->zone, sq->zonelen, packet); #endif return w; } /** create query for serviced queries */ static void serviced_gen_query(sldns_buffer* buff, uint8_t* qname, size_t qnamelen, uint16_t qtype, uint16_t qclass, uint16_t flags) { sldns_buffer_clear(buff); /* skip id */ sldns_buffer_write_u16(buff, flags); sldns_buffer_write_u16(buff, 1); /* qdcount */ sldns_buffer_write_u16(buff, 0); /* ancount */ sldns_buffer_write_u16(buff, 0); /* nscount */ sldns_buffer_write_u16(buff, 0); /* arcount */ sldns_buffer_write(buff, qname, qnamelen); sldns_buffer_write_u16(buff, qtype); sldns_buffer_write_u16(buff, qclass); sldns_buffer_flip(buff); } /** lookup serviced query in serviced query rbtree */ static struct serviced_query* lookup_serviced(struct outside_network* outnet, sldns_buffer* buff, int dnssec, struct sockaddr_storage* addr, socklen_t addrlen, struct edns_option* opt_list) { struct serviced_query key; key.node.key = &key; key.qbuf = sldns_buffer_begin(buff); key.qbuflen = sldns_buffer_limit(buff); key.dnssec = dnssec; memcpy(&key.addr, addr, addrlen); key.addrlen = addrlen; key.outnet = outnet; key.opt_list = opt_list; return (struct serviced_query*)rbtree_search(outnet->serviced, &key); } /** Create new serviced entry */ static struct serviced_query* serviced_create(struct outside_network* outnet, sldns_buffer* buff, int dnssec, int want_dnssec, int nocaps, int tcp_upstream, int ssl_upstream, char* tls_auth_name, struct sockaddr_storage* addr, socklen_t addrlen, uint8_t* zone, size_t zonelen, int qtype, struct edns_option* opt_list) { struct serviced_query* sq = (struct serviced_query*)malloc(sizeof(*sq)); #ifdef UNBOUND_DEBUG rbnode_type* ins; #endif if(!sq) return NULL; sq->node.key = sq; sq->qbuf = memdup(sldns_buffer_begin(buff), sldns_buffer_limit(buff)); if(!sq->qbuf) { free(sq); return NULL; } sq->qbuflen = sldns_buffer_limit(buff); sq->zone = memdup(zone, zonelen); if(!sq->zone) { free(sq->qbuf); free(sq); return NULL; } sq->zonelen = zonelen; sq->qtype = qtype; sq->dnssec = dnssec; sq->want_dnssec = want_dnssec; sq->nocaps = nocaps; sq->tcp_upstream = tcp_upstream; sq->ssl_upstream = ssl_upstream; if(tls_auth_name) { sq->tls_auth_name = strdup(tls_auth_name); if(!sq->tls_auth_name) { free(sq->zone); free(sq->qbuf); free(sq); return NULL; } } else { sq->tls_auth_name = NULL; } memcpy(&sq->addr, addr, addrlen); sq->addrlen = addrlen; sq->opt_list = NULL; if(opt_list) { sq->opt_list = edns_opt_copy_alloc(opt_list); if(!sq->opt_list) { free(sq->tls_auth_name); free(sq->zone); free(sq->qbuf); free(sq); return NULL; } } sq->outnet = outnet; sq->cblist = NULL; sq->pending = NULL; sq->status = serviced_initial; sq->retry = 0; sq->to_be_deleted = 0; #ifdef UNBOUND_DEBUG ins = #else (void) #endif rbtree_insert(outnet->serviced, &sq->node); log_assert(ins != NULL); /* must not be already present */ return sq; } /** remove waiting tcp from the outnet waiting list */ static void waiting_list_remove(struct outside_network* outnet, struct waiting_tcp* w) { struct waiting_tcp* p = outnet->tcp_wait_first, *prev = NULL; w->on_tcp_waiting_list = 0; while(p) { if(p == w) { /* remove w */ if(prev) prev->next_waiting = w->next_waiting; else outnet->tcp_wait_first = w->next_waiting; if(outnet->tcp_wait_last == w) outnet->tcp_wait_last = prev; return; } prev = p; p = p->next_waiting; } } /** reuse tcp stream, remove serviced query from stream, * return true if the stream is kept, false if it is to be closed */ static int reuse_tcp_remove_serviced_keep(struct waiting_tcp* w, struct serviced_query* sq) { struct pending_tcp* pend_tcp = (struct pending_tcp*)w->next_waiting; verbose(VERB_CLIENT, "reuse_tcp_remove_serviced_keep"); /* remove the callback. let query continue to write to not cancel * the stream itself. also keep it as an entry in the tree_by_id, * in case the answer returns (that we no longer want), but we cannot * pick the same ID number meanwhile */ w->cb = NULL; /* see if can be entered in reuse tree * for that the FD has to be non-1 */ if(pend_tcp->c->fd == -1) { verbose(VERB_CLIENT, "reuse_tcp_remove_serviced_keep: -1 fd"); return 0; } /* if in tree and used by other queries */ if(pend_tcp->reuse.node.key) { verbose(VERB_CLIENT, "reuse_tcp_remove_serviced_keep: in use by other queries"); /* do not reset the keepalive timer, for that * we'd need traffic, and this is where the serviced is * removed due to state machine internal reasons, * eg. iterator no longer interested in this query */ return 1; } /* if still open and want to keep it open */ if(pend_tcp->c->fd != -1 && sq->outnet->tcp_reuse.count < sq->outnet->tcp_reuse_max) { verbose(VERB_CLIENT, "reuse_tcp_remove_serviced_keep: keep open"); /* set a keepalive timer on it */ if(!reuse_tcp_insert(sq->outnet, pend_tcp)) { return 0; } reuse_tcp_setup_timeout(pend_tcp); return 1; } return 0; } /** cleanup serviced query entry */ static void serviced_delete(struct serviced_query* sq) { verbose(VERB_CLIENT, "serviced_delete"); if(sq->pending) { /* clear up the pending query */ if(sq->status == serviced_query_UDP_EDNS || sq->status == serviced_query_UDP || sq->status == serviced_query_UDP_EDNS_FRAG || sq->status == serviced_query_UDP_EDNS_fallback) { struct pending* p = (struct pending*)sq->pending; verbose(VERB_CLIENT, "serviced_delete: UDP"); if(p->pc) portcomm_loweruse(sq->outnet, p->pc); pending_delete(sq->outnet, p); /* this call can cause reentrant calls back into the * mesh */ outnet_send_wait_udp(sq->outnet); } else { struct waiting_tcp* w = (struct waiting_tcp*) sq->pending; verbose(VERB_CLIENT, "serviced_delete: TCP"); /* if on stream-write-waiting list then * remove from waiting list and waiting_tcp_delete */ if(w->write_wait_queued) { struct pending_tcp* pend = (struct pending_tcp*)w->next_waiting; verbose(VERB_CLIENT, "serviced_delete: writewait"); reuse_tree_by_id_delete(&pend->reuse, w); reuse_write_wait_remove(&pend->reuse, w); waiting_tcp_delete(w); } else if(!w->on_tcp_waiting_list) { struct pending_tcp* pend = (struct pending_tcp*)w->next_waiting; verbose(VERB_CLIENT, "serviced_delete: tcpreusekeep"); if(!reuse_tcp_remove_serviced_keep(w, sq)) { reuse_cb_and_decommission(sq->outnet, pend, NETEVENT_CLOSED); use_free_buffer(sq->outnet); } sq->pending = NULL; } else { verbose(VERB_CLIENT, "serviced_delete: tcpwait"); waiting_list_remove(sq->outnet, w); waiting_tcp_delete(w); } } } /* does not delete from tree, caller has to do that */ serviced_node_del(&sq->node, NULL); } /** perturb a dname capitalization randomly */ static void serviced_perturb_qname(struct ub_randstate* rnd, uint8_t* qbuf, size_t len) { uint8_t lablen; uint8_t* d = qbuf + 10; long int random = 0; int bits = 0; log_assert(len >= 10 + 5 /* offset qname, root, qtype, qclass */); (void)len; lablen = *d++; while(lablen) { while(lablen--) { /* only perturb A-Z, a-z */ if(isalpha((unsigned char)*d)) { /* get a random bit */ if(bits == 0) { random = ub_random(rnd); bits = 30; } if(random & 0x1) { *d = (uint8_t)toupper((unsigned char)*d); } else { *d = (uint8_t)tolower((unsigned char)*d); } random >>= 1; bits--; } d++; } lablen = *d++; } if(verbosity >= VERB_ALGO) { char buf[LDNS_MAX_DOMAINLEN+1]; dname_str(qbuf+10, buf); verbose(VERB_ALGO, "qname perturbed to %s", buf); } } /** put serviced query into a buffer */ static void serviced_encode(struct serviced_query* sq, sldns_buffer* buff, int with_edns) { /* if we are using 0x20 bits for ID randomness, perturb them */ if(sq->outnet->use_caps_for_id && !sq->nocaps) { serviced_perturb_qname(sq->outnet->rnd, sq->qbuf, sq->qbuflen); } /* generate query */ sldns_buffer_clear(buff); sldns_buffer_write_u16(buff, 0); /* id placeholder */ sldns_buffer_write(buff, sq->qbuf, sq->qbuflen); sldns_buffer_flip(buff); if(with_edns) { /* add edns section */ struct edns_data edns; edns.edns_present = 1; edns.ext_rcode = 0; edns.edns_version = EDNS_ADVERTISED_VERSION; edns.opt_list = sq->opt_list; if(sq->status == serviced_query_UDP_EDNS_FRAG) { if(addr_is_ip6(&sq->addr, sq->addrlen)) { if(EDNS_FRAG_SIZE_IP6 < EDNS_ADVERTISED_SIZE) edns.udp_size = EDNS_FRAG_SIZE_IP6; else edns.udp_size = EDNS_ADVERTISED_SIZE; } else { if(EDNS_FRAG_SIZE_IP4 < EDNS_ADVERTISED_SIZE) edns.udp_size = EDNS_FRAG_SIZE_IP4; else edns.udp_size = EDNS_ADVERTISED_SIZE; } } else { edns.udp_size = EDNS_ADVERTISED_SIZE; } edns.bits = 0; if(sq->dnssec & EDNS_DO) edns.bits = EDNS_DO; if(sq->dnssec & BIT_CD) LDNS_CD_SET(sldns_buffer_begin(buff)); attach_edns_record(buff, &edns); } } /** * Perform serviced query UDP sending operation. * Sends UDP with EDNS, unless infra host marked non EDNS. * @param sq: query to send. * @param buff: buffer scratch space. * @return 0 on error. */ static int serviced_udp_send(struct serviced_query* sq, sldns_buffer* buff) { int rtt, vs; uint8_t edns_lame_known; time_t now = *sq->outnet->now_secs; if(!infra_host(sq->outnet->infra, &sq->addr, sq->addrlen, sq->zone, sq->zonelen, now, &vs, &edns_lame_known, &rtt)) return 0; sq->last_rtt = rtt; verbose(VERB_ALGO, "EDNS lookup known=%d vs=%d", edns_lame_known, vs); if(sq->status == serviced_initial) { if(vs != -1) { sq->status = serviced_query_UDP_EDNS; } else { sq->status = serviced_query_UDP; } } serviced_encode(sq, buff, (sq->status == serviced_query_UDP_EDNS) || (sq->status == serviced_query_UDP_EDNS_FRAG)); sq->last_sent_time = *sq->outnet->now_tv; sq->edns_lame_known = (int)edns_lame_known; verbose(VERB_ALGO, "serviced query UDP timeout=%d msec", rtt); sq->pending = pending_udp_query(sq, buff, rtt, serviced_udp_callback, sq); if(!sq->pending) return 0; return 1; } /** check that perturbed qname is identical */ static int serviced_check_qname(sldns_buffer* pkt, uint8_t* qbuf, size_t qbuflen) { uint8_t* d1 = sldns_buffer_begin(pkt)+12; uint8_t* d2 = qbuf+10; uint8_t len1, len2; int count = 0; if(sldns_buffer_limit(pkt) < 12+1+4) /* packet too small for qname */ return 0; log_assert(qbuflen >= 15 /* 10 header, root, type, class */); len1 = *d1++; len2 = *d2++; while(len1 != 0 || len2 != 0) { if(LABEL_IS_PTR(len1)) { /* check if we can read *d1 with compression ptr rest */ if(d1 >= sldns_buffer_at(pkt, sldns_buffer_limit(pkt))) return 0; d1 = sldns_buffer_begin(pkt)+PTR_OFFSET(len1, *d1); /* check if we can read the destination *d1 */ if(d1 >= sldns_buffer_at(pkt, sldns_buffer_limit(pkt))) return 0; len1 = *d1++; if(count++ > MAX_COMPRESS_PTRS) return 0; continue; } if(d2 > qbuf+qbuflen) return 0; if(len1 != len2) return 0; if(len1 > LDNS_MAX_LABELLEN) return 0; /* check len1 + 1(next length) are okay to read */ if(d1+len1 >= sldns_buffer_at(pkt, sldns_buffer_limit(pkt))) return 0; log_assert(len1 <= LDNS_MAX_LABELLEN); log_assert(len2 <= LDNS_MAX_LABELLEN); log_assert(len1 == len2 && len1 != 0); /* compare the labels - bitwise identical */ if(memcmp(d1, d2, len1) != 0) return 0; d1 += len1; d2 += len2; len1 = *d1++; len2 = *d2++; } return 1; } /** call the callbacks for a serviced query */ static void serviced_callbacks(struct serviced_query* sq, int error, struct comm_point* c, struct comm_reply* rep) { struct service_callback* p; int dobackup = (sq->cblist && sq->cblist->next); /* >1 cb*/ uint8_t *backup_p = NULL; size_t backlen = 0; #ifdef UNBOUND_DEBUG rbnode_type* rem = #else (void) #endif /* remove from tree, and schedule for deletion, so that callbacks * can safely deregister themselves and even create new serviced * queries that are identical to this one. */ rbtree_delete(sq->outnet->serviced, sq); log_assert(rem); /* should have been present */ sq->to_be_deleted = 1; verbose(VERB_ALGO, "svcd callbacks start"); if(sq->outnet->use_caps_for_id && error == NETEVENT_NOERROR && c && !sq->nocaps && sq->qtype != LDNS_RR_TYPE_PTR) { /* for type PTR do not check perturbed name in answer, * compatibility with cisco dns guard boxes that mess up * reverse queries 0x20 contents */ /* noerror and nxdomain must have a qname in reply */ if(sldns_buffer_read_u16_at(c->buffer, 4) == 0 && (LDNS_RCODE_WIRE(sldns_buffer_begin(c->buffer)) == LDNS_RCODE_NOERROR || LDNS_RCODE_WIRE(sldns_buffer_begin(c->buffer)) == LDNS_RCODE_NXDOMAIN)) { verbose(VERB_DETAIL, "no qname in reply to check 0x20ID"); log_addr(VERB_DETAIL, "from server", &sq->addr, sq->addrlen); log_buf(VERB_DETAIL, "for packet", c->buffer); error = NETEVENT_CLOSED; c = NULL; } else if(sldns_buffer_read_u16_at(c->buffer, 4) > 0 && !serviced_check_qname(c->buffer, sq->qbuf, sq->qbuflen)) { verbose(VERB_DETAIL, "wrong 0x20-ID in reply qname"); log_addr(VERB_DETAIL, "from server", &sq->addr, sq->addrlen); log_buf(VERB_DETAIL, "for packet", c->buffer); error = NETEVENT_CAPSFAIL; /* and cleanup too */ pkt_dname_tolower(c->buffer, sldns_buffer_at(c->buffer, 12)); } else { verbose(VERB_ALGO, "good 0x20-ID in reply qname"); /* cleanup caps, prettier cache contents. */ pkt_dname_tolower(c->buffer, sldns_buffer_at(c->buffer, 12)); } } if(dobackup && c) { /* make a backup of the query, since the querystate processing * may send outgoing queries that overwrite the buffer. * use secondary buffer to store the query. * This is a data copy, but faster than packet to server */ backlen = sldns_buffer_limit(c->buffer); backup_p = memdup(sldns_buffer_begin(c->buffer), backlen); if(!backup_p) { log_err("malloc failure in serviced query callbacks"); error = NETEVENT_CLOSED; c = NULL; } sq->outnet->svcd_overhead = backlen; } /* test the actual sq->cblist, because the next elem could be deleted*/ while((p=sq->cblist) != NULL) { sq->cblist = p->next; /* remove this element */ if(dobackup && c) { sldns_buffer_clear(c->buffer); sldns_buffer_write(c->buffer, backup_p, backlen); sldns_buffer_flip(c->buffer); } fptr_ok(fptr_whitelist_serviced_query(p->cb)); (void)(*p->cb)(c, p->cb_arg, error, rep); free(p); } if(backup_p) { free(backup_p); sq->outnet->svcd_overhead = 0; } verbose(VERB_ALGO, "svcd callbacks end"); log_assert(sq->cblist == NULL); serviced_delete(sq); } int serviced_tcp_callback(struct comm_point* c, void* arg, int error, struct comm_reply* rep) { struct serviced_query* sq = (struct serviced_query*)arg; struct comm_reply r2; sq->pending = NULL; /* removed after this callback */ if(error != NETEVENT_NOERROR) log_addr(VERB_QUERY, "tcp error for address", &sq->addr, sq->addrlen); if(error==NETEVENT_NOERROR) infra_update_tcp_works(sq->outnet->infra, &sq->addr, sq->addrlen, sq->zone, sq->zonelen); #ifdef USE_DNSTAP if(error==NETEVENT_NOERROR && sq->outnet->dtenv && (sq->outnet->dtenv->log_resolver_response_messages || sq->outnet->dtenv->log_forwarder_response_messages)) dt_msg_send_outside_response(sq->outnet->dtenv, &sq->addr, c->type, sq->zone, sq->zonelen, sq->qbuf, sq->qbuflen, &sq->last_sent_time, sq->outnet->now_tv, c->buffer); #endif if(error==NETEVENT_NOERROR && sq->status == serviced_query_TCP_EDNS && (LDNS_RCODE_WIRE(sldns_buffer_begin(c->buffer)) == LDNS_RCODE_FORMERR || LDNS_RCODE_WIRE(sldns_buffer_begin( c->buffer)) == LDNS_RCODE_NOTIMPL) ) { /* attempt to fallback to nonEDNS */ sq->status = serviced_query_TCP_EDNS_fallback; serviced_tcp_initiate(sq, c->buffer); return 0; } else if(error==NETEVENT_NOERROR && sq->status == serviced_query_TCP_EDNS_fallback && (LDNS_RCODE_WIRE(sldns_buffer_begin(c->buffer)) == LDNS_RCODE_NOERROR || LDNS_RCODE_WIRE( sldns_buffer_begin(c->buffer)) == LDNS_RCODE_NXDOMAIN || LDNS_RCODE_WIRE(sldns_buffer_begin(c->buffer)) == LDNS_RCODE_YXDOMAIN)) { /* the fallback produced a result that looks promising, note * that this server should be approached without EDNS */ /* only store noEDNS in cache if domain is noDNSSEC */ if(!sq->want_dnssec) if(!infra_edns_update(sq->outnet->infra, &sq->addr, sq->addrlen, sq->zone, sq->zonelen, -1, *sq->outnet->now_secs)) log_err("Out of memory caching no edns for host"); sq->status = serviced_query_TCP; } if(sq->tcp_upstream || sq->ssl_upstream) { struct timeval now = *sq->outnet->now_tv; if(error!=NETEVENT_NOERROR) { if(!infra_rtt_update(sq->outnet->infra, &sq->addr, sq->addrlen, sq->zone, sq->zonelen, sq->qtype, -1, sq->last_rtt, (time_t)now.tv_sec)) log_err("out of memory in TCP exponential backoff."); } else if(now.tv_sec > sq->last_sent_time.tv_sec || (now.tv_sec == sq->last_sent_time.tv_sec && now.tv_usec > sq->last_sent_time.tv_usec)) { /* convert from microseconds to milliseconds */ int roundtime = ((int)(now.tv_sec - sq->last_sent_time.tv_sec))*1000 + ((int)now.tv_usec - (int)sq->last_sent_time.tv_usec)/1000; verbose(VERB_ALGO, "measured TCP-time at %d msec", roundtime); log_assert(roundtime >= 0); /* only store if less then AUTH_TIMEOUT seconds, it could be * huge due to system-hibernated and we woke up */ if(roundtime < 60000) { if(!infra_rtt_update(sq->outnet->infra, &sq->addr, sq->addrlen, sq->zone, sq->zonelen, sq->qtype, roundtime, sq->last_rtt, (time_t)now.tv_sec)) log_err("out of memory noting rtt."); } } } /* insert address into reply info */ if(!rep) { /* create one if there isn't (on errors) */ rep = &r2; r2.c = c; } memcpy(&rep->addr, &sq->addr, sq->addrlen); rep->addrlen = sq->addrlen; serviced_callbacks(sq, error, c, rep); return 0; } static void serviced_tcp_initiate(struct serviced_query* sq, sldns_buffer* buff) { verbose(VERB_ALGO, "initiate TCP query %s", sq->status==serviced_query_TCP_EDNS?"EDNS":""); serviced_encode(sq, buff, sq->status == serviced_query_TCP_EDNS); sq->last_sent_time = *sq->outnet->now_tv; sq->pending = pending_tcp_query(sq, buff, TCP_AUTH_QUERY_TIMEOUT, serviced_tcp_callback, sq); if(!sq->pending) { /* delete from tree so that a retry by above layer does not * clash with this entry */ verbose(VERB_ALGO, "serviced_tcp_initiate: failed to send tcp query"); serviced_callbacks(sq, NETEVENT_CLOSED, NULL, NULL); } } /** Send serviced query over TCP return false on initial failure */ static int serviced_tcp_send(struct serviced_query* sq, sldns_buffer* buff) { int vs, rtt, timeout; uint8_t edns_lame_known; if(!infra_host(sq->outnet->infra, &sq->addr, sq->addrlen, sq->zone, sq->zonelen, *sq->outnet->now_secs, &vs, &edns_lame_known, &rtt)) return 0; sq->last_rtt = rtt; if(vs != -1) sq->status = serviced_query_TCP_EDNS; else sq->status = serviced_query_TCP; serviced_encode(sq, buff, sq->status == serviced_query_TCP_EDNS); sq->last_sent_time = *sq->outnet->now_tv; if(sq->tcp_upstream || sq->ssl_upstream) { timeout = rtt; if(rtt >= UNKNOWN_SERVER_NICENESS && rtt < TCP_AUTH_QUERY_TIMEOUT) timeout = TCP_AUTH_QUERY_TIMEOUT; } else { timeout = TCP_AUTH_QUERY_TIMEOUT; } sq->pending = pending_tcp_query(sq, buff, timeout, serviced_tcp_callback, sq); return sq->pending != NULL; } /* see if packet is edns malformed; got zeroes at start. * This is from servers that return malformed packets to EDNS0 queries, * but they return good packets for nonEDNS0 queries. * We try to detect their output; without resorting to a full parse or * check for too many bytes after the end of the packet. */ static int packet_edns_malformed(struct sldns_buffer* buf, int qtype) { size_t len; if(sldns_buffer_limit(buf) < LDNS_HEADER_SIZE) return 1; /* malformed */ /* they have NOERROR rcode, 1 answer. */ if(LDNS_RCODE_WIRE(sldns_buffer_begin(buf)) != LDNS_RCODE_NOERROR) return 0; /* one query (to skip) and answer records */ if(LDNS_QDCOUNT(sldns_buffer_begin(buf)) != 1 || LDNS_ANCOUNT(sldns_buffer_begin(buf)) == 0) return 0; /* skip qname */ len = dname_valid(sldns_buffer_at(buf, LDNS_HEADER_SIZE), sldns_buffer_limit(buf)-LDNS_HEADER_SIZE); if(len == 0) return 0; if(len == 1 && qtype == 0) return 0; /* we asked for '.' and type 0 */ /* and then 4 bytes (type and class of query) */ if(sldns_buffer_limit(buf) < LDNS_HEADER_SIZE + len + 4 + 3) return 0; /* and start with 11 zeroes as the answer RR */ /* so check the qtype of the answer record, qname=0, type=0 */ if(sldns_buffer_at(buf, LDNS_HEADER_SIZE+len+4)[0] == 0 && sldns_buffer_at(buf, LDNS_HEADER_SIZE+len+4)[1] == 0 && sldns_buffer_at(buf, LDNS_HEADER_SIZE+len+4)[2] == 0) return 1; return 0; } int serviced_udp_callback(struct comm_point* c, void* arg, int error, struct comm_reply* rep) { struct serviced_query* sq = (struct serviced_query*)arg; struct outside_network* outnet = sq->outnet; struct timeval now = *sq->outnet->now_tv; sq->pending = NULL; /* removed after callback */ if(error == NETEVENT_TIMEOUT) { if(sq->status == serviced_query_UDP_EDNS && sq->last_rtt < 5000) { /* fallback to 1480/1280 */ sq->status = serviced_query_UDP_EDNS_FRAG; log_name_addr(VERB_ALGO, "try edns1xx0", sq->qbuf+10, &sq->addr, sq->addrlen); if(!serviced_udp_send(sq, c->buffer)) { serviced_callbacks(sq, NETEVENT_CLOSED, c, rep); } return 0; } if(sq->status == serviced_query_UDP_EDNS_FRAG) { /* fragmentation size did not fix it */ sq->status = serviced_query_UDP_EDNS; } sq->retry++; if(!infra_rtt_update(outnet->infra, &sq->addr, sq->addrlen, sq->zone, sq->zonelen, sq->qtype, -1, sq->last_rtt, (time_t)now.tv_sec)) log_err("out of memory in UDP exponential backoff"); if(sq->retry < OUTBOUND_UDP_RETRY) { log_name_addr(VERB_ALGO, "retry query", sq->qbuf+10, &sq->addr, sq->addrlen); if(!serviced_udp_send(sq, c->buffer)) { serviced_callbacks(sq, NETEVENT_CLOSED, c, rep); } return 0; } } if(error != NETEVENT_NOERROR) { /* udp returns error (due to no ID or interface available) */ serviced_callbacks(sq, error, c, rep); return 0; } #ifdef USE_DNSTAP if(error == NETEVENT_NOERROR && outnet->dtenv && (outnet->dtenv->log_resolver_response_messages || outnet->dtenv->log_forwarder_response_messages)) dt_msg_send_outside_response(outnet->dtenv, &sq->addr, c->type, sq->zone, sq->zonelen, sq->qbuf, sq->qbuflen, &sq->last_sent_time, sq->outnet->now_tv, c->buffer); #endif if( (sq->status == serviced_query_UDP_EDNS ||sq->status == serviced_query_UDP_EDNS_FRAG) && (LDNS_RCODE_WIRE(sldns_buffer_begin(c->buffer)) == LDNS_RCODE_FORMERR || LDNS_RCODE_WIRE( sldns_buffer_begin(c->buffer)) == LDNS_RCODE_NOTIMPL || packet_edns_malformed(c->buffer, sq->qtype) )) { /* try to get an answer by falling back without EDNS */ verbose(VERB_ALGO, "serviced query: attempt without EDNS"); sq->status = serviced_query_UDP_EDNS_fallback; sq->retry = 0; if(!serviced_udp_send(sq, c->buffer)) { serviced_callbacks(sq, NETEVENT_CLOSED, c, rep); } return 0; } else if(sq->status == serviced_query_UDP_EDNS && !sq->edns_lame_known) { /* now we know that edns queries received answers store that */ log_addr(VERB_ALGO, "serviced query: EDNS works for", &sq->addr, sq->addrlen); if(!infra_edns_update(outnet->infra, &sq->addr, sq->addrlen, sq->zone, sq->zonelen, 0, (time_t)now.tv_sec)) { log_err("Out of memory caching edns works"); } sq->edns_lame_known = 1; } else if(sq->status == serviced_query_UDP_EDNS_fallback && !sq->edns_lame_known && (LDNS_RCODE_WIRE( sldns_buffer_begin(c->buffer)) == LDNS_RCODE_NOERROR || LDNS_RCODE_WIRE(sldns_buffer_begin(c->buffer)) == LDNS_RCODE_NXDOMAIN || LDNS_RCODE_WIRE(sldns_buffer_begin( c->buffer)) == LDNS_RCODE_YXDOMAIN)) { /* the fallback produced a result that looks promising, note * that this server should be approached without EDNS */ /* only store noEDNS in cache if domain is noDNSSEC */ if(!sq->want_dnssec) { log_addr(VERB_ALGO, "serviced query: EDNS fails for", &sq->addr, sq->addrlen); if(!infra_edns_update(outnet->infra, &sq->addr, sq->addrlen, sq->zone, sq->zonelen, -1, (time_t)now.tv_sec)) { log_err("Out of memory caching no edns for host"); } } else { log_addr(VERB_ALGO, "serviced query: EDNS fails, but " "not stored because need DNSSEC for", &sq->addr, sq->addrlen); } sq->status = serviced_query_UDP; } if(now.tv_sec > sq->last_sent_time.tv_sec || (now.tv_sec == sq->last_sent_time.tv_sec && now.tv_usec > sq->last_sent_time.tv_usec)) { /* convert from microseconds to milliseconds */ int roundtime = ((int)(now.tv_sec - sq->last_sent_time.tv_sec))*1000 + ((int)now.tv_usec - (int)sq->last_sent_time.tv_usec)/1000; verbose(VERB_ALGO, "measured roundtrip at %d msec", roundtime); log_assert(roundtime >= 0); /* in case the system hibernated, do not enter a huge value, * above this value gives trouble with server selection */ if(roundtime < 60000) { if(!infra_rtt_update(outnet->infra, &sq->addr, sq->addrlen, sq->zone, sq->zonelen, sq->qtype, roundtime, sq->last_rtt, (time_t)now.tv_sec)) log_err("out of memory noting rtt."); } } /* perform TC flag check and TCP fallback after updating our * cache entries for EDNS status and RTT times */ if(LDNS_TC_WIRE(sldns_buffer_begin(c->buffer))) { /* fallback to TCP */ /* this discards partial UDP contents */ if(sq->status == serviced_query_UDP_EDNS || sq->status == serviced_query_UDP_EDNS_FRAG || sq->status == serviced_query_UDP_EDNS_fallback) /* if we have unfinished EDNS_fallback, start again */ sq->status = serviced_query_TCP_EDNS; else sq->status = serviced_query_TCP; serviced_tcp_initiate(sq, c->buffer); return 0; } /* yay! an answer */ serviced_callbacks(sq, error, c, rep); return 0; } struct serviced_query* outnet_serviced_query(struct outside_network* outnet, struct query_info* qinfo, uint16_t flags, int dnssec, int want_dnssec, int nocaps, int tcp_upstream, int ssl_upstream, char* tls_auth_name, struct sockaddr_storage* addr, socklen_t addrlen, uint8_t* zone, size_t zonelen, struct module_qstate* qstate, comm_point_callback_type* callback, void* callback_arg, sldns_buffer* buff, struct module_env* env) { struct serviced_query* sq; struct service_callback* cb; struct edns_string_addr* client_string_addr; if(!inplace_cb_query_call(env, qinfo, flags, addr, addrlen, zone, zonelen, qstate, qstate->region)) return NULL; if((client_string_addr = edns_string_addr_lookup( &env->edns_strings->client_strings, addr, addrlen))) { edns_opt_list_append(&qstate->edns_opts_back_out, env->edns_strings->client_string_opcode, client_string_addr->string_len, client_string_addr->string, qstate->region); } serviced_gen_query(buff, qinfo->qname, qinfo->qname_len, qinfo->qtype, qinfo->qclass, flags); sq = lookup_serviced(outnet, buff, dnssec, addr, addrlen, qstate->edns_opts_back_out); /* duplicate entries are included in the callback list, because * there is a counterpart registration by our caller that needs to * be doubly-removed (with callbacks perhaps). */ if(!(cb = (struct service_callback*)malloc(sizeof(*cb)))) return NULL; if(!sq) { /* make new serviced query entry */ sq = serviced_create(outnet, buff, dnssec, want_dnssec, nocaps, tcp_upstream, ssl_upstream, tls_auth_name, addr, addrlen, zone, zonelen, (int)qinfo->qtype, qstate->edns_opts_back_out); if(!sq) { free(cb); return NULL; } /* perform first network action */ if(outnet->do_udp && !(tcp_upstream || ssl_upstream)) { if(!serviced_udp_send(sq, buff)) { (void)rbtree_delete(outnet->serviced, sq); serviced_node_del(&sq->node, NULL); free(cb); return NULL; } } else { if(!serviced_tcp_send(sq, buff)) { (void)rbtree_delete(outnet->serviced, sq); serviced_node_del(&sq->node, NULL); free(cb); return NULL; } } } /* add callback to list of callbacks */ cb->cb = callback; cb->cb_arg = callback_arg; cb->next = sq->cblist; sq->cblist = cb; return sq; } /** remove callback from list */ static void callback_list_remove(struct serviced_query* sq, void* cb_arg) { struct service_callback** pp = &sq->cblist; while(*pp) { if((*pp)->cb_arg == cb_arg) { struct service_callback* del = *pp; *pp = del->next; free(del); return; } pp = &(*pp)->next; } } void outnet_serviced_query_stop(struct serviced_query* sq, void* cb_arg) { if(!sq) return; callback_list_remove(sq, cb_arg); /* if callbacks() routine scheduled deletion, let it do that */ if(!sq->cblist && !sq->to_be_deleted) { (void)rbtree_delete(sq->outnet->serviced, sq); serviced_delete(sq); } } /** create fd to send to this destination */ static int fd_for_dest(struct outside_network* outnet, struct sockaddr_storage* to_addr, socklen_t to_addrlen) { struct sockaddr_storage* addr; socklen_t addrlen; int i, try, pnum, dscp; struct port_if* pif; /* create fd */ dscp = outnet->ip_dscp; for(try = 0; try<1000; try++) { int port = 0; int freebind = 0; int noproto = 0; int inuse = 0; int fd = -1; /* select interface */ if(addr_is_ip6(to_addr, to_addrlen)) { if(outnet->num_ip6 == 0) { char to[64]; addr_to_str(to_addr, to_addrlen, to, sizeof(to)); verbose(VERB_QUERY, "need ipv6 to send, but no ipv6 outgoing interfaces, for %s", to); return -1; } i = ub_random_max(outnet->rnd, outnet->num_ip6); pif = &outnet->ip6_ifs[i]; } else { if(outnet->num_ip4 == 0) { char to[64]; addr_to_str(to_addr, to_addrlen, to, sizeof(to)); verbose(VERB_QUERY, "need ipv4 to send, but no ipv4 outgoing interfaces, for %s", to); return -1; } i = ub_random_max(outnet->rnd, outnet->num_ip4); pif = &outnet->ip4_ifs[i]; } addr = &pif->addr; addrlen = pif->addrlen; #ifndef DISABLE_EXPLICIT_PORT_RANDOMISATION pnum = ub_random_max(outnet->rnd, pif->avail_total); if(pnum < pif->inuse) { /* port already open */ port = pif->out[pnum]->number; } else { /* unused ports in start part of array */ port = pif->avail_ports[pnum - pif->inuse]; } #else pnum = port = 0; #endif if(addr_is_ip6(to_addr, to_addrlen)) { struct sockaddr_in6 sa = *(struct sockaddr_in6*)addr; sa.sin6_port = (in_port_t)htons((uint16_t)port); fd = create_udp_sock(AF_INET6, SOCK_DGRAM, (struct sockaddr*)&sa, addrlen, 1, &inuse, &noproto, 0, 0, 0, NULL, 0, freebind, 0, dscp); } else { struct sockaddr_in* sa = (struct sockaddr_in*)addr; sa->sin_port = (in_port_t)htons((uint16_t)port); fd = create_udp_sock(AF_INET, SOCK_DGRAM, (struct sockaddr*)addr, addrlen, 1, &inuse, &noproto, 0, 0, 0, NULL, 0, freebind, 0, dscp); } if(fd != -1) { return fd; } if(!inuse) { return -1; } } /* too many tries */ log_err("cannot send probe, ports are in use"); return -1; } struct comm_point* outnet_comm_point_for_udp(struct outside_network* outnet, comm_point_callback_type* cb, void* cb_arg, struct sockaddr_storage* to_addr, socklen_t to_addrlen) { struct comm_point* cp; int fd = fd_for_dest(outnet, to_addr, to_addrlen); if(fd == -1) { return NULL; } cp = comm_point_create_udp(outnet->base, fd, outnet->udp_buff, cb, cb_arg); if(!cp) { log_err("malloc failure"); close(fd); return NULL; } return cp; } /** setup SSL for comm point */ static int setup_comm_ssl(struct comm_point* cp, struct outside_network* outnet, int fd, char* host) { cp->ssl = outgoing_ssl_fd(outnet->sslctx, fd); if(!cp->ssl) { log_err("cannot create SSL object"); return 0; } #ifdef USE_WINSOCK comm_point_tcp_win_bio_cb(cp, cp->ssl); #endif cp->ssl_shake_state = comm_ssl_shake_write; /* https verification */ #ifdef HAVE_SSL if(outnet->tls_use_sni) { (void)SSL_set_tlsext_host_name(cp->ssl, host); } #endif #ifdef HAVE_SSL_SET1_HOST if((SSL_CTX_get_verify_mode(outnet->sslctx)&SSL_VERIFY_PEER)) { /* because we set SSL_VERIFY_PEER, in netevent in * ssl_handshake, it'll check if the certificate * verification has succeeded */ /* SSL_VERIFY_PEER is set on the sslctx */ /* and the certificates to verify with are loaded into * it with SSL_load_verify_locations or * SSL_CTX_set_default_verify_paths */ /* setting the hostname makes openssl verify the * host name in the x509 certificate in the * SSL connection*/ if(!SSL_set1_host(cp->ssl, host)) { log_err("SSL_set1_host failed"); return 0; } } #elif defined(HAVE_X509_VERIFY_PARAM_SET1_HOST) /* openssl 1.0.2 has this function that can be used for * set1_host like verification */ if((SSL_CTX_get_verify_mode(outnet->sslctx)&SSL_VERIFY_PEER)) { X509_VERIFY_PARAM* param = SSL_get0_param(cp->ssl); # ifdef X509_CHECK_FLAG_NO_PARTIAL_WILDCARDS X509_VERIFY_PARAM_set_hostflags(param, X509_CHECK_FLAG_NO_PARTIAL_WILDCARDS); # endif if(!X509_VERIFY_PARAM_set1_host(param, host, strlen(host))) { log_err("X509_VERIFY_PARAM_set1_host failed"); return 0; } } #else (void)host; #endif /* HAVE_SSL_SET1_HOST */ return 1; } struct comm_point* outnet_comm_point_for_tcp(struct outside_network* outnet, comm_point_callback_type* cb, void* cb_arg, struct sockaddr_storage* to_addr, socklen_t to_addrlen, sldns_buffer* query, int timeout, int ssl, char* host) { struct comm_point* cp; int fd = outnet_get_tcp_fd(to_addr, to_addrlen, outnet->tcp_mss, outnet->ip_dscp); if(fd == -1) { return 0; } fd_set_nonblock(fd); if(!outnet_tcp_connect(fd, to_addr, to_addrlen)) { /* outnet_tcp_connect has closed fd on error for us */ return 0; } cp = comm_point_create_tcp_out(outnet->base, 65552, cb, cb_arg); if(!cp) { log_err("malloc failure"); close(fd); return 0; } cp->repinfo.addrlen = to_addrlen; memcpy(&cp->repinfo.addr, to_addr, to_addrlen); /* setup for SSL (if needed) */ if(ssl) { if(!setup_comm_ssl(cp, outnet, fd, host)) { log_err("cannot setup XoT"); comm_point_delete(cp); return NULL; } } /* set timeout on TCP connection */ comm_point_start_listening(cp, fd, timeout); /* copy scratch buffer to cp->buffer */ sldns_buffer_copy(cp->buffer, query); return cp; } /** setup http request headers in buffer for sending query to destination */ static int setup_http_request(sldns_buffer* buf, char* host, char* path) { sldns_buffer_clear(buf); sldns_buffer_printf(buf, "GET /%s HTTP/1.1\r\n", path); sldns_buffer_printf(buf, "Host: %s\r\n", host); sldns_buffer_printf(buf, "User-Agent: unbound/%s\r\n", PACKAGE_VERSION); /* We do not really do multiple queries per connection, * but this header setting is also not needed. * sldns_buffer_printf(buf, "Connection: close\r\n") */ sldns_buffer_printf(buf, "\r\n"); if(sldns_buffer_position(buf)+10 > sldns_buffer_capacity(buf)) return 0; /* somehow buffer too short, but it is about 60K and the request is only a couple bytes long. */ sldns_buffer_flip(buf); return 1; } struct comm_point* outnet_comm_point_for_http(struct outside_network* outnet, comm_point_callback_type* cb, void* cb_arg, struct sockaddr_storage* to_addr, socklen_t to_addrlen, int timeout, int ssl, char* host, char* path) { /* cp calls cb with err=NETEVENT_DONE when transfer is done */ struct comm_point* cp; int fd = outnet_get_tcp_fd(to_addr, to_addrlen, outnet->tcp_mss, outnet->ip_dscp); if(fd == -1) { return 0; } fd_set_nonblock(fd); if(!outnet_tcp_connect(fd, to_addr, to_addrlen)) { /* outnet_tcp_connect has closed fd on error for us */ return 0; } cp = comm_point_create_http_out(outnet->base, 65552, cb, cb_arg, outnet->udp_buff); if(!cp) { log_err("malloc failure"); close(fd); return 0; } cp->repinfo.addrlen = to_addrlen; memcpy(&cp->repinfo.addr, to_addr, to_addrlen); /* setup for SSL (if needed) */ if(ssl) { if(!setup_comm_ssl(cp, outnet, fd, host)) { log_err("cannot setup https"); comm_point_delete(cp); return NULL; } } /* set timeout on TCP connection */ comm_point_start_listening(cp, fd, timeout); /* setup http request in cp->buffer */ if(!setup_http_request(cp->buffer, host, path)) { log_err("error setting up http request"); comm_point_delete(cp); return NULL; } return cp; } /** get memory used by waiting tcp entry (in use or not) */ static size_t waiting_tcp_get_mem(struct waiting_tcp* w) { size_t s; if(!w) return 0; s = sizeof(*w) + w->pkt_len; if(w->timer) s += comm_timer_get_mem(w->timer); return s; } /** get memory used by port if */ static size_t if_get_mem(struct port_if* pif) { size_t s; int i; s = sizeof(*pif) + #ifndef DISABLE_EXPLICIT_PORT_RANDOMISATION sizeof(int)*pif->avail_total + #endif sizeof(struct port_comm*)*pif->maxout; for(i=0; iinuse; i++) s += sizeof(*pif->out[i]) + comm_point_get_mem(pif->out[i]->cp); return s; } /** get memory used by waiting udp */ static size_t waiting_udp_get_mem(struct pending* w) { size_t s; s = sizeof(*w) + comm_timer_get_mem(w->timer) + w->pkt_len; return s; } size_t outnet_get_mem(struct outside_network* outnet) { size_t i; int k; struct waiting_tcp* w; struct pending* u; struct serviced_query* sq; struct service_callback* sb; struct port_comm* pc; size_t s = sizeof(*outnet) + sizeof(*outnet->base) + sizeof(*outnet->udp_buff) + sldns_buffer_capacity(outnet->udp_buff); /* second buffer is not ours */ for(pc = outnet->unused_fds; pc; pc = pc->next) { s += sizeof(*pc) + comm_point_get_mem(pc->cp); } for(k=0; knum_ip4; k++) s += if_get_mem(&outnet->ip4_ifs[k]); for(k=0; knum_ip6; k++) s += if_get_mem(&outnet->ip6_ifs[k]); for(u=outnet->udp_wait_first; u; u=u->next_waiting) s += waiting_udp_get_mem(u); s += sizeof(struct pending_tcp*)*outnet->num_tcp; for(i=0; inum_tcp; i++) { s += sizeof(struct pending_tcp); s += comm_point_get_mem(outnet->tcp_conns[i]->c); if(outnet->tcp_conns[i]->query) s += waiting_tcp_get_mem(outnet->tcp_conns[i]->query); } for(w=outnet->tcp_wait_first; w; w = w->next_waiting) s += waiting_tcp_get_mem(w); s += sizeof(*outnet->pending); s += (sizeof(struct pending) + comm_timer_get_mem(NULL)) * outnet->pending->count; s += sizeof(*outnet->serviced); s += outnet->svcd_overhead; RBTREE_FOR(sq, struct serviced_query*, outnet->serviced) { s += sizeof(*sq) + sq->qbuflen; for(sb = sq->cblist; sb; sb = sb->next) s += sizeof(*sb); } return s; } size_t serviced_get_mem(struct serviced_query* sq) { struct service_callback* sb; size_t s; s = sizeof(*sq) + sq->qbuflen; for(sb = sq->cblist; sb; sb = sb->next) s += sizeof(*sb); if(sq->status == serviced_query_UDP_EDNS || sq->status == serviced_query_UDP || sq->status == serviced_query_UDP_EDNS_FRAG || sq->status == serviced_query_UDP_EDNS_fallback) { s += sizeof(struct pending); s += comm_timer_get_mem(NULL); } else { /* does not have size of the pkt pointer */ /* always has a timer except on malloc failures */ /* these sizes are part of the main outside network mem */ /* s += sizeof(struct waiting_tcp); s += comm_timer_get_mem(NULL); */ } return s; } Index: head/contrib/unbound/util/netevent.c =================================================================== --- head/contrib/unbound/util/netevent.c (revision 368750) +++ head/contrib/unbound/util/netevent.c (revision 368751) @@ -1,4217 +1,4226 @@ /* * util/netevent.c - event notification * * Copyright (c) 2007, NLnet Labs. All rights reserved. * * This software is open source. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * Neither the name of the NLNET LABS nor the names of its contributors may * be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /** * \file * * This file contains event notification functions. */ #include "config.h" #include "util/netevent.h" #include "util/ub_event.h" #include "util/log.h" #include "util/net_help.h" #include "util/tcp_conn_limit.h" #include "util/fptr_wlist.h" #include "sldns/pkthdr.h" #include "sldns/sbuffer.h" #include "sldns/str2wire.h" #include "dnstap/dnstap.h" #include "dnscrypt/dnscrypt.h" #include "services/listen_dnsport.h" #ifdef HAVE_OPENSSL_SSL_H #include #endif #ifdef HAVE_OPENSSL_ERR_H #include #endif /* -------- Start of local definitions -------- */ /** if CMSG_ALIGN is not defined on this platform, a workaround */ #ifndef CMSG_ALIGN # ifdef __CMSG_ALIGN # define CMSG_ALIGN(n) __CMSG_ALIGN(n) # elif defined(CMSG_DATA_ALIGN) # define CMSG_ALIGN _CMSG_DATA_ALIGN # else # define CMSG_ALIGN(len) (((len)+sizeof(long)-1) & ~(sizeof(long)-1)) # endif #endif /** if CMSG_LEN is not defined on this platform, a workaround */ #ifndef CMSG_LEN # define CMSG_LEN(len) (CMSG_ALIGN(sizeof(struct cmsghdr))+(len)) #endif /** if CMSG_SPACE is not defined on this platform, a workaround */ #ifndef CMSG_SPACE # ifdef _CMSG_HDR_ALIGN # define CMSG_SPACE(l) (CMSG_ALIGN(l)+_CMSG_HDR_ALIGN(sizeof(struct cmsghdr))) # else # define CMSG_SPACE(l) (CMSG_ALIGN(l)+CMSG_ALIGN(sizeof(struct cmsghdr))) # endif #endif /** The TCP writing query timeout in milliseconds */ #define TCP_QUERY_TIMEOUT 120000 /** The minimum actual TCP timeout to use, regardless of what we advertise, * in msec */ #define TCP_QUERY_TIMEOUT_MINIMUM 200 #ifndef NONBLOCKING_IS_BROKEN /** number of UDP reads to perform per read indication from select */ #define NUM_UDP_PER_SELECT 100 #else #define NUM_UDP_PER_SELECT 1 #endif /** * The internal event structure for keeping ub_event info for the event. * Possibly other structures (list, tree) this is part of. */ struct internal_event { /** the comm base */ struct comm_base* base; /** ub_event event type */ struct ub_event* ev; }; /** * Internal base structure, so that every thread has its own events. */ struct internal_base { /** ub_event event_base type. */ struct ub_event_base* base; /** seconds time pointer points here */ time_t secs; /** timeval with current time */ struct timeval now; /** the event used for slow_accept timeouts */ struct ub_event* slow_accept; /** true if slow_accept is enabled */ int slow_accept_enabled; }; /** * Internal timer structure, to store timer event in. */ struct internal_timer { /** the super struct from which derived */ struct comm_timer super; /** the comm base */ struct comm_base* base; /** ub_event event type */ struct ub_event* ev; /** is timer enabled */ uint8_t enabled; }; /** * Internal signal structure, to store signal event in. */ struct internal_signal { /** ub_event event type */ struct ub_event* ev; /** next in signal list */ struct internal_signal* next; }; /** create a tcp handler with a parent */ static struct comm_point* comm_point_create_tcp_handler( struct comm_base *base, struct comm_point* parent, size_t bufsize, struct sldns_buffer* spoolbuf, comm_point_callback_type* callback, void* callback_arg); /* -------- End of local definitions -------- */ struct comm_base* comm_base_create(int sigs) { struct comm_base* b = (struct comm_base*)calloc(1, sizeof(struct comm_base)); const char *evnm="event", *evsys="", *evmethod=""; if(!b) return NULL; b->eb = (struct internal_base*)calloc(1, sizeof(struct internal_base)); if(!b->eb) { free(b); return NULL; } b->eb->base = ub_default_event_base(sigs, &b->eb->secs, &b->eb->now); if(!b->eb->base) { free(b->eb); free(b); return NULL; } ub_comm_base_now(b); ub_get_event_sys(b->eb->base, &evnm, &evsys, &evmethod); verbose(VERB_ALGO, "%s %s uses %s method.", evnm, evsys, evmethod); return b; } struct comm_base* comm_base_create_event(struct ub_event_base* base) { struct comm_base* b = (struct comm_base*)calloc(1, sizeof(struct comm_base)); if(!b) return NULL; b->eb = (struct internal_base*)calloc(1, sizeof(struct internal_base)); if(!b->eb) { free(b); return NULL; } b->eb->base = base; ub_comm_base_now(b); return b; } void comm_base_delete(struct comm_base* b) { if(!b) return; if(b->eb->slow_accept_enabled) { if(ub_event_del(b->eb->slow_accept) != 0) { log_err("could not event_del slow_accept"); } ub_event_free(b->eb->slow_accept); } ub_event_base_free(b->eb->base); b->eb->base = NULL; free(b->eb); free(b); } void comm_base_delete_no_base(struct comm_base* b) { if(!b) return; if(b->eb->slow_accept_enabled) { if(ub_event_del(b->eb->slow_accept) != 0) { log_err("could not event_del slow_accept"); } ub_event_free(b->eb->slow_accept); } b->eb->base = NULL; free(b->eb); free(b); } void comm_base_timept(struct comm_base* b, time_t** tt, struct timeval** tv) { *tt = &b->eb->secs; *tv = &b->eb->now; } void comm_base_dispatch(struct comm_base* b) { int retval; retval = ub_event_base_dispatch(b->eb->base); if(retval < 0) { fatal_exit("event_dispatch returned error %d, " "errno is %s", retval, strerror(errno)); } } void comm_base_exit(struct comm_base* b) { if(ub_event_base_loopexit(b->eb->base) != 0) { log_err("Could not loopexit"); } } void comm_base_set_slow_accept_handlers(struct comm_base* b, void (*stop_acc)(void*), void (*start_acc)(void*), void* arg) { b->stop_accept = stop_acc; b->start_accept = start_acc; b->cb_arg = arg; } struct ub_event_base* comm_base_internal(struct comm_base* b) { return b->eb->base; } /** see if errno for udp has to be logged or not uses globals */ static int udp_send_errno_needs_log(struct sockaddr* addr, socklen_t addrlen) { /* do not log transient errors (unless high verbosity) */ #if defined(ENETUNREACH) || defined(EHOSTDOWN) || defined(EHOSTUNREACH) || defined(ENETDOWN) switch(errno) { # ifdef ENETUNREACH case ENETUNREACH: # endif # ifdef EHOSTDOWN case EHOSTDOWN: # endif # ifdef EHOSTUNREACH case EHOSTUNREACH: # endif # ifdef ENETDOWN case ENETDOWN: # endif if(verbosity < VERB_ALGO) return 0; default: break; } #endif /* permission denied is gotten for every send if the * network is disconnected (on some OS), squelch it */ if( ((errno == EPERM) # ifdef EADDRNOTAVAIL /* 'Cannot assign requested address' also when disconnected */ || (errno == EADDRNOTAVAIL) # endif ) && verbosity < VERB_DETAIL) return 0; # ifdef EADDRINUSE /* If SO_REUSEADDR is set, we could try to connect to the same server * from the same source port twice. */ if(errno == EADDRINUSE && verbosity < VERB_DETAIL) return 0; # endif /* squelch errors where people deploy AAAA ::ffff:bla for * authority servers, which we try for intranets. */ if(errno == EINVAL && addr_is_ip4mapped( (struct sockaddr_storage*)addr, addrlen) && verbosity < VERB_DETAIL) return 0; /* SO_BROADCAST sockopt can give access to 255.255.255.255, * but a dns cache does not need it. */ if(errno == EACCES && addr_is_broadcast( (struct sockaddr_storage*)addr, addrlen) && verbosity < VERB_DETAIL) return 0; return 1; } int tcp_connect_errno_needs_log(struct sockaddr* addr, socklen_t addrlen) { return udp_send_errno_needs_log(addr, addrlen); } /* send a UDP reply */ int comm_point_send_udp_msg(struct comm_point *c, sldns_buffer* packet, - struct sockaddr* addr, socklen_t addrlen) + struct sockaddr* addr, socklen_t addrlen, int is_connected) { ssize_t sent; log_assert(c->fd != -1); #ifdef UNBOUND_DEBUG if(sldns_buffer_remaining(packet) == 0) log_err("error: send empty UDP packet"); #endif - if(addr) { - log_assert(addr && addrlen > 0); + log_assert(addr && addrlen > 0); + if(!is_connected) { sent = sendto(c->fd, (void*)sldns_buffer_begin(packet), sldns_buffer_remaining(packet), 0, addr, addrlen); } else { sent = send(c->fd, (void*)sldns_buffer_begin(packet), sldns_buffer_remaining(packet), 0); } if(sent == -1) { /* try again and block, waiting for IO to complete, * we want to send the answer, and we will wait for * the ethernet interface buffer to have space. */ #ifndef USE_WINSOCK if(errno == EAGAIN || # ifdef EWOULDBLOCK errno == EWOULDBLOCK || # endif errno == ENOBUFS) { #else if(WSAGetLastError() == WSAEINPROGRESS || WSAGetLastError() == WSAENOBUFS || WSAGetLastError() == WSAEWOULDBLOCK) { #endif int e; fd_set_block(c->fd); - sent = sendto(c->fd, (void*)sldns_buffer_begin(packet), - sldns_buffer_remaining(packet), 0, - addr, addrlen); + if (!is_connected) { + sent = sendto(c->fd, (void*)sldns_buffer_begin(packet), + sldns_buffer_remaining(packet), 0, + addr, addrlen); + } else { + sent = send(c->fd, (void*)sldns_buffer_begin(packet), + sldns_buffer_remaining(packet), 0); + } e = errno; fd_set_nonblock(c->fd); errno = e; } } if(sent == -1) { if(!udp_send_errno_needs_log(addr, addrlen)) return 0; - verbose(VERB_OPS, "sendto failed: %s", sock_strerror(errno)); - log_addr(VERB_OPS, "remote address is", + if (!is_connected) { + verbose(VERB_OPS, "sendto failed: %s", sock_strerror(errno)); + } else { + verbose(VERB_OPS, "send failed: %s", sock_strerror(errno)); + } + log_addr(VERB_OPS, "remote address is", (struct sockaddr_storage*)addr, addrlen); return 0; } else if((size_t)sent != sldns_buffer_remaining(packet)) { log_err("sent %d in place of %d bytes", (int)sent, (int)sldns_buffer_remaining(packet)); return 0; } return 1; } #if defined(AF_INET6) && defined(IPV6_PKTINFO) && (defined(HAVE_RECVMSG) || defined(HAVE_SENDMSG)) /** print debug ancillary info */ static void p_ancil(const char* str, struct comm_reply* r) { if(r->srctype != 4 && r->srctype != 6) { log_info("%s: unknown srctype %d", str, r->srctype); return; } if(r->srctype == 6) { char buf[1024]; if(inet_ntop(AF_INET6, &r->pktinfo.v6info.ipi6_addr, buf, (socklen_t)sizeof(buf)) == 0) { (void)strlcpy(buf, "(inet_ntop error)", sizeof(buf)); } buf[sizeof(buf)-1]=0; log_info("%s: %s %d", str, buf, r->pktinfo.v6info.ipi6_ifindex); } else if(r->srctype == 4) { #ifdef IP_PKTINFO char buf1[1024], buf2[1024]; if(inet_ntop(AF_INET, &r->pktinfo.v4info.ipi_addr, buf1, (socklen_t)sizeof(buf1)) == 0) { (void)strlcpy(buf1, "(inet_ntop error)", sizeof(buf1)); } buf1[sizeof(buf1)-1]=0; #ifdef HAVE_STRUCT_IN_PKTINFO_IPI_SPEC_DST if(inet_ntop(AF_INET, &r->pktinfo.v4info.ipi_spec_dst, buf2, (socklen_t)sizeof(buf2)) == 0) { (void)strlcpy(buf2, "(inet_ntop error)", sizeof(buf2)); } buf2[sizeof(buf2)-1]=0; #else buf2[0]=0; #endif log_info("%s: %d %s %s", str, r->pktinfo.v4info.ipi_ifindex, buf1, buf2); #elif defined(IP_RECVDSTADDR) char buf1[1024]; if(inet_ntop(AF_INET, &r->pktinfo.v4addr, buf1, (socklen_t)sizeof(buf1)) == 0) { (void)strlcpy(buf1, "(inet_ntop error)", sizeof(buf1)); } buf1[sizeof(buf1)-1]=0; log_info("%s: %s", str, buf1); #endif /* IP_PKTINFO or PI_RECVDSTDADDR */ } } #endif /* AF_INET6 && IPV6_PKTINFO && HAVE_RECVMSG||HAVE_SENDMSG */ /** send a UDP reply over specified interface*/ static int comm_point_send_udp_msg_if(struct comm_point *c, sldns_buffer* packet, struct sockaddr* addr, socklen_t addrlen, struct comm_reply* r) { #if defined(AF_INET6) && defined(IPV6_PKTINFO) && defined(HAVE_SENDMSG) ssize_t sent; struct msghdr msg; struct iovec iov[1]; union { struct cmsghdr hdr; char buf[256]; } control; #ifndef S_SPLINT_S struct cmsghdr *cmsg; #endif /* S_SPLINT_S */ log_assert(c->fd != -1); #ifdef UNBOUND_DEBUG if(sldns_buffer_remaining(packet) == 0) log_err("error: send empty UDP packet"); #endif log_assert(addr && addrlen > 0); msg.msg_name = addr; msg.msg_namelen = addrlen; iov[0].iov_base = sldns_buffer_begin(packet); iov[0].iov_len = sldns_buffer_remaining(packet); msg.msg_iov = iov; msg.msg_iovlen = 1; msg.msg_control = control.buf; #ifndef S_SPLINT_S msg.msg_controllen = sizeof(control.buf); #endif /* S_SPLINT_S */ msg.msg_flags = 0; #ifndef S_SPLINT_S cmsg = CMSG_FIRSTHDR(&msg); if(r->srctype == 4) { #ifdef IP_PKTINFO void* cmsg_data; msg.msg_controllen = CMSG_SPACE(sizeof(struct in_pktinfo)); log_assert(msg.msg_controllen <= sizeof(control.buf)); cmsg->cmsg_level = IPPROTO_IP; cmsg->cmsg_type = IP_PKTINFO; memmove(CMSG_DATA(cmsg), &r->pktinfo.v4info, sizeof(struct in_pktinfo)); /* unset the ifindex to not bypass the routing tables */ cmsg_data = CMSG_DATA(cmsg); ((struct in_pktinfo *) cmsg_data)->ipi_ifindex = 0; cmsg->cmsg_len = CMSG_LEN(sizeof(struct in_pktinfo)); #elif defined(IP_SENDSRCADDR) msg.msg_controllen = CMSG_SPACE(sizeof(struct in_addr)); log_assert(msg.msg_controllen <= sizeof(control.buf)); cmsg->cmsg_level = IPPROTO_IP; cmsg->cmsg_type = IP_SENDSRCADDR; memmove(CMSG_DATA(cmsg), &r->pktinfo.v4addr, sizeof(struct in_addr)); cmsg->cmsg_len = CMSG_LEN(sizeof(struct in_addr)); #else verbose(VERB_ALGO, "no IP_PKTINFO or IP_SENDSRCADDR"); msg.msg_control = NULL; #endif /* IP_PKTINFO or IP_SENDSRCADDR */ } else if(r->srctype == 6) { void* cmsg_data; msg.msg_controllen = CMSG_SPACE(sizeof(struct in6_pktinfo)); log_assert(msg.msg_controllen <= sizeof(control.buf)); cmsg->cmsg_level = IPPROTO_IPV6; cmsg->cmsg_type = IPV6_PKTINFO; memmove(CMSG_DATA(cmsg), &r->pktinfo.v6info, sizeof(struct in6_pktinfo)); /* unset the ifindex to not bypass the routing tables */ cmsg_data = CMSG_DATA(cmsg); ((struct in6_pktinfo *) cmsg_data)->ipi6_ifindex = 0; cmsg->cmsg_len = CMSG_LEN(sizeof(struct in6_pktinfo)); } else { /* try to pass all 0 to use default route */ msg.msg_controllen = CMSG_SPACE(sizeof(struct in6_pktinfo)); log_assert(msg.msg_controllen <= sizeof(control.buf)); cmsg->cmsg_level = IPPROTO_IPV6; cmsg->cmsg_type = IPV6_PKTINFO; memset(CMSG_DATA(cmsg), 0, sizeof(struct in6_pktinfo)); cmsg->cmsg_len = CMSG_LEN(sizeof(struct in6_pktinfo)); } #endif /* S_SPLINT_S */ if(verbosity >= VERB_ALGO) p_ancil("send_udp over interface", r); sent = sendmsg(c->fd, &msg, 0); if(sent == -1) { /* try again and block, waiting for IO to complete, * we want to send the answer, and we will wait for * the ethernet interface buffer to have space. */ #ifndef USE_WINSOCK if(errno == EAGAIN || # ifdef EWOULDBLOCK errno == EWOULDBLOCK || # endif errno == ENOBUFS) { #else if(WSAGetLastError() == WSAEINPROGRESS || WSAGetLastError() == WSAENOBUFS || WSAGetLastError() == WSAEWOULDBLOCK) { #endif int e; fd_set_block(c->fd); sent = sendmsg(c->fd, &msg, 0); e = errno; fd_set_nonblock(c->fd); errno = e; } } if(sent == -1) { if(!udp_send_errno_needs_log(addr, addrlen)) return 0; verbose(VERB_OPS, "sendmsg failed: %s", strerror(errno)); log_addr(VERB_OPS, "remote address is", (struct sockaddr_storage*)addr, addrlen); #ifdef __NetBSD__ /* netbsd 7 has IP_PKTINFO for recv but not send */ if(errno == EINVAL && r->srctype == 4) log_err("sendmsg: No support for sendmsg(IP_PKTINFO). " "Please disable interface-automatic"); #endif return 0; } else if((size_t)sent != sldns_buffer_remaining(packet)) { log_err("sent %d in place of %d bytes", (int)sent, (int)sldns_buffer_remaining(packet)); return 0; } return 1; #else (void)c; (void)packet; (void)addr; (void)addrlen; (void)r; log_err("sendmsg: IPV6_PKTINFO not supported"); return 0; #endif /* AF_INET6 && IPV6_PKTINFO && HAVE_SENDMSG */ } /** return true is UDP receive error needs to be logged */ static int udp_recv_needs_log(int err) { switch(err) { case ECONNREFUSED: # ifdef ENETUNREACH case ENETUNREACH: # endif # ifdef EHOSTDOWN case EHOSTDOWN: # endif # ifdef EHOSTUNREACH case EHOSTUNREACH: # endif # ifdef ENETDOWN case ENETDOWN: # endif if(verbosity >= VERB_ALGO) return 1; return 0; default: break; } return 1; } void comm_point_udp_ancil_callback(int fd, short event, void* arg) { #if defined(AF_INET6) && defined(IPV6_PKTINFO) && defined(HAVE_RECVMSG) struct comm_reply rep; struct msghdr msg; struct iovec iov[1]; ssize_t rcv; union { struct cmsghdr hdr; char buf[256]; } ancil; int i; #ifndef S_SPLINT_S struct cmsghdr* cmsg; #endif /* S_SPLINT_S */ rep.c = (struct comm_point*)arg; log_assert(rep.c->type == comm_udp); if(!(event&UB_EV_READ)) return; log_assert(rep.c && rep.c->buffer && rep.c->fd == fd); ub_comm_base_now(rep.c->ev->base); for(i=0; ibuffer); rep.addrlen = (socklen_t)sizeof(rep.addr); log_assert(fd != -1); log_assert(sldns_buffer_remaining(rep.c->buffer) > 0); msg.msg_name = &rep.addr; msg.msg_namelen = (socklen_t)sizeof(rep.addr); iov[0].iov_base = sldns_buffer_begin(rep.c->buffer); iov[0].iov_len = sldns_buffer_remaining(rep.c->buffer); msg.msg_iov = iov; msg.msg_iovlen = 1; msg.msg_control = ancil.buf; #ifndef S_SPLINT_S msg.msg_controllen = sizeof(ancil.buf); #endif /* S_SPLINT_S */ msg.msg_flags = 0; rcv = recvmsg(fd, &msg, 0); if(rcv == -1) { if(errno != EAGAIN && errno != EINTR && udp_recv_needs_log(errno)) { log_err("recvmsg failed: %s", strerror(errno)); } return; } rep.addrlen = msg.msg_namelen; sldns_buffer_skip(rep.c->buffer, rcv); sldns_buffer_flip(rep.c->buffer); rep.srctype = 0; #ifndef S_SPLINT_S for(cmsg = CMSG_FIRSTHDR(&msg); cmsg != NULL; cmsg = CMSG_NXTHDR(&msg, cmsg)) { if( cmsg->cmsg_level == IPPROTO_IPV6 && cmsg->cmsg_type == IPV6_PKTINFO) { rep.srctype = 6; memmove(&rep.pktinfo.v6info, CMSG_DATA(cmsg), sizeof(struct in6_pktinfo)); break; #ifdef IP_PKTINFO } else if( cmsg->cmsg_level == IPPROTO_IP && cmsg->cmsg_type == IP_PKTINFO) { rep.srctype = 4; memmove(&rep.pktinfo.v4info, CMSG_DATA(cmsg), sizeof(struct in_pktinfo)); break; #elif defined(IP_RECVDSTADDR) } else if( cmsg->cmsg_level == IPPROTO_IP && cmsg->cmsg_type == IP_RECVDSTADDR) { rep.srctype = 4; memmove(&rep.pktinfo.v4addr, CMSG_DATA(cmsg), sizeof(struct in_addr)); break; #endif /* IP_PKTINFO or IP_RECVDSTADDR */ } } if(verbosity >= VERB_ALGO) p_ancil("receive_udp on interface", &rep); #endif /* S_SPLINT_S */ fptr_ok(fptr_whitelist_comm_point(rep.c->callback)); if((*rep.c->callback)(rep.c, rep.c->cb_arg, NETEVENT_NOERROR, &rep)) { /* send back immediate reply */ (void)comm_point_send_udp_msg_if(rep.c, rep.c->buffer, (struct sockaddr*)&rep.addr, rep.addrlen, &rep); } if(!rep.c || rep.c->fd == -1) /* commpoint closed */ break; } #else (void)fd; (void)event; (void)arg; fatal_exit("recvmsg: No support for IPV6_PKTINFO; IP_PKTINFO or IP_RECVDSTADDR. " "Please disable interface-automatic"); #endif /* AF_INET6 && IPV6_PKTINFO && HAVE_RECVMSG */ } void comm_point_udp_callback(int fd, short event, void* arg) { struct comm_reply rep; ssize_t rcv; int i; struct sldns_buffer *buffer; rep.c = (struct comm_point*)arg; log_assert(rep.c->type == comm_udp); if(!(event&UB_EV_READ)) return; log_assert(rep.c && rep.c->buffer && rep.c->fd == fd); ub_comm_base_now(rep.c->ev->base); for(i=0; ibuffer); rep.addrlen = (socklen_t)sizeof(rep.addr); log_assert(fd != -1); log_assert(sldns_buffer_remaining(rep.c->buffer) > 0); rcv = recvfrom(fd, (void*)sldns_buffer_begin(rep.c->buffer), sldns_buffer_remaining(rep.c->buffer), 0, (struct sockaddr*)&rep.addr, &rep.addrlen); if(rcv == -1) { #ifndef USE_WINSOCK if(errno != EAGAIN && errno != EINTR && udp_recv_needs_log(errno)) log_err("recvfrom %d failed: %s", fd, strerror(errno)); #else if(WSAGetLastError() != WSAEINPROGRESS && WSAGetLastError() != WSAECONNRESET && WSAGetLastError()!= WSAEWOULDBLOCK) log_err("recvfrom failed: %s", wsa_strerror(WSAGetLastError())); #endif return; } sldns_buffer_skip(rep.c->buffer, rcv); sldns_buffer_flip(rep.c->buffer); rep.srctype = 0; fptr_ok(fptr_whitelist_comm_point(rep.c->callback)); if((*rep.c->callback)(rep.c, rep.c->cb_arg, NETEVENT_NOERROR, &rep)) { /* send back immediate reply */ #ifdef USE_DNSCRYPT buffer = rep.c->dnscrypt_buffer; #else buffer = rep.c->buffer; #endif (void)comm_point_send_udp_msg(rep.c, buffer, - (struct sockaddr*)&rep.addr, rep.addrlen); + (struct sockaddr*)&rep.addr, rep.addrlen, 0); } if(!rep.c || rep.c->fd != fd) /* commpoint closed to -1 or reused for another UDP port. Note rep.c cannot be reused with TCP fd. */ break; } } /** Use a new tcp handler for new query fd, set to read query */ static void setup_tcp_handler(struct comm_point* c, int fd, int cur, int max) { int handler_usage; log_assert(c->type == comm_tcp || c->type == comm_http); log_assert(c->fd == -1); sldns_buffer_clear(c->buffer); #ifdef USE_DNSCRYPT if (c->dnscrypt) sldns_buffer_clear(c->dnscrypt_buffer); #endif c->tcp_is_reading = 1; c->tcp_byte_count = 0; /* if more than half the tcp handlers are in use, use a shorter * timeout for this TCP connection, we need to make space for * other connections to be able to get attention */ /* If > 50% TCP handler structures in use, set timeout to 1/100th * configured value. * If > 65%TCP handler structures in use, set to 1/500th configured * value. * If > 80% TCP handler structures in use, set to 0. * * If the timeout to use falls below 200 milliseconds, an actual * timeout of 200ms is used. */ handler_usage = (cur * 100) / max; if(handler_usage > 50 && handler_usage <= 65) c->tcp_timeout_msec /= 100; else if (handler_usage > 65 && handler_usage <= 80) c->tcp_timeout_msec /= 500; else if (handler_usage > 80) c->tcp_timeout_msec = 0; comm_point_start_listening(c, fd, c->tcp_timeout_msec < TCP_QUERY_TIMEOUT_MINIMUM ? TCP_QUERY_TIMEOUT_MINIMUM : c->tcp_timeout_msec); } void comm_base_handle_slow_accept(int ATTR_UNUSED(fd), short ATTR_UNUSED(event), void* arg) { struct comm_base* b = (struct comm_base*)arg; /* timeout for the slow accept, re-enable accepts again */ if(b->start_accept) { verbose(VERB_ALGO, "wait is over, slow accept disabled"); fptr_ok(fptr_whitelist_start_accept(b->start_accept)); (*b->start_accept)(b->cb_arg); b->eb->slow_accept_enabled = 0; } } int comm_point_perform_accept(struct comm_point* c, struct sockaddr_storage* addr, socklen_t* addrlen) { int new_fd; *addrlen = (socklen_t)sizeof(*addr); #ifndef HAVE_ACCEPT4 new_fd = accept(c->fd, (struct sockaddr*)addr, addrlen); #else /* SOCK_NONBLOCK saves extra calls to fcntl for the same result */ new_fd = accept4(c->fd, (struct sockaddr*)addr, addrlen, SOCK_NONBLOCK); #endif if(new_fd == -1) { #ifndef USE_WINSOCK /* EINTR is signal interrupt. others are closed connection. */ if( errno == EINTR || errno == EAGAIN #ifdef EWOULDBLOCK || errno == EWOULDBLOCK #endif #ifdef ECONNABORTED || errno == ECONNABORTED #endif #ifdef EPROTO || errno == EPROTO #endif /* EPROTO */ ) return -1; #if defined(ENFILE) && defined(EMFILE) if(errno == ENFILE || errno == EMFILE) { /* out of file descriptors, likely outside of our * control. stop accept() calls for some time */ if(c->ev->base->stop_accept) { struct comm_base* b = c->ev->base; struct timeval tv; verbose(VERB_ALGO, "out of file descriptors: " "slow accept"); b->eb->slow_accept_enabled = 1; fptr_ok(fptr_whitelist_stop_accept( b->stop_accept)); (*b->stop_accept)(b->cb_arg); /* set timeout, no mallocs */ tv.tv_sec = NETEVENT_SLOW_ACCEPT_TIME/1000; tv.tv_usec = (NETEVENT_SLOW_ACCEPT_TIME%1000)*1000; b->eb->slow_accept = ub_event_new(b->eb->base, -1, UB_EV_TIMEOUT, comm_base_handle_slow_accept, b); if(b->eb->slow_accept == NULL) { /* we do not want to log here, because * that would spam the logfiles. * error: "event_base_set failed." */ } else if(ub_event_add(b->eb->slow_accept, &tv) != 0) { /* we do not want to log here, * error: "event_add failed." */ } } return -1; } #endif #else /* USE_WINSOCK */ if(WSAGetLastError() == WSAEINPROGRESS || WSAGetLastError() == WSAECONNRESET) return -1; if(WSAGetLastError() == WSAEWOULDBLOCK) { ub_winsock_tcp_wouldblock(c->ev->ev, UB_EV_READ); return -1; } #endif log_err_addr("accept failed", sock_strerror(errno), addr, *addrlen); return -1; } if(c->tcp_conn_limit && c->type == comm_tcp_accept) { c->tcl_addr = tcl_addr_lookup(c->tcp_conn_limit, addr, *addrlen); if(!tcl_new_connection(c->tcl_addr)) { if(verbosity >= 3) log_err_addr("accept rejected", "connection limit exceeded", addr, *addrlen); close(new_fd); return -1; } } #ifndef HAVE_ACCEPT4 fd_set_nonblock(new_fd); #endif return new_fd; } #ifdef USE_WINSOCK static long win_bio_cb(BIO *b, int oper, const char* ATTR_UNUSED(argp), int ATTR_UNUSED(argi), long argl, long retvalue) { int wsa_err = WSAGetLastError(); /* store errcode before it is gone */ verbose(VERB_ALGO, "bio_cb %d, %s %s %s", oper, (oper&BIO_CB_RETURN)?"return":"before", (oper&BIO_CB_READ)?"read":((oper&BIO_CB_WRITE)?"write":"other"), wsa_err==WSAEWOULDBLOCK?"wsawb":""); /* on windows, check if previous operation caused EWOULDBLOCK */ if( (oper == (BIO_CB_READ|BIO_CB_RETURN) && argl == 0) || (oper == (BIO_CB_GETS|BIO_CB_RETURN) && argl == 0)) { if(wsa_err == WSAEWOULDBLOCK) ub_winsock_tcp_wouldblock((struct ub_event*) BIO_get_callback_arg(b), UB_EV_READ); } if( (oper == (BIO_CB_WRITE|BIO_CB_RETURN) && argl == 0) || (oper == (BIO_CB_PUTS|BIO_CB_RETURN) && argl == 0)) { if(wsa_err == WSAEWOULDBLOCK) ub_winsock_tcp_wouldblock((struct ub_event*) BIO_get_callback_arg(b), UB_EV_WRITE); } /* return original return value */ return retvalue; } /** set win bio callbacks for nonblocking operations */ void comm_point_tcp_win_bio_cb(struct comm_point* c, void* thessl) { SSL* ssl = (SSL*)thessl; /* set them both just in case, but usually they are the same BIO */ BIO_set_callback(SSL_get_rbio(ssl), &win_bio_cb); BIO_set_callback_arg(SSL_get_rbio(ssl), (char*)c->ev->ev); BIO_set_callback(SSL_get_wbio(ssl), &win_bio_cb); BIO_set_callback_arg(SSL_get_wbio(ssl), (char*)c->ev->ev); } #endif #ifdef HAVE_NGHTTP2 /** Create http2 session server. Per connection, after TCP accepted.*/ static int http2_session_server_create(struct http2_session* h2_session) { log_assert(h2_session->callbacks); h2_session->is_drop = 0; if(nghttp2_session_server_new(&h2_session->session, h2_session->callbacks, h2_session) == NGHTTP2_ERR_NOMEM) { log_err("failed to create nghttp2 session server"); return 0; } return 1; } /** Submit http2 setting to session. Once per session. */ static int http2_submit_settings(struct http2_session* h2_session) { int ret; nghttp2_settings_entry settings[1] = { {NGHTTP2_SETTINGS_MAX_CONCURRENT_STREAMS, h2_session->c->http2_max_streams}}; ret = nghttp2_submit_settings(h2_session->session, NGHTTP2_FLAG_NONE, settings, 1); if(ret) { verbose(VERB_QUERY, "http2: submit_settings failed, " "error: %s", nghttp2_strerror(ret)); return 0; } return 1; } #endif /* HAVE_NGHTTP2 */ void comm_point_tcp_accept_callback(int fd, short event, void* arg) { struct comm_point* c = (struct comm_point*)arg, *c_hdl; int new_fd; log_assert(c->type == comm_tcp_accept); if(!(event & UB_EV_READ)) { log_info("ignoring tcp accept event %d", (int)event); return; } ub_comm_base_now(c->ev->base); /* find free tcp handler. */ if(!c->tcp_free) { log_warn("accepted too many tcp, connections full"); return; } /* accept incoming connection. */ c_hdl = c->tcp_free; /* clear leftover flags from previous use, and then set the * correct event base for the event structure for libevent */ ub_event_free(c_hdl->ev->ev); if((c_hdl->type == comm_tcp && c_hdl->tcp_req_info) || c_hdl->type == comm_local || c_hdl->type == comm_raw) c_hdl->tcp_do_toggle_rw = 0; else c_hdl->tcp_do_toggle_rw = 1; if(c_hdl->type == comm_http) { #ifdef HAVE_NGHTTP2 if(!c_hdl->h2_session || !http2_session_server_create(c_hdl->h2_session)) { log_warn("failed to create nghttp2"); return; } if(!c_hdl->h2_session || !http2_submit_settings(c_hdl->h2_session)) { log_warn("failed to submit http2 settings"); return; } if(!c->ssl) { c_hdl->tcp_do_toggle_rw = 0; c_hdl->use_h2 = 1; } #endif c_hdl->ev->ev = ub_event_new(c_hdl->ev->base->eb->base, -1, UB_EV_PERSIST | UB_EV_READ | UB_EV_TIMEOUT, comm_point_http_handle_callback, c_hdl); } else { c_hdl->ev->ev = ub_event_new(c_hdl->ev->base->eb->base, -1, UB_EV_PERSIST | UB_EV_READ | UB_EV_TIMEOUT, comm_point_tcp_handle_callback, c_hdl); } if(!c_hdl->ev->ev) { log_warn("could not ub_event_new, dropped tcp"); return; } log_assert(fd != -1); (void)fd; new_fd = comm_point_perform_accept(c, &c_hdl->repinfo.addr, &c_hdl->repinfo.addrlen); if(new_fd == -1) return; if(c->ssl) { c_hdl->ssl = incoming_ssl_fd(c->ssl, new_fd); if(!c_hdl->ssl) { c_hdl->fd = new_fd; comm_point_close(c_hdl); return; } c_hdl->ssl_shake_state = comm_ssl_shake_read; #ifdef USE_WINSOCK comm_point_tcp_win_bio_cb(c_hdl, c_hdl->ssl); #endif } /* grab the tcp handler buffers */ c->cur_tcp_count++; c->tcp_free = c_hdl->tcp_free; if(!c->tcp_free) { /* stop accepting incoming queries for now. */ comm_point_stop_listening(c); } setup_tcp_handler(c_hdl, new_fd, c->cur_tcp_count, c->max_tcp_count); } /** Make tcp handler free for next assignment */ static void reclaim_tcp_handler(struct comm_point* c) { log_assert(c->type == comm_tcp); if(c->ssl) { #ifdef HAVE_SSL SSL_shutdown(c->ssl); SSL_free(c->ssl); c->ssl = NULL; #endif } comm_point_close(c); if(c->tcp_parent) { c->tcp_parent->cur_tcp_count--; c->tcp_free = c->tcp_parent->tcp_free; c->tcp_parent->tcp_free = c; if(!c->tcp_free) { /* re-enable listening on accept socket */ comm_point_start_listening(c->tcp_parent, -1, -1); } } c->tcp_more_read_again = NULL; c->tcp_more_write_again = NULL; } /** do the callback when writing is done */ static void tcp_callback_writer(struct comm_point* c) { log_assert(c->type == comm_tcp); if(!c->tcp_write_and_read) { sldns_buffer_clear(c->buffer); c->tcp_byte_count = 0; } if(c->tcp_do_toggle_rw) c->tcp_is_reading = 1; /* switch from listening(write) to listening(read) */ if(c->tcp_req_info) { tcp_req_info_handle_writedone(c->tcp_req_info); } else { comm_point_stop_listening(c); if(c->tcp_write_and_read) { fptr_ok(fptr_whitelist_comm_point(c->callback)); if( (*c->callback)(c, c->cb_arg, NETEVENT_PKT_WRITTEN, &c->repinfo) ) { comm_point_start_listening(c, -1, c->tcp_timeout_msec); } } else { comm_point_start_listening(c, -1, c->tcp_timeout_msec); } } } /** do the callback when reading is done */ static void tcp_callback_reader(struct comm_point* c) { log_assert(c->type == comm_tcp || c->type == comm_local); sldns_buffer_flip(c->buffer); if(c->tcp_do_toggle_rw) c->tcp_is_reading = 0; c->tcp_byte_count = 0; if(c->tcp_req_info) { tcp_req_info_handle_readdone(c->tcp_req_info); } else { if(c->type == comm_tcp) comm_point_stop_listening(c); fptr_ok(fptr_whitelist_comm_point(c->callback)); if( (*c->callback)(c, c->cb_arg, NETEVENT_NOERROR, &c->repinfo) ) { comm_point_start_listening(c, -1, c->tcp_timeout_msec); } } } #ifdef HAVE_SSL /** true if the ssl handshake error has to be squelched from the logs */ int squelch_err_ssl_handshake(unsigned long err) { if(verbosity >= VERB_QUERY) return 0; /* only squelch on low verbosity */ /* this is very specific, we could filter on ERR_GET_REASON() * (the third element in ERR_PACK) */ if(err == ERR_PACK(ERR_LIB_SSL, SSL_F_SSL3_GET_RECORD, SSL_R_HTTPS_PROXY_REQUEST) || err == ERR_PACK(ERR_LIB_SSL, SSL_F_SSL3_GET_RECORD, SSL_R_HTTP_REQUEST) || err == ERR_PACK(ERR_LIB_SSL, SSL_F_SSL3_GET_RECORD, SSL_R_WRONG_VERSION_NUMBER) || err == ERR_PACK(ERR_LIB_SSL, SSL_F_SSL3_READ_BYTES, SSL_R_SSLV3_ALERT_BAD_CERTIFICATE) #ifdef SSL_F_TLS_POST_PROCESS_CLIENT_HELLO || err == ERR_PACK(ERR_LIB_SSL, SSL_F_TLS_POST_PROCESS_CLIENT_HELLO, SSL_R_NO_SHARED_CIPHER) #endif #ifdef SSL_F_TLS_EARLY_POST_PROCESS_CLIENT_HELLO || err == ERR_PACK(ERR_LIB_SSL, SSL_F_TLS_EARLY_POST_PROCESS_CLIENT_HELLO, SSL_R_UNKNOWN_PROTOCOL) || err == ERR_PACK(ERR_LIB_SSL, SSL_F_TLS_EARLY_POST_PROCESS_CLIENT_HELLO, SSL_R_UNSUPPORTED_PROTOCOL) # ifdef SSL_R_VERSION_TOO_LOW || err == ERR_PACK(ERR_LIB_SSL, SSL_F_TLS_EARLY_POST_PROCESS_CLIENT_HELLO, SSL_R_VERSION_TOO_LOW) # endif #endif ) return 1; return 0; } #endif /* HAVE_SSL */ /** continue ssl handshake */ #ifdef HAVE_SSL static int ssl_handshake(struct comm_point* c) { int r; if(c->ssl_shake_state == comm_ssl_shake_hs_read) { /* read condition satisfied back to writing */ comm_point_listen_for_rw(c, 1, 1); c->ssl_shake_state = comm_ssl_shake_none; return 1; } if(c->ssl_shake_state == comm_ssl_shake_hs_write) { /* write condition satisfied, back to reading */ comm_point_listen_for_rw(c, 1, 0); c->ssl_shake_state = comm_ssl_shake_none; return 1; } ERR_clear_error(); r = SSL_do_handshake(c->ssl); if(r != 1) { int want = SSL_get_error(c->ssl, r); if(want == SSL_ERROR_WANT_READ) { if(c->ssl_shake_state == comm_ssl_shake_read) return 1; c->ssl_shake_state = comm_ssl_shake_read; comm_point_listen_for_rw(c, 1, 0); return 1; } else if(want == SSL_ERROR_WANT_WRITE) { if(c->ssl_shake_state == comm_ssl_shake_write) return 1; c->ssl_shake_state = comm_ssl_shake_write; comm_point_listen_for_rw(c, 0, 1); return 1; } else if(r == 0) { return 0; /* closed */ } else if(want == SSL_ERROR_SYSCALL) { /* SYSCALL and errno==0 means closed uncleanly */ #ifdef EPIPE if(errno == EPIPE && verbosity < 2) return 0; /* silence 'broken pipe' */ #endif #ifdef ECONNRESET if(errno == ECONNRESET && verbosity < 2) return 0; /* silence reset by peer */ #endif if(errno != 0) log_err("SSL_handshake syscall: %s", strerror(errno)); return 0; } else { unsigned long err = ERR_get_error(); if(!squelch_err_ssl_handshake(err)) { log_crypto_err_code("ssl handshake failed", err); log_addr(VERB_OPS, "ssl handshake failed", &c->repinfo.addr, c->repinfo.addrlen); } return 0; } } /* this is where peer verification could take place */ if((SSL_get_verify_mode(c->ssl)&SSL_VERIFY_PEER)) { /* verification */ if(SSL_get_verify_result(c->ssl) == X509_V_OK) { X509* x = SSL_get_peer_certificate(c->ssl); if(!x) { log_addr(VERB_ALGO, "SSL connection failed: " "no certificate", &c->repinfo.addr, c->repinfo.addrlen); return 0; } log_cert(VERB_ALGO, "peer certificate", x); #ifdef HAVE_SSL_GET0_PEERNAME if(SSL_get0_peername(c->ssl)) { char buf[255]; snprintf(buf, sizeof(buf), "SSL connection " "to %s authenticated", SSL_get0_peername(c->ssl)); log_addr(VERB_ALGO, buf, &c->repinfo.addr, c->repinfo.addrlen); } else { #endif log_addr(VERB_ALGO, "SSL connection " "authenticated", &c->repinfo.addr, c->repinfo.addrlen); #ifdef HAVE_SSL_GET0_PEERNAME } #endif X509_free(x); } else { X509* x = SSL_get_peer_certificate(c->ssl); if(x) { log_cert(VERB_ALGO, "peer certificate", x); X509_free(x); } log_addr(VERB_ALGO, "SSL connection failed: " "failed to authenticate", &c->repinfo.addr, c->repinfo.addrlen); return 0; } } else { /* unauthenticated, the verify peer flag was not set * in c->ssl when the ssl object was created from ssl_ctx */ log_addr(VERB_ALGO, "SSL connection", &c->repinfo.addr, c->repinfo.addrlen); } /* check if http2 use is negotiated */ if(c->type == comm_http && c->h2_session) { const unsigned char *alpn; unsigned int alpnlen = 0; SSL_get0_alpn_selected(c->ssl, &alpn, &alpnlen); if(alpnlen == 2 && memcmp("h2", alpn, 2) == 0) { /* connection upgraded to HTTP2 */ c->tcp_do_toggle_rw = 0; c->use_h2 = 1; } } /* setup listen rw correctly */ if(c->tcp_is_reading) { if(c->ssl_shake_state != comm_ssl_shake_read) comm_point_listen_for_rw(c, 1, 0); } else { comm_point_listen_for_rw(c, 1, 1); } c->ssl_shake_state = comm_ssl_shake_none; return 1; } #endif /* HAVE_SSL */ /** ssl read callback on TCP */ static int ssl_handle_read(struct comm_point* c) { #ifdef HAVE_SSL int r; if(c->ssl_shake_state != comm_ssl_shake_none) { if(!ssl_handshake(c)) return 0; if(c->ssl_shake_state != comm_ssl_shake_none) return 1; } if(c->tcp_byte_count < sizeof(uint16_t)) { /* read length bytes */ ERR_clear_error(); if((r=SSL_read(c->ssl, (void*)sldns_buffer_at(c->buffer, c->tcp_byte_count), (int)(sizeof(uint16_t) - c->tcp_byte_count))) <= 0) { int want = SSL_get_error(c->ssl, r); if(want == SSL_ERROR_ZERO_RETURN) { if(c->tcp_req_info) return tcp_req_info_handle_read_close(c->tcp_req_info); return 0; /* shutdown, closed */ } else if(want == SSL_ERROR_WANT_READ) { ub_winsock_tcp_wouldblock(c->ev->ev, UB_EV_READ); return 1; /* read more later */ } else if(want == SSL_ERROR_WANT_WRITE) { c->ssl_shake_state = comm_ssl_shake_hs_write; comm_point_listen_for_rw(c, 0, 1); return 1; } else if(want == SSL_ERROR_SYSCALL) { #ifdef ECONNRESET if(errno == ECONNRESET && verbosity < 2) return 0; /* silence reset by peer */ #endif if(errno != 0) log_err("SSL_read syscall: %s", strerror(errno)); return 0; } log_crypto_err("could not SSL_read"); return 0; } c->tcp_byte_count += r; if(c->tcp_byte_count < sizeof(uint16_t)) return 1; if(sldns_buffer_read_u16_at(c->buffer, 0) > sldns_buffer_capacity(c->buffer)) { verbose(VERB_QUERY, "ssl: dropped larger than buffer"); return 0; } sldns_buffer_set_limit(c->buffer, sldns_buffer_read_u16_at(c->buffer, 0)); if(sldns_buffer_limit(c->buffer) < LDNS_HEADER_SIZE) { verbose(VERB_QUERY, "ssl: dropped bogus too short."); return 0; } sldns_buffer_skip(c->buffer, (ssize_t)(c->tcp_byte_count-sizeof(uint16_t))); verbose(VERB_ALGO, "Reading ssl tcp query of length %d", (int)sldns_buffer_limit(c->buffer)); } if(sldns_buffer_remaining(c->buffer) > 0) { ERR_clear_error(); r = SSL_read(c->ssl, (void*)sldns_buffer_current(c->buffer), (int)sldns_buffer_remaining(c->buffer)); if(r <= 0) { int want = SSL_get_error(c->ssl, r); if(want == SSL_ERROR_ZERO_RETURN) { if(c->tcp_req_info) return tcp_req_info_handle_read_close(c->tcp_req_info); return 0; /* shutdown, closed */ } else if(want == SSL_ERROR_WANT_READ) { ub_winsock_tcp_wouldblock(c->ev->ev, UB_EV_READ); return 1; /* read more later */ } else if(want == SSL_ERROR_WANT_WRITE) { c->ssl_shake_state = comm_ssl_shake_hs_write; comm_point_listen_for_rw(c, 0, 1); return 1; } else if(want == SSL_ERROR_SYSCALL) { #ifdef ECONNRESET if(errno == ECONNRESET && verbosity < 2) return 0; /* silence reset by peer */ #endif if(errno != 0) log_err("SSL_read syscall: %s", strerror(errno)); return 0; } log_crypto_err("could not SSL_read"); return 0; } sldns_buffer_skip(c->buffer, (ssize_t)r); } if(sldns_buffer_remaining(c->buffer) <= 0) { tcp_callback_reader(c); } return 1; #else (void)c; return 0; #endif /* HAVE_SSL */ } /** ssl write callback on TCP */ static int ssl_handle_write(struct comm_point* c) { #ifdef HAVE_SSL int r; if(c->ssl_shake_state != comm_ssl_shake_none) { if(!ssl_handshake(c)) return 0; if(c->ssl_shake_state != comm_ssl_shake_none) return 1; } /* ignore return, if fails we may simply block */ (void)SSL_set_mode(c->ssl, (long)SSL_MODE_ENABLE_PARTIAL_WRITE); if((c->tcp_write_and_read?c->tcp_write_byte_count:c->tcp_byte_count) < sizeof(uint16_t)) { uint16_t len = htons(c->tcp_write_and_read?c->tcp_write_pkt_len:sldns_buffer_limit(c->buffer)); ERR_clear_error(); if(c->tcp_write_and_read) { if(c->tcp_write_pkt_len + 2 < LDNS_RR_BUF_SIZE) { /* combine the tcp length and the query for * write, this emulates writev */ uint8_t buf[LDNS_RR_BUF_SIZE]; memmove(buf, &len, sizeof(uint16_t)); memmove(buf+sizeof(uint16_t), c->tcp_write_pkt, c->tcp_write_pkt_len); r = SSL_write(c->ssl, (void*)(buf+c->tcp_write_byte_count), c->tcp_write_pkt_len + 2 - c->tcp_write_byte_count); } else { r = SSL_write(c->ssl, (void*)(((uint8_t*)&len)+c->tcp_write_byte_count), (int)(sizeof(uint16_t)-c->tcp_write_byte_count)); } } else if(sizeof(uint16_t)+sldns_buffer_remaining(c->buffer) < LDNS_RR_BUF_SIZE) { /* combine the tcp length and the query for write, * this emulates writev */ uint8_t buf[LDNS_RR_BUF_SIZE]; memmove(buf, &len, sizeof(uint16_t)); memmove(buf+sizeof(uint16_t), sldns_buffer_current(c->buffer), sldns_buffer_remaining(c->buffer)); r = SSL_write(c->ssl, (void*)(buf+c->tcp_byte_count), (int)(sizeof(uint16_t)+ sldns_buffer_remaining(c->buffer) - c->tcp_byte_count)); } else { r = SSL_write(c->ssl, (void*)(((uint8_t*)&len)+c->tcp_byte_count), (int)(sizeof(uint16_t)-c->tcp_byte_count)); } if(r <= 0) { int want = SSL_get_error(c->ssl, r); if(want == SSL_ERROR_ZERO_RETURN) { return 0; /* closed */ } else if(want == SSL_ERROR_WANT_READ) { c->ssl_shake_state = comm_ssl_shake_hs_read; comm_point_listen_for_rw(c, 1, 0); return 1; /* wait for read condition */ } else if(want == SSL_ERROR_WANT_WRITE) { ub_winsock_tcp_wouldblock(c->ev->ev, UB_EV_WRITE); return 1; /* write more later */ } else if(want == SSL_ERROR_SYSCALL) { #ifdef EPIPE if(errno == EPIPE && verbosity < 2) return 0; /* silence 'broken pipe' */ #endif if(errno != 0) log_err("SSL_write syscall: %s", strerror(errno)); return 0; } log_crypto_err("could not SSL_write"); return 0; } if(c->tcp_write_and_read) { c->tcp_write_byte_count += r; if(c->tcp_write_byte_count < sizeof(uint16_t)) return 1; } else { c->tcp_byte_count += r; if(c->tcp_byte_count < sizeof(uint16_t)) return 1; sldns_buffer_set_position(c->buffer, c->tcp_byte_count - sizeof(uint16_t)); } if((!c->tcp_write_and_read && sldns_buffer_remaining(c->buffer) == 0) || (c->tcp_write_and_read && c->tcp_write_byte_count == c->tcp_write_pkt_len + 2)) { tcp_callback_writer(c); return 1; } } log_assert(c->tcp_write_and_read || sldns_buffer_remaining(c->buffer) > 0); log_assert(!c->tcp_write_and_read || c->tcp_write_byte_count < c->tcp_write_pkt_len + 2); ERR_clear_error(); if(c->tcp_write_and_read) { r = SSL_write(c->ssl, (void*)(c->tcp_write_pkt + c->tcp_write_byte_count - 2), (int)(c->tcp_write_pkt_len + 2 - c->tcp_write_byte_count)); } else { r = SSL_write(c->ssl, (void*)sldns_buffer_current(c->buffer), (int)sldns_buffer_remaining(c->buffer)); } if(r <= 0) { int want = SSL_get_error(c->ssl, r); if(want == SSL_ERROR_ZERO_RETURN) { return 0; /* closed */ } else if(want == SSL_ERROR_WANT_READ) { c->ssl_shake_state = comm_ssl_shake_hs_read; comm_point_listen_for_rw(c, 1, 0); return 1; /* wait for read condition */ } else if(want == SSL_ERROR_WANT_WRITE) { ub_winsock_tcp_wouldblock(c->ev->ev, UB_EV_WRITE); return 1; /* write more later */ } else if(want == SSL_ERROR_SYSCALL) { #ifdef EPIPE if(errno == EPIPE && verbosity < 2) return 0; /* silence 'broken pipe' */ #endif if(errno != 0) log_err("SSL_write syscall: %s", strerror(errno)); return 0; } log_crypto_err("could not SSL_write"); return 0; } if(c->tcp_write_and_read) { c->tcp_write_byte_count += r; } else { sldns_buffer_skip(c->buffer, (ssize_t)r); } if((!c->tcp_write_and_read && sldns_buffer_remaining(c->buffer) == 0) || (c->tcp_write_and_read && c->tcp_write_byte_count == c->tcp_write_pkt_len + 2)) { tcp_callback_writer(c); } return 1; #else (void)c; return 0; #endif /* HAVE_SSL */ } /** handle ssl tcp connection with dns contents */ static int ssl_handle_it(struct comm_point* c, int is_write) { /* handle case where renegotiation wants read during write call * or write during read calls */ if(is_write && c->ssl_shake_state == comm_ssl_shake_hs_write) return ssl_handle_read(c); else if(!is_write && c->ssl_shake_state == comm_ssl_shake_hs_read) return ssl_handle_write(c); /* handle read events for read operation and write events for a * write operation */ else if(!is_write) return ssl_handle_read(c); return ssl_handle_write(c); } /** Handle tcp reading callback. * @param fd: file descriptor of socket. * @param c: comm point to read from into buffer. * @param short_ok: if true, very short packets are OK (for comm_local). * @return: 0 on error */ static int comm_point_tcp_handle_read(int fd, struct comm_point* c, int short_ok) { ssize_t r; log_assert(c->type == comm_tcp || c->type == comm_local); if(c->ssl) return ssl_handle_it(c, 0); if(!c->tcp_is_reading && !c->tcp_write_and_read) return 0; log_assert(fd != -1); if(c->tcp_byte_count < sizeof(uint16_t)) { /* read length bytes */ r = recv(fd,(void*)sldns_buffer_at(c->buffer,c->tcp_byte_count), sizeof(uint16_t)-c->tcp_byte_count, 0); if(r == 0) { if(c->tcp_req_info) return tcp_req_info_handle_read_close(c->tcp_req_info); return 0; } else if(r == -1) { #ifndef USE_WINSOCK if(errno == EINTR || errno == EAGAIN) return 1; #ifdef ECONNRESET if(errno == ECONNRESET && verbosity < 2) return 0; /* silence reset by peer */ #endif #else /* USE_WINSOCK */ if(WSAGetLastError() == WSAECONNRESET) return 0; if(WSAGetLastError() == WSAEINPROGRESS) return 1; if(WSAGetLastError() == WSAEWOULDBLOCK) { ub_winsock_tcp_wouldblock(c->ev->ev, UB_EV_READ); return 1; } #endif log_err_addr("read (in tcp s)", sock_strerror(errno), &c->repinfo.addr, c->repinfo.addrlen); return 0; } c->tcp_byte_count += r; if(c->tcp_byte_count != sizeof(uint16_t)) return 1; if(sldns_buffer_read_u16_at(c->buffer, 0) > sldns_buffer_capacity(c->buffer)) { verbose(VERB_QUERY, "tcp: dropped larger than buffer"); return 0; } sldns_buffer_set_limit(c->buffer, sldns_buffer_read_u16_at(c->buffer, 0)); if(!short_ok && sldns_buffer_limit(c->buffer) < LDNS_HEADER_SIZE) { verbose(VERB_QUERY, "tcp: dropped bogus too short."); return 0; } verbose(VERB_ALGO, "Reading tcp query of length %d", (int)sldns_buffer_limit(c->buffer)); } log_assert(sldns_buffer_remaining(c->buffer) > 0); r = recv(fd, (void*)sldns_buffer_current(c->buffer), sldns_buffer_remaining(c->buffer), 0); if(r == 0) { if(c->tcp_req_info) return tcp_req_info_handle_read_close(c->tcp_req_info); return 0; } else if(r == -1) { #ifndef USE_WINSOCK if(errno == EINTR || errno == EAGAIN) return 1; #else /* USE_WINSOCK */ if(WSAGetLastError() == WSAECONNRESET) return 0; if(WSAGetLastError() == WSAEINPROGRESS) return 1; if(WSAGetLastError() == WSAEWOULDBLOCK) { ub_winsock_tcp_wouldblock(c->ev->ev, UB_EV_READ); return 1; } #endif log_err_addr("read (in tcp r)", sock_strerror(errno), &c->repinfo.addr, c->repinfo.addrlen); return 0; } sldns_buffer_skip(c->buffer, r); if(sldns_buffer_remaining(c->buffer) <= 0) { tcp_callback_reader(c); } return 1; } /** * Handle tcp writing callback. * @param fd: file descriptor of socket. * @param c: comm point to write buffer out of. * @return: 0 on error */ static int comm_point_tcp_handle_write(int fd, struct comm_point* c) { ssize_t r; struct sldns_buffer *buffer; log_assert(c->type == comm_tcp); #ifdef USE_DNSCRYPT buffer = c->dnscrypt_buffer; #else buffer = c->buffer; #endif if(c->tcp_is_reading && !c->ssl && !c->tcp_write_and_read) return 0; log_assert(fd != -1); if(((!c->tcp_write_and_read && c->tcp_byte_count == 0) || (c->tcp_write_and_read && c->tcp_write_byte_count == 0)) && c->tcp_check_nb_connect) { /* check for pending error from nonblocking connect */ /* from Stevens, unix network programming, vol1, 3rd ed, p450*/ int error = 0; socklen_t len = (socklen_t)sizeof(error); if(getsockopt(fd, SOL_SOCKET, SO_ERROR, (void*)&error, &len) < 0){ #ifndef USE_WINSOCK error = errno; /* on solaris errno is error */ #else /* USE_WINSOCK */ error = WSAGetLastError(); #endif } #ifndef USE_WINSOCK #if defined(EINPROGRESS) && defined(EWOULDBLOCK) if(error == EINPROGRESS || error == EWOULDBLOCK) return 1; /* try again later */ else #endif if(error != 0 && verbosity < 2) return 0; /* silence lots of chatter in the logs */ else if(error != 0) { log_err_addr("tcp connect", strerror(error), &c->repinfo.addr, c->repinfo.addrlen); #else /* USE_WINSOCK */ /* examine error */ if(error == WSAEINPROGRESS) return 1; else if(error == WSAEWOULDBLOCK) { ub_winsock_tcp_wouldblock(c->ev->ev, UB_EV_WRITE); return 1; } else if(error != 0 && verbosity < 2) return 0; else if(error != 0) { log_err_addr("tcp connect", wsa_strerror(error), &c->repinfo.addr, c->repinfo.addrlen); #endif /* USE_WINSOCK */ return 0; } } if(c->ssl) return ssl_handle_it(c, 1); #ifdef USE_MSG_FASTOPEN /* Only try this on first use of a connection that uses tfo, otherwise fall through to normal write */ /* Also, TFO support on WINDOWS not implemented at the moment */ if(c->tcp_do_fastopen == 1) { /* this form of sendmsg() does both a connect() and send() so need to look for various flavours of error*/ uint16_t len = htons(c->tcp_write_and_read?c->tcp_write_pkt_len:sldns_buffer_limit(buffer)); struct msghdr msg; struct iovec iov[2]; c->tcp_do_fastopen = 0; memset(&msg, 0, sizeof(msg)); if(c->tcp_write_and_read) { iov[0].iov_base = (uint8_t*)&len + c->tcp_write_byte_count; iov[0].iov_len = sizeof(uint16_t) - c->tcp_write_byte_count; iov[1].iov_base = c->tcp_write_pkt; iov[1].iov_len = c->tcp_write_pkt_len; } else { iov[0].iov_base = (uint8_t*)&len + c->tcp_byte_count; iov[0].iov_len = sizeof(uint16_t) - c->tcp_byte_count; iov[1].iov_base = sldns_buffer_begin(buffer); iov[1].iov_len = sldns_buffer_limit(buffer); } log_assert(iov[0].iov_len > 0); msg.msg_name = &c->repinfo.addr; msg.msg_namelen = c->repinfo.addrlen; msg.msg_iov = iov; msg.msg_iovlen = 2; r = sendmsg(fd, &msg, MSG_FASTOPEN); if (r == -1) { #if defined(EINPROGRESS) && defined(EWOULDBLOCK) /* Handshake is underway, maybe because no TFO cookie available. Come back to write the message*/ if(errno == EINPROGRESS || errno == EWOULDBLOCK) return 1; #endif if(errno == EINTR || errno == EAGAIN) return 1; /* Not handling EISCONN here as shouldn't ever hit that case.*/ if(errno != EPIPE && errno != 0 && verbosity < 2) return 0; /* silence lots of chatter in the logs */ if(errno != EPIPE && errno != 0) { log_err_addr("tcp sendmsg", strerror(errno), &c->repinfo.addr, c->repinfo.addrlen); return 0; } /* fallthrough to nonFASTOPEN * (MSG_FASTOPEN on Linux 3 produces EPIPE) * we need to perform connect() */ if(connect(fd, (struct sockaddr *)&c->repinfo.addr, c->repinfo.addrlen) == -1) { #ifdef EINPROGRESS if(errno == EINPROGRESS) return 1; /* wait until connect done*/ #endif #ifdef USE_WINSOCK if(WSAGetLastError() == WSAEINPROGRESS || WSAGetLastError() == WSAEWOULDBLOCK) return 1; /* wait until connect done*/ #endif if(tcp_connect_errno_needs_log( (struct sockaddr *)&c->repinfo.addr, c->repinfo.addrlen)) { log_err_addr("outgoing tcp: connect after EPIPE for fastopen", strerror(errno), &c->repinfo.addr, c->repinfo.addrlen); } return 0; } } else { if(c->tcp_write_and_read) { c->tcp_write_byte_count += r; if(c->tcp_write_byte_count < sizeof(uint16_t)) return 1; } else { c->tcp_byte_count += r; if(c->tcp_byte_count < sizeof(uint16_t)) return 1; sldns_buffer_set_position(buffer, c->tcp_byte_count - sizeof(uint16_t)); } if((!c->tcp_write_and_read && sldns_buffer_remaining(buffer) == 0) || (c->tcp_write_and_read && c->tcp_write_byte_count == c->tcp_write_pkt_len + 2)) { tcp_callback_writer(c); return 1; } } } #endif /* USE_MSG_FASTOPEN */ if((c->tcp_write_and_read?c->tcp_write_byte_count:c->tcp_byte_count) < sizeof(uint16_t)) { uint16_t len = htons(c->tcp_write_and_read?c->tcp_write_pkt_len:sldns_buffer_limit(buffer)); #ifdef HAVE_WRITEV struct iovec iov[2]; if(c->tcp_write_and_read) { iov[0].iov_base = (uint8_t*)&len + c->tcp_write_byte_count; iov[0].iov_len = sizeof(uint16_t) - c->tcp_write_byte_count; iov[1].iov_base = c->tcp_write_pkt; iov[1].iov_len = c->tcp_write_pkt_len; } else { iov[0].iov_base = (uint8_t*)&len + c->tcp_byte_count; iov[0].iov_len = sizeof(uint16_t) - c->tcp_byte_count; iov[1].iov_base = sldns_buffer_begin(buffer); iov[1].iov_len = sldns_buffer_limit(buffer); } log_assert(iov[0].iov_len > 0); r = writev(fd, iov, 2); #else /* HAVE_WRITEV */ if(c->tcp_write_and_read) { r = send(fd, (void*)(((uint8_t*)&len)+c->tcp_write_byte_count), sizeof(uint16_t)-c->tcp_write_byte_count, 0); } else { r = send(fd, (void*)(((uint8_t*)&len)+c->tcp_byte_count), sizeof(uint16_t)-c->tcp_byte_count, 0); } #endif /* HAVE_WRITEV */ if(r == -1) { #ifndef USE_WINSOCK # ifdef EPIPE if(errno == EPIPE && verbosity < 2) return 0; /* silence 'broken pipe' */ #endif if(errno == EINTR || errno == EAGAIN) return 1; #ifdef ECONNRESET if(errno == ECONNRESET && verbosity < 2) return 0; /* silence reset by peer */ #endif # ifdef HAVE_WRITEV log_err_addr("tcp writev", strerror(errno), &c->repinfo.addr, c->repinfo.addrlen); # else /* HAVE_WRITEV */ log_err_addr("tcp send s", strerror(errno), &c->repinfo.addr, c->repinfo.addrlen); # endif /* HAVE_WRITEV */ #else if(WSAGetLastError() == WSAENOTCONN) return 1; if(WSAGetLastError() == WSAEINPROGRESS) return 1; if(WSAGetLastError() == WSAEWOULDBLOCK) { ub_winsock_tcp_wouldblock(c->ev->ev, UB_EV_WRITE); return 1; } if(WSAGetLastError() == WSAECONNRESET && verbosity < 2) return 0; /* silence reset by peer */ log_err_addr("tcp send s", wsa_strerror(WSAGetLastError()), &c->repinfo.addr, c->repinfo.addrlen); #endif return 0; } if(c->tcp_write_and_read) { c->tcp_write_byte_count += r; if(c->tcp_write_byte_count < sizeof(uint16_t)) return 1; } else { c->tcp_byte_count += r; if(c->tcp_byte_count < sizeof(uint16_t)) return 1; sldns_buffer_set_position(buffer, c->tcp_byte_count - sizeof(uint16_t)); } if((!c->tcp_write_and_read && sldns_buffer_remaining(buffer) == 0) || (c->tcp_write_and_read && c->tcp_write_byte_count == c->tcp_write_pkt_len + 2)) { tcp_callback_writer(c); return 1; } } log_assert(c->tcp_write_and_read || sldns_buffer_remaining(buffer) > 0); log_assert(!c->tcp_write_and_read || c->tcp_write_byte_count < c->tcp_write_pkt_len + 2); if(c->tcp_write_and_read) { r = send(fd, (void*)c->tcp_write_pkt + c->tcp_write_byte_count - 2, c->tcp_write_pkt_len + 2 - c->tcp_write_byte_count, 0); } else { r = send(fd, (void*)sldns_buffer_current(buffer), sldns_buffer_remaining(buffer), 0); } if(r == -1) { #ifndef USE_WINSOCK if(errno == EINTR || errno == EAGAIN) return 1; #ifdef ECONNRESET if(errno == ECONNRESET && verbosity < 2) return 0; /* silence reset by peer */ #endif #else if(WSAGetLastError() == WSAEINPROGRESS) return 1; if(WSAGetLastError() == WSAEWOULDBLOCK) { ub_winsock_tcp_wouldblock(c->ev->ev, UB_EV_WRITE); return 1; } if(WSAGetLastError() == WSAECONNRESET && verbosity < 2) return 0; /* silence reset by peer */ #endif log_err_addr("tcp send r", sock_strerror(errno), &c->repinfo.addr, c->repinfo.addrlen); return 0; } if(c->tcp_write_and_read) { c->tcp_write_byte_count += r; } else { sldns_buffer_skip(buffer, r); } if((!c->tcp_write_and_read && sldns_buffer_remaining(buffer) == 0) || (c->tcp_write_and_read && c->tcp_write_byte_count == c->tcp_write_pkt_len + 2)) { tcp_callback_writer(c); } return 1; } /** read again to drain buffers when there could be more to read */ static void tcp_req_info_read_again(int fd, struct comm_point* c) { while(c->tcp_req_info->read_again) { int r; c->tcp_req_info->read_again = 0; if(c->tcp_is_reading) r = comm_point_tcp_handle_read(fd, c, 0); else r = comm_point_tcp_handle_write(fd, c); if(!r) { reclaim_tcp_handler(c); if(!c->tcp_do_close) { fptr_ok(fptr_whitelist_comm_point( c->callback)); (void)(*c->callback)(c, c->cb_arg, NETEVENT_CLOSED, NULL); } return; } } } /** read again to drain buffers when there could be more to read */ static void tcp_more_read_again(int fd, struct comm_point* c) { /* if the packet is done, but another one could be waiting on * the connection, the callback signals this, and we try again */ /* this continues until the read routines get EAGAIN or so, * and thus does not call the callback, and the bool is 0 */ int* moreread = c->tcp_more_read_again; while(moreread && *moreread) { *moreread = 0; if(!comm_point_tcp_handle_read(fd, c, 0)) { reclaim_tcp_handler(c); if(!c->tcp_do_close) { fptr_ok(fptr_whitelist_comm_point( c->callback)); (void)(*c->callback)(c, c->cb_arg, NETEVENT_CLOSED, NULL); } return; } } } /** write again to fill up when there could be more to write */ static void tcp_more_write_again(int fd, struct comm_point* c) { /* if the packet is done, but another is waiting to be written, * the callback signals it and we try again. */ /* this continues until the write routines get EAGAIN or so, * and thus does not call the callback, and the bool is 0 */ int* morewrite = c->tcp_more_write_again; while(morewrite && *morewrite) { *morewrite = 0; if(!comm_point_tcp_handle_write(fd, c)) { reclaim_tcp_handler(c); if(!c->tcp_do_close) { fptr_ok(fptr_whitelist_comm_point( c->callback)); (void)(*c->callback)(c, c->cb_arg, NETEVENT_CLOSED, NULL); } return; } } } void comm_point_tcp_handle_callback(int fd, short event, void* arg) { struct comm_point* c = (struct comm_point*)arg; log_assert(c->type == comm_tcp); ub_comm_base_now(c->ev->base); #ifdef USE_DNSCRYPT /* Initialize if this is a dnscrypt socket */ if(c->tcp_parent) { c->dnscrypt = c->tcp_parent->dnscrypt; } if(c->dnscrypt && c->dnscrypt_buffer == c->buffer) { c->dnscrypt_buffer = sldns_buffer_new(sldns_buffer_capacity(c->buffer)); if(!c->dnscrypt_buffer) { log_err("Could not allocate dnscrypt buffer"); reclaim_tcp_handler(c); if(!c->tcp_do_close) { fptr_ok(fptr_whitelist_comm_point( c->callback)); (void)(*c->callback)(c, c->cb_arg, NETEVENT_CLOSED, NULL); } return; } } #endif if(event&UB_EV_TIMEOUT) { verbose(VERB_QUERY, "tcp took too long, dropped"); reclaim_tcp_handler(c); if(!c->tcp_do_close) { fptr_ok(fptr_whitelist_comm_point(c->callback)); (void)(*c->callback)(c, c->cb_arg, NETEVENT_TIMEOUT, NULL); } return; } if(event&UB_EV_READ #ifdef USE_MSG_FASTOPEN && !(c->tcp_do_fastopen && (event&UB_EV_WRITE)) #endif ) { int has_tcpq = (c->tcp_req_info != NULL); int* moreread = c->tcp_more_read_again; if(!comm_point_tcp_handle_read(fd, c, 0)) { reclaim_tcp_handler(c); if(!c->tcp_do_close) { fptr_ok(fptr_whitelist_comm_point( c->callback)); (void)(*c->callback)(c, c->cb_arg, NETEVENT_CLOSED, NULL); } return; } if(has_tcpq && c->tcp_req_info && c->tcp_req_info->read_again) tcp_req_info_read_again(fd, c); if(moreread && *moreread) tcp_more_read_again(fd, c); return; } if(event&UB_EV_WRITE) { int has_tcpq = (c->tcp_req_info != NULL); int* morewrite = c->tcp_more_write_again; if(!comm_point_tcp_handle_write(fd, c)) { reclaim_tcp_handler(c); if(!c->tcp_do_close) { fptr_ok(fptr_whitelist_comm_point( c->callback)); (void)(*c->callback)(c, c->cb_arg, NETEVENT_CLOSED, NULL); } return; } if(has_tcpq && c->tcp_req_info && c->tcp_req_info->read_again) tcp_req_info_read_again(fd, c); if(morewrite && *morewrite) tcp_more_write_again(fd, c); return; } log_err("Ignored event %d for tcphdl.", event); } /** Make http handler free for next assignment */ static void reclaim_http_handler(struct comm_point* c) { log_assert(c->type == comm_http); if(c->ssl) { #ifdef HAVE_SSL SSL_shutdown(c->ssl); SSL_free(c->ssl); c->ssl = NULL; #endif } comm_point_close(c); if(c->tcp_parent) { c->tcp_parent->cur_tcp_count--; c->tcp_free = c->tcp_parent->tcp_free; c->tcp_parent->tcp_free = c; if(!c->tcp_free) { /* re-enable listening on accept socket */ comm_point_start_listening(c->tcp_parent, -1, -1); } } } /** read more data for http (with ssl) */ static int ssl_http_read_more(struct comm_point* c) { #ifdef HAVE_SSL int r; log_assert(sldns_buffer_remaining(c->buffer) > 0); ERR_clear_error(); r = SSL_read(c->ssl, (void*)sldns_buffer_current(c->buffer), (int)sldns_buffer_remaining(c->buffer)); if(r <= 0) { int want = SSL_get_error(c->ssl, r); if(want == SSL_ERROR_ZERO_RETURN) { return 0; /* shutdown, closed */ } else if(want == SSL_ERROR_WANT_READ) { return 1; /* read more later */ } else if(want == SSL_ERROR_WANT_WRITE) { c->ssl_shake_state = comm_ssl_shake_hs_write; comm_point_listen_for_rw(c, 0, 1); return 1; } else if(want == SSL_ERROR_SYSCALL) { #ifdef ECONNRESET if(errno == ECONNRESET && verbosity < 2) return 0; /* silence reset by peer */ #endif if(errno != 0) log_err("SSL_read syscall: %s", strerror(errno)); return 0; } log_crypto_err("could not SSL_read"); return 0; } sldns_buffer_skip(c->buffer, (ssize_t)r); return 1; #else (void)c; return 0; #endif /* HAVE_SSL */ } /** read more data for http */ static int http_read_more(int fd, struct comm_point* c) { ssize_t r; log_assert(sldns_buffer_remaining(c->buffer) > 0); r = recv(fd, (void*)sldns_buffer_current(c->buffer), sldns_buffer_remaining(c->buffer), 0); if(r == 0) { return 0; } else if(r == -1) { #ifndef USE_WINSOCK if(errno == EINTR || errno == EAGAIN) return 1; #else /* USE_WINSOCK */ if(WSAGetLastError() == WSAECONNRESET) return 0; if(WSAGetLastError() == WSAEINPROGRESS) return 1; if(WSAGetLastError() == WSAEWOULDBLOCK) { ub_winsock_tcp_wouldblock(c->ev->ev, UB_EV_READ); return 1; } #endif log_err_addr("read (in http r)", sock_strerror(errno), &c->repinfo.addr, c->repinfo.addrlen); return 0; } sldns_buffer_skip(c->buffer, r); return 1; } /** return true if http header has been read (one line complete) */ static int http_header_done(sldns_buffer* buf) { size_t i; for(i=sldns_buffer_position(buf); ibuffer); if(!line) return 1; verbose(VERB_ALGO, "http header: %s", line); if(strncasecmp(line, "HTTP/1.1 ", 9) == 0) { /* check returncode */ if(line[9] != '2') { verbose(VERB_ALGO, "http bad status %s", line+9); return 0; } } else if(strncasecmp(line, "Content-Length: ", 16) == 0) { if(!c->http_is_chunked) c->tcp_byte_count = (size_t)atoi(line+16); } else if(strncasecmp(line, "Transfer-Encoding: chunked", 19+7) == 0) { c->tcp_byte_count = 0; c->http_is_chunked = 1; } else if(line[0] == 0) { /* end of initial headers */ c->http_in_headers = 0; if(c->http_is_chunked) c->http_in_chunk_headers = 1; /* remove header text from front of buffer * the buffer is going to be used to return the data segment * itself and we don't want the header to get returned * prepended with it */ http_moveover_buffer(c->buffer); sldns_buffer_flip(c->buffer); return 1; } /* ignore other headers */ return 1; } /** a chunk header is complete, process it, return 0=fail, 1=continue next * header line, 2=done with chunked transfer*/ static int http_process_chunk_header(struct comm_point* c) { char* line = http_header_line(c->buffer); if(!line) return 1; if(c->http_in_chunk_headers == 3) { verbose(VERB_ALGO, "http chunk trailer: %s", line); /* are we done ? */ if(line[0] == 0 && c->tcp_byte_count == 0) { /* callback of http reader when NETEVENT_DONE, * end of data, with no data in buffer */ sldns_buffer_set_position(c->buffer, 0); sldns_buffer_set_limit(c->buffer, 0); fptr_ok(fptr_whitelist_comm_point(c->callback)); (void)(*c->callback)(c, c->cb_arg, NETEVENT_DONE, NULL); /* return that we are done */ return 2; } if(line[0] == 0) { /* continue with header of the next chunk */ c->http_in_chunk_headers = 1; /* remove header text from front of buffer */ http_moveover_buffer(c->buffer); sldns_buffer_flip(c->buffer); return 1; } /* ignore further trail headers */ return 1; } verbose(VERB_ALGO, "http chunk header: %s", line); if(c->http_in_chunk_headers == 1) { /* read chunked start line */ char* end = NULL; c->tcp_byte_count = (size_t)strtol(line, &end, 16); if(end == line) return 0; c->http_in_chunk_headers = 0; /* remove header text from front of buffer */ http_moveover_buffer(c->buffer); sldns_buffer_flip(c->buffer); if(c->tcp_byte_count == 0) { /* done with chunks, process chunk_trailer lines */ c->http_in_chunk_headers = 3; } return 1; } /* ignore other headers */ return 1; } /** handle nonchunked data segment */ static int http_nonchunk_segment(struct comm_point* c) { /* c->buffer at position..limit has new data we read in. * the buffer itself is full of nonchunked data. * we are looking to read tcp_byte_count more data * and then the transfer is done. */ size_t remainbufferlen; size_t got_now = sldns_buffer_limit(c->buffer) - c->http_stored; if(c->tcp_byte_count <= got_now) { /* done, this is the last data fragment */ c->http_stored = 0; sldns_buffer_set_position(c->buffer, 0); fptr_ok(fptr_whitelist_comm_point(c->callback)); (void)(*c->callback)(c, c->cb_arg, NETEVENT_DONE, NULL); return 1; } c->tcp_byte_count -= got_now; /* if we have the buffer space, * read more data collected into the buffer */ remainbufferlen = sldns_buffer_capacity(c->buffer) - sldns_buffer_limit(c->buffer); if(remainbufferlen >= c->tcp_byte_count || remainbufferlen >= 2048) { size_t total = sldns_buffer_limit(c->buffer); sldns_buffer_clear(c->buffer); sldns_buffer_set_position(c->buffer, total); c->http_stored = total; /* return and wait to read more */ return 1; } /* call callback with this data amount, then * wait for more */ c->http_stored = 0; sldns_buffer_set_position(c->buffer, 0); fptr_ok(fptr_whitelist_comm_point(c->callback)); (void)(*c->callback)(c, c->cb_arg, NETEVENT_NOERROR, NULL); /* c->callback has to buffer_clear(c->buffer). */ /* return and wait to read more */ return 1; } /** handle nonchunked data segment, return 0=fail, 1=wait, 2=process more */ static int http_chunked_segment(struct comm_point* c) { /* the c->buffer has from position..limit new data we read. */ /* the current chunk has length tcp_byte_count. * once we read that read more chunk headers. */ size_t remainbufferlen; size_t got_now = sldns_buffer_limit(c->buffer) - c->http_stored; if(c->tcp_byte_count <= got_now) { /* the chunk has completed (with perhaps some extra data * from next chunk header and next chunk) */ /* save too much info into temp buffer */ size_t fraglen; struct comm_reply repinfo; c->http_stored = 0; sldns_buffer_skip(c->buffer, (ssize_t)c->tcp_byte_count); sldns_buffer_clear(c->http_temp); sldns_buffer_write(c->http_temp, sldns_buffer_current(c->buffer), sldns_buffer_remaining(c->buffer)); sldns_buffer_flip(c->http_temp); /* callback with this fragment */ fraglen = sldns_buffer_position(c->buffer); sldns_buffer_set_position(c->buffer, 0); sldns_buffer_set_limit(c->buffer, fraglen); repinfo = c->repinfo; fptr_ok(fptr_whitelist_comm_point(c->callback)); (void)(*c->callback)(c, c->cb_arg, NETEVENT_NOERROR, &repinfo); /* c->callback has to buffer_clear(). */ /* is commpoint deleted? */ if(!repinfo.c) { return 1; } /* copy waiting info */ sldns_buffer_clear(c->buffer); sldns_buffer_write(c->buffer, sldns_buffer_begin(c->http_temp), sldns_buffer_remaining(c->http_temp)); sldns_buffer_flip(c->buffer); /* process end of chunk trailer header lines, until * an empty line */ c->http_in_chunk_headers = 3; /* process more data in buffer (if any) */ return 2; } c->tcp_byte_count -= got_now; /* if we have the buffer space, * read more data collected into the buffer */ remainbufferlen = sldns_buffer_capacity(c->buffer) - sldns_buffer_limit(c->buffer); if(remainbufferlen >= c->tcp_byte_count || remainbufferlen >= 2048) { size_t total = sldns_buffer_limit(c->buffer); sldns_buffer_clear(c->buffer); sldns_buffer_set_position(c->buffer, total); c->http_stored = total; /* return and wait to read more */ return 1; } /* callback of http reader for a new part of the data */ c->http_stored = 0; sldns_buffer_set_position(c->buffer, 0); fptr_ok(fptr_whitelist_comm_point(c->callback)); (void)(*c->callback)(c, c->cb_arg, NETEVENT_NOERROR, NULL); /* c->callback has to buffer_clear(c->buffer). */ /* return and wait to read more */ return 1; } #ifdef HAVE_NGHTTP2 /** Create new http2 session. Called when creating handling comm point. */ struct http2_session* http2_session_create(struct comm_point* c) { struct http2_session* session = calloc(1, sizeof(*session)); if(!session) { log_err("malloc failure while creating http2 session"); return NULL; } session->c = c; return session; } #endif /** Delete http2 session. After closing connection or on error */ void http2_session_delete(struct http2_session* h2_session) { #ifdef HAVE_NGHTTP2 if(h2_session->callbacks) nghttp2_session_callbacks_del(h2_session->callbacks); free(h2_session); #else (void)h2_session; #endif } #ifdef HAVE_NGHTTP2 struct http2_stream* http2_stream_create(int32_t stream_id) { struct http2_stream* h2_stream = calloc(1, sizeof(*h2_stream)); if(!h2_stream) { log_err("malloc failure while creating http2 stream"); return NULL; } h2_stream->stream_id = stream_id; return h2_stream; } /** Delete http2 stream. After session delete or stream close callback */ static void http2_stream_delete(struct http2_session* h2_session, struct http2_stream* h2_stream) { if(h2_stream->mesh_state) { mesh_state_remove_reply(h2_stream->mesh, h2_stream->mesh_state, h2_session->c); h2_stream->mesh_state = NULL; } http2_req_stream_clear(h2_stream); free(h2_stream); } #endif void http2_stream_add_meshstate(struct http2_stream* h2_stream, struct mesh_area* mesh, struct mesh_state* m) { h2_stream->mesh = mesh; h2_stream->mesh_state = m; } /** delete http2 session server. After closing connection. */ static void http2_session_server_delete(struct http2_session* h2_session) { #ifdef HAVE_NGHTTP2 struct http2_stream* h2_stream, *next; nghttp2_session_del(h2_session->session); /* NULL input is fine */ h2_session->session = NULL; for(h2_stream = h2_session->first_stream; h2_stream;) { next = h2_stream->next; http2_stream_delete(h2_session, h2_stream); h2_stream = next; } h2_session->first_stream = NULL; h2_session->is_drop = 0; h2_session->postpone_drop = 0; h2_session->c->h2_stream = NULL; #endif (void)h2_session; } #ifdef HAVE_NGHTTP2 void http2_session_add_stream(struct http2_session* h2_session, struct http2_stream* h2_stream) { if(h2_session->first_stream) h2_session->first_stream->prev = h2_stream; h2_stream->next = h2_session->first_stream; h2_session->first_stream = h2_stream; } /** remove stream from session linked list. After stream close callback or * closing connection */ void http2_session_remove_stream(struct http2_session* h2_session, struct http2_stream* h2_stream) { if(h2_stream->prev) h2_stream->prev->next = h2_stream->next; else h2_session->first_stream = h2_stream->next; if(h2_stream->next) h2_stream->next->prev = h2_stream->prev; } int http2_stream_close_cb(nghttp2_session* ATTR_UNUSED(session), int32_t stream_id, uint32_t ATTR_UNUSED(error_code), void* cb_arg) { struct http2_stream* h2_stream; struct http2_session* h2_session = (struct http2_session*)cb_arg; if(!(h2_stream = nghttp2_session_get_stream_user_data( h2_session->session, stream_id))) { return 0; } http2_session_remove_stream(h2_session, h2_stream); http2_stream_delete(h2_session, h2_stream); return 0; } ssize_t http2_recv_cb(nghttp2_session* ATTR_UNUSED(session), uint8_t* buf, size_t len, int ATTR_UNUSED(flags), void* cb_arg) { struct http2_session* h2_session = (struct http2_session*)cb_arg; ssize_t ret; log_assert(h2_session->c->type == comm_http); log_assert(h2_session->c->h2_session); #ifdef HAVE_SSL if(h2_session->c->ssl) { int r; ERR_clear_error(); r = SSL_read(h2_session->c->ssl, buf, len); if(r <= 0) { int want = SSL_get_error(h2_session->c->ssl, r); if(want == SSL_ERROR_ZERO_RETURN) { return NGHTTP2_ERR_EOF; } else if(want == SSL_ERROR_WANT_READ) { return NGHTTP2_ERR_WOULDBLOCK; } else if(want == SSL_ERROR_WANT_WRITE) { h2_session->c->ssl_shake_state = comm_ssl_shake_hs_write; comm_point_listen_for_rw(h2_session->c, 0, 1); return NGHTTP2_ERR_WOULDBLOCK; } else if(want == SSL_ERROR_SYSCALL) { #ifdef ECONNRESET if(errno == ECONNRESET && verbosity < 2) return NGHTTP2_ERR_CALLBACK_FAILURE; #endif if(errno != 0) log_err("SSL_read syscall: %s", strerror(errno)); return NGHTTP2_ERR_CALLBACK_FAILURE; } log_crypto_err("could not SSL_read"); return NGHTTP2_ERR_CALLBACK_FAILURE; } return r; } #endif /* HAVE_SSL */ ret = recv(h2_session->c->fd, buf, len, 0); if(ret == 0) { return NGHTTP2_ERR_EOF; } else if(ret < 0) { #ifndef USE_WINSOCK if(errno == EINTR || errno == EAGAIN) return NGHTTP2_ERR_WOULDBLOCK; #ifdef ECONNRESET if(errno == ECONNRESET && verbosity < 2) return NGHTTP2_ERR_CALLBACK_FAILURE; #endif log_err_addr("could not http2 recv: %s", strerror(errno), &h2_session->c->repinfo.addr, h2_session->c->repinfo.addrlen); #else /* USE_WINSOCK */ if(WSAGetLastError() == WSAECONNRESET) return NGHTTP2_ERR_CALLBACK_FAILURE; if(WSAGetLastError() == WSAEINPROGRESS) return NGHTTP2_ERR_WOULDBLOCK; if(WSAGetLastError() == WSAEWOULDBLOCK) { ub_winsock_tcp_wouldblock(h2_session->c->ev->ev, UB_EV_READ); return NGHTTP2_ERR_WOULDBLOCK; } log_err_addr("could not http2 recv: %s", wsa_strerror(WSAGetLastError()), &h2_session->c->repinfo.addr, h2_session->c->repinfo.addrlen); #endif return NGHTTP2_ERR_CALLBACK_FAILURE; } return ret; } #endif /* HAVE_NGHTTP2 */ /** Handle http2 read */ static int comm_point_http2_handle_read(int ATTR_UNUSED(fd), struct comm_point* c) { #ifdef HAVE_NGHTTP2 int ret; log_assert(c->h2_session); /* reading until recv cb returns NGHTTP2_ERR_WOULDBLOCK */ ret = nghttp2_session_recv(c->h2_session->session); if(ret) { if(ret != NGHTTP2_ERR_EOF && ret != NGHTTP2_ERR_CALLBACK_FAILURE) { char a[256]; addr_to_str(&c->repinfo.addr, c->repinfo.addrlen, a, sizeof(a)); verbose(VERB_QUERY, "http2: session_recv from %s failed, " "error: %s", a, nghttp2_strerror(ret)); } return 0; } if(nghttp2_session_want_write(c->h2_session->session)) { c->tcp_is_reading = 0; comm_point_stop_listening(c); comm_point_start_listening(c, -1, c->tcp_timeout_msec); } else if(!nghttp2_session_want_read(c->h2_session->session)) return 0; /* connection can be closed */ return 1; #else (void)c; return 0; #endif } /** * Handle http reading callback. * @param fd: file descriptor of socket. * @param c: comm point to read from into buffer. * @return: 0 on error */ static int comm_point_http_handle_read(int fd, struct comm_point* c) { log_assert(c->type == comm_http); log_assert(fd != -1); /* if we are in ssl handshake, handle SSL handshake */ #ifdef HAVE_SSL if(c->ssl && c->ssl_shake_state != comm_ssl_shake_none) { if(!ssl_handshake(c)) return 0; if(c->ssl_shake_state != comm_ssl_shake_none) return 1; } #endif /* HAVE_SSL */ if(!c->tcp_is_reading) return 1; if(c->use_h2) { return comm_point_http2_handle_read(fd, c); } /* http version is <= http/1.1 */ if(c->http_min_version >= http_version_2) { /* HTTP/2 failed, not allowed to use lower version. */ return 0; } /* read more data */ if(c->ssl) { if(!ssl_http_read_more(c)) return 0; } else { if(!http_read_more(fd, c)) return 0; } sldns_buffer_flip(c->buffer); while(sldns_buffer_remaining(c->buffer) > 0) { /* Handle HTTP/1.x data */ /* if we are reading headers, read more headers */ if(c->http_in_headers || c->http_in_chunk_headers) { /* if header is done, process the header */ if(!http_header_done(c->buffer)) { /* copy remaining data to front of buffer * and set rest for writing into it */ http_moveover_buffer(c->buffer); /* return and wait to read more */ return 1; } if(!c->http_in_chunk_headers) { /* process initial headers */ if(!http_process_initial_header(c)) return 0; } else { /* process chunk headers */ int r = http_process_chunk_header(c); if(r == 0) return 0; if(r == 2) return 1; /* done */ /* r == 1, continue */ } /* see if we have more to process */ continue; } if(!c->http_is_chunked) { /* if we are reading nonchunks, process that*/ return http_nonchunk_segment(c); } else { /* if we are reading chunks, read the chunk */ int r = http_chunked_segment(c); if(r == 0) return 0; if(r == 1) return 1; continue; } } /* broke out of the loop; could not process header instead need * to read more */ /* moveover any remaining data and read more data */ http_moveover_buffer(c->buffer); /* return and wait to read more */ return 1; } /** check pending connect for http */ static int http_check_connect(int fd, struct comm_point* c) { /* check for pending error from nonblocking connect */ /* from Stevens, unix network programming, vol1, 3rd ed, p450*/ int error = 0; socklen_t len = (socklen_t)sizeof(error); if(getsockopt(fd, SOL_SOCKET, SO_ERROR, (void*)&error, &len) < 0){ #ifndef USE_WINSOCK error = errno; /* on solaris errno is error */ #else /* USE_WINSOCK */ error = WSAGetLastError(); #endif } #ifndef USE_WINSOCK #if defined(EINPROGRESS) && defined(EWOULDBLOCK) if(error == EINPROGRESS || error == EWOULDBLOCK) return 1; /* try again later */ else #endif if(error != 0 && verbosity < 2) return 0; /* silence lots of chatter in the logs */ else if(error != 0) { log_err_addr("http connect", strerror(error), &c->repinfo.addr, c->repinfo.addrlen); #else /* USE_WINSOCK */ /* examine error */ if(error == WSAEINPROGRESS) return 1; else if(error == WSAEWOULDBLOCK) { ub_winsock_tcp_wouldblock(c->ev->ev, UB_EV_WRITE); return 1; } else if(error != 0 && verbosity < 2) return 0; else if(error != 0) { log_err_addr("http connect", wsa_strerror(error), &c->repinfo.addr, c->repinfo.addrlen); #endif /* USE_WINSOCK */ return 0; } /* keep on processing this socket */ return 2; } /** write more data for http (with ssl) */ static int ssl_http_write_more(struct comm_point* c) { #ifdef HAVE_SSL int r; log_assert(sldns_buffer_remaining(c->buffer) > 0); ERR_clear_error(); r = SSL_write(c->ssl, (void*)sldns_buffer_current(c->buffer), (int)sldns_buffer_remaining(c->buffer)); if(r <= 0) { int want = SSL_get_error(c->ssl, r); if(want == SSL_ERROR_ZERO_RETURN) { return 0; /* closed */ } else if(want == SSL_ERROR_WANT_READ) { c->ssl_shake_state = comm_ssl_shake_hs_read; comm_point_listen_for_rw(c, 1, 0); return 1; /* wait for read condition */ } else if(want == SSL_ERROR_WANT_WRITE) { return 1; /* write more later */ } else if(want == SSL_ERROR_SYSCALL) { #ifdef EPIPE if(errno == EPIPE && verbosity < 2) return 0; /* silence 'broken pipe' */ #endif if(errno != 0) log_err("SSL_write syscall: %s", strerror(errno)); return 0; } log_crypto_err("could not SSL_write"); return 0; } sldns_buffer_skip(c->buffer, (ssize_t)r); return 1; #else (void)c; return 0; #endif /* HAVE_SSL */ } /** write more data for http */ static int http_write_more(int fd, struct comm_point* c) { ssize_t r; log_assert(sldns_buffer_remaining(c->buffer) > 0); r = send(fd, (void*)sldns_buffer_current(c->buffer), sldns_buffer_remaining(c->buffer), 0); if(r == -1) { #ifndef USE_WINSOCK if(errno == EINTR || errno == EAGAIN) return 1; #else if(WSAGetLastError() == WSAEINPROGRESS) return 1; if(WSAGetLastError() == WSAEWOULDBLOCK) { ub_winsock_tcp_wouldblock(c->ev->ev, UB_EV_WRITE); return 1; } #endif log_err_addr("http send r", sock_strerror(errno), &c->repinfo.addr, c->repinfo.addrlen); return 0; } sldns_buffer_skip(c->buffer, r); return 1; } #ifdef HAVE_NGHTTP2 ssize_t http2_send_cb(nghttp2_session* ATTR_UNUSED(session), const uint8_t* buf, size_t len, int ATTR_UNUSED(flags), void* cb_arg) { ssize_t ret; struct http2_session* h2_session = (struct http2_session*)cb_arg; log_assert(h2_session->c->type == comm_http); log_assert(h2_session->c->h2_session); #ifdef HAVE_SSL if(h2_session->c->ssl) { int r; ERR_clear_error(); r = SSL_write(h2_session->c->ssl, buf, len); if(r <= 0) { int want = SSL_get_error(h2_session->c->ssl, r); if(want == SSL_ERROR_ZERO_RETURN) { return NGHTTP2_ERR_CALLBACK_FAILURE; } else if(want == SSL_ERROR_WANT_READ) { h2_session->c->ssl_shake_state = comm_ssl_shake_hs_read; comm_point_listen_for_rw(h2_session->c, 1, 0); return NGHTTP2_ERR_WOULDBLOCK; } else if(want == SSL_ERROR_WANT_WRITE) { return NGHTTP2_ERR_WOULDBLOCK; } else if(want == SSL_ERROR_SYSCALL) { #ifdef EPIPE if(errno == EPIPE && verbosity < 2) return NGHTTP2_ERR_CALLBACK_FAILURE; #endif if(errno != 0) log_err("SSL_write syscall: %s", strerror(errno)); return NGHTTP2_ERR_CALLBACK_FAILURE; } log_crypto_err("could not SSL_write"); return NGHTTP2_ERR_CALLBACK_FAILURE; } return r; } #endif /* HAVE_SSL */ ret = send(h2_session->c->fd, buf, len, 0); if(ret == 0) { return NGHTTP2_ERR_CALLBACK_FAILURE; } else if(ret < 0) { #ifndef USE_WINSOCK if(errno == EINTR || errno == EAGAIN) return NGHTTP2_ERR_WOULDBLOCK; #ifdef EPIPE if(errno == EPIPE && verbosity < 2) return NGHTTP2_ERR_CALLBACK_FAILURE; #endif #ifdef ECONNRESET if(errno == ECONNRESET && verbosity < 2) return NGHTTP2_ERR_CALLBACK_FAILURE; #endif log_err_addr("could not http2 write: %s", strerror(errno), &h2_session->c->repinfo.addr, h2_session->c->repinfo.addrlen); #else /* USE_WINSOCK */ if(WSAGetLastError() == WSAENOTCONN) return NGHTTP2_ERR_WOULDBLOCK; if(WSAGetLastError() == WSAEINPROGRESS) return NGHTTP2_ERR_WOULDBLOCK; if(WSAGetLastError() == WSAEWOULDBLOCK) { ub_winsock_tcp_wouldblock(h2_session->c->ev->ev, UB_EV_WRITE); return NGHTTP2_ERR_WOULDBLOCK; } if(WSAGetLastError() == WSAECONNRESET && verbosity < 2) return NGHTTP2_ERR_CALLBACK_FAILURE; log_err_addr("could not http2 write: %s", wsa_strerror(WSAGetLastError()), &h2_session->c->repinfo.addr, h2_session->c->repinfo.addrlen); #endif return NGHTTP2_ERR_CALLBACK_FAILURE; } return ret; } #endif /* HAVE_NGHTTP2 */ /** Handle http2 writing */ static int comm_point_http2_handle_write(int ATTR_UNUSED(fd), struct comm_point* c) { #ifdef HAVE_NGHTTP2 int ret; log_assert(c->h2_session); ret = nghttp2_session_send(c->h2_session->session); if(ret) { verbose(VERB_QUERY, "http2: session_send failed, " "error: %s", nghttp2_strerror(ret)); return 0; } if(nghttp2_session_want_read(c->h2_session->session)) { c->tcp_is_reading = 1; comm_point_stop_listening(c); comm_point_start_listening(c, -1, c->tcp_timeout_msec); } else if(!nghttp2_session_want_write(c->h2_session->session)) return 0; /* connection can be closed */ return 1; #else (void)c; return 0; #endif } /** * Handle http writing callback. * @param fd: file descriptor of socket. * @param c: comm point to write buffer out of. * @return: 0 on error */ static int comm_point_http_handle_write(int fd, struct comm_point* c) { log_assert(c->type == comm_http); log_assert(fd != -1); /* check pending connect errors, if that fails, we wait for more, * or we can continue to write contents */ if(c->tcp_check_nb_connect) { int r = http_check_connect(fd, c); if(r == 0) return 0; if(r == 1) return 1; c->tcp_check_nb_connect = 0; } /* if we are in ssl handshake, handle SSL handshake */ #ifdef HAVE_SSL if(c->ssl && c->ssl_shake_state != comm_ssl_shake_none) { if(!ssl_handshake(c)) return 0; if(c->ssl_shake_state != comm_ssl_shake_none) return 1; } #endif /* HAVE_SSL */ if(c->tcp_is_reading) return 1; if(c->use_h2) { return comm_point_http2_handle_write(fd, c); } /* http version is <= http/1.1 */ if(c->http_min_version >= http_version_2) { /* HTTP/2 failed, not allowed to use lower version. */ return 0; } /* if we are writing, write more */ if(c->ssl) { if(!ssl_http_write_more(c)) return 0; } else { if(!http_write_more(fd, c)) return 0; } /* we write a single buffer contents, that can contain * the http request, and then flip to read the results */ /* see if write is done */ if(sldns_buffer_remaining(c->buffer) == 0) { sldns_buffer_clear(c->buffer); if(c->tcp_do_toggle_rw) c->tcp_is_reading = 1; c->tcp_byte_count = 0; /* switch from listening(write) to listening(read) */ comm_point_stop_listening(c); comm_point_start_listening(c, -1, -1); } return 1; } void comm_point_http_handle_callback(int fd, short event, void* arg) { struct comm_point* c = (struct comm_point*)arg; log_assert(c->type == comm_http); ub_comm_base_now(c->ev->base); if(event&UB_EV_TIMEOUT) { verbose(VERB_QUERY, "http took too long, dropped"); reclaim_http_handler(c); if(!c->tcp_do_close) { fptr_ok(fptr_whitelist_comm_point(c->callback)); (void)(*c->callback)(c, c->cb_arg, NETEVENT_TIMEOUT, NULL); } return; } if(event&UB_EV_READ) { if(!comm_point_http_handle_read(fd, c)) { reclaim_http_handler(c); if(!c->tcp_do_close) { fptr_ok(fptr_whitelist_comm_point( c->callback)); (void)(*c->callback)(c, c->cb_arg, NETEVENT_CLOSED, NULL); } } return; } if(event&UB_EV_WRITE) { if(!comm_point_http_handle_write(fd, c)) { reclaim_http_handler(c); if(!c->tcp_do_close) { fptr_ok(fptr_whitelist_comm_point( c->callback)); (void)(*c->callback)(c, c->cb_arg, NETEVENT_CLOSED, NULL); } } return; } log_err("Ignored event %d for httphdl.", event); } void comm_point_local_handle_callback(int fd, short event, void* arg) { struct comm_point* c = (struct comm_point*)arg; log_assert(c->type == comm_local); ub_comm_base_now(c->ev->base); if(event&UB_EV_READ) { if(!comm_point_tcp_handle_read(fd, c, 1)) { fptr_ok(fptr_whitelist_comm_point(c->callback)); (void)(*c->callback)(c, c->cb_arg, NETEVENT_CLOSED, NULL); } return; } log_err("Ignored event %d for localhdl.", event); } void comm_point_raw_handle_callback(int ATTR_UNUSED(fd), short event, void* arg) { struct comm_point* c = (struct comm_point*)arg; int err = NETEVENT_NOERROR; log_assert(c->type == comm_raw); ub_comm_base_now(c->ev->base); if(event&UB_EV_TIMEOUT) err = NETEVENT_TIMEOUT; fptr_ok(fptr_whitelist_comm_point_raw(c->callback)); (void)(*c->callback)(c, c->cb_arg, err, NULL); } struct comm_point* comm_point_create_udp(struct comm_base *base, int fd, sldns_buffer* buffer, comm_point_callback_type* callback, void* callback_arg) { struct comm_point* c = (struct comm_point*)calloc(1, sizeof(struct comm_point)); short evbits; if(!c) return NULL; c->ev = (struct internal_event*)calloc(1, sizeof(struct internal_event)); if(!c->ev) { free(c); return NULL; } c->ev->base = base; c->fd = fd; c->buffer = buffer; c->timeout = NULL; c->tcp_is_reading = 0; c->tcp_byte_count = 0; c->tcp_parent = NULL; c->max_tcp_count = 0; c->cur_tcp_count = 0; c->tcp_handlers = NULL; c->tcp_free = NULL; c->type = comm_udp; c->tcp_do_close = 0; c->do_not_close = 0; c->tcp_do_toggle_rw = 0; c->tcp_check_nb_connect = 0; #ifdef USE_MSG_FASTOPEN c->tcp_do_fastopen = 0; #endif #ifdef USE_DNSCRYPT c->dnscrypt = 0; c->dnscrypt_buffer = buffer; #endif c->inuse = 0; c->callback = callback; c->cb_arg = callback_arg; evbits = UB_EV_READ | UB_EV_PERSIST; /* ub_event stuff */ c->ev->ev = ub_event_new(base->eb->base, c->fd, evbits, comm_point_udp_callback, c); if(c->ev->ev == NULL) { log_err("could not baseset udp event"); comm_point_delete(c); return NULL; } if(fd!=-1 && ub_event_add(c->ev->ev, c->timeout) != 0 ) { log_err("could not add udp event"); comm_point_delete(c); return NULL; } return c; } struct comm_point* comm_point_create_udp_ancil(struct comm_base *base, int fd, sldns_buffer* buffer, comm_point_callback_type* callback, void* callback_arg) { struct comm_point* c = (struct comm_point*)calloc(1, sizeof(struct comm_point)); short evbits; if(!c) return NULL; c->ev = (struct internal_event*)calloc(1, sizeof(struct internal_event)); if(!c->ev) { free(c); return NULL; } c->ev->base = base; c->fd = fd; c->buffer = buffer; c->timeout = NULL; c->tcp_is_reading = 0; c->tcp_byte_count = 0; c->tcp_parent = NULL; c->max_tcp_count = 0; c->cur_tcp_count = 0; c->tcp_handlers = NULL; c->tcp_free = NULL; c->type = comm_udp; c->tcp_do_close = 0; c->do_not_close = 0; #ifdef USE_DNSCRYPT c->dnscrypt = 0; c->dnscrypt_buffer = buffer; #endif c->inuse = 0; c->tcp_do_toggle_rw = 0; c->tcp_check_nb_connect = 0; #ifdef USE_MSG_FASTOPEN c->tcp_do_fastopen = 0; #endif c->callback = callback; c->cb_arg = callback_arg; evbits = UB_EV_READ | UB_EV_PERSIST; /* ub_event stuff */ c->ev->ev = ub_event_new(base->eb->base, c->fd, evbits, comm_point_udp_ancil_callback, c); if(c->ev->ev == NULL) { log_err("could not baseset udp event"); comm_point_delete(c); return NULL; } if(fd!=-1 && ub_event_add(c->ev->ev, c->timeout) != 0 ) { log_err("could not add udp event"); comm_point_delete(c); return NULL; } return c; } static struct comm_point* comm_point_create_tcp_handler(struct comm_base *base, struct comm_point* parent, size_t bufsize, struct sldns_buffer* spoolbuf, comm_point_callback_type* callback, void* callback_arg) { struct comm_point* c = (struct comm_point*)calloc(1, sizeof(struct comm_point)); short evbits; if(!c) return NULL; c->ev = (struct internal_event*)calloc(1, sizeof(struct internal_event)); if(!c->ev) { free(c); return NULL; } c->ev->base = base; c->fd = -1; c->buffer = sldns_buffer_new(bufsize); if(!c->buffer) { free(c->ev); free(c); return NULL; } c->timeout = (struct timeval*)malloc(sizeof(struct timeval)); if(!c->timeout) { sldns_buffer_free(c->buffer); free(c->ev); free(c); return NULL; } c->tcp_is_reading = 0; c->tcp_byte_count = 0; c->tcp_parent = parent; c->tcp_timeout_msec = parent->tcp_timeout_msec; c->tcp_conn_limit = parent->tcp_conn_limit; c->tcl_addr = NULL; c->tcp_keepalive = 0; c->max_tcp_count = 0; c->cur_tcp_count = 0; c->tcp_handlers = NULL; c->tcp_free = NULL; c->type = comm_tcp; c->tcp_do_close = 0; c->do_not_close = 0; c->tcp_do_toggle_rw = 1; c->tcp_check_nb_connect = 0; #ifdef USE_MSG_FASTOPEN c->tcp_do_fastopen = 0; #endif #ifdef USE_DNSCRYPT c->dnscrypt = 0; /* We don't know just yet if this is a dnscrypt channel. Allocation * will be done when handling the callback. */ c->dnscrypt_buffer = c->buffer; #endif c->repinfo.c = c; c->callback = callback; c->cb_arg = callback_arg; if(spoolbuf) { c->tcp_req_info = tcp_req_info_create(spoolbuf); if(!c->tcp_req_info) { log_err("could not create tcp commpoint"); sldns_buffer_free(c->buffer); free(c->timeout); free(c->ev); free(c); return NULL; } c->tcp_req_info->cp = c; c->tcp_do_close = 1; c->tcp_do_toggle_rw = 0; } /* add to parent free list */ c->tcp_free = parent->tcp_free; parent->tcp_free = c; /* ub_event stuff */ evbits = UB_EV_PERSIST | UB_EV_READ | UB_EV_TIMEOUT; c->ev->ev = ub_event_new(base->eb->base, c->fd, evbits, comm_point_tcp_handle_callback, c); if(c->ev->ev == NULL) { log_err("could not basetset tcphdl event"); parent->tcp_free = c->tcp_free; tcp_req_info_delete(c->tcp_req_info); sldns_buffer_free(c->buffer); free(c->timeout); free(c->ev); free(c); return NULL; } return c; } static struct comm_point* comm_point_create_http_handler(struct comm_base *base, struct comm_point* parent, size_t bufsize, int harden_large_queries, uint32_t http_max_streams, char* http_endpoint, comm_point_callback_type* callback, void* callback_arg) { struct comm_point* c = (struct comm_point*)calloc(1, sizeof(struct comm_point)); short evbits; if(!c) return NULL; c->ev = (struct internal_event*)calloc(1, sizeof(struct internal_event)); if(!c->ev) { free(c); return NULL; } c->ev->base = base; c->fd = -1; c->buffer = sldns_buffer_new(bufsize); if(!c->buffer) { free(c->ev); free(c); return NULL; } c->timeout = (struct timeval*)malloc(sizeof(struct timeval)); if(!c->timeout) { sldns_buffer_free(c->buffer); free(c->ev); free(c); return NULL; } c->tcp_is_reading = 0; c->tcp_byte_count = 0; c->tcp_parent = parent; c->tcp_timeout_msec = parent->tcp_timeout_msec; c->tcp_conn_limit = parent->tcp_conn_limit; c->tcl_addr = NULL; c->tcp_keepalive = 0; c->max_tcp_count = 0; c->cur_tcp_count = 0; c->tcp_handlers = NULL; c->tcp_free = NULL; c->type = comm_http; c->tcp_do_close = 1; c->do_not_close = 0; c->tcp_do_toggle_rw = 1; /* will be set to 0 after http2 upgrade */ c->tcp_check_nb_connect = 0; #ifdef USE_MSG_FASTOPEN c->tcp_do_fastopen = 0; #endif #ifdef USE_DNSCRYPT c->dnscrypt = 0; c->dnscrypt_buffer = NULL; #endif c->repinfo.c = c; c->callback = callback; c->cb_arg = callback_arg; c->http_min_version = http_version_2; c->http2_stream_max_qbuffer_size = bufsize; if(harden_large_queries && bufsize > 512) c->http2_stream_max_qbuffer_size = 512; c->http2_max_streams = http_max_streams; if(!(c->http_endpoint = strdup(http_endpoint))) { log_err("could not strdup http_endpoint"); sldns_buffer_free(c->buffer); free(c->timeout); free(c->ev); free(c); return NULL; } c->use_h2 = 0; #ifdef HAVE_NGHTTP2 if(!(c->h2_session = http2_session_create(c))) { log_err("could not create http2 session"); free(c->http_endpoint); sldns_buffer_free(c->buffer); free(c->timeout); free(c->ev); free(c); return NULL; } if(!(c->h2_session->callbacks = http2_req_callbacks_create())) { log_err("could not create http2 callbacks"); http2_session_delete(c->h2_session); free(c->http_endpoint); sldns_buffer_free(c->buffer); free(c->timeout); free(c->ev); free(c); return NULL; } #endif /* add to parent free list */ c->tcp_free = parent->tcp_free; parent->tcp_free = c; /* ub_event stuff */ evbits = UB_EV_PERSIST | UB_EV_READ | UB_EV_TIMEOUT; c->ev->ev = ub_event_new(base->eb->base, c->fd, evbits, comm_point_http_handle_callback, c); if(c->ev->ev == NULL) { log_err("could not set http handler event"); parent->tcp_free = c->tcp_free; http2_session_delete(c->h2_session); sldns_buffer_free(c->buffer); free(c->timeout); free(c->ev); free(c); return NULL; } return c; } struct comm_point* comm_point_create_tcp(struct comm_base *base, int fd, int num, int idle_timeout, int harden_large_queries, uint32_t http_max_streams, char* http_endpoint, struct tcl_list* tcp_conn_limit, size_t bufsize, struct sldns_buffer* spoolbuf, enum listen_type port_type, comm_point_callback_type* callback, void* callback_arg) { struct comm_point* c = (struct comm_point*)calloc(1, sizeof(struct comm_point)); short evbits; int i; /* first allocate the TCP accept listener */ if(!c) return NULL; c->ev = (struct internal_event*)calloc(1, sizeof(struct internal_event)); if(!c->ev) { free(c); return NULL; } c->ev->base = base; c->fd = fd; c->buffer = NULL; c->timeout = NULL; c->tcp_is_reading = 0; c->tcp_byte_count = 0; c->tcp_timeout_msec = idle_timeout; c->tcp_conn_limit = tcp_conn_limit; c->tcl_addr = NULL; c->tcp_keepalive = 0; c->tcp_parent = NULL; c->max_tcp_count = num; c->cur_tcp_count = 0; c->tcp_handlers = (struct comm_point**)calloc((size_t)num, sizeof(struct comm_point*)); if(!c->tcp_handlers) { free(c->ev); free(c); return NULL; } c->tcp_free = NULL; c->type = comm_tcp_accept; c->tcp_do_close = 0; c->do_not_close = 0; c->tcp_do_toggle_rw = 0; c->tcp_check_nb_connect = 0; #ifdef USE_MSG_FASTOPEN c->tcp_do_fastopen = 0; #endif #ifdef USE_DNSCRYPT c->dnscrypt = 0; c->dnscrypt_buffer = NULL; #endif c->callback = NULL; c->cb_arg = NULL; evbits = UB_EV_READ | UB_EV_PERSIST; /* ub_event stuff */ c->ev->ev = ub_event_new(base->eb->base, c->fd, evbits, comm_point_tcp_accept_callback, c); if(c->ev->ev == NULL) { log_err("could not baseset tcpacc event"); comm_point_delete(c); return NULL; } if (ub_event_add(c->ev->ev, c->timeout) != 0) { log_err("could not add tcpacc event"); comm_point_delete(c); return NULL; } /* now prealloc the handlers */ for(i=0; itcp_handlers[i] = comm_point_create_tcp_handler(base, c, bufsize, spoolbuf, callback, callback_arg); } else if(port_type == listen_type_http) { c->tcp_handlers[i] = comm_point_create_http_handler( base, c, bufsize, harden_large_queries, http_max_streams, http_endpoint, callback, callback_arg); } else { log_err("could not create tcp handler, unknown listen " "type"); return NULL; } if(!c->tcp_handlers[i]) { comm_point_delete(c); return NULL; } } return c; } struct comm_point* comm_point_create_tcp_out(struct comm_base *base, size_t bufsize, comm_point_callback_type* callback, void* callback_arg) { struct comm_point* c = (struct comm_point*)calloc(1, sizeof(struct comm_point)); short evbits; if(!c) return NULL; c->ev = (struct internal_event*)calloc(1, sizeof(struct internal_event)); if(!c->ev) { free(c); return NULL; } c->ev->base = base; c->fd = -1; c->buffer = sldns_buffer_new(bufsize); if(!c->buffer) { free(c->ev); free(c); return NULL; } c->timeout = NULL; c->tcp_is_reading = 0; c->tcp_byte_count = 0; c->tcp_timeout_msec = TCP_QUERY_TIMEOUT; c->tcp_conn_limit = NULL; c->tcl_addr = NULL; c->tcp_keepalive = 0; c->tcp_parent = NULL; c->max_tcp_count = 0; c->cur_tcp_count = 0; c->tcp_handlers = NULL; c->tcp_free = NULL; c->type = comm_tcp; c->tcp_do_close = 0; c->do_not_close = 0; c->tcp_do_toggle_rw = 1; c->tcp_check_nb_connect = 1; #ifdef USE_MSG_FASTOPEN c->tcp_do_fastopen = 1; #endif #ifdef USE_DNSCRYPT c->dnscrypt = 0; c->dnscrypt_buffer = c->buffer; #endif c->repinfo.c = c; c->callback = callback; c->cb_arg = callback_arg; evbits = UB_EV_PERSIST | UB_EV_WRITE; c->ev->ev = ub_event_new(base->eb->base, c->fd, evbits, comm_point_tcp_handle_callback, c); if(c->ev->ev == NULL) { log_err("could not baseset tcpout event"); sldns_buffer_free(c->buffer); free(c->ev); free(c); return NULL; } return c; } struct comm_point* comm_point_create_http_out(struct comm_base *base, size_t bufsize, comm_point_callback_type* callback, void* callback_arg, sldns_buffer* temp) { struct comm_point* c = (struct comm_point*)calloc(1, sizeof(struct comm_point)); short evbits; if(!c) return NULL; c->ev = (struct internal_event*)calloc(1, sizeof(struct internal_event)); if(!c->ev) { free(c); return NULL; } c->ev->base = base; c->fd = -1; c->buffer = sldns_buffer_new(bufsize); if(!c->buffer) { free(c->ev); free(c); return NULL; } c->timeout = NULL; c->tcp_is_reading = 0; c->tcp_byte_count = 0; c->tcp_parent = NULL; c->max_tcp_count = 0; c->cur_tcp_count = 0; c->tcp_handlers = NULL; c->tcp_free = NULL; c->type = comm_http; c->tcp_do_close = 0; c->do_not_close = 0; c->tcp_do_toggle_rw = 1; c->tcp_check_nb_connect = 1; c->http_in_headers = 1; c->http_in_chunk_headers = 0; c->http_is_chunked = 0; c->http_temp = temp; #ifdef USE_MSG_FASTOPEN c->tcp_do_fastopen = 1; #endif #ifdef USE_DNSCRYPT c->dnscrypt = 0; c->dnscrypt_buffer = c->buffer; #endif c->repinfo.c = c; c->callback = callback; c->cb_arg = callback_arg; evbits = UB_EV_PERSIST | UB_EV_WRITE; c->ev->ev = ub_event_new(base->eb->base, c->fd, evbits, comm_point_http_handle_callback, c); if(c->ev->ev == NULL) { log_err("could not baseset tcpout event"); #ifdef HAVE_SSL SSL_free(c->ssl); #endif sldns_buffer_free(c->buffer); free(c->ev); free(c); return NULL; } return c; } struct comm_point* comm_point_create_local(struct comm_base *base, int fd, size_t bufsize, comm_point_callback_type* callback, void* callback_arg) { struct comm_point* c = (struct comm_point*)calloc(1, sizeof(struct comm_point)); short evbits; if(!c) return NULL; c->ev = (struct internal_event*)calloc(1, sizeof(struct internal_event)); if(!c->ev) { free(c); return NULL; } c->ev->base = base; c->fd = fd; c->buffer = sldns_buffer_new(bufsize); if(!c->buffer) { free(c->ev); free(c); return NULL; } c->timeout = NULL; c->tcp_is_reading = 1; c->tcp_byte_count = 0; c->tcp_parent = NULL; c->max_tcp_count = 0; c->cur_tcp_count = 0; c->tcp_handlers = NULL; c->tcp_free = NULL; c->type = comm_local; c->tcp_do_close = 0; c->do_not_close = 1; c->tcp_do_toggle_rw = 0; c->tcp_check_nb_connect = 0; #ifdef USE_MSG_FASTOPEN c->tcp_do_fastopen = 0; #endif #ifdef USE_DNSCRYPT c->dnscrypt = 0; c->dnscrypt_buffer = c->buffer; #endif c->callback = callback; c->cb_arg = callback_arg; /* ub_event stuff */ evbits = UB_EV_PERSIST | UB_EV_READ; c->ev->ev = ub_event_new(base->eb->base, c->fd, evbits, comm_point_local_handle_callback, c); if(c->ev->ev == NULL) { log_err("could not baseset localhdl event"); free(c->ev); free(c); return NULL; } if (ub_event_add(c->ev->ev, c->timeout) != 0) { log_err("could not add localhdl event"); ub_event_free(c->ev->ev); free(c->ev); free(c); return NULL; } return c; } struct comm_point* comm_point_create_raw(struct comm_base* base, int fd, int writing, comm_point_callback_type* callback, void* callback_arg) { struct comm_point* c = (struct comm_point*)calloc(1, sizeof(struct comm_point)); short evbits; if(!c) return NULL; c->ev = (struct internal_event*)calloc(1, sizeof(struct internal_event)); if(!c->ev) { free(c); return NULL; } c->ev->base = base; c->fd = fd; c->buffer = NULL; c->timeout = NULL; c->tcp_is_reading = 0; c->tcp_byte_count = 0; c->tcp_parent = NULL; c->max_tcp_count = 0; c->cur_tcp_count = 0; c->tcp_handlers = NULL; c->tcp_free = NULL; c->type = comm_raw; c->tcp_do_close = 0; c->do_not_close = 1; c->tcp_do_toggle_rw = 0; c->tcp_check_nb_connect = 0; #ifdef USE_MSG_FASTOPEN c->tcp_do_fastopen = 0; #endif #ifdef USE_DNSCRYPT c->dnscrypt = 0; c->dnscrypt_buffer = c->buffer; #endif c->callback = callback; c->cb_arg = callback_arg; /* ub_event stuff */ if(writing) evbits = UB_EV_PERSIST | UB_EV_WRITE; else evbits = UB_EV_PERSIST | UB_EV_READ; c->ev->ev = ub_event_new(base->eb->base, c->fd, evbits, comm_point_raw_handle_callback, c); if(c->ev->ev == NULL) { log_err("could not baseset rawhdl event"); free(c->ev); free(c); return NULL; } if (ub_event_add(c->ev->ev, c->timeout) != 0) { log_err("could not add rawhdl event"); ub_event_free(c->ev->ev); free(c->ev); free(c); return NULL; } return c; } void comm_point_close(struct comm_point* c) { if(!c) return; if(c->fd != -1) { verbose(5, "comm_point_close of %d: event_del", c->fd); if(ub_event_del(c->ev->ev) != 0) { log_err("could not event_del on close"); } } tcl_close_connection(c->tcl_addr); if(c->tcp_req_info) tcp_req_info_clear(c->tcp_req_info); if(c->h2_session) http2_session_server_delete(c->h2_session); /* close fd after removing from event lists, or epoll.. is messed up */ if(c->fd != -1 && !c->do_not_close) { if(c->type == comm_tcp || c->type == comm_http) { /* delete sticky events for the fd, it gets closed */ ub_winsock_tcp_wouldblock(c->ev->ev, UB_EV_READ); ub_winsock_tcp_wouldblock(c->ev->ev, UB_EV_WRITE); } verbose(VERB_ALGO, "close fd %d", c->fd); sock_close(c->fd); } c->fd = -1; } void comm_point_delete(struct comm_point* c) { if(!c) return; if((c->type == comm_tcp || c->type == comm_http) && c->ssl) { #ifdef HAVE_SSL SSL_shutdown(c->ssl); SSL_free(c->ssl); #endif } if(c->type == comm_http && c->http_endpoint) { free(c->http_endpoint); c->http_endpoint = NULL; } comm_point_close(c); if(c->tcp_handlers) { int i; for(i=0; imax_tcp_count; i++) comm_point_delete(c->tcp_handlers[i]); free(c->tcp_handlers); } free(c->timeout); if(c->type == comm_tcp || c->type == comm_local || c->type == comm_http) { sldns_buffer_free(c->buffer); #ifdef USE_DNSCRYPT if(c->dnscrypt && c->dnscrypt_buffer != c->buffer) { sldns_buffer_free(c->dnscrypt_buffer); } #endif if(c->tcp_req_info) { tcp_req_info_delete(c->tcp_req_info); } if(c->h2_session) { http2_session_delete(c->h2_session); } } ub_event_free(c->ev->ev); free(c->ev); free(c); } void comm_point_send_reply(struct comm_reply *repinfo) { struct sldns_buffer* buffer; log_assert(repinfo && repinfo->c); #ifdef USE_DNSCRYPT buffer = repinfo->c->dnscrypt_buffer; if(!dnsc_handle_uncurved_request(repinfo)) { return; } #else buffer = repinfo->c->buffer; #endif if(repinfo->c->type == comm_udp) { if(repinfo->srctype) comm_point_send_udp_msg_if(repinfo->c, buffer, (struct sockaddr*)&repinfo->addr, repinfo->addrlen, repinfo); else comm_point_send_udp_msg(repinfo->c, buffer, - (struct sockaddr*)&repinfo->addr, repinfo->addrlen); + (struct sockaddr*)&repinfo->addr, repinfo->addrlen, 0); #ifdef USE_DNSTAP if(repinfo->c->dtenv != NULL && repinfo->c->dtenv->log_client_response_messages) dt_msg_send_client_response(repinfo->c->dtenv, &repinfo->addr, repinfo->c->type, repinfo->c->buffer); #endif } else { #ifdef USE_DNSTAP if(repinfo->c->tcp_parent->dtenv != NULL && repinfo->c->tcp_parent->dtenv->log_client_response_messages) dt_msg_send_client_response(repinfo->c->tcp_parent->dtenv, &repinfo->addr, repinfo->c->type, ( repinfo->c->tcp_req_info ? repinfo->c->tcp_req_info->spool_buffer : repinfo->c->buffer )); #endif if(repinfo->c->tcp_req_info) { tcp_req_info_send_reply(repinfo->c->tcp_req_info); } else if(repinfo->c->use_h2) { if(!http2_submit_dns_response(repinfo->c->h2_session)) { comm_point_drop_reply(repinfo); return; } repinfo->c->h2_stream = NULL; repinfo->c->tcp_is_reading = 0; comm_point_stop_listening(repinfo->c); comm_point_start_listening(repinfo->c, -1, repinfo->c->tcp_timeout_msec); return; } else { comm_point_start_listening(repinfo->c, -1, repinfo->c->tcp_timeout_msec); } } } void comm_point_drop_reply(struct comm_reply* repinfo) { if(!repinfo) return; log_assert(repinfo->c); log_assert(repinfo->c->type != comm_tcp_accept); if(repinfo->c->type == comm_udp) return; if(repinfo->c->tcp_req_info) repinfo->c->tcp_req_info->is_drop = 1; if(repinfo->c->type == comm_http) { if(repinfo->c->h2_session) { repinfo->c->h2_session->is_drop = 1; if(!repinfo->c->h2_session->postpone_drop) reclaim_http_handler(repinfo->c); return; } reclaim_http_handler(repinfo->c); return; } reclaim_tcp_handler(repinfo->c); } void comm_point_stop_listening(struct comm_point* c) { verbose(VERB_ALGO, "comm point stop listening %d", c->fd); if(ub_event_del(c->ev->ev) != 0) { log_err("event_del error to stoplisten"); } } void comm_point_start_listening(struct comm_point* c, int newfd, int msec) { verbose(VERB_ALGO, "comm point start listening %d (%d msec)", c->fd==-1?newfd:c->fd, msec); if(c->type == comm_tcp_accept && !c->tcp_free) { /* no use to start listening no free slots. */ return; } if(msec != -1 && msec != 0) { if(!c->timeout) { c->timeout = (struct timeval*)malloc(sizeof( struct timeval)); if(!c->timeout) { log_err("cpsl: malloc failed. No net read."); return; } } ub_event_add_bits(c->ev->ev, UB_EV_TIMEOUT); #ifndef S_SPLINT_S /* splint fails on struct timeval. */ c->timeout->tv_sec = msec/1000; c->timeout->tv_usec = (msec%1000)*1000; #endif /* S_SPLINT_S */ } if(c->type == comm_tcp || c->type == comm_http) { ub_event_del_bits(c->ev->ev, UB_EV_READ|UB_EV_WRITE); if(c->tcp_write_and_read) { verbose(5, "startlistening %d mode rw", (newfd==-1?c->fd:newfd)); ub_event_add_bits(c->ev->ev, UB_EV_READ|UB_EV_WRITE); } else if(c->tcp_is_reading) { verbose(5, "startlistening %d mode r", (newfd==-1?c->fd:newfd)); ub_event_add_bits(c->ev->ev, UB_EV_READ); } else { verbose(5, "startlistening %d mode w", (newfd==-1?c->fd:newfd)); ub_event_add_bits(c->ev->ev, UB_EV_WRITE); } } if(newfd != -1) { if(c->fd != -1 && c->fd != newfd) { verbose(5, "cpsl close of fd %d for %d", c->fd, newfd); sock_close(c->fd); } c->fd = newfd; ub_event_set_fd(c->ev->ev, c->fd); } if(ub_event_add(c->ev->ev, msec==0?NULL:c->timeout) != 0) { log_err("event_add failed. in cpsl."); } } void comm_point_listen_for_rw(struct comm_point* c, int rd, int wr) { verbose(VERB_ALGO, "comm point listen_for_rw %d %d", c->fd, wr); if(ub_event_del(c->ev->ev) != 0) { log_err("event_del error to cplf"); } ub_event_del_bits(c->ev->ev, UB_EV_READ|UB_EV_WRITE); if(rd) ub_event_add_bits(c->ev->ev, UB_EV_READ); if(wr) ub_event_add_bits(c->ev->ev, UB_EV_WRITE); if(ub_event_add(c->ev->ev, c->timeout) != 0) { log_err("event_add failed. in cplf."); } } size_t comm_point_get_mem(struct comm_point* c) { size_t s; if(!c) return 0; s = sizeof(*c) + sizeof(*c->ev); if(c->timeout) s += sizeof(*c->timeout); if(c->type == comm_tcp || c->type == comm_local) { s += sizeof(*c->buffer) + sldns_buffer_capacity(c->buffer); #ifdef USE_DNSCRYPT s += sizeof(*c->dnscrypt_buffer); if(c->buffer != c->dnscrypt_buffer) { s += sldns_buffer_capacity(c->dnscrypt_buffer); } #endif } if(c->type == comm_tcp_accept) { int i; for(i=0; imax_tcp_count; i++) s += comm_point_get_mem(c->tcp_handlers[i]); } return s; } struct comm_timer* comm_timer_create(struct comm_base* base, void (*cb)(void*), void* cb_arg) { struct internal_timer *tm = (struct internal_timer*)calloc(1, sizeof(struct internal_timer)); if(!tm) { log_err("malloc failed"); return NULL; } tm->super.ev_timer = tm; tm->base = base; tm->super.callback = cb; tm->super.cb_arg = cb_arg; tm->ev = ub_event_new(base->eb->base, -1, UB_EV_TIMEOUT, comm_timer_callback, &tm->super); if(tm->ev == NULL) { log_err("timer_create: event_base_set failed."); free(tm); return NULL; } return &tm->super; } void comm_timer_disable(struct comm_timer* timer) { if(!timer) return; ub_timer_del(timer->ev_timer->ev); timer->ev_timer->enabled = 0; } void comm_timer_set(struct comm_timer* timer, struct timeval* tv) { log_assert(tv); if(timer->ev_timer->enabled) comm_timer_disable(timer); if(ub_timer_add(timer->ev_timer->ev, timer->ev_timer->base->eb->base, comm_timer_callback, timer, tv) != 0) log_err("comm_timer_set: evtimer_add failed."); timer->ev_timer->enabled = 1; } void comm_timer_delete(struct comm_timer* timer) { if(!timer) return; comm_timer_disable(timer); /* Free the sub struct timer->ev_timer derived from the super struct timer. * i.e. assert(timer == timer->ev_timer) */ ub_event_free(timer->ev_timer->ev); free(timer->ev_timer); } void comm_timer_callback(int ATTR_UNUSED(fd), short event, void* arg) { struct comm_timer* tm = (struct comm_timer*)arg; if(!(event&UB_EV_TIMEOUT)) return; ub_comm_base_now(tm->ev_timer->base); tm->ev_timer->enabled = 0; fptr_ok(fptr_whitelist_comm_timer(tm->callback)); (*tm->callback)(tm->cb_arg); } int comm_timer_is_set(struct comm_timer* timer) { return (int)timer->ev_timer->enabled; } size_t comm_timer_get_mem(struct comm_timer* ATTR_UNUSED(timer)) { return sizeof(struct internal_timer); } struct comm_signal* comm_signal_create(struct comm_base* base, void (*callback)(int, void*), void* cb_arg) { struct comm_signal* com = (struct comm_signal*)malloc( sizeof(struct comm_signal)); if(!com) { log_err("malloc failed"); return NULL; } com->base = base; com->callback = callback; com->cb_arg = cb_arg; com->ev_signal = NULL; return com; } void comm_signal_callback(int sig, short event, void* arg) { struct comm_signal* comsig = (struct comm_signal*)arg; if(!(event & UB_EV_SIGNAL)) return; ub_comm_base_now(comsig->base); fptr_ok(fptr_whitelist_comm_signal(comsig->callback)); (*comsig->callback)(sig, comsig->cb_arg); } int comm_signal_bind(struct comm_signal* comsig, int sig) { struct internal_signal* entry = (struct internal_signal*)calloc(1, sizeof(struct internal_signal)); if(!entry) { log_err("malloc failed"); return 0; } log_assert(comsig); /* add signal event */ entry->ev = ub_signal_new(comsig->base->eb->base, sig, comm_signal_callback, comsig); if(entry->ev == NULL) { log_err("Could not create signal event"); free(entry); return 0; } if(ub_signal_add(entry->ev, NULL) != 0) { log_err("Could not add signal handler"); ub_event_free(entry->ev); free(entry); return 0; } /* link into list */ entry->next = comsig->ev_signal; comsig->ev_signal = entry; return 1; } void comm_signal_delete(struct comm_signal* comsig) { struct internal_signal* p, *np; if(!comsig) return; p=comsig->ev_signal; while(p) { np = p->next; ub_signal_del(p->ev); ub_event_free(p->ev); free(p); p = np; } free(comsig); } Index: head/contrib/unbound/util/netevent.h =================================================================== --- head/contrib/unbound/util/netevent.h (revision 368750) +++ head/contrib/unbound/util/netevent.h (revision 368751) @@ -1,988 +1,989 @@ /* * util/netevent.h - event notification * * Copyright (c) 2007, NLnet Labs. All rights reserved. * * This software is open source. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * Neither the name of the NLNET LABS nor the names of its contributors may * be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /** * \file * * This file contains event notification functions. * * There are three types of communication points * o UDP socket - perthread buffer. * o TCP-accept socket - array of TCP-sockets, socketcount. * o TCP socket - own buffer, parent-TCPaccept, read/write state, * number of bytes read/written, timeout. * * There are sockets aimed towards our clients and towards the internet. * o frontside - aimed towards our clients, queries come in, answers back. * o behind - aimed towards internet, to the authoritative DNS servers. * * Several event types are available: * o comm_base - for thread safety of the comm points, one per thread. * o comm_point - udp and tcp networking, with callbacks. * o comm_timer - a timeout with callback. * o comm_signal - callbacks when signal is caught. * o comm_reply - holds reply info during networking callback. * */ #ifndef NET_EVENT_H #define NET_EVENT_H #include "dnscrypt/dnscrypt.h" #ifdef HAVE_NGHTTP2_NGHTTP2_H #include #endif struct sldns_buffer; struct comm_point; struct comm_reply; struct tcl_list; struct ub_event_base; struct mesh_state; struct mesh_area; /* internal event notification data storage structure. */ struct internal_event; struct internal_base; struct internal_timer; /* A sub struct of the comm_timer super struct */ enum listen_type; /** callback from communication point function type */ typedef int comm_point_callback_type(struct comm_point*, void*, int, struct comm_reply*); /** to pass no_error to callback function */ #define NETEVENT_NOERROR 0 /** to pass closed connection to callback function */ #define NETEVENT_CLOSED -1 /** to pass timeout happened to callback function */ #define NETEVENT_TIMEOUT -2 /** to pass fallback from capsforID to callback function; 0x20 failed */ #define NETEVENT_CAPSFAIL -3 /** to pass done transfer to callback function; http file is complete */ #define NETEVENT_DONE -4 /** to pass write of the write packet is done to callback function * used when tcp_write_and_read is enabled */ #define NETEVENT_PKT_WRITTEN -5 /** timeout to slow accept calls when not possible, in msec. */ #define NETEVENT_SLOW_ACCEPT_TIME 2000 /** * A communication point dispatcher. Thread specific. */ struct comm_base { /** behind the scenes structure. with say libevent info. alloced */ struct internal_base* eb; /** callback to stop listening on accept sockets, * performed when accept() will not function properly */ void (*stop_accept)(void*); /** callback to start listening on accept sockets, performed * after stop_accept() then a timeout has passed. */ void (*start_accept)(void*); /** user argument for stop_accept and start_accept functions */ void* cb_arg; }; /** * Reply information for a communication point. */ struct comm_reply { /** the comm_point with fd to send reply on to. */ struct comm_point* c; /** the address (for UDP based communication) */ struct sockaddr_storage addr; /** length of address */ socklen_t addrlen; /** return type 0 (none), 4(IP4), 6(IP6) */ int srctype; /* DnsCrypt context */ #ifdef USE_DNSCRYPT uint8_t client_nonce[crypto_box_HALF_NONCEBYTES]; uint8_t nmkey[crypto_box_BEFORENMBYTES]; const dnsccert *dnsc_cert; int is_dnscrypted; #endif /** the return source interface data */ union { #ifdef IPV6_PKTINFO struct in6_pktinfo v6info; #endif #ifdef IP_PKTINFO struct in_pktinfo v4info; #elif defined(IP_RECVDSTADDR) struct in_addr v4addr; #endif } /** variable with return source data */ pktinfo; /** max udp size for udp packets */ size_t max_udp_size; }; /** * Communication point to the network * These behaviours can be accomplished by setting the flags * and passing return values from the callback. * udp frontside: called after readdone. sendafter. * tcp frontside: called readdone, sendafter. close. * udp behind: called after readdone. No send after. * tcp behind: write done, read done, then called. No send after. */ struct comm_point { /** behind the scenes structure, with say libevent info. alloced. */ struct internal_event* ev; /** file descriptor for communication point */ int fd; /** timeout (NULL if it does not). Malloced. */ struct timeval* timeout; /** buffer pointer. Either to perthread, or own buffer or NULL */ struct sldns_buffer* buffer; /* -------- TCP Handler -------- */ /** Read/Write state for TCP */ int tcp_is_reading; /** The current read/write count for TCP */ size_t tcp_byte_count; /** parent communication point (for TCP sockets) */ struct comm_point* tcp_parent; /** sockaddr from peer, for TCP handlers */ struct comm_reply repinfo; /* -------- TCP Accept -------- */ /** the number of TCP handlers for this tcp-accept socket */ int max_tcp_count; /** current number of tcp handler in-use for this accept socket */ int cur_tcp_count; /** malloced array of tcp handlers for a tcp-accept, of size max_tcp_count. */ struct comm_point** tcp_handlers; /** linked list of free tcp_handlers to use for new queries. For tcp_accept the first entry, for tcp_handlers the next one. */ struct comm_point* tcp_free; /* -------- SSL TCP DNS ------- */ /** the SSL object with rw bio (owned) or for commaccept ctx ref */ void* ssl; /** handshake state for init and renegotiate */ enum { /** no handshake, it has been done */ comm_ssl_shake_none = 0, /** ssl initial handshake wants to read */ comm_ssl_shake_read, /** ssl initial handshake wants to write */ comm_ssl_shake_write, /** ssl_write wants to read */ comm_ssl_shake_hs_read, /** ssl_read wants to write */ comm_ssl_shake_hs_write } ssl_shake_state; /* -------- HTTP ------- */ /** Do not allow connection to use HTTP version lower than this. 0=no * minimum. */ enum { http_version_none = 0, http_version_2 = 2 } http_min_version; /** http endpoint */ char* http_endpoint; /* -------- HTTP/1.1 ------- */ /** Currently reading in http headers */ int http_in_headers; /** Currently reading in chunk headers, 0=not, 1=firstline, 2=unused * (more lines), 3=trailer headers after chunk */ int http_in_chunk_headers; /** chunked transfer */ int http_is_chunked; /** http temp buffer (shared buffer for temporary work) */ struct sldns_buffer* http_temp; /** http stored content in buffer */ size_t http_stored; /* -------- HTTP/2 ------- */ /** http2 session */ struct http2_session* h2_session; /** set to 1 if h2 is negotiated to be used (using alpn) */ int use_h2; /** stream currently being handled */ struct http2_stream* h2_stream; /** maximum allowed query buffer size, per stream */ size_t http2_stream_max_qbuffer_size; /** maximum number of HTTP/2 streams per connection. Send in HTTP/2 * SETTINGS frame. */ uint32_t http2_max_streams; /* -------- dnstap ------- */ /** the dnstap environment */ struct dt_env* dtenv; /** is this a UDP, TCP-accept or TCP socket. */ enum comm_point_type { /** UDP socket - handle datagrams. */ comm_udp, /** TCP accept socket - only creates handlers if readable. */ comm_tcp_accept, /** TCP handler socket - handle byteperbyte readwrite. */ comm_tcp, /** HTTP handler socket */ comm_http, /** AF_UNIX socket - for internal commands. */ comm_local, /** raw - not DNS format - for pipe readers and writers */ comm_raw } /** variable with type of socket, UDP,TCP-accept,TCP,pipe */ type; /* ---------- Behaviour ----------- */ /** if set the connection is NOT closed on delete. */ int do_not_close; /** if set, the connection is closed on error, on timeout, and after read/write completes. No callback is done. */ int tcp_do_close; /** flag that indicates the stream is both written and read from. */ int tcp_write_and_read; /** byte count for written length over write channel, for when * tcp_write_and_read is enabled. When tcp_write_and_read is enabled, * this is the counter for writing, the one for reading is in the * commpoint.buffer sldns buffer. The counter counts from 0 to * 2+tcp_write_pkt_len, and includes the tcp length bytes. */ size_t tcp_write_byte_count; /** packet to write currently over the write channel. for when * tcp_write_and_read is enabled. When tcp_write_and_read is enabled, * this is the buffer for the written packet, the commpoint.buffer * sldns buffer is the buffer for the received packet. */ uint8_t* tcp_write_pkt; /** length of tcp_write_pkt in bytes */ size_t tcp_write_pkt_len; /** if set try to read another packet again (over connection with * multiple packets), once set, tries once, then zero again, * so set it in the packet complete section. * The pointer itself has to be set before the callback is invoked, * when you set things up, and continue to exist also after the * commpoint is closed and deleted in your callback. So that after * the callback cleans up netevent can see what it has to do. * Or leave NULL if it is not used at all. */ int* tcp_more_read_again; /** if set try to write another packet (over connection with * multiple packets), once set, tries once, then zero again, * so set it in the packet complete section. * The pointer itself has to be set before the callback is invoked, * when you set things up, and continue to exist also after the * commpoint is closed and deleted in your callback. So that after * the callback cleans up netevent can see what it has to do. * Or leave NULL if it is not used at all. */ int* tcp_more_write_again; /** if set, read/write completes: read/write state of tcp is toggled. buffer reset/bytecount reset. this flag cleared. So that when that is done the callback is called. */ int tcp_do_toggle_rw; /** timeout in msec for TCP wait times for this connection */ int tcp_timeout_msec; /** if set, tcp keepalive is enabled on this connection */ int tcp_keepalive; /** if set, checks for pending error from nonblocking connect() call.*/ int tcp_check_nb_connect; /** if set, check for connection limit on tcp accept. */ struct tcl_list* tcp_conn_limit; /** the entry for the connection. */ struct tcl_addr* tcl_addr; /** the structure to keep track of open requests on this channel */ struct tcp_req_info* tcp_req_info; #ifdef USE_MSG_FASTOPEN /** used to track if the sendto() call should be done when using TFO. */ int tcp_do_fastopen; #endif #ifdef USE_DNSCRYPT /** Is this a dnscrypt channel */ int dnscrypt; /** encrypted buffer pointer. Either to perthread, or own buffer or NULL */ struct sldns_buffer* dnscrypt_buffer; #endif /** number of queries outstanding on this socket, used by * outside network for udp ports */ int inuse; /** callback when done. tcp_accept does not get called back, is NULL then. If a timeout happens, callback with timeout=1 is called. If an error happens, callback is called with error set nonzero. If not NETEVENT_NOERROR, it is an errno value. If the connection is closed (by remote end) then the callback is called with error set to NETEVENT_CLOSED=-1. If a timeout happens on the connection, the error is set to NETEVENT_TIMEOUT=-2. The reply_info can be copied if the reply needs to happen at a later time. It consists of a struct with commpoint and address. It can be passed to a msg send routine some time later. Note the reply information is temporary and must be copied. NULL is passed for_reply info, in cases where error happened. declare as: int my_callback(struct comm_point* c, void* my_arg, int error, struct comm_reply *reply_info); if the routine returns 0, nothing is done. Notzero, the buffer will be sent back to client. For UDP this is done without changing the commpoint. In TCP it sets write state. */ comm_point_callback_type* callback; /** argument to pass to callback. */ void *cb_arg; }; /** * Structure only for making timeout events. */ struct comm_timer { /** the internal event stuff (derived) */ struct internal_timer* ev_timer; /** callback function, takes user arg only */ void (*callback)(void*); /** callback user argument */ void* cb_arg; }; /** * Structure only for signal events. */ struct comm_signal { /** the communication base */ struct comm_base* base; /** the internal event stuff */ struct internal_signal* ev_signal; /** callback function, takes signal number and user arg */ void (*callback)(int, void*); /** callback user argument */ void* cb_arg; }; /** * Create a new comm base. * @param sigs: if true it attempts to create a default loop for * signal handling. * @return: the new comm base. NULL on error. */ struct comm_base* comm_base_create(int sigs); /** * Create comm base that uses the given ub_event_base (underlying pluggable * event mechanism pointer). * @param base: underlying pluggable event base. * @return: the new comm base. NULL on error. */ struct comm_base* comm_base_create_event(struct ub_event_base* base); /** * Delete comm base structure but not the underlying lib event base. * All comm points must have been deleted. * @param b: the base to delete. */ void comm_base_delete_no_base(struct comm_base* b); /** * Destroy a comm base. * All comm points must have been deleted. * @param b: the base to delete. */ void comm_base_delete(struct comm_base* b); /** * Obtain two pointers. The pointers never change (until base_delete()). * The pointers point to time values that are updated regularly. * @param b: the communication base that will update the time values. * @param tt: pointer to time in seconds is returned. * @param tv: pointer to time in microseconds is returned. */ void comm_base_timept(struct comm_base* b, time_t** tt, struct timeval** tv); /** * Dispatch the comm base events. * @param b: the communication to perform. */ void comm_base_dispatch(struct comm_base* b); /** * Exit from dispatch loop. * @param b: the communication base that is in dispatch(). */ void comm_base_exit(struct comm_base* b); /** * Set the slow_accept mode handlers. You can not provide these if you do * not perform accept() calls. * @param b: comm base * @param stop_accept: function that stops listening to accept fds. * @param start_accept: function that resumes listening to accept fds. * @param arg: callback arg to pass to the functions. */ void comm_base_set_slow_accept_handlers(struct comm_base* b, void (*stop_accept)(void*), void (*start_accept)(void*), void* arg); /** * Access internal data structure (for util/tube.c on windows) * @param b: comm base * @return ub_event_base. */ struct ub_event_base* comm_base_internal(struct comm_base* b); /** * Create an UDP comm point. Calls malloc. * setups the structure with the parameters you provide. * @param base: in which base to alloc the commpoint. * @param fd : file descriptor of open UDP socket. * @param buffer: shared buffer by UDP sockets from this thread. * @param callback: callback function pointer. * @param callback_arg: will be passed to your callback function. * @return: returns the allocated communication point. NULL on error. * Sets timeout to NULL. Turns off TCP options. */ struct comm_point* comm_point_create_udp(struct comm_base* base, int fd, struct sldns_buffer* buffer, comm_point_callback_type* callback, void* callback_arg); /** * Create an UDP with ancillary data comm point. Calls malloc. * Uses recvmsg instead of recv to get udp message. * setups the structure with the parameters you provide. * @param base: in which base to alloc the commpoint. * @param fd : file descriptor of open UDP socket. * @param buffer: shared buffer by UDP sockets from this thread. * @param callback: callback function pointer. * @param callback_arg: will be passed to your callback function. * @return: returns the allocated communication point. NULL on error. * Sets timeout to NULL. Turns off TCP options. */ struct comm_point* comm_point_create_udp_ancil(struct comm_base* base, int fd, struct sldns_buffer* buffer, comm_point_callback_type* callback, void* callback_arg); /** * Create a TCP listener comm point. Calls malloc. * Setups the structure with the parameters you provide. * Also Creates TCP Handlers, pre allocated for you. * Uses the parameters you provide. * @param base: in which base to alloc the commpoint. * @param fd: file descriptor of open TCP socket set to listen nonblocking. * @param num: becomes max_tcp_count, the routine allocates that * many tcp handler commpoints. * @param idle_timeout: TCP idle timeout in ms. * @param harden_large_queries: whether query size should be limited. * @param http_max_streams: maximum number of HTTP/2 streams per connection. * @param http_endpoint: HTTP endpoint to service queries on * @param tcp_conn_limit: TCP connection limit info. * @param bufsize: size of buffer to create for handlers. * @param spoolbuf: shared spool buffer for tcp_req_info structures. * or NULL to not create those structures in the tcp handlers. * @param port_type: the type of port we are creating a TCP listener for. Used * to select handler type to use. * @param callback: callback function pointer for TCP handlers. * @param callback_arg: will be passed to your callback function. * @return: returns the TCP listener commpoint. You can find the * TCP handlers in the array inside the listener commpoint. * returns NULL on error. * Inits timeout to NULL. All handlers are on the free list. */ struct comm_point* comm_point_create_tcp(struct comm_base* base, int fd, int num, int idle_timeout, int harden_large_queries, uint32_t http_max_streams, char* http_endpoint, struct tcl_list* tcp_conn_limit, size_t bufsize, struct sldns_buffer* spoolbuf, enum listen_type port_type, comm_point_callback_type* callback, void* callback_arg); /** * Create an outgoing TCP commpoint. No file descriptor is opened, left at -1. * @param base: in which base to alloc the commpoint. * @param bufsize: size of buffer to create for handlers. * @param callback: callback function pointer for the handler. * @param callback_arg: will be passed to your callback function. * @return: the commpoint or NULL on error. */ struct comm_point* comm_point_create_tcp_out(struct comm_base* base, size_t bufsize, comm_point_callback_type* callback, void* callback_arg); /** * Create an outgoing HTTP commpoint. No file descriptor is opened, left at -1. * @param base: in which base to alloc the commpoint. * @param bufsize: size of buffer to create for handlers. * @param callback: callback function pointer for the handler. * @param callback_arg: will be passed to your callback function. * @param temp: sldns buffer, shared between other http_out commpoints, for * temporary data when performing callbacks. * @return: the commpoint or NULL on error. */ struct comm_point* comm_point_create_http_out(struct comm_base* base, size_t bufsize, comm_point_callback_type* callback, void* callback_arg, struct sldns_buffer* temp); /** * Create commpoint to listen to a local domain file descriptor. * @param base: in which base to alloc the commpoint. * @param fd: file descriptor of open AF_UNIX socket set to listen nonblocking. * @param bufsize: size of buffer to create for handlers. * @param callback: callback function pointer for the handler. * @param callback_arg: will be passed to your callback function. * @return: the commpoint or NULL on error. */ struct comm_point* comm_point_create_local(struct comm_base* base, int fd, size_t bufsize, comm_point_callback_type* callback, void* callback_arg); /** * Create commpoint to listen to a local domain pipe descriptor. * @param base: in which base to alloc the commpoint. * @param fd: file descriptor. * @param writing: true if you want to listen to writes, false for reads. * @param callback: callback function pointer for the handler. * @param callback_arg: will be passed to your callback function. * @return: the commpoint or NULL on error. */ struct comm_point* comm_point_create_raw(struct comm_base* base, int fd, int writing, comm_point_callback_type* callback, void* callback_arg); /** * Close a comm point fd. * @param c: comm point to close. */ void comm_point_close(struct comm_point* c); /** * Close and deallocate (free) the comm point. If the comm point is * a tcp-accept point, also its tcp-handler points are deleted. * @param c: comm point to delete. */ void comm_point_delete(struct comm_point* c); /** * Send reply. Put message into commpoint buffer. * @param repinfo: The reply info copied from a commpoint callback call. */ void comm_point_send_reply(struct comm_reply* repinfo); /** * Drop reply. Cleans up. * @param repinfo: The reply info copied from a commpoint callback call. */ void comm_point_drop_reply(struct comm_reply* repinfo); /** * Send an udp message over a commpoint. * @param c: commpoint to send it from. * @param packet: what to send. * @param addr: where to send it to. If NULL, send is performed, * for connected sockets, to the connected address. * @param addrlen: length of addr. + * @param is_connected: if the UDP socket is connect()ed. * @return: false on a failure. */ int comm_point_send_udp_msg(struct comm_point* c, struct sldns_buffer* packet, - struct sockaddr* addr, socklen_t addrlen); + struct sockaddr* addr, socklen_t addrlen,int is_connected); /** * Stop listening for input on the commpoint. No callbacks will happen. * @param c: commpoint to disable. The fd is not closed. */ void comm_point_stop_listening(struct comm_point* c); /** * Start listening again for input on the comm point. * @param c: commpoint to enable again. * @param newfd: new fd, or -1 to leave fd be. * @param msec: timeout in milliseconds, or -1 for no (change to the) timeout. * So seconds*1000. */ void comm_point_start_listening(struct comm_point* c, int newfd, int msec); /** * Stop listening and start listening again for reading or writing. * @param c: commpoint * @param rd: if true, listens for reading. * @param wr: if true, listens for writing. */ void comm_point_listen_for_rw(struct comm_point* c, int rd, int wr); /** * Get size of memory used by comm point. * For TCP handlers this includes subhandlers. * For UDP handlers, this does not include the (shared) UDP buffer. * @param c: commpoint. * @return size in bytes. */ size_t comm_point_get_mem(struct comm_point* c); /** * create timer. Not active upon creation. * @param base: event handling base. * @param cb: callback function: void myfunc(void* myarg); * @param cb_arg: user callback argument. * @return: the new timer or NULL on error. */ struct comm_timer* comm_timer_create(struct comm_base* base, void (*cb)(void*), void* cb_arg); /** * disable timer. Stops callbacks from happening. * @param timer: to disable. */ void comm_timer_disable(struct comm_timer* timer); /** * reset timevalue for timer. * @param timer: timer to (re)set. * @param tv: when the timer should activate. if NULL timer is disabled. */ void comm_timer_set(struct comm_timer* timer, struct timeval* tv); /** * delete timer. * @param timer: to delete. */ void comm_timer_delete(struct comm_timer* timer); /** * see if timeout has been set to a value. * @param timer: the timer to examine. * @return: false if disabled or not set. */ int comm_timer_is_set(struct comm_timer* timer); /** * Get size of memory used by comm timer. * @param timer: the timer to examine. * @return size in bytes. */ size_t comm_timer_get_mem(struct comm_timer* timer); /** * Create a signal handler. Call signal_bind() later to bind to a signal. * @param base: communication base to use. * @param callback: called when signal is caught. * @param cb_arg: user argument to callback * @return: the signal struct or NULL on error. */ struct comm_signal* comm_signal_create(struct comm_base* base, void (*callback)(int, void*), void* cb_arg); /** * Bind signal struct to catch a signal. A signle comm_signal can be bound * to multiple signals, calling comm_signal_bind multiple times. * @param comsig: the communication point, with callback information. * @param sig: signal number. * @return: true on success. false on error. */ int comm_signal_bind(struct comm_signal* comsig, int sig); /** * Delete the signal communication point. * @param comsig: to delete. */ void comm_signal_delete(struct comm_signal* comsig); /** * perform accept(2) with error checking. * @param c: commpoint with accept fd. * @param addr: remote end returned here. * @param addrlen: length of remote end returned here. * @return new fd, or -1 on error. * if -1, error message has been printed if necessary, simply drop * out of the reading handler. */ int comm_point_perform_accept(struct comm_point* c, struct sockaddr_storage* addr, socklen_t* addrlen); /**** internal routines ****/ /** * This routine is published for checks and tests, and is only used internally. * handle libevent callback for udp comm point. * @param fd: file descriptor. * @param event: event bits from libevent: * EV_READ, EV_WRITE, EV_SIGNAL, EV_TIMEOUT. * @param arg: the comm_point structure. */ void comm_point_udp_callback(int fd, short event, void* arg); /** * This routine is published for checks and tests, and is only used internally. * handle libevent callback for udp ancillary data comm point. * @param fd: file descriptor. * @param event: event bits from libevent: * EV_READ, EV_WRITE, EV_SIGNAL, EV_TIMEOUT. * @param arg: the comm_point structure. */ void comm_point_udp_ancil_callback(int fd, short event, void* arg); /** * This routine is published for checks and tests, and is only used internally. * handle libevent callback for tcp accept comm point * @param fd: file descriptor. * @param event: event bits from libevent: * EV_READ, EV_WRITE, EV_SIGNAL, EV_TIMEOUT. * @param arg: the comm_point structure. */ void comm_point_tcp_accept_callback(int fd, short event, void* arg); /** * This routine is published for checks and tests, and is only used internally. * handle libevent callback for tcp data comm point * @param fd: file descriptor. * @param event: event bits from libevent: * EV_READ, EV_WRITE, EV_SIGNAL, EV_TIMEOUT. * @param arg: the comm_point structure. */ void comm_point_tcp_handle_callback(int fd, short event, void* arg); /** * This routine is published for checks and tests, and is only used internally. * handle libevent callback for tcp data comm point * @param fd: file descriptor. * @param event: event bits from libevent: * EV_READ, EV_WRITE, EV_SIGNAL, EV_TIMEOUT. * @param arg: the comm_point structure. */ void comm_point_http_handle_callback(int fd, short event, void* arg); /** * HTTP2 session. HTTP2 related info per comm point. */ struct http2_session { /** first item in list of streams */ struct http2_stream* first_stream; #ifdef HAVE_NGHTTP2 /** nghttp2 session */ nghttp2_session *session; /** store nghttp2 callbacks for easy reuse */ nghttp2_session_callbacks* callbacks; #endif /** comm point containing buffer used to build answer in worker or * module */ struct comm_point* c; /** session is instructed to get dropped (comm port will be closed) */ int is_drop; /** postpone dropping the session, can be used to prevent dropping * while being in a callback */ int postpone_drop; }; /** enum of HTTP status */ enum http_status { HTTP_STATUS_OK = 200, HTTP_STATUS_BAD_REQUEST = 400, HTTP_STATUS_NOT_FOUND = 404, HTTP_STATUS_PAYLOAD_TOO_LARGE = 413, HTTP_STATUS_URI_TOO_LONG = 414, HTTP_STATUS_UNSUPPORTED_MEDIA_TYPE = 415, HTTP_STATUS_NOT_IMPLEMENTED = 501 }; /** * HTTP stream. Part of list of HTTP2 streams per session. */ struct http2_stream { /** next stream in list per session */ struct http2_stream* next; /** previous stream in list per session */ struct http2_stream* prev; /** HTTP2 stream ID is an unsigned 31-bit integer */ int32_t stream_id; /** HTTP method used for this stream */ enum { HTTP_METHOD_POST = 1, HTTP_METHOD_GET, HTTP_METHOD_UNSUPPORTED } http_method; /** message contains invalid content type */ int invalid_content_type; /** message body content type */ size_t content_length; /** HTTP response status */ enum http_status status; /** request for non existing endpoint */ int invalid_endpoint; /** query in request is too large */ int query_too_large; /** buffer to store query into. Can't use session shared buffer as query * can arrive in parts, intertwined with frames for other queries. */ struct sldns_buffer* qbuffer; /** buffer to store response into. Can't use shared buffer as a next * query read callback can overwrite it before it is send out. */ struct sldns_buffer* rbuffer; /** mesh area containing mesh state */ struct mesh_area* mesh; /** mesh state for query. Used to remove mesh reply before closing * stream. */ struct mesh_state* mesh_state; }; #ifdef HAVE_NGHTTP2 /** nghttp2 receive cb. Read from SSL connection into nghttp2 buffer */ ssize_t http2_recv_cb(nghttp2_session* session, uint8_t* buf, size_t len, int flags, void* cb_arg); /** nghttp2 send callback. Send from nghttp2 buffer to ssl socket */ ssize_t http2_send_cb(nghttp2_session* session, const uint8_t* buf, size_t len, int flags, void* cb_arg); /** nghttp2 callback on closing stream */ int http2_stream_close_cb(nghttp2_session* session, int32_t stream_id, uint32_t error_code, void* cb_arg); #endif /** * Create new http2 stream * @param stream_id: ID for stream to create. * @return malloc'ed stream, NULL on error */ struct http2_stream* http2_stream_create(int32_t stream_id); /** * Add new stream to session linked list * @param h2_session: http2 session to add stream to * @param h2_stream: stream to add to session list */ void http2_session_add_stream(struct http2_session* h2_session, struct http2_stream* h2_stream); /** Add mesh state to stream. To be able to remove mesh reply on stream closure */ void http2_stream_add_meshstate(struct http2_stream* h2_stream, struct mesh_area* mesh, struct mesh_state* m); /** * This routine is published for checks and tests, and is only used internally. * handle libevent callback for timer comm. * @param fd: file descriptor (always -1). * @param event: event bits from libevent: * EV_READ, EV_WRITE, EV_SIGNAL, EV_TIMEOUT. * @param arg: the comm_timer structure. */ void comm_timer_callback(int fd, short event, void* arg); /** * This routine is published for checks and tests, and is only used internally. * handle libevent callback for signal comm. * @param fd: file descriptor (used for the signal number). * @param event: event bits from libevent: * EV_READ, EV_WRITE, EV_SIGNAL, EV_TIMEOUT. * @param arg: the internal commsignal structure. */ void comm_signal_callback(int fd, short event, void* arg); /** * This routine is published for checks and tests, and is only used internally. * libevent callback for AF_UNIX fds * @param fd: file descriptor. * @param event: event bits from libevent: * EV_READ, EV_WRITE, EV_SIGNAL, EV_TIMEOUT. * @param arg: the comm_point structure. */ void comm_point_local_handle_callback(int fd, short event, void* arg); /** * This routine is published for checks and tests, and is only used internally. * libevent callback for raw fd access. * @param fd: file descriptor. * @param event: event bits from libevent: * EV_READ, EV_WRITE, EV_SIGNAL, EV_TIMEOUT. * @param arg: the comm_point structure. */ void comm_point_raw_handle_callback(int fd, short event, void* arg); /** * This routine is published for checks and tests, and is only used internally. * libevent callback for timeout on slow accept. * @param fd: file descriptor. * @param event: event bits from libevent: * EV_READ, EV_WRITE, EV_SIGNAL, EV_TIMEOUT. * @param arg: the comm_point structure. */ void comm_base_handle_slow_accept(int fd, short event, void* arg); #ifdef USE_WINSOCK /** * Callback for openssl BIO to on windows detect WSAEWOULDBLOCK and notify * the winsock_event of this for proper TCP nonblocking implementation. * @param c: comm_point, fd must be set its struct event is registered. * @param ssl: openssl SSL, fd must be set so it has a bio. */ void comm_point_tcp_win_bio_cb(struct comm_point* c, void* ssl); #endif /** * See if errno for tcp connect has to be logged or not. This uses errno * @param addr: apart from checking errno, the addr is checked for ip4mapped * and broadcast type, hence passed. * @param addrlen: length of the addr parameter. * @return true if it needs to be logged. */ int tcp_connect_errno_needs_log(struct sockaddr* addr, socklen_t addrlen); #ifdef HAVE_SSL /** * True if the ssl handshake error has to be squelched from the logs * @param err: the error returned by the openssl routine, ERR_get_error. * This is a packed structure with elements that are examined. * @return true if the error is squelched (not logged). */ int squelch_err_ssl_handshake(unsigned long err); #endif #endif /* NET_EVENT_H */ Index: head/contrib/unbound =================================================================== --- head/contrib/unbound (revision 368750) +++ head/contrib/unbound (revision 368751) Property changes on: head/contrib/unbound ___________________________________________________________________ Modified: svn:mergeinfo ## -0,0 +0,1 ## Merged /vendor/unbound/dist:r368746