Index: head/sbin/pfctl/pfctl_optimize.c =================================================================== --- head/sbin/pfctl/pfctl_optimize.c (revision 308456) +++ head/sbin/pfctl/pfctl_optimize.c (revision 308457) @@ -1,1656 +1,1656 @@ /* $OpenBSD: pfctl_optimize.c,v 1.17 2008/05/06 03:45:21 mpf Exp $ */ /* * Copyright (c) 2004 Mike Frantzen * * Permission to use, copy, modify, and distribute this software for any * purpose with or without fee is hereby granted, provided that the above * copyright notice and this permission notice appear in all copies. * * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "pfctl_parser.h" #include "pfctl.h" /* The size at which a table becomes faster than individual rules */ #define TABLE_THRESHOLD 6 /* #define OPT_DEBUG 1 */ #ifdef OPT_DEBUG # define DEBUG(str, v...) \ printf("%s: " str "\n", __FUNCTION__ , ## v) #else # define DEBUG(str, v...) ((void)0) #endif /* * A container that lets us sort a superblock to optimize the skip step jumps */ struct pf_skip_step { int ps_count; /* number of items */ TAILQ_HEAD( , pf_opt_rule) ps_rules; TAILQ_ENTRY(pf_skip_step) ps_entry; }; /* * A superblock is a block of adjacent rules of similar action. If there * are five PASS rules in a row, they all become members of a superblock. * Once we have a superblock, we are free to re-order any rules within it * in order to improve performance; if a packet is passed, it doesn't matter * who passed it. */ struct superblock { TAILQ_HEAD( , pf_opt_rule) sb_rules; TAILQ_ENTRY(superblock) sb_entry; struct superblock *sb_profiled_block; TAILQ_HEAD(skiplist, pf_skip_step) sb_skipsteps[PF_SKIP_COUNT]; }; TAILQ_HEAD(superblocks, superblock); /* * Description of the PF rule structure. */ enum { - BARRIER, /* the presence of the field puts the rule in it's own block */ + BARRIER, /* the presence of the field puts the rule in its own block */ BREAK, /* the field may not differ between rules in a superblock */ NOMERGE, /* the field may not differ between rules when combined */ COMBINED, /* the field may itself be combined with other rules */ DC, /* we just don't care about the field */ NEVER}; /* we should never see this field set?!? */ static struct pf_rule_field { const char *prf_name; int prf_type; size_t prf_offset; size_t prf_size; } pf_rule_desc[] = { #define PF_RULE_FIELD(field, ty) \ {#field, \ ty, \ offsetof(struct pf_rule, field), \ sizeof(((struct pf_rule *)0)->field)} /* - * The presence of these fields in a rule put the rule in it's own + * The presence of these fields in a rule put the rule in its own * superblock. Thus it will not be optimized. It also prevents the * rule from being re-ordered at all. */ PF_RULE_FIELD(label, BARRIER), PF_RULE_FIELD(prob, BARRIER), PF_RULE_FIELD(max_states, BARRIER), PF_RULE_FIELD(max_src_nodes, BARRIER), PF_RULE_FIELD(max_src_states, BARRIER), PF_RULE_FIELD(max_src_conn, BARRIER), PF_RULE_FIELD(max_src_conn_rate, BARRIER), PF_RULE_FIELD(anchor, BARRIER), /* for now */ /* * These fields must be the same between all rules in the same superblock. * These rules are allowed to be re-ordered but only among like rules. * For instance we can re-order all 'tag "foo"' rules because they have the * same tag. But we can not re-order between a 'tag "foo"' and a * 'tag "bar"' since that would change the meaning of the ruleset. */ PF_RULE_FIELD(tagname, BREAK), PF_RULE_FIELD(keep_state, BREAK), PF_RULE_FIELD(qname, BREAK), PF_RULE_FIELD(pqname, BREAK), PF_RULE_FIELD(rt, BREAK), PF_RULE_FIELD(allow_opts, BREAK), PF_RULE_FIELD(rule_flag, BREAK), PF_RULE_FIELD(action, BREAK), PF_RULE_FIELD(log, BREAK), PF_RULE_FIELD(quick, BREAK), PF_RULE_FIELD(return_ttl, BREAK), PF_RULE_FIELD(overload_tblname, BREAK), PF_RULE_FIELD(flush, BREAK), PF_RULE_FIELD(rpool, BREAK), PF_RULE_FIELD(logif, BREAK), /* * Any fields not listed in this structure act as BREAK fields */ /* * These fields must not differ when we merge two rules together but * their difference isn't enough to put the rules in different superblocks. * There are no problems re-ordering any rules with these fields. */ PF_RULE_FIELD(af, NOMERGE), PF_RULE_FIELD(ifnot, NOMERGE), PF_RULE_FIELD(ifname, NOMERGE), /* hack for IF groups */ PF_RULE_FIELD(match_tag_not, NOMERGE), PF_RULE_FIELD(match_tagname, NOMERGE), PF_RULE_FIELD(os_fingerprint, NOMERGE), PF_RULE_FIELD(timeout, NOMERGE), PF_RULE_FIELD(return_icmp, NOMERGE), PF_RULE_FIELD(return_icmp6, NOMERGE), PF_RULE_FIELD(uid, NOMERGE), PF_RULE_FIELD(gid, NOMERGE), PF_RULE_FIELD(direction, NOMERGE), PF_RULE_FIELD(proto, NOMERGE), PF_RULE_FIELD(type, NOMERGE), PF_RULE_FIELD(code, NOMERGE), PF_RULE_FIELD(flags, NOMERGE), PF_RULE_FIELD(flagset, NOMERGE), PF_RULE_FIELD(tos, NOMERGE), PF_RULE_FIELD(src.port, NOMERGE), PF_RULE_FIELD(dst.port, NOMERGE), PF_RULE_FIELD(src.port_op, NOMERGE), PF_RULE_FIELD(dst.port_op, NOMERGE), PF_RULE_FIELD(src.neg, NOMERGE), PF_RULE_FIELD(dst.neg, NOMERGE), /* These fields can be merged */ PF_RULE_FIELD(src.addr, COMBINED), PF_RULE_FIELD(dst.addr, COMBINED), /* We just don't care about these fields. They're set by the kernel */ PF_RULE_FIELD(skip, DC), PF_RULE_FIELD(evaluations, DC), PF_RULE_FIELD(packets, DC), PF_RULE_FIELD(bytes, DC), PF_RULE_FIELD(kif, DC), PF_RULE_FIELD(states_cur, DC), PF_RULE_FIELD(states_tot, DC), PF_RULE_FIELD(src_nodes, DC), PF_RULE_FIELD(nr, DC), PF_RULE_FIELD(entries, DC), PF_RULE_FIELD(qid, DC), PF_RULE_FIELD(pqid, DC), PF_RULE_FIELD(anchor_relative, DC), PF_RULE_FIELD(anchor_wildcard, DC), PF_RULE_FIELD(tag, DC), PF_RULE_FIELD(match_tag, DC), PF_RULE_FIELD(overload_tbl, DC), /* These fields should never be set in a PASS/BLOCK rule */ PF_RULE_FIELD(natpass, NEVER), PF_RULE_FIELD(max_mss, NEVER), PF_RULE_FIELD(min_ttl, NEVER), PF_RULE_FIELD(set_tos, NEVER), }; int add_opt_table(struct pfctl *, struct pf_opt_tbl **, sa_family_t, struct pf_rule_addr *); int addrs_combineable(struct pf_rule_addr *, struct pf_rule_addr *); int addrs_equal(struct pf_rule_addr *, struct pf_rule_addr *); int block_feedback(struct pfctl *, struct superblock *); int combine_rules(struct pfctl *, struct superblock *); void comparable_rule(struct pf_rule *, const struct pf_rule *, int); int construct_superblocks(struct pfctl *, struct pf_opt_queue *, struct superblocks *); void exclude_supersets(struct pf_rule *, struct pf_rule *); int interface_group(const char *); int load_feedback_profile(struct pfctl *, struct superblocks *); int optimize_superblock(struct pfctl *, struct superblock *); int pf_opt_create_table(struct pfctl *, struct pf_opt_tbl *); void remove_from_skipsteps(struct skiplist *, struct superblock *, struct pf_opt_rule *, struct pf_skip_step *); int remove_identical_rules(struct pfctl *, struct superblock *); int reorder_rules(struct pfctl *, struct superblock *, int); int rules_combineable(struct pf_rule *, struct pf_rule *); void skip_append(struct superblock *, int, struct pf_skip_step *, struct pf_opt_rule *); int skip_compare(int, struct pf_skip_step *, struct pf_opt_rule *); void skip_init(void); int skip_cmp_af(struct pf_rule *, struct pf_rule *); int skip_cmp_dir(struct pf_rule *, struct pf_rule *); int skip_cmp_dst_addr(struct pf_rule *, struct pf_rule *); int skip_cmp_dst_port(struct pf_rule *, struct pf_rule *); int skip_cmp_ifp(struct pf_rule *, struct pf_rule *); int skip_cmp_proto(struct pf_rule *, struct pf_rule *); int skip_cmp_src_addr(struct pf_rule *, struct pf_rule *); int skip_cmp_src_port(struct pf_rule *, struct pf_rule *); int superblock_inclusive(struct superblock *, struct pf_opt_rule *); void superblock_free(struct pfctl *, struct superblock *); static int (*skip_comparitors[PF_SKIP_COUNT])(struct pf_rule *, struct pf_rule *); static const char *skip_comparitors_names[PF_SKIP_COUNT]; #define PF_SKIP_COMPARITORS { \ { "ifp", PF_SKIP_IFP, skip_cmp_ifp }, \ { "dir", PF_SKIP_DIR, skip_cmp_dir }, \ { "af", PF_SKIP_AF, skip_cmp_af }, \ { "proto", PF_SKIP_PROTO, skip_cmp_proto }, \ { "saddr", PF_SKIP_SRC_ADDR, skip_cmp_src_addr }, \ { "sport", PF_SKIP_SRC_PORT, skip_cmp_src_port }, \ { "daddr", PF_SKIP_DST_ADDR, skip_cmp_dst_addr }, \ { "dport", PF_SKIP_DST_PORT, skip_cmp_dst_port } \ } static struct pfr_buffer table_buffer; static int table_identifier; int pfctl_optimize_ruleset(struct pfctl *pf, struct pf_ruleset *rs) { struct superblocks superblocks; struct pf_opt_queue opt_queue; struct superblock *block; struct pf_opt_rule *por; struct pf_rule *r; struct pf_rulequeue *old_rules; DEBUG("optimizing ruleset"); memset(&table_buffer, 0, sizeof(table_buffer)); skip_init(); TAILQ_INIT(&opt_queue); old_rules = rs->rules[PF_RULESET_FILTER].active.ptr; rs->rules[PF_RULESET_FILTER].active.ptr = rs->rules[PF_RULESET_FILTER].inactive.ptr; rs->rules[PF_RULESET_FILTER].inactive.ptr = old_rules; /* * XXX expanding the pf_opt_rule format throughout pfctl might allow * us to avoid all this copying. */ while ((r = TAILQ_FIRST(rs->rules[PF_RULESET_FILTER].inactive.ptr)) != NULL) { TAILQ_REMOVE(rs->rules[PF_RULESET_FILTER].inactive.ptr, r, entries); if ((por = calloc(1, sizeof(*por))) == NULL) err(1, "calloc"); memcpy(&por->por_rule, r, sizeof(*r)); if (TAILQ_FIRST(&r->rpool.list) != NULL) { TAILQ_INIT(&por->por_rule.rpool.list); pfctl_move_pool(&r->rpool, &por->por_rule.rpool); } else bzero(&por->por_rule.rpool, sizeof(por->por_rule.rpool)); TAILQ_INSERT_TAIL(&opt_queue, por, por_entry); } TAILQ_INIT(&superblocks); if (construct_superblocks(pf, &opt_queue, &superblocks)) goto error; if (pf->optimize & PF_OPTIMIZE_PROFILE) { if (load_feedback_profile(pf, &superblocks)) goto error; } TAILQ_FOREACH(block, &superblocks, sb_entry) { if (optimize_superblock(pf, block)) goto error; } rs->anchor->refcnt = 0; while ((block = TAILQ_FIRST(&superblocks))) { TAILQ_REMOVE(&superblocks, block, sb_entry); while ((por = TAILQ_FIRST(&block->sb_rules))) { TAILQ_REMOVE(&block->sb_rules, por, por_entry); por->por_rule.nr = rs->anchor->refcnt++; if ((r = calloc(1, sizeof(*r))) == NULL) err(1, "calloc"); memcpy(r, &por->por_rule, sizeof(*r)); TAILQ_INIT(&r->rpool.list); pfctl_move_pool(&por->por_rule.rpool, &r->rpool); TAILQ_INSERT_TAIL( rs->rules[PF_RULESET_FILTER].active.ptr, r, entries); free(por); } free(block); } return (0); error: while ((por = TAILQ_FIRST(&opt_queue))) { TAILQ_REMOVE(&opt_queue, por, por_entry); if (por->por_src_tbl) { pfr_buf_clear(por->por_src_tbl->pt_buf); free(por->por_src_tbl->pt_buf); free(por->por_src_tbl); } if (por->por_dst_tbl) { pfr_buf_clear(por->por_dst_tbl->pt_buf); free(por->por_dst_tbl->pt_buf); free(por->por_dst_tbl); } free(por); } while ((block = TAILQ_FIRST(&superblocks))) { TAILQ_REMOVE(&superblocks, block, sb_entry); superblock_free(pf, block); } return (1); } /* * Go ahead and optimize a superblock */ int optimize_superblock(struct pfctl *pf, struct superblock *block) { #ifdef OPT_DEBUG struct pf_opt_rule *por; #endif /* OPT_DEBUG */ /* We have a few optimization passes: * 1) remove duplicate rules or rules that are a subset of other * rules * 2) combine otherwise identical rules with different IP addresses * into a single rule and put the addresses in a table. * 3) re-order the rules to improve kernel skip steps * 4) re-order the 'quick' rules based on feedback from the * active ruleset statistics * * XXX combine_rules() doesn't combine v4 and v6 rules. would just * have to keep af in the table container, make af 'COMBINE' and * twiddle the af on the merged rule * XXX maybe add a weighting to the metric on skipsteps when doing * reordering. sometimes two sequential tables will be better * that four consecutive interfaces. * XXX need to adjust the skipstep count of everything after PROTO, * since they aren't actually checked on a proto mismatch in * pf_test_{tcp, udp, icmp}() * XXX should i treat proto=0, af=0 or dir=0 special in skepstep * calculation since they are a DC? * XXX keep last skiplist of last superblock to influence this * superblock. '5 inet6 log' should make '3 inet6' come before '4 * inet' in the next superblock. * XXX would be useful to add tables for ports * XXX we can also re-order some mutually exclusive superblocks to * try merging superblocks before any of these optimization passes. * for instance a single 'log in' rule in the middle of non-logging * out rules. */ /* shortcut. there will be a lot of 1-rule superblocks */ if (!TAILQ_NEXT(TAILQ_FIRST(&block->sb_rules), por_entry)) return (0); #ifdef OPT_DEBUG printf("--- Superblock ---\n"); TAILQ_FOREACH(por, &block->sb_rules, por_entry) { printf(" "); print_rule(&por->por_rule, por->por_rule.anchor ? por->por_rule.anchor->name : "", 1, 0); } #endif /* OPT_DEBUG */ if (remove_identical_rules(pf, block)) return (1); if (combine_rules(pf, block)) return (1); if ((pf->optimize & PF_OPTIMIZE_PROFILE) && TAILQ_FIRST(&block->sb_rules)->por_rule.quick && block->sb_profiled_block) { if (block_feedback(pf, block)) return (1); } else if (reorder_rules(pf, block, 0)) { return (1); } /* * Don't add any optimization passes below reorder_rules(). It will * have divided superblocks into smaller blocks for further refinement * and doesn't put them back together again. What once was a true * superblock might have been split into multiple superblocks. */ #ifdef OPT_DEBUG printf("--- END Superblock ---\n"); #endif /* OPT_DEBUG */ return (0); } /* * Optimization pass #1: remove identical rules */ int remove_identical_rules(struct pfctl *pf, struct superblock *block) { struct pf_opt_rule *por1, *por2, *por_next, *por2_next; struct pf_rule a, a2, b, b2; for (por1 = TAILQ_FIRST(&block->sb_rules); por1; por1 = por_next) { por_next = TAILQ_NEXT(por1, por_entry); for (por2 = por_next; por2; por2 = por2_next) { por2_next = TAILQ_NEXT(por2, por_entry); comparable_rule(&a, &por1->por_rule, DC); comparable_rule(&b, &por2->por_rule, DC); memcpy(&a2, &a, sizeof(a2)); memcpy(&b2, &b, sizeof(b2)); exclude_supersets(&a, &b); exclude_supersets(&b2, &a2); if (memcmp(&a, &b, sizeof(a)) == 0) { DEBUG("removing identical rule nr%d = *nr%d*", por1->por_rule.nr, por2->por_rule.nr); TAILQ_REMOVE(&block->sb_rules, por2, por_entry); if (por_next == por2) por_next = TAILQ_NEXT(por1, por_entry); free(por2); } else if (memcmp(&a2, &b2, sizeof(a2)) == 0) { DEBUG("removing identical rule *nr%d* = nr%d", por1->por_rule.nr, por2->por_rule.nr); TAILQ_REMOVE(&block->sb_rules, por1, por_entry); free(por1); break; } } } return (0); } /* * Optimization pass #2: combine similar rules with different addresses * into a single rule and a table */ int combine_rules(struct pfctl *pf, struct superblock *block) { struct pf_opt_rule *p1, *p2, *por_next; int src_eq, dst_eq; if ((pf->loadopt & PFCTL_FLAG_TABLE) == 0) { warnx("Must enable table loading for optimizations"); return (1); } /* First we make a pass to combine the rules. O(n log n) */ TAILQ_FOREACH(p1, &block->sb_rules, por_entry) { for (p2 = TAILQ_NEXT(p1, por_entry); p2; p2 = por_next) { por_next = TAILQ_NEXT(p2, por_entry); src_eq = addrs_equal(&p1->por_rule.src, &p2->por_rule.src); dst_eq = addrs_equal(&p1->por_rule.dst, &p2->por_rule.dst); if (src_eq && !dst_eq && p1->por_src_tbl == NULL && p2->por_dst_tbl == NULL && p2->por_src_tbl == NULL && rules_combineable(&p1->por_rule, &p2->por_rule) && addrs_combineable(&p1->por_rule.dst, &p2->por_rule.dst)) { DEBUG("can combine rules nr%d = nr%d", p1->por_rule.nr, p2->por_rule.nr); if (p1->por_dst_tbl == NULL && add_opt_table(pf, &p1->por_dst_tbl, p1->por_rule.af, &p1->por_rule.dst)) return (1); if (add_opt_table(pf, &p1->por_dst_tbl, p1->por_rule.af, &p2->por_rule.dst)) return (1); p2->por_dst_tbl = p1->por_dst_tbl; if (p1->por_dst_tbl->pt_rulecount >= TABLE_THRESHOLD) { TAILQ_REMOVE(&block->sb_rules, p2, por_entry); free(p2); } } else if (!src_eq && dst_eq && p1->por_dst_tbl == NULL && p2->por_src_tbl == NULL && p2->por_dst_tbl == NULL && rules_combineable(&p1->por_rule, &p2->por_rule) && addrs_combineable(&p1->por_rule.src, &p2->por_rule.src)) { DEBUG("can combine rules nr%d = nr%d", p1->por_rule.nr, p2->por_rule.nr); if (p1->por_src_tbl == NULL && add_opt_table(pf, &p1->por_src_tbl, p1->por_rule.af, &p1->por_rule.src)) return (1); if (add_opt_table(pf, &p1->por_src_tbl, p1->por_rule.af, &p2->por_rule.src)) return (1); p2->por_src_tbl = p1->por_src_tbl; if (p1->por_src_tbl->pt_rulecount >= TABLE_THRESHOLD) { TAILQ_REMOVE(&block->sb_rules, p2, por_entry); free(p2); } } } } /* * Then we make a final pass to create a valid table name and * insert the name into the rules. */ for (p1 = TAILQ_FIRST(&block->sb_rules); p1; p1 = por_next) { por_next = TAILQ_NEXT(p1, por_entry); assert(p1->por_src_tbl == NULL || p1->por_dst_tbl == NULL); if (p1->por_src_tbl && p1->por_src_tbl->pt_rulecount >= TABLE_THRESHOLD) { if (p1->por_src_tbl->pt_generated) { /* This rule is included in a table */ TAILQ_REMOVE(&block->sb_rules, p1, por_entry); free(p1); continue; } p1->por_src_tbl->pt_generated = 1; if ((pf->opts & PF_OPT_NOACTION) == 0 && pf_opt_create_table(pf, p1->por_src_tbl)) return (1); pf->tdirty = 1; if (pf->opts & PF_OPT_VERBOSE) print_tabledef(p1->por_src_tbl->pt_name, PFR_TFLAG_CONST, 1, &p1->por_src_tbl->pt_nodes); memset(&p1->por_rule.src.addr, 0, sizeof(p1->por_rule.src.addr)); p1->por_rule.src.addr.type = PF_ADDR_TABLE; strlcpy(p1->por_rule.src.addr.v.tblname, p1->por_src_tbl->pt_name, sizeof(p1->por_rule.src.addr.v.tblname)); pfr_buf_clear(p1->por_src_tbl->pt_buf); free(p1->por_src_tbl->pt_buf); p1->por_src_tbl->pt_buf = NULL; } if (p1->por_dst_tbl && p1->por_dst_tbl->pt_rulecount >= TABLE_THRESHOLD) { if (p1->por_dst_tbl->pt_generated) { /* This rule is included in a table */ TAILQ_REMOVE(&block->sb_rules, p1, por_entry); free(p1); continue; } p1->por_dst_tbl->pt_generated = 1; if ((pf->opts & PF_OPT_NOACTION) == 0 && pf_opt_create_table(pf, p1->por_dst_tbl)) return (1); pf->tdirty = 1; if (pf->opts & PF_OPT_VERBOSE) print_tabledef(p1->por_dst_tbl->pt_name, PFR_TFLAG_CONST, 1, &p1->por_dst_tbl->pt_nodes); memset(&p1->por_rule.dst.addr, 0, sizeof(p1->por_rule.dst.addr)); p1->por_rule.dst.addr.type = PF_ADDR_TABLE; strlcpy(p1->por_rule.dst.addr.v.tblname, p1->por_dst_tbl->pt_name, sizeof(p1->por_rule.dst.addr.v.tblname)); pfr_buf_clear(p1->por_dst_tbl->pt_buf); free(p1->por_dst_tbl->pt_buf); p1->por_dst_tbl->pt_buf = NULL; } } return (0); } /* * Optimization pass #3: re-order rules to improve skip steps */ int reorder_rules(struct pfctl *pf, struct superblock *block, int depth) { struct superblock *newblock; struct pf_skip_step *skiplist; struct pf_opt_rule *por; int i, largest, largest_list, rule_count = 0; TAILQ_HEAD( , pf_opt_rule) head; /* * Calculate the best-case skip steps. We put each rule in a list * of other rules with common fields */ for (i = 0; i < PF_SKIP_COUNT; i++) { TAILQ_FOREACH(por, &block->sb_rules, por_entry) { TAILQ_FOREACH(skiplist, &block->sb_skipsteps[i], ps_entry) { if (skip_compare(i, skiplist, por) == 0) break; } if (skiplist == NULL) { if ((skiplist = calloc(1, sizeof(*skiplist))) == NULL) err(1, "calloc"); TAILQ_INIT(&skiplist->ps_rules); TAILQ_INSERT_TAIL(&block->sb_skipsteps[i], skiplist, ps_entry); } skip_append(block, i, skiplist, por); } } TAILQ_FOREACH(por, &block->sb_rules, por_entry) rule_count++; /* * Now we're going to ignore any fields that are identical between * all of the rules in the superblock and those fields which differ * between every rule in the superblock. */ largest = 0; for (i = 0; i < PF_SKIP_COUNT; i++) { skiplist = TAILQ_FIRST(&block->sb_skipsteps[i]); if (skiplist->ps_count == rule_count) { DEBUG("(%d) original skipstep '%s' is all rules", depth, skip_comparitors_names[i]); skiplist->ps_count = 0; } else if (skiplist->ps_count == 1) { skiplist->ps_count = 0; } else { DEBUG("(%d) original skipstep '%s' largest jump is %d", depth, skip_comparitors_names[i], skiplist->ps_count); if (skiplist->ps_count > largest) largest = skiplist->ps_count; } } if (largest == 0) { /* Ugh. There is NO commonality in the superblock on which * optimize the skipsteps optimization. */ goto done; } /* * Now we're going to empty the superblock rule list and re-create * it based on a more optimal skipstep order. */ TAILQ_INIT(&head); while ((por = TAILQ_FIRST(&block->sb_rules))) { TAILQ_REMOVE(&block->sb_rules, por, por_entry); TAILQ_INSERT_TAIL(&head, por, por_entry); } while (!TAILQ_EMPTY(&head)) { largest = 1; /* * Find the most useful skip steps remaining */ for (i = 0; i < PF_SKIP_COUNT; i++) { skiplist = TAILQ_FIRST(&block->sb_skipsteps[i]); if (skiplist->ps_count > largest) { largest = skiplist->ps_count; largest_list = i; } } if (largest <= 1) { /* * Nothing useful left. Leave remaining rules in order. */ DEBUG("(%d) no more commonality for skip steps", depth); while ((por = TAILQ_FIRST(&head))) { TAILQ_REMOVE(&head, por, por_entry); TAILQ_INSERT_TAIL(&block->sb_rules, por, por_entry); } } else { /* * There is commonality. Extract those common rules * and place them in the ruleset adjacent to each * other. */ skiplist = TAILQ_FIRST(&block->sb_skipsteps[ largest_list]); DEBUG("(%d) skipstep '%s' largest jump is %d @ #%d", depth, skip_comparitors_names[largest_list], largest, TAILQ_FIRST(&TAILQ_FIRST(&block-> sb_skipsteps [largest_list])->ps_rules)-> por_rule.nr); TAILQ_REMOVE(&block->sb_skipsteps[largest_list], skiplist, ps_entry); /* * There may be further commonality inside these * rules. So we'll split them off into they're own * superblock and pass it back into the optimizer. */ if (skiplist->ps_count > 2) { if ((newblock = calloc(1, sizeof(*newblock))) == NULL) { warn("calloc"); return (1); } TAILQ_INIT(&newblock->sb_rules); for (i = 0; i < PF_SKIP_COUNT; i++) TAILQ_INIT(&newblock->sb_skipsteps[i]); TAILQ_INSERT_BEFORE(block, newblock, sb_entry); DEBUG("(%d) splitting off %d rules from superblock @ #%d", depth, skiplist->ps_count, TAILQ_FIRST(&skiplist->ps_rules)-> por_rule.nr); } else { newblock = block; } while ((por = TAILQ_FIRST(&skiplist->ps_rules))) { TAILQ_REMOVE(&head, por, por_entry); TAILQ_REMOVE(&skiplist->ps_rules, por, por_skip_entry[largest_list]); TAILQ_INSERT_TAIL(&newblock->sb_rules, por, por_entry); /* Remove this rule from all other skiplists */ remove_from_skipsteps(&block->sb_skipsteps[ largest_list], block, por, skiplist); } free(skiplist); if (newblock != block) if (reorder_rules(pf, newblock, depth + 1)) return (1); } } done: for (i = 0; i < PF_SKIP_COUNT; i++) { while ((skiplist = TAILQ_FIRST(&block->sb_skipsteps[i]))) { TAILQ_REMOVE(&block->sb_skipsteps[i], skiplist, ps_entry); free(skiplist); } } return (0); } /* * Optimization pass #4: re-order 'quick' rules based on feedback from the * currently running ruleset */ int block_feedback(struct pfctl *pf, struct superblock *block) { TAILQ_HEAD( , pf_opt_rule) queue; struct pf_opt_rule *por1, *por2; u_int64_t total_count = 0; struct pf_rule a, b; /* * Walk through all of the profiled superblock's rules and copy * the counters onto our rules. */ TAILQ_FOREACH(por1, &block->sb_profiled_block->sb_rules, por_entry) { comparable_rule(&a, &por1->por_rule, DC); total_count += por1->por_rule.packets[0] + por1->por_rule.packets[1]; TAILQ_FOREACH(por2, &block->sb_rules, por_entry) { if (por2->por_profile_count) continue; comparable_rule(&b, &por2->por_rule, DC); if (memcmp(&a, &b, sizeof(a)) == 0) { por2->por_profile_count = por1->por_rule.packets[0] + por1->por_rule.packets[1]; break; } } } superblock_free(pf, block->sb_profiled_block); block->sb_profiled_block = NULL; /* * Now we pull all of the rules off the superblock and re-insert them * in sorted order. */ TAILQ_INIT(&queue); while ((por1 = TAILQ_FIRST(&block->sb_rules)) != NULL) { TAILQ_REMOVE(&block->sb_rules, por1, por_entry); TAILQ_INSERT_TAIL(&queue, por1, por_entry); } while ((por1 = TAILQ_FIRST(&queue)) != NULL) { TAILQ_REMOVE(&queue, por1, por_entry); /* XXX I should sort all of the unused rules based on skip steps */ TAILQ_FOREACH(por2, &block->sb_rules, por_entry) { if (por1->por_profile_count > por2->por_profile_count) { TAILQ_INSERT_BEFORE(por2, por1, por_entry); break; } } #ifdef __FreeBSD__ if (por2 == NULL) #else if (por2 == TAILQ_END(&block->sb_rules)) #endif TAILQ_INSERT_TAIL(&block->sb_rules, por1, por_entry); } return (0); } /* * Load the current ruleset from the kernel and try to associate them with * the ruleset we're optimizing. */ int load_feedback_profile(struct pfctl *pf, struct superblocks *superblocks) { struct superblock *block, *blockcur; struct superblocks prof_superblocks; struct pf_opt_rule *por; struct pf_opt_queue queue; struct pfioc_rule pr; struct pf_rule a, b; int nr, mnr; TAILQ_INIT(&queue); TAILQ_INIT(&prof_superblocks); memset(&pr, 0, sizeof(pr)); pr.rule.action = PF_PASS; if (ioctl(pf->dev, DIOCGETRULES, &pr)) { warn("DIOCGETRULES"); return (1); } mnr = pr.nr; DEBUG("Loading %d active rules for a feedback profile", mnr); for (nr = 0; nr < mnr; ++nr) { struct pf_ruleset *rs; if ((por = calloc(1, sizeof(*por))) == NULL) { warn("calloc"); return (1); } pr.nr = nr; if (ioctl(pf->dev, DIOCGETRULE, &pr)) { warn("DIOCGETRULES"); return (1); } memcpy(&por->por_rule, &pr.rule, sizeof(por->por_rule)); rs = pf_find_or_create_ruleset(pr.anchor_call); por->por_rule.anchor = rs->anchor; if (TAILQ_EMPTY(&por->por_rule.rpool.list)) memset(&por->por_rule.rpool, 0, sizeof(por->por_rule.rpool)); TAILQ_INSERT_TAIL(&queue, por, por_entry); /* XXX pfctl_get_pool(pf->dev, &pr.rule.rpool, nr, pr.ticket, * PF_PASS, pf->anchor) ??? * ... pfctl_clear_pool(&pr.rule.rpool) */ } if (construct_superblocks(pf, &queue, &prof_superblocks)) return (1); /* * Now we try to associate the active ruleset's superblocks with * the superblocks we're compiling. */ block = TAILQ_FIRST(superblocks); blockcur = TAILQ_FIRST(&prof_superblocks); while (block && blockcur) { comparable_rule(&a, &TAILQ_FIRST(&block->sb_rules)->por_rule, BREAK); comparable_rule(&b, &TAILQ_FIRST(&blockcur->sb_rules)->por_rule, BREAK); if (memcmp(&a, &b, sizeof(a)) == 0) { /* The two superblocks lined up */ block->sb_profiled_block = blockcur; } else { DEBUG("superblocks don't line up between #%d and #%d", TAILQ_FIRST(&block->sb_rules)->por_rule.nr, TAILQ_FIRST(&blockcur->sb_rules)->por_rule.nr); break; } block = TAILQ_NEXT(block, sb_entry); blockcur = TAILQ_NEXT(blockcur, sb_entry); } /* Free any superblocks we couldn't link */ while (blockcur) { block = TAILQ_NEXT(blockcur, sb_entry); superblock_free(pf, blockcur); blockcur = block; } return (0); } /* * Compare a rule to a skiplist to see if the rule is a member */ int skip_compare(int skipnum, struct pf_skip_step *skiplist, struct pf_opt_rule *por) { struct pf_rule *a, *b; if (skipnum >= PF_SKIP_COUNT || skipnum < 0) errx(1, "skip_compare() out of bounds"); a = &por->por_rule; b = &TAILQ_FIRST(&skiplist->ps_rules)->por_rule; return ((skip_comparitors[skipnum])(a, b)); } /* * Add a rule to a skiplist */ void skip_append(struct superblock *superblock, int skipnum, struct pf_skip_step *skiplist, struct pf_opt_rule *por) { struct pf_skip_step *prev; skiplist->ps_count++; TAILQ_INSERT_TAIL(&skiplist->ps_rules, por, por_skip_entry[skipnum]); /* Keep the list of skiplists sorted by whichever is larger */ while ((prev = TAILQ_PREV(skiplist, skiplist, ps_entry)) && prev->ps_count < skiplist->ps_count) { TAILQ_REMOVE(&superblock->sb_skipsteps[skipnum], skiplist, ps_entry); TAILQ_INSERT_BEFORE(prev, skiplist, ps_entry); } } /* * Remove a rule from the other skiplist calculations. */ void remove_from_skipsteps(struct skiplist *head, struct superblock *block, struct pf_opt_rule *por, struct pf_skip_step *active_list) { struct pf_skip_step *sk, *next; struct pf_opt_rule *p2; int i, found; for (i = 0; i < PF_SKIP_COUNT; i++) { sk = TAILQ_FIRST(&block->sb_skipsteps[i]); if (sk == NULL || sk == active_list || sk->ps_count <= 1) continue; found = 0; do { TAILQ_FOREACH(p2, &sk->ps_rules, por_skip_entry[i]) if (p2 == por) { TAILQ_REMOVE(&sk->ps_rules, p2, por_skip_entry[i]); found = 1; sk->ps_count--; break; } } while (!found && (sk = TAILQ_NEXT(sk, ps_entry))); if (found && sk) { /* Does this change the sorting order? */ while ((next = TAILQ_NEXT(sk, ps_entry)) && next->ps_count > sk->ps_count) { TAILQ_REMOVE(head, sk, ps_entry); TAILQ_INSERT_AFTER(head, next, sk, ps_entry); } #ifdef OPT_DEBUG next = TAILQ_NEXT(sk, ps_entry); assert(next == NULL || next->ps_count <= sk->ps_count); #endif /* OPT_DEBUG */ } } } /* Compare two rules AF field for skiplist construction */ int skip_cmp_af(struct pf_rule *a, struct pf_rule *b) { if (a->af != b->af || a->af == 0) return (1); return (0); } /* Compare two rules DIRECTION field for skiplist construction */ int skip_cmp_dir(struct pf_rule *a, struct pf_rule *b) { if (a->direction == 0 || a->direction != b->direction) return (1); return (0); } /* Compare two rules DST Address field for skiplist construction */ int skip_cmp_dst_addr(struct pf_rule *a, struct pf_rule *b) { if (a->dst.neg != b->dst.neg || a->dst.addr.type != b->dst.addr.type) return (1); /* XXX if (a->proto != b->proto && a->proto != 0 && b->proto != 0 * && (a->proto == IPPROTO_TCP || a->proto == IPPROTO_UDP || * a->proto == IPPROTO_ICMP * return (1); */ switch (a->dst.addr.type) { case PF_ADDR_ADDRMASK: if (memcmp(&a->dst.addr.v.a.addr, &b->dst.addr.v.a.addr, sizeof(a->dst.addr.v.a.addr)) || memcmp(&a->dst.addr.v.a.mask, &b->dst.addr.v.a.mask, sizeof(a->dst.addr.v.a.mask)) || (a->dst.addr.v.a.addr.addr32[0] == 0 && a->dst.addr.v.a.addr.addr32[1] == 0 && a->dst.addr.v.a.addr.addr32[2] == 0 && a->dst.addr.v.a.addr.addr32[3] == 0)) return (1); return (0); case PF_ADDR_DYNIFTL: if (strcmp(a->dst.addr.v.ifname, b->dst.addr.v.ifname) != 0 || a->dst.addr.iflags != a->dst.addr.iflags || memcmp(&a->dst.addr.v.a.mask, &b->dst.addr.v.a.mask, sizeof(a->dst.addr.v.a.mask))) return (1); return (0); case PF_ADDR_NOROUTE: case PF_ADDR_URPFFAILED: return (0); case PF_ADDR_TABLE: return (strcmp(a->dst.addr.v.tblname, b->dst.addr.v.tblname)); } return (1); } /* Compare two rules DST port field for skiplist construction */ int skip_cmp_dst_port(struct pf_rule *a, struct pf_rule *b) { /* XXX if (a->proto != b->proto && a->proto != 0 && b->proto != 0 * && (a->proto == IPPROTO_TCP || a->proto == IPPROTO_UDP || * a->proto == IPPROTO_ICMP * return (1); */ if (a->dst.port_op == PF_OP_NONE || a->dst.port_op != b->dst.port_op || a->dst.port[0] != b->dst.port[0] || a->dst.port[1] != b->dst.port[1]) return (1); return (0); } /* Compare two rules IFP field for skiplist construction */ int skip_cmp_ifp(struct pf_rule *a, struct pf_rule *b) { if (strcmp(a->ifname, b->ifname) || a->ifname[0] == '\0') return (1); return (a->ifnot != b->ifnot); } /* Compare two rules PROTO field for skiplist construction */ int skip_cmp_proto(struct pf_rule *a, struct pf_rule *b) { return (a->proto != b->proto || a->proto == 0); } /* Compare two rules SRC addr field for skiplist construction */ int skip_cmp_src_addr(struct pf_rule *a, struct pf_rule *b) { if (a->src.neg != b->src.neg || a->src.addr.type != b->src.addr.type) return (1); /* XXX if (a->proto != b->proto && a->proto != 0 && b->proto != 0 * && (a->proto == IPPROTO_TCP || a->proto == IPPROTO_UDP || * a->proto == IPPROTO_ICMP * return (1); */ switch (a->src.addr.type) { case PF_ADDR_ADDRMASK: if (memcmp(&a->src.addr.v.a.addr, &b->src.addr.v.a.addr, sizeof(a->src.addr.v.a.addr)) || memcmp(&a->src.addr.v.a.mask, &b->src.addr.v.a.mask, sizeof(a->src.addr.v.a.mask)) || (a->src.addr.v.a.addr.addr32[0] == 0 && a->src.addr.v.a.addr.addr32[1] == 0 && a->src.addr.v.a.addr.addr32[2] == 0 && a->src.addr.v.a.addr.addr32[3] == 0)) return (1); return (0); case PF_ADDR_DYNIFTL: if (strcmp(a->src.addr.v.ifname, b->src.addr.v.ifname) != 0 || a->src.addr.iflags != a->src.addr.iflags || memcmp(&a->src.addr.v.a.mask, &b->src.addr.v.a.mask, sizeof(a->src.addr.v.a.mask))) return (1); return (0); case PF_ADDR_NOROUTE: case PF_ADDR_URPFFAILED: return (0); case PF_ADDR_TABLE: return (strcmp(a->src.addr.v.tblname, b->src.addr.v.tblname)); } return (1); } /* Compare two rules SRC port field for skiplist construction */ int skip_cmp_src_port(struct pf_rule *a, struct pf_rule *b) { if (a->src.port_op == PF_OP_NONE || a->src.port_op != b->src.port_op || a->src.port[0] != b->src.port[0] || a->src.port[1] != b->src.port[1]) return (1); /* XXX if (a->proto != b->proto && a->proto != 0 && b->proto != 0 * && (a->proto == IPPROTO_TCP || a->proto == IPPROTO_UDP || * a->proto == IPPROTO_ICMP * return (1); */ return (0); } void skip_init(void) { struct { char *name; int skipnum; int (*func)(struct pf_rule *, struct pf_rule *); } comps[] = PF_SKIP_COMPARITORS; int skipnum, i; for (skipnum = 0; skipnum < PF_SKIP_COUNT; skipnum++) { for (i = 0; i < sizeof(comps)/sizeof(*comps); i++) if (comps[i].skipnum == skipnum) { skip_comparitors[skipnum] = comps[i].func; skip_comparitors_names[skipnum] = comps[i].name; } } for (skipnum = 0; skipnum < PF_SKIP_COUNT; skipnum++) if (skip_comparitors[skipnum] == NULL) errx(1, "Need to add skip step comparitor to pfctl?!"); } /* * Add a host/netmask to a table */ int add_opt_table(struct pfctl *pf, struct pf_opt_tbl **tbl, sa_family_t af, struct pf_rule_addr *addr) { #ifdef OPT_DEBUG char buf[128]; #endif /* OPT_DEBUG */ static int tablenum = 0; struct node_host node_host; if (*tbl == NULL) { if ((*tbl = calloc(1, sizeof(**tbl))) == NULL || ((*tbl)->pt_buf = calloc(1, sizeof(*(*tbl)->pt_buf))) == NULL) err(1, "calloc"); (*tbl)->pt_buf->pfrb_type = PFRB_ADDRS; SIMPLEQ_INIT(&(*tbl)->pt_nodes); /* This is just a temporary table name */ snprintf((*tbl)->pt_name, sizeof((*tbl)->pt_name), "%s%d", PF_OPT_TABLE_PREFIX, tablenum++); DEBUG("creating table <%s>", (*tbl)->pt_name); } memset(&node_host, 0, sizeof(node_host)); node_host.af = af; node_host.addr = addr->addr; #ifdef OPT_DEBUG DEBUG("<%s> adding %s/%d", (*tbl)->pt_name, inet_ntop(af, &node_host.addr.v.a.addr, buf, sizeof(buf)), unmask(&node_host.addr.v.a.mask, af)); #endif /* OPT_DEBUG */ if (append_addr_host((*tbl)->pt_buf, &node_host, 0, 0)) { warn("failed to add host"); return (1); } if (pf->opts & PF_OPT_VERBOSE) { struct node_tinit *ti; if ((ti = calloc(1, sizeof(*ti))) == NULL) err(1, "malloc"); if ((ti->host = malloc(sizeof(*ti->host))) == NULL) err(1, "malloc"); memcpy(ti->host, &node_host, sizeof(*ti->host)); SIMPLEQ_INSERT_TAIL(&(*tbl)->pt_nodes, ti, entries); } (*tbl)->pt_rulecount++; if ((*tbl)->pt_rulecount == TABLE_THRESHOLD) DEBUG("table <%s> now faster than skip steps", (*tbl)->pt_name); return (0); } /* * Do the dirty work of choosing an unused table name and creating it. * (be careful with the table name, it might already be used in another anchor) */ int pf_opt_create_table(struct pfctl *pf, struct pf_opt_tbl *tbl) { static int tablenum; struct pfr_table *t; if (table_buffer.pfrb_type == 0) { /* Initialize the list of tables */ table_buffer.pfrb_type = PFRB_TABLES; for (;;) { pfr_buf_grow(&table_buffer, table_buffer.pfrb_size); table_buffer.pfrb_size = table_buffer.pfrb_msize; if (pfr_get_tables(NULL, table_buffer.pfrb_caddr, &table_buffer.pfrb_size, PFR_FLAG_ALLRSETS)) err(1, "pfr_get_tables"); if (table_buffer.pfrb_size <= table_buffer.pfrb_msize) break; } table_identifier = arc4random(); } /* XXX would be *really* nice to avoid duplicating identical tables */ /* Now we have to pick a table name that isn't used */ again: DEBUG("translating temporary table <%s> to <%s%x_%d>", tbl->pt_name, PF_OPT_TABLE_PREFIX, table_identifier, tablenum); snprintf(tbl->pt_name, sizeof(tbl->pt_name), "%s%x_%d", PF_OPT_TABLE_PREFIX, table_identifier, tablenum); PFRB_FOREACH(t, &table_buffer) { if (strcasecmp(t->pfrt_name, tbl->pt_name) == 0) { /* Collision. Try again */ DEBUG("wow, table <%s> in use. trying again", tbl->pt_name); table_identifier = arc4random(); goto again; } } tablenum++; if (pfctl_define_table(tbl->pt_name, PFR_TFLAG_CONST, 1, pf->astack[0]->name, tbl->pt_buf, pf->astack[0]->ruleset.tticket)) { warn("failed to create table %s in %s", tbl->pt_name, pf->astack[0]->name); return (1); } return (0); } /* * Partition the flat ruleset into a list of distinct superblocks */ int construct_superblocks(struct pfctl *pf, struct pf_opt_queue *opt_queue, struct superblocks *superblocks) { struct superblock *block = NULL; struct pf_opt_rule *por; int i; while (!TAILQ_EMPTY(opt_queue)) { por = TAILQ_FIRST(opt_queue); TAILQ_REMOVE(opt_queue, por, por_entry); if (block == NULL || !superblock_inclusive(block, por)) { if ((block = calloc(1, sizeof(*block))) == NULL) { warn("calloc"); return (1); } TAILQ_INIT(&block->sb_rules); for (i = 0; i < PF_SKIP_COUNT; i++) TAILQ_INIT(&block->sb_skipsteps[i]); TAILQ_INSERT_TAIL(superblocks, block, sb_entry); } TAILQ_INSERT_TAIL(&block->sb_rules, por, por_entry); } return (0); } /* * Compare two rule addresses */ int addrs_equal(struct pf_rule_addr *a, struct pf_rule_addr *b) { if (a->neg != b->neg) return (0); return (memcmp(&a->addr, &b->addr, sizeof(a->addr)) == 0); } /* * The addresses are not equal, but can we combine them into one table? */ int addrs_combineable(struct pf_rule_addr *a, struct pf_rule_addr *b) { if (a->addr.type != PF_ADDR_ADDRMASK || b->addr.type != PF_ADDR_ADDRMASK) return (0); if (a->neg != b->neg || a->port_op != b->port_op || a->port[0] != b->port[0] || a->port[1] != b->port[1]) return (0); return (1); } /* * Are we allowed to combine these two rules */ int rules_combineable(struct pf_rule *p1, struct pf_rule *p2) { struct pf_rule a, b; comparable_rule(&a, p1, COMBINED); comparable_rule(&b, p2, COMBINED); return (memcmp(&a, &b, sizeof(a)) == 0); } /* * Can a rule be included inside a superblock */ int superblock_inclusive(struct superblock *block, struct pf_opt_rule *por) { struct pf_rule a, b; int i, j; /* First check for hard breaks */ for (i = 0; i < sizeof(pf_rule_desc)/sizeof(*pf_rule_desc); i++) { if (pf_rule_desc[i].prf_type == BARRIER) { for (j = 0; j < pf_rule_desc[i].prf_size; j++) if (((char *)&por->por_rule)[j + pf_rule_desc[i].prf_offset] != 0) return (0); } } /* per-rule src-track is also a hard break */ if (por->por_rule.rule_flag & PFRULE_RULESRCTRACK) return (0); /* * Have to handle interface groups separately. Consider the following * rules: * block on EXTIFS to any port 22 * pass on em0 to any port 22 * (where EXTIFS is an arbitrary interface group) * The optimizer may decide to re-order the pass rule in front of the * block rule. But what if EXTIFS includes em0??? Such a reordering * would change the meaning of the ruleset. * We can't just lookup the EXTIFS group and check if em0 is a member * because the user is allowed to add interfaces to a group during * runtime. * Ergo interface groups become a defacto superblock break :-( */ if (interface_group(por->por_rule.ifname) || interface_group(TAILQ_FIRST(&block->sb_rules)->por_rule.ifname)) { if (strcasecmp(por->por_rule.ifname, TAILQ_FIRST(&block->sb_rules)->por_rule.ifname) != 0) return (0); } comparable_rule(&a, &TAILQ_FIRST(&block->sb_rules)->por_rule, NOMERGE); comparable_rule(&b, &por->por_rule, NOMERGE); if (memcmp(&a, &b, sizeof(a)) == 0) return (1); #ifdef OPT_DEBUG for (i = 0; i < sizeof(por->por_rule); i++) { int closest = -1; if (((u_int8_t *)&a)[i] != ((u_int8_t *)&b)[i]) { for (j = 0; j < sizeof(pf_rule_desc) / sizeof(*pf_rule_desc); j++) { if (i >= pf_rule_desc[j].prf_offset && i < pf_rule_desc[j].prf_offset + pf_rule_desc[j].prf_size) { DEBUG("superblock break @ %d due to %s", por->por_rule.nr, pf_rule_desc[j].prf_name); return (0); } if (i > pf_rule_desc[j].prf_offset) { if (closest == -1 || i-pf_rule_desc[j].prf_offset < i-pf_rule_desc[closest].prf_offset) closest = j; } } if (closest >= 0) DEBUG("superblock break @ %d on %s+%xh", por->por_rule.nr, pf_rule_desc[closest].prf_name, i - pf_rule_desc[closest].prf_offset - pf_rule_desc[closest].prf_size); else DEBUG("superblock break @ %d on field @ %d", por->por_rule.nr, i); return (0); } } #endif /* OPT_DEBUG */ return (0); } /* * Figure out if an interface name is an actual interface or actually a * group of interfaces. */ int interface_group(const char *ifname) { if (ifname == NULL || !ifname[0]) return (0); /* Real interfaces must end in a number, interface groups do not */ if (isdigit(ifname[strlen(ifname) - 1])) return (0); else return (1); } /* * Make a rule that can directly compared by memcmp() */ void comparable_rule(struct pf_rule *dst, const struct pf_rule *src, int type) { int i; /* * To simplify the comparison, we just zero out the fields that are * allowed to be different and then do a simple memcmp() */ memcpy(dst, src, sizeof(*dst)); for (i = 0; i < sizeof(pf_rule_desc)/sizeof(*pf_rule_desc); i++) if (pf_rule_desc[i].prf_type >= type) { #ifdef OPT_DEBUG assert(pf_rule_desc[i].prf_type != NEVER || *(((char *)dst) + pf_rule_desc[i].prf_offset) == 0); #endif /* OPT_DEBUG */ memset(((char *)dst) + pf_rule_desc[i].prf_offset, 0, pf_rule_desc[i].prf_size); } } /* * Remove superset information from two rules so we can directly compare them * with memcmp() */ void exclude_supersets(struct pf_rule *super, struct pf_rule *sub) { if (super->ifname[0] == '\0') memset(sub->ifname, 0, sizeof(sub->ifname)); if (super->direction == PF_INOUT) sub->direction = PF_INOUT; if ((super->proto == 0 || super->proto == sub->proto) && super->flags == 0 && super->flagset == 0 && (sub->flags || sub->flagset)) { sub->flags = super->flags; sub->flagset = super->flagset; } if (super->proto == 0) sub->proto = 0; if (super->src.port_op == 0) { sub->src.port_op = 0; sub->src.port[0] = 0; sub->src.port[1] = 0; } if (super->dst.port_op == 0) { sub->dst.port_op = 0; sub->dst.port[0] = 0; sub->dst.port[1] = 0; } if (super->src.addr.type == PF_ADDR_ADDRMASK && !super->src.neg && !sub->src.neg && super->src.addr.v.a.mask.addr32[0] == 0 && super->src.addr.v.a.mask.addr32[1] == 0 && super->src.addr.v.a.mask.addr32[2] == 0 && super->src.addr.v.a.mask.addr32[3] == 0) memset(&sub->src.addr, 0, sizeof(sub->src.addr)); else if (super->src.addr.type == PF_ADDR_ADDRMASK && sub->src.addr.type == PF_ADDR_ADDRMASK && super->src.neg == sub->src.neg && super->af == sub->af && unmask(&super->src.addr.v.a.mask, super->af) < unmask(&sub->src.addr.v.a.mask, sub->af) && super->src.addr.v.a.addr.addr32[0] == (sub->src.addr.v.a.addr.addr32[0] & super->src.addr.v.a.mask.addr32[0]) && super->src.addr.v.a.addr.addr32[1] == (sub->src.addr.v.a.addr.addr32[1] & super->src.addr.v.a.mask.addr32[1]) && super->src.addr.v.a.addr.addr32[2] == (sub->src.addr.v.a.addr.addr32[2] & super->src.addr.v.a.mask.addr32[2]) && super->src.addr.v.a.addr.addr32[3] == (sub->src.addr.v.a.addr.addr32[3] & super->src.addr.v.a.mask.addr32[3])) { /* sub->src.addr is a subset of super->src.addr/mask */ memcpy(&sub->src.addr, &super->src.addr, sizeof(sub->src.addr)); } if (super->dst.addr.type == PF_ADDR_ADDRMASK && !super->dst.neg && !sub->dst.neg && super->dst.addr.v.a.mask.addr32[0] == 0 && super->dst.addr.v.a.mask.addr32[1] == 0 && super->dst.addr.v.a.mask.addr32[2] == 0 && super->dst.addr.v.a.mask.addr32[3] == 0) memset(&sub->dst.addr, 0, sizeof(sub->dst.addr)); else if (super->dst.addr.type == PF_ADDR_ADDRMASK && sub->dst.addr.type == PF_ADDR_ADDRMASK && super->dst.neg == sub->dst.neg && super->af == sub->af && unmask(&super->dst.addr.v.a.mask, super->af) < unmask(&sub->dst.addr.v.a.mask, sub->af) && super->dst.addr.v.a.addr.addr32[0] == (sub->dst.addr.v.a.addr.addr32[0] & super->dst.addr.v.a.mask.addr32[0]) && super->dst.addr.v.a.addr.addr32[1] == (sub->dst.addr.v.a.addr.addr32[1] & super->dst.addr.v.a.mask.addr32[1]) && super->dst.addr.v.a.addr.addr32[2] == (sub->dst.addr.v.a.addr.addr32[2] & super->dst.addr.v.a.mask.addr32[2]) && super->dst.addr.v.a.addr.addr32[3] == (sub->dst.addr.v.a.addr.addr32[3] & super->dst.addr.v.a.mask.addr32[3])) { /* sub->dst.addr is a subset of super->dst.addr/mask */ memcpy(&sub->dst.addr, &super->dst.addr, sizeof(sub->dst.addr)); } if (super->af == 0) sub->af = 0; } void superblock_free(struct pfctl *pf, struct superblock *block) { struct pf_opt_rule *por; while ((por = TAILQ_FIRST(&block->sb_rules))) { TAILQ_REMOVE(&block->sb_rules, por, por_entry); if (por->por_src_tbl) { if (por->por_src_tbl->pt_buf) { pfr_buf_clear(por->por_src_tbl->pt_buf); free(por->por_src_tbl->pt_buf); } free(por->por_src_tbl); } if (por->por_dst_tbl) { if (por->por_dst_tbl->pt_buf) { pfr_buf_clear(por->por_dst_tbl->pt_buf); free(por->por_dst_tbl->pt_buf); } free(por->por_dst_tbl); } free(por); } if (block->sb_profiled_block) superblock_free(pf, block->sb_profiled_block); free(block); } Index: head/sys/amd64/vmm/io/vlapic.c =================================================================== --- head/sys/amd64/vmm/io/vlapic.c (revision 308456) +++ head/sys/amd64/vmm/io/vlapic.c (revision 308457) @@ -1,1654 +1,1654 @@ /*- * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include "vmm_lapic.h" #include "vmm_ktr.h" #include "vmm_stat.h" #include "vlapic.h" #include "vlapic_priv.h" #include "vioapic.h" #define PRIO(x) ((x) >> 4) #define VLAPIC_VERSION (16) #define x2apic(vlapic) (((vlapic)->msr_apicbase & APICBASE_X2APIC) ? 1 : 0) /* * The 'vlapic->timer_mtx' is used to provide mutual exclusion between the * vlapic_callout_handler() and vcpu accesses to: * - timer_freq_bt, timer_period_bt, timer_fire_bt * - timer LVT register */ #define VLAPIC_TIMER_LOCK(vlapic) mtx_lock_spin(&((vlapic)->timer_mtx)) #define VLAPIC_TIMER_UNLOCK(vlapic) mtx_unlock_spin(&((vlapic)->timer_mtx)) #define VLAPIC_TIMER_LOCKED(vlapic) mtx_owned(&((vlapic)->timer_mtx)) /* * APIC timer frequency: * - arbitrary but chosen to be in the ballpark of contemporary hardware. * - power-of-two to avoid loss of precision when converted to a bintime. */ #define VLAPIC_BUS_FREQ (128 * 1024 * 1024) static __inline uint32_t vlapic_get_id(struct vlapic *vlapic) { if (x2apic(vlapic)) return (vlapic->vcpuid); else return (vlapic->vcpuid << 24); } static uint32_t x2apic_ldr(struct vlapic *vlapic) { int apicid; uint32_t ldr; apicid = vlapic_get_id(vlapic); ldr = 1 << (apicid & 0xf); ldr |= (apicid & 0xffff0) << 12; return (ldr); } void vlapic_dfr_write_handler(struct vlapic *vlapic) { struct LAPIC *lapic; lapic = vlapic->apic_page; if (x2apic(vlapic)) { VM_CTR1(vlapic->vm, "ignoring write to DFR in x2apic mode: %#x", lapic->dfr); lapic->dfr = 0; return; } lapic->dfr &= APIC_DFR_MODEL_MASK; lapic->dfr |= APIC_DFR_RESERVED; if ((lapic->dfr & APIC_DFR_MODEL_MASK) == APIC_DFR_MODEL_FLAT) VLAPIC_CTR0(vlapic, "vlapic DFR in Flat Model"); else if ((lapic->dfr & APIC_DFR_MODEL_MASK) == APIC_DFR_MODEL_CLUSTER) VLAPIC_CTR0(vlapic, "vlapic DFR in Cluster Model"); else VLAPIC_CTR1(vlapic, "DFR in Unknown Model %#x", lapic->dfr); } void vlapic_ldr_write_handler(struct vlapic *vlapic) { struct LAPIC *lapic; lapic = vlapic->apic_page; /* LDR is read-only in x2apic mode */ if (x2apic(vlapic)) { VLAPIC_CTR1(vlapic, "ignoring write to LDR in x2apic mode: %#x", lapic->ldr); lapic->ldr = x2apic_ldr(vlapic); } else { lapic->ldr &= ~APIC_LDR_RESERVED; VLAPIC_CTR1(vlapic, "vlapic LDR set to %#x", lapic->ldr); } } void vlapic_id_write_handler(struct vlapic *vlapic) { struct LAPIC *lapic; /* * We don't allow the ID register to be modified so reset it back to * its default value. */ lapic = vlapic->apic_page; lapic->id = vlapic_get_id(vlapic); } static int vlapic_timer_divisor(uint32_t dcr) { switch (dcr & 0xB) { case APIC_TDCR_1: return (1); case APIC_TDCR_2: return (2); case APIC_TDCR_4: return (4); case APIC_TDCR_8: return (8); case APIC_TDCR_16: return (16); case APIC_TDCR_32: return (32); case APIC_TDCR_64: return (64); case APIC_TDCR_128: return (128); default: panic("vlapic_timer_divisor: invalid dcr 0x%08x", dcr); } } #if 0 static inline void vlapic_dump_lvt(uint32_t offset, uint32_t *lvt) { printf("Offset %x: lvt %08x (V:%02x DS:%x M:%x)\n", offset, *lvt, *lvt & APIC_LVTT_VECTOR, *lvt & APIC_LVTT_DS, *lvt & APIC_LVTT_M); } #endif static uint32_t vlapic_get_ccr(struct vlapic *vlapic) { struct bintime bt_now, bt_rem; struct LAPIC *lapic; uint32_t ccr; ccr = 0; lapic = vlapic->apic_page; VLAPIC_TIMER_LOCK(vlapic); if (callout_active(&vlapic->callout)) { /* * If the timer is scheduled to expire in the future then * compute the value of 'ccr' based on the remaining time. */ binuptime(&bt_now); if (bintime_cmp(&vlapic->timer_fire_bt, &bt_now, >)) { bt_rem = vlapic->timer_fire_bt; bintime_sub(&bt_rem, &bt_now); ccr += bt_rem.sec * BT2FREQ(&vlapic->timer_freq_bt); ccr += bt_rem.frac / vlapic->timer_freq_bt.frac; } } KASSERT(ccr <= lapic->icr_timer, ("vlapic_get_ccr: invalid ccr %#x, " "icr_timer is %#x", ccr, lapic->icr_timer)); VLAPIC_CTR2(vlapic, "vlapic ccr_timer = %#x, icr_timer = %#x", ccr, lapic->icr_timer); VLAPIC_TIMER_UNLOCK(vlapic); return (ccr); } void vlapic_dcr_write_handler(struct vlapic *vlapic) { struct LAPIC *lapic; int divisor; lapic = vlapic->apic_page; VLAPIC_TIMER_LOCK(vlapic); divisor = vlapic_timer_divisor(lapic->dcr_timer); VLAPIC_CTR2(vlapic, "vlapic dcr_timer=%#x, divisor=%d", lapic->dcr_timer, divisor); /* * Update the timer frequency and the timer period. * * XXX changes to the frequency divider will not take effect until * the timer is reloaded. */ FREQ2BT(VLAPIC_BUS_FREQ / divisor, &vlapic->timer_freq_bt); vlapic->timer_period_bt = vlapic->timer_freq_bt; bintime_mul(&vlapic->timer_period_bt, lapic->icr_timer); VLAPIC_TIMER_UNLOCK(vlapic); } void vlapic_esr_write_handler(struct vlapic *vlapic) { struct LAPIC *lapic; lapic = vlapic->apic_page; lapic->esr = vlapic->esr_pending; vlapic->esr_pending = 0; } int vlapic_set_intr_ready(struct vlapic *vlapic, int vector, bool level) { struct LAPIC *lapic; uint32_t *irrptr, *tmrptr, mask; int idx; KASSERT(vector >= 0 && vector < 256, ("invalid vector %d", vector)); lapic = vlapic->apic_page; if (!(lapic->svr & APIC_SVR_ENABLE)) { VLAPIC_CTR1(vlapic, "vlapic is software disabled, ignoring " "interrupt %d", vector); return (0); } if (vector < 16) { vlapic_set_error(vlapic, APIC_ESR_RECEIVE_ILLEGAL_VECTOR); VLAPIC_CTR1(vlapic, "vlapic ignoring interrupt to vector %d", vector); return (1); } if (vlapic->ops.set_intr_ready) return ((*vlapic->ops.set_intr_ready)(vlapic, vector, level)); idx = (vector / 32) * 4; mask = 1 << (vector % 32); irrptr = &lapic->irr0; atomic_set_int(&irrptr[idx], mask); /* * Verify that the trigger-mode of the interrupt matches with * the vlapic TMR registers. */ tmrptr = &lapic->tmr0; if ((tmrptr[idx] & mask) != (level ? mask : 0)) { VLAPIC_CTR3(vlapic, "vlapic TMR[%d] is 0x%08x but " "interrupt is %s-triggered", idx / 4, tmrptr[idx], level ? "level" : "edge"); } VLAPIC_CTR_IRR(vlapic, "vlapic_set_intr_ready"); return (1); } static __inline uint32_t * vlapic_get_lvtptr(struct vlapic *vlapic, uint32_t offset) { struct LAPIC *lapic = vlapic->apic_page; int i; switch (offset) { case APIC_OFFSET_CMCI_LVT: return (&lapic->lvt_cmci); case APIC_OFFSET_TIMER_LVT ... APIC_OFFSET_ERROR_LVT: i = (offset - APIC_OFFSET_TIMER_LVT) >> 2; return ((&lapic->lvt_timer) + i);; default: panic("vlapic_get_lvt: invalid LVT\n"); } } static __inline int lvt_off_to_idx(uint32_t offset) { int index; switch (offset) { case APIC_OFFSET_CMCI_LVT: index = APIC_LVT_CMCI; break; case APIC_OFFSET_TIMER_LVT: index = APIC_LVT_TIMER; break; case APIC_OFFSET_THERM_LVT: index = APIC_LVT_THERMAL; break; case APIC_OFFSET_PERF_LVT: index = APIC_LVT_PMC; break; case APIC_OFFSET_LINT0_LVT: index = APIC_LVT_LINT0; break; case APIC_OFFSET_LINT1_LVT: index = APIC_LVT_LINT1; break; case APIC_OFFSET_ERROR_LVT: index = APIC_LVT_ERROR; break; default: index = -1; break; } KASSERT(index >= 0 && index <= VLAPIC_MAXLVT_INDEX, ("lvt_off_to_idx: " "invalid lvt index %d for offset %#x", index, offset)); return (index); } static __inline uint32_t vlapic_get_lvt(struct vlapic *vlapic, uint32_t offset) { int idx; uint32_t val; idx = lvt_off_to_idx(offset); val = atomic_load_acq_32(&vlapic->lvt_last[idx]); return (val); } void vlapic_lvt_write_handler(struct vlapic *vlapic, uint32_t offset) { uint32_t *lvtptr, mask, val; struct LAPIC *lapic; int idx; lapic = vlapic->apic_page; lvtptr = vlapic_get_lvtptr(vlapic, offset); val = *lvtptr; idx = lvt_off_to_idx(offset); if (!(lapic->svr & APIC_SVR_ENABLE)) val |= APIC_LVT_M; mask = APIC_LVT_M | APIC_LVT_DS | APIC_LVT_VECTOR; switch (offset) { case APIC_OFFSET_TIMER_LVT: mask |= APIC_LVTT_TM; break; case APIC_OFFSET_ERROR_LVT: break; case APIC_OFFSET_LINT0_LVT: case APIC_OFFSET_LINT1_LVT: mask |= APIC_LVT_TM | APIC_LVT_RIRR | APIC_LVT_IIPP; /* FALLTHROUGH */ default: mask |= APIC_LVT_DM; break; } val &= mask; *lvtptr = val; atomic_store_rel_32(&vlapic->lvt_last[idx], val); } static void vlapic_mask_lvts(struct vlapic *vlapic) { struct LAPIC *lapic = vlapic->apic_page; lapic->lvt_cmci |= APIC_LVT_M; vlapic_lvt_write_handler(vlapic, APIC_OFFSET_CMCI_LVT); lapic->lvt_timer |= APIC_LVT_M; vlapic_lvt_write_handler(vlapic, APIC_OFFSET_TIMER_LVT); lapic->lvt_thermal |= APIC_LVT_M; vlapic_lvt_write_handler(vlapic, APIC_OFFSET_THERM_LVT); lapic->lvt_pcint |= APIC_LVT_M; vlapic_lvt_write_handler(vlapic, APIC_OFFSET_PERF_LVT); lapic->lvt_lint0 |= APIC_LVT_M; vlapic_lvt_write_handler(vlapic, APIC_OFFSET_LINT0_LVT); lapic->lvt_lint1 |= APIC_LVT_M; vlapic_lvt_write_handler(vlapic, APIC_OFFSET_LINT1_LVT); lapic->lvt_error |= APIC_LVT_M; vlapic_lvt_write_handler(vlapic, APIC_OFFSET_ERROR_LVT); } static int vlapic_fire_lvt(struct vlapic *vlapic, uint32_t lvt) { uint32_t vec, mode; if (lvt & APIC_LVT_M) return (0); vec = lvt & APIC_LVT_VECTOR; mode = lvt & APIC_LVT_DM; switch (mode) { case APIC_LVT_DM_FIXED: if (vec < 16) { vlapic_set_error(vlapic, APIC_ESR_SEND_ILLEGAL_VECTOR); return (0); } if (vlapic_set_intr_ready(vlapic, vec, false)) vcpu_notify_event(vlapic->vm, vlapic->vcpuid, true); break; case APIC_LVT_DM_NMI: vm_inject_nmi(vlapic->vm, vlapic->vcpuid); break; case APIC_LVT_DM_EXTINT: vm_inject_extint(vlapic->vm, vlapic->vcpuid); break; default: // Other modes ignored return (0); } return (1); } #if 1 static void dump_isrvec_stk(struct vlapic *vlapic) { int i; uint32_t *isrptr; isrptr = &vlapic->apic_page->isr0; for (i = 0; i < 8; i++) printf("ISR%d 0x%08x\n", i, isrptr[i * 4]); for (i = 0; i <= vlapic->isrvec_stk_top; i++) printf("isrvec_stk[%d] = %d\n", i, vlapic->isrvec_stk[i]); } #endif /* * Algorithm adopted from section "Interrupt, Task and Processor Priority" * in Intel Architecture Manual Vol 3a. */ static void vlapic_update_ppr(struct vlapic *vlapic) { int isrvec, tpr, ppr; /* * Note that the value on the stack at index 0 is always 0. * * This is a placeholder for the value of ISRV when none of the * bits is set in the ISRx registers. */ isrvec = vlapic->isrvec_stk[vlapic->isrvec_stk_top]; tpr = vlapic->apic_page->tpr; #if 1 { int i, lastprio, curprio, vector, idx; uint32_t *isrptr; if (vlapic->isrvec_stk_top == 0 && isrvec != 0) panic("isrvec_stk is corrupted: %d", isrvec); /* * Make sure that the priority of the nested interrupts is * always increasing. */ lastprio = -1; for (i = 1; i <= vlapic->isrvec_stk_top; i++) { curprio = PRIO(vlapic->isrvec_stk[i]); if (curprio <= lastprio) { dump_isrvec_stk(vlapic); panic("isrvec_stk does not satisfy invariant"); } lastprio = curprio; } /* * Make sure that each bit set in the ISRx registers has a * corresponding entry on the isrvec stack. */ i = 1; isrptr = &vlapic->apic_page->isr0; for (vector = 0; vector < 256; vector++) { idx = (vector / 32) * 4; if (isrptr[idx] & (1 << (vector % 32))) { if (i > vlapic->isrvec_stk_top || vlapic->isrvec_stk[i] != vector) { dump_isrvec_stk(vlapic); panic("ISR and isrvec_stk out of sync"); } i++; } } } #endif if (PRIO(tpr) >= PRIO(isrvec)) ppr = tpr; else ppr = isrvec & 0xf0; vlapic->apic_page->ppr = ppr; VLAPIC_CTR1(vlapic, "vlapic_update_ppr 0x%02x", ppr); } static VMM_STAT(VLAPIC_GRATUITOUS_EOI, "EOI without any in-service interrupt"); static void vlapic_process_eoi(struct vlapic *vlapic) { struct LAPIC *lapic = vlapic->apic_page; uint32_t *isrptr, *tmrptr; int i, idx, bitpos, vector; isrptr = &lapic->isr0; tmrptr = &lapic->tmr0; for (i = 7; i >= 0; i--) { idx = i * 4; bitpos = fls(isrptr[idx]); if (bitpos-- != 0) { if (vlapic->isrvec_stk_top <= 0) { panic("invalid vlapic isrvec_stk_top %d", vlapic->isrvec_stk_top); } isrptr[idx] &= ~(1 << bitpos); vector = i * 32 + bitpos; VCPU_CTR1(vlapic->vm, vlapic->vcpuid, "EOI vector %d", vector); VLAPIC_CTR_ISR(vlapic, "vlapic_process_eoi"); vlapic->isrvec_stk_top--; vlapic_update_ppr(vlapic); if ((tmrptr[idx] & (1 << bitpos)) != 0) { vioapic_process_eoi(vlapic->vm, vlapic->vcpuid, vector); } return; } } VCPU_CTR0(vlapic->vm, vlapic->vcpuid, "Gratuitous EOI"); vmm_stat_incr(vlapic->vm, vlapic->vcpuid, VLAPIC_GRATUITOUS_EOI, 1); } static __inline int vlapic_get_lvt_field(uint32_t lvt, uint32_t mask) { return (lvt & mask); } static __inline int vlapic_periodic_timer(struct vlapic *vlapic) { uint32_t lvt; lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_TIMER_LVT); return (vlapic_get_lvt_field(lvt, APIC_LVTT_TM_PERIODIC)); } static VMM_STAT(VLAPIC_INTR_ERROR, "error interrupts generated by vlapic"); void vlapic_set_error(struct vlapic *vlapic, uint32_t mask) { uint32_t lvt; vlapic->esr_pending |= mask; if (vlapic->esr_firing) return; vlapic->esr_firing = 1; // The error LVT always uses the fixed delivery mode. lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_ERROR_LVT); if (vlapic_fire_lvt(vlapic, lvt | APIC_LVT_DM_FIXED)) { vmm_stat_incr(vlapic->vm, vlapic->vcpuid, VLAPIC_INTR_ERROR, 1); } vlapic->esr_firing = 0; } static VMM_STAT(VLAPIC_INTR_TIMER, "timer interrupts generated by vlapic"); static void vlapic_fire_timer(struct vlapic *vlapic) { uint32_t lvt; KASSERT(VLAPIC_TIMER_LOCKED(vlapic), ("vlapic_fire_timer not locked")); // The timer LVT always uses the fixed delivery mode. lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_TIMER_LVT); if (vlapic_fire_lvt(vlapic, lvt | APIC_LVT_DM_FIXED)) { VLAPIC_CTR0(vlapic, "vlapic timer fired"); vmm_stat_incr(vlapic->vm, vlapic->vcpuid, VLAPIC_INTR_TIMER, 1); } } static VMM_STAT(VLAPIC_INTR_CMC, "corrected machine check interrupts generated by vlapic"); void vlapic_fire_cmci(struct vlapic *vlapic) { uint32_t lvt; lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_CMCI_LVT); if (vlapic_fire_lvt(vlapic, lvt)) { vmm_stat_incr(vlapic->vm, vlapic->vcpuid, VLAPIC_INTR_CMC, 1); } } static VMM_STAT_ARRAY(LVTS_TRIGGERRED, VLAPIC_MAXLVT_INDEX + 1, "lvts triggered"); int vlapic_trigger_lvt(struct vlapic *vlapic, int vector) { uint32_t lvt; if (vlapic_enabled(vlapic) == false) { /* * When the local APIC is global/hardware disabled, * LINT[1:0] pins are configured as INTR and NMI pins, * respectively. */ switch (vector) { case APIC_LVT_LINT0: vm_inject_extint(vlapic->vm, vlapic->vcpuid); break; case APIC_LVT_LINT1: vm_inject_nmi(vlapic->vm, vlapic->vcpuid); break; default: break; } return (0); } switch (vector) { case APIC_LVT_LINT0: lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_LINT0_LVT); break; case APIC_LVT_LINT1: lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_LINT1_LVT); break; case APIC_LVT_TIMER: lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_TIMER_LVT); lvt |= APIC_LVT_DM_FIXED; break; case APIC_LVT_ERROR: lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_ERROR_LVT); lvt |= APIC_LVT_DM_FIXED; break; case APIC_LVT_PMC: lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_PERF_LVT); break; case APIC_LVT_THERMAL: lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_THERM_LVT); break; case APIC_LVT_CMCI: lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_CMCI_LVT); break; default: return (EINVAL); } if (vlapic_fire_lvt(vlapic, lvt)) { vmm_stat_array_incr(vlapic->vm, vlapic->vcpuid, LVTS_TRIGGERRED, vector, 1); } return (0); } static void vlapic_callout_handler(void *arg) { struct vlapic *vlapic; struct bintime bt, btnow; sbintime_t rem_sbt; vlapic = arg; VLAPIC_TIMER_LOCK(vlapic); if (callout_pending(&vlapic->callout)) /* callout was reset */ goto done; if (!callout_active(&vlapic->callout)) /* callout was stopped */ goto done; callout_deactivate(&vlapic->callout); vlapic_fire_timer(vlapic); if (vlapic_periodic_timer(vlapic)) { binuptime(&btnow); KASSERT(bintime_cmp(&btnow, &vlapic->timer_fire_bt, >=), ("vlapic callout at %#lx.%#lx, expected at %#lx.#%lx", btnow.sec, btnow.frac, vlapic->timer_fire_bt.sec, vlapic->timer_fire_bt.frac)); /* * Compute the delta between when the timer was supposed to * fire and the present time. */ bt = btnow; bintime_sub(&bt, &vlapic->timer_fire_bt); rem_sbt = bttosbt(vlapic->timer_period_bt); if (bintime_cmp(&bt, &vlapic->timer_period_bt, <)) { /* * Adjust the time until the next countdown downward * to account for the lost time. */ rem_sbt -= bttosbt(bt); } else { /* * If the delta is greater than the timer period then * just reset our time base instead of trying to catch * up. */ vlapic->timer_fire_bt = btnow; VLAPIC_CTR2(vlapic, "vlapic timer lagging by %lu " "usecs, period is %lu usecs - resetting time base", bttosbt(bt) / SBT_1US, bttosbt(vlapic->timer_period_bt) / SBT_1US); } bintime_add(&vlapic->timer_fire_bt, &vlapic->timer_period_bt); callout_reset_sbt(&vlapic->callout, rem_sbt, 0, vlapic_callout_handler, vlapic, 0); } done: VLAPIC_TIMER_UNLOCK(vlapic); } void vlapic_icrtmr_write_handler(struct vlapic *vlapic) { struct LAPIC *lapic; sbintime_t sbt; uint32_t icr_timer; VLAPIC_TIMER_LOCK(vlapic); lapic = vlapic->apic_page; icr_timer = lapic->icr_timer; vlapic->timer_period_bt = vlapic->timer_freq_bt; bintime_mul(&vlapic->timer_period_bt, icr_timer); if (icr_timer != 0) { binuptime(&vlapic->timer_fire_bt); bintime_add(&vlapic->timer_fire_bt, &vlapic->timer_period_bt); sbt = bttosbt(vlapic->timer_period_bt); callout_reset_sbt(&vlapic->callout, sbt, 0, vlapic_callout_handler, vlapic, 0); } else callout_stop(&vlapic->callout); VLAPIC_TIMER_UNLOCK(vlapic); } /* * This function populates 'dmask' with the set of vcpus that match the * addressing specified by the (dest, phys, lowprio) tuple. * * 'x2apic_dest' specifies whether 'dest' is interpreted as x2APIC (32-bit) * or xAPIC (8-bit) destination field. */ static void vlapic_calcdest(struct vm *vm, cpuset_t *dmask, uint32_t dest, bool phys, bool lowprio, bool x2apic_dest) { struct vlapic *vlapic; uint32_t dfr, ldr, ldest, cluster; uint32_t mda_flat_ldest, mda_cluster_ldest, mda_ldest, mda_cluster_id; cpuset_t amask; int vcpuid; if ((x2apic_dest && dest == 0xffffffff) || (!x2apic_dest && dest == 0xff)) { /* * Broadcast in both logical and physical modes. */ *dmask = vm_active_cpus(vm); return; } if (phys) { /* * Physical mode: destination is APIC ID. */ CPU_ZERO(dmask); vcpuid = vm_apicid2vcpuid(vm, dest); if (vcpuid < VM_MAXCPU) CPU_SET(vcpuid, dmask); } else { /* * In the "Flat Model" the MDA is interpreted as an 8-bit wide * bitmask. This model is only available in the xAPIC mode. */ mda_flat_ldest = dest & 0xff; /* * In the "Cluster Model" the MDA is used to identify a * specific cluster and a set of APICs in that cluster. */ if (x2apic_dest) { mda_cluster_id = dest >> 16; mda_cluster_ldest = dest & 0xffff; } else { mda_cluster_id = (dest >> 4) & 0xf; mda_cluster_ldest = dest & 0xf; } /* * Logical mode: match each APIC that has a bit set - * in it's LDR that matches a bit in the ldest. + * in its LDR that matches a bit in the ldest. */ CPU_ZERO(dmask); amask = vm_active_cpus(vm); while ((vcpuid = CPU_FFS(&amask)) != 0) { vcpuid--; CPU_CLR(vcpuid, &amask); vlapic = vm_lapic(vm, vcpuid); dfr = vlapic->apic_page->dfr; ldr = vlapic->apic_page->ldr; if ((dfr & APIC_DFR_MODEL_MASK) == APIC_DFR_MODEL_FLAT) { ldest = ldr >> 24; mda_ldest = mda_flat_ldest; } else if ((dfr & APIC_DFR_MODEL_MASK) == APIC_DFR_MODEL_CLUSTER) { if (x2apic(vlapic)) { cluster = ldr >> 16; ldest = ldr & 0xffff; } else { cluster = ldr >> 28; ldest = (ldr >> 24) & 0xf; } if (cluster != mda_cluster_id) continue; mda_ldest = mda_cluster_ldest; } else { /* * Guest has configured a bad logical * model for this vcpu - skip it. */ VLAPIC_CTR1(vlapic, "vlapic has bad logical " "model %x - cannot deliver interrupt", dfr); continue; } if ((mda_ldest & ldest) != 0) { CPU_SET(vcpuid, dmask); if (lowprio) break; } } } } static VMM_STAT_ARRAY(IPIS_SENT, VM_MAXCPU, "ipis sent to vcpu"); static void vlapic_set_tpr(struct vlapic *vlapic, uint8_t val) { struct LAPIC *lapic = vlapic->apic_page; if (lapic->tpr != val) { VCPU_CTR2(vlapic->vm, vlapic->vcpuid, "vlapic TPR changed " "from %#x to %#x", lapic->tpr, val); lapic->tpr = val; vlapic_update_ppr(vlapic); } } static uint8_t vlapic_get_tpr(struct vlapic *vlapic) { struct LAPIC *lapic = vlapic->apic_page; return (lapic->tpr); } void vlapic_set_cr8(struct vlapic *vlapic, uint64_t val) { uint8_t tpr; if (val & ~0xf) { vm_inject_gp(vlapic->vm, vlapic->vcpuid); return; } tpr = val << 4; vlapic_set_tpr(vlapic, tpr); } uint64_t vlapic_get_cr8(struct vlapic *vlapic) { uint8_t tpr; tpr = vlapic_get_tpr(vlapic); return (tpr >> 4); } int vlapic_icrlo_write_handler(struct vlapic *vlapic, bool *retu) { int i; bool phys; cpuset_t dmask; uint64_t icrval; uint32_t dest, vec, mode; struct vlapic *vlapic2; struct vm_exit *vmexit; struct LAPIC *lapic; lapic = vlapic->apic_page; lapic->icr_lo &= ~APIC_DELSTAT_PEND; icrval = ((uint64_t)lapic->icr_hi << 32) | lapic->icr_lo; if (x2apic(vlapic)) dest = icrval >> 32; else dest = icrval >> (32 + 24); vec = icrval & APIC_VECTOR_MASK; mode = icrval & APIC_DELMODE_MASK; if (mode == APIC_DELMODE_FIXED && vec < 16) { vlapic_set_error(vlapic, APIC_ESR_SEND_ILLEGAL_VECTOR); VLAPIC_CTR1(vlapic, "Ignoring invalid IPI %d", vec); return (0); } VLAPIC_CTR2(vlapic, "icrlo 0x%016lx triggered ipi %d", icrval, vec); if (mode == APIC_DELMODE_FIXED || mode == APIC_DELMODE_NMI) { switch (icrval & APIC_DEST_MASK) { case APIC_DEST_DESTFLD: phys = ((icrval & APIC_DESTMODE_LOG) == 0); vlapic_calcdest(vlapic->vm, &dmask, dest, phys, false, x2apic(vlapic)); break; case APIC_DEST_SELF: CPU_SETOF(vlapic->vcpuid, &dmask); break; case APIC_DEST_ALLISELF: dmask = vm_active_cpus(vlapic->vm); break; case APIC_DEST_ALLESELF: dmask = vm_active_cpus(vlapic->vm); CPU_CLR(vlapic->vcpuid, &dmask); break; default: CPU_ZERO(&dmask); /* satisfy gcc */ break; } while ((i = CPU_FFS(&dmask)) != 0) { i--; CPU_CLR(i, &dmask); if (mode == APIC_DELMODE_FIXED) { lapic_intr_edge(vlapic->vm, i, vec); vmm_stat_array_incr(vlapic->vm, vlapic->vcpuid, IPIS_SENT, i, 1); VLAPIC_CTR2(vlapic, "vlapic sending ipi %d " "to vcpuid %d", vec, i); } else { vm_inject_nmi(vlapic->vm, i); VLAPIC_CTR1(vlapic, "vlapic sending ipi nmi " "to vcpuid %d", i); } } return (0); /* handled completely in the kernel */ } if (mode == APIC_DELMODE_INIT) { if ((icrval & APIC_LEVEL_MASK) == APIC_LEVEL_DEASSERT) return (0); if (vlapic->vcpuid == 0 && dest != 0 && dest < VM_MAXCPU) { vlapic2 = vm_lapic(vlapic->vm, dest); /* move from INIT to waiting-for-SIPI state */ if (vlapic2->boot_state == BS_INIT) { vlapic2->boot_state = BS_SIPI; } return (0); } } if (mode == APIC_DELMODE_STARTUP) { if (vlapic->vcpuid == 0 && dest != 0 && dest < VM_MAXCPU) { vlapic2 = vm_lapic(vlapic->vm, dest); /* * Ignore SIPIs in any state other than wait-for-SIPI */ if (vlapic2->boot_state != BS_SIPI) return (0); vlapic2->boot_state = BS_RUNNING; *retu = true; vmexit = vm_exitinfo(vlapic->vm, vlapic->vcpuid); vmexit->exitcode = VM_EXITCODE_SPINUP_AP; vmexit->u.spinup_ap.vcpu = dest; vmexit->u.spinup_ap.rip = vec << PAGE_SHIFT; return (0); } } /* * This will cause a return to userland. */ return (1); } void vlapic_self_ipi_handler(struct vlapic *vlapic, uint64_t val) { int vec; KASSERT(x2apic(vlapic), ("SELF_IPI does not exist in xAPIC mode")); vec = val & 0xff; lapic_intr_edge(vlapic->vm, vlapic->vcpuid, vec); vmm_stat_array_incr(vlapic->vm, vlapic->vcpuid, IPIS_SENT, vlapic->vcpuid, 1); VLAPIC_CTR1(vlapic, "vlapic self-ipi %d", vec); } int vlapic_pending_intr(struct vlapic *vlapic, int *vecptr) { struct LAPIC *lapic = vlapic->apic_page; int idx, i, bitpos, vector; uint32_t *irrptr, val; if (vlapic->ops.pending_intr) return ((*vlapic->ops.pending_intr)(vlapic, vecptr)); irrptr = &lapic->irr0; for (i = 7; i >= 0; i--) { idx = i * 4; val = atomic_load_acq_int(&irrptr[idx]); bitpos = fls(val); if (bitpos != 0) { vector = i * 32 + (bitpos - 1); if (PRIO(vector) > PRIO(lapic->ppr)) { VLAPIC_CTR1(vlapic, "pending intr %d", vector); if (vecptr != NULL) *vecptr = vector; return (1); } else break; } } return (0); } void vlapic_intr_accepted(struct vlapic *vlapic, int vector) { struct LAPIC *lapic = vlapic->apic_page; uint32_t *irrptr, *isrptr; int idx, stk_top; if (vlapic->ops.intr_accepted) return ((*vlapic->ops.intr_accepted)(vlapic, vector)); /* * clear the ready bit for vector being accepted in irr * and set the vector as in service in isr. */ idx = (vector / 32) * 4; irrptr = &lapic->irr0; atomic_clear_int(&irrptr[idx], 1 << (vector % 32)); VLAPIC_CTR_IRR(vlapic, "vlapic_intr_accepted"); isrptr = &lapic->isr0; isrptr[idx] |= 1 << (vector % 32); VLAPIC_CTR_ISR(vlapic, "vlapic_intr_accepted"); /* * Update the PPR */ vlapic->isrvec_stk_top++; stk_top = vlapic->isrvec_stk_top; if (stk_top >= ISRVEC_STK_SIZE) panic("isrvec_stk_top overflow %d", stk_top); vlapic->isrvec_stk[stk_top] = vector; vlapic_update_ppr(vlapic); } void vlapic_svr_write_handler(struct vlapic *vlapic) { struct LAPIC *lapic; uint32_t old, new, changed; lapic = vlapic->apic_page; new = lapic->svr; old = vlapic->svr_last; vlapic->svr_last = new; changed = old ^ new; if ((changed & APIC_SVR_ENABLE) != 0) { if ((new & APIC_SVR_ENABLE) == 0) { /* * The apic is now disabled so stop the apic timer * and mask all the LVT entries. */ VLAPIC_CTR0(vlapic, "vlapic is software-disabled"); VLAPIC_TIMER_LOCK(vlapic); callout_stop(&vlapic->callout); VLAPIC_TIMER_UNLOCK(vlapic); vlapic_mask_lvts(vlapic); } else { /* * The apic is now enabled so restart the apic timer * if it is configured in periodic mode. */ VLAPIC_CTR0(vlapic, "vlapic is software-enabled"); if (vlapic_periodic_timer(vlapic)) vlapic_icrtmr_write_handler(vlapic); } } } int vlapic_read(struct vlapic *vlapic, int mmio_access, uint64_t offset, uint64_t *data, bool *retu) { struct LAPIC *lapic = vlapic->apic_page; uint32_t *reg; int i; /* Ignore MMIO accesses in x2APIC mode */ if (x2apic(vlapic) && mmio_access) { VLAPIC_CTR1(vlapic, "MMIO read from offset %#lx in x2APIC mode", offset); *data = 0; goto done; } if (!x2apic(vlapic) && !mmio_access) { /* * XXX Generate GP fault for MSR accesses in xAPIC mode */ VLAPIC_CTR1(vlapic, "x2APIC MSR read from offset %#lx in " "xAPIC mode", offset); *data = 0; goto done; } if (offset > sizeof(*lapic)) { *data = 0; goto done; } offset &= ~3; switch(offset) { case APIC_OFFSET_ID: *data = lapic->id; break; case APIC_OFFSET_VER: *data = lapic->version; break; case APIC_OFFSET_TPR: *data = vlapic_get_tpr(vlapic); break; case APIC_OFFSET_APR: *data = lapic->apr; break; case APIC_OFFSET_PPR: *data = lapic->ppr; break; case APIC_OFFSET_EOI: *data = lapic->eoi; break; case APIC_OFFSET_LDR: *data = lapic->ldr; break; case APIC_OFFSET_DFR: *data = lapic->dfr; break; case APIC_OFFSET_SVR: *data = lapic->svr; break; case APIC_OFFSET_ISR0 ... APIC_OFFSET_ISR7: i = (offset - APIC_OFFSET_ISR0) >> 2; reg = &lapic->isr0; *data = *(reg + i); break; case APIC_OFFSET_TMR0 ... APIC_OFFSET_TMR7: i = (offset - APIC_OFFSET_TMR0) >> 2; reg = &lapic->tmr0; *data = *(reg + i); break; case APIC_OFFSET_IRR0 ... APIC_OFFSET_IRR7: i = (offset - APIC_OFFSET_IRR0) >> 2; reg = &lapic->irr0; *data = atomic_load_acq_int(reg + i); break; case APIC_OFFSET_ESR: *data = lapic->esr; break; case APIC_OFFSET_ICR_LOW: *data = lapic->icr_lo; if (x2apic(vlapic)) *data |= (uint64_t)lapic->icr_hi << 32; break; case APIC_OFFSET_ICR_HI: *data = lapic->icr_hi; break; case APIC_OFFSET_CMCI_LVT: case APIC_OFFSET_TIMER_LVT ... APIC_OFFSET_ERROR_LVT: *data = vlapic_get_lvt(vlapic, offset); #ifdef INVARIANTS reg = vlapic_get_lvtptr(vlapic, offset); KASSERT(*data == *reg, ("inconsistent lvt value at " "offset %#lx: %#lx/%#x", offset, *data, *reg)); #endif break; case APIC_OFFSET_TIMER_ICR: *data = lapic->icr_timer; break; case APIC_OFFSET_TIMER_CCR: *data = vlapic_get_ccr(vlapic); break; case APIC_OFFSET_TIMER_DCR: *data = lapic->dcr_timer; break; case APIC_OFFSET_SELF_IPI: /* * XXX generate a GP fault if vlapic is in x2apic mode */ *data = 0; break; case APIC_OFFSET_RRR: default: *data = 0; break; } done: VLAPIC_CTR2(vlapic, "vlapic read offset %#x, data %#lx", offset, *data); return 0; } int vlapic_write(struct vlapic *vlapic, int mmio_access, uint64_t offset, uint64_t data, bool *retu) { struct LAPIC *lapic = vlapic->apic_page; uint32_t *regptr; int retval; KASSERT((offset & 0xf) == 0 && offset < PAGE_SIZE, ("vlapic_write: invalid offset %#lx", offset)); VLAPIC_CTR2(vlapic, "vlapic write offset %#lx, data %#lx", offset, data); if (offset > sizeof(*lapic)) return (0); /* Ignore MMIO accesses in x2APIC mode */ if (x2apic(vlapic) && mmio_access) { VLAPIC_CTR2(vlapic, "MMIO write of %#lx to offset %#lx " "in x2APIC mode", data, offset); return (0); } /* * XXX Generate GP fault for MSR accesses in xAPIC mode */ if (!x2apic(vlapic) && !mmio_access) { VLAPIC_CTR2(vlapic, "x2APIC MSR write of %#lx to offset %#lx " "in xAPIC mode", data, offset); return (0); } retval = 0; switch(offset) { case APIC_OFFSET_ID: lapic->id = data; vlapic_id_write_handler(vlapic); break; case APIC_OFFSET_TPR: vlapic_set_tpr(vlapic, data & 0xff); break; case APIC_OFFSET_EOI: vlapic_process_eoi(vlapic); break; case APIC_OFFSET_LDR: lapic->ldr = data; vlapic_ldr_write_handler(vlapic); break; case APIC_OFFSET_DFR: lapic->dfr = data; vlapic_dfr_write_handler(vlapic); break; case APIC_OFFSET_SVR: lapic->svr = data; vlapic_svr_write_handler(vlapic); break; case APIC_OFFSET_ICR_LOW: lapic->icr_lo = data; if (x2apic(vlapic)) lapic->icr_hi = data >> 32; retval = vlapic_icrlo_write_handler(vlapic, retu); break; case APIC_OFFSET_ICR_HI: lapic->icr_hi = data; break; case APIC_OFFSET_CMCI_LVT: case APIC_OFFSET_TIMER_LVT ... APIC_OFFSET_ERROR_LVT: regptr = vlapic_get_lvtptr(vlapic, offset); *regptr = data; vlapic_lvt_write_handler(vlapic, offset); break; case APIC_OFFSET_TIMER_ICR: lapic->icr_timer = data; vlapic_icrtmr_write_handler(vlapic); break; case APIC_OFFSET_TIMER_DCR: lapic->dcr_timer = data; vlapic_dcr_write_handler(vlapic); break; case APIC_OFFSET_ESR: vlapic_esr_write_handler(vlapic); break; case APIC_OFFSET_SELF_IPI: if (x2apic(vlapic)) vlapic_self_ipi_handler(vlapic, data); break; case APIC_OFFSET_VER: case APIC_OFFSET_APR: case APIC_OFFSET_PPR: case APIC_OFFSET_RRR: case APIC_OFFSET_ISR0 ... APIC_OFFSET_ISR7: case APIC_OFFSET_TMR0 ... APIC_OFFSET_TMR7: case APIC_OFFSET_IRR0 ... APIC_OFFSET_IRR7: case APIC_OFFSET_TIMER_CCR: default: // Read only. break; } return (retval); } static void vlapic_reset(struct vlapic *vlapic) { struct LAPIC *lapic; lapic = vlapic->apic_page; bzero(lapic, sizeof(struct LAPIC)); lapic->id = vlapic_get_id(vlapic); lapic->version = VLAPIC_VERSION; lapic->version |= (VLAPIC_MAXLVT_INDEX << MAXLVTSHIFT); lapic->dfr = 0xffffffff; lapic->svr = APIC_SVR_VECTOR; vlapic_mask_lvts(vlapic); vlapic_reset_tmr(vlapic); lapic->dcr_timer = 0; vlapic_dcr_write_handler(vlapic); if (vlapic->vcpuid == 0) vlapic->boot_state = BS_RUNNING; /* BSP */ else vlapic->boot_state = BS_INIT; /* AP */ vlapic->svr_last = lapic->svr; } void vlapic_init(struct vlapic *vlapic) { KASSERT(vlapic->vm != NULL, ("vlapic_init: vm is not initialized")); KASSERT(vlapic->vcpuid >= 0 && vlapic->vcpuid < VM_MAXCPU, ("vlapic_init: vcpuid is not initialized")); KASSERT(vlapic->apic_page != NULL, ("vlapic_init: apic_page is not " "initialized")); /* * If the vlapic is configured in x2apic mode then it will be * accessed in the critical section via the MSR emulation code. * * Therefore the timer mutex must be a spinlock because blockable * mutexes cannot be acquired in a critical section. */ mtx_init(&vlapic->timer_mtx, "vlapic timer mtx", NULL, MTX_SPIN); callout_init(&vlapic->callout, 1); vlapic->msr_apicbase = DEFAULT_APIC_BASE | APICBASE_ENABLED; if (vlapic->vcpuid == 0) vlapic->msr_apicbase |= APICBASE_BSP; vlapic_reset(vlapic); } void vlapic_cleanup(struct vlapic *vlapic) { callout_drain(&vlapic->callout); } uint64_t vlapic_get_apicbase(struct vlapic *vlapic) { return (vlapic->msr_apicbase); } int vlapic_set_apicbase(struct vlapic *vlapic, uint64_t new) { if (vlapic->msr_apicbase != new) { VLAPIC_CTR2(vlapic, "Changing APIC_BASE MSR from %#lx to %#lx " "not supported", vlapic->msr_apicbase, new); return (-1); } return (0); } void vlapic_set_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state state) { struct vlapic *vlapic; struct LAPIC *lapic; vlapic = vm_lapic(vm, vcpuid); if (state == X2APIC_DISABLED) vlapic->msr_apicbase &= ~APICBASE_X2APIC; else vlapic->msr_apicbase |= APICBASE_X2APIC; /* * Reset the local APIC registers whose values are mode-dependent. * * XXX this works because the APIC mode can be changed only at vcpu * initialization time. */ lapic = vlapic->apic_page; lapic->id = vlapic_get_id(vlapic); if (x2apic(vlapic)) { lapic->ldr = x2apic_ldr(vlapic); lapic->dfr = 0; } else { lapic->ldr = 0; lapic->dfr = 0xffffffff; } if (state == X2APIC_ENABLED) { if (vlapic->ops.enable_x2apic_mode) (*vlapic->ops.enable_x2apic_mode)(vlapic); } } void vlapic_deliver_intr(struct vm *vm, bool level, uint32_t dest, bool phys, int delmode, int vec) { bool lowprio; int vcpuid; cpuset_t dmask; if (delmode != IOART_DELFIXED && delmode != IOART_DELLOPRI && delmode != IOART_DELEXINT) { VM_CTR1(vm, "vlapic intr invalid delmode %#x", delmode); return; } lowprio = (delmode == IOART_DELLOPRI); /* * We don't provide any virtual interrupt redirection hardware so * all interrupts originating from the ioapic or MSI specify the * 'dest' in the legacy xAPIC format. */ vlapic_calcdest(vm, &dmask, dest, phys, lowprio, false); while ((vcpuid = CPU_FFS(&dmask)) != 0) { vcpuid--; CPU_CLR(vcpuid, &dmask); if (delmode == IOART_DELEXINT) { vm_inject_extint(vm, vcpuid); } else { lapic_set_intr(vm, vcpuid, vec, level); } } } void vlapic_post_intr(struct vlapic *vlapic, int hostcpu, int ipinum) { /* * Post an interrupt to the vcpu currently running on 'hostcpu'. * * This is done by leveraging features like Posted Interrupts (Intel) * Doorbell MSR (AMD AVIC) that avoid a VM exit. * * If neither of these features are available then fallback to * sending an IPI to 'hostcpu'. */ if (vlapic->ops.post_intr) (*vlapic->ops.post_intr)(vlapic, hostcpu); else ipi_cpu(hostcpu, ipinum); } bool vlapic_enabled(struct vlapic *vlapic) { struct LAPIC *lapic = vlapic->apic_page; if ((vlapic->msr_apicbase & APICBASE_ENABLED) != 0 && (lapic->svr & APIC_SVR_ENABLE) != 0) return (true); else return (false); } static void vlapic_set_tmr(struct vlapic *vlapic, int vector, bool level) { struct LAPIC *lapic; uint32_t *tmrptr, mask; int idx; lapic = vlapic->apic_page; tmrptr = &lapic->tmr0; idx = (vector / 32) * 4; mask = 1 << (vector % 32); if (level) tmrptr[idx] |= mask; else tmrptr[idx] &= ~mask; if (vlapic->ops.set_tmr != NULL) (*vlapic->ops.set_tmr)(vlapic, vector, level); } void vlapic_reset_tmr(struct vlapic *vlapic) { int vector; VLAPIC_CTR0(vlapic, "vlapic resetting all vectors to edge-triggered"); for (vector = 0; vector <= 255; vector++) vlapic_set_tmr(vlapic, vector, false); } void vlapic_set_tmr_level(struct vlapic *vlapic, uint32_t dest, bool phys, int delmode, int vector) { cpuset_t dmask; bool lowprio; KASSERT(vector >= 0 && vector <= 255, ("invalid vector %d", vector)); /* * A level trigger is valid only for fixed and lowprio delivery modes. */ if (delmode != APIC_DELMODE_FIXED && delmode != APIC_DELMODE_LOWPRIO) { VLAPIC_CTR1(vlapic, "Ignoring level trigger-mode for " "delivery-mode %d", delmode); return; } lowprio = (delmode == APIC_DELMODE_LOWPRIO); vlapic_calcdest(vlapic->vm, &dmask, dest, phys, lowprio, false); if (!CPU_ISSET(vlapic->vcpuid, &dmask)) return; VLAPIC_CTR1(vlapic, "vector %d set to level-triggered", vector); vlapic_set_tmr(vlapic, vector, true); } Index: head/sys/arm/include/profile.h =================================================================== --- head/sys/arm/include/profile.h (revision 308456) +++ head/sys/arm/include/profile.h (revision 308457) @@ -1,130 +1,130 @@ /*- * Copyright (c) 1992, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)profile.h 8.1 (Berkeley) 6/11/93 * $FreeBSD$ */ #ifndef _MACHINE_PROFILE_H_ #define _MACHINE_PROFILE_H_ /* * Config generates something to tell the compiler to align functions on 32 * byte boundaries. A strict alignment is good for keeping the tables small. */ #define FUNCTION_ALIGNMENT 16 #define _MCOUNT_DECL void mcount typedef u_long fptrdiff_t; /* * Cannot implement mcount in C as GCC will trash the ip register when it * pushes a trapframe. Pity we cannot insert assembly before the function * prologue. */ #ifndef PLTSYM #define PLTSYM #endif #define MCOUNT \ __asm__(".text"); \ __asm__(".align 2"); \ __asm__(".type __mcount ,%function"); \ __asm__(".global __mcount"); \ __asm__("__mcount:"); \ /* \ * Preserve registers that are trashed during mcount \ */ \ __asm__("stmfd sp!, {r0-r3, ip, lr}"); \ /* \ * find the return address for mcount, \ * and the return address for mcount's caller. \ * \ * frompcindex = pc pushed by call into self. \ */ \ __asm__("mov r0, ip"); \ /* \ * selfpc = pc pushed by mcount call \ */ \ __asm__("mov r1, lr"); \ /* \ * Call the real mcount code \ */ \ __asm__("bl mcount"); \ /* \ * Restore registers that were trashed during mcount \ */ \ __asm__("ldmfd sp!, {r0-r3, lr}"); \ /* \ * Return to the caller. Loading lr and pc in one instruction \ - * is deprecated on ARMv7 so we need this on it's own. \ + * is deprecated on ARMv7 so we need this on its own. \ */ \ __asm__("ldmfd sp!, {pc}"); void bintr(void); void btrap(void); void eintr(void); void user(void); #define MCOUNT_FROMPC_USER(pc) \ ((pc < (uintfptr_t)VM_MAXUSER_ADDRESS) ? (uintfptr_t)user : pc) #define MCOUNT_FROMPC_INTR(pc) \ ((pc >= (uintfptr_t)btrap && pc < (uintfptr_t)eintr) ? \ ((pc >= (uintfptr_t)bintr) ? (uintfptr_t)bintr : \ (uintfptr_t)btrap) : ~0U) #ifdef _KERNEL #define MCOUNT_DECL(s) register_t s; #include #include /* * splhigh() and splx() are heavyweight, and call mcount(). Therefore * we disabled interrupts (IRQ, but not FIQ) directly on the CPU. * * We're lucky that the CPSR and 's' both happen to be 'int's. */ #define MCOUNT_ENTER(s) {s = intr_disable(); } /* kill IRQ */ #define MCOUNT_EXIT(s) {intr_restore(s); } /* restore old value */ void mcount(uintfptr_t frompc, uintfptr_t selfpc); #else typedef u_int uintfptr_t; #endif /* _KERNEL */ #endif /* !_MACHINE_PROFILE_H_ */ Index: head/sys/arm/ti/omap4/omap4_prcm_clks.c =================================================================== --- head/sys/arm/ti/omap4/omap4_prcm_clks.c (revision 308456) +++ head/sys/arm/ti/omap4/omap4_prcm_clks.c (revision 308457) @@ -1,1494 +1,1494 @@ /*- * Copyright (c) 2011 * Ben Gray . * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the company nor the name of the author may be used to * endorse or promote products derived from this software without specific * prior written permission. * * THIS SOFTWARE IS PROVIDED BY BEN GRAY ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL BEN GRAY BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * This file defines the clock configuration for the OMAP4xxx series of * devices. * * How This is Suppose to Work * =========================== * - There is a top level omap_prcm module that defines all OMAP SoC drivers * should use to enable/disable the system clocks regardless of the version * of OMAP device they are running on. This top level PRCM module is just * a thin shim to chip specific functions that perform the donkey work of * configuring the clock - this file is the 'donkey' for OMAP44xx devices. * * - The key bit in this file is the omap_clk_devmap array, it's * used by the omap_prcm driver to determine what clocks are valid and which * functions to call to manipulate them. * * - In essence you just need to define some callbacks for each of the * clocks and then you're done. * * - The other thing that is worth noting is that when the omap_prcm device * is registered you typically pass in some memory ranges which are the * SYS_MEMORY resources. These resources are in turn allocated using * bus_allocate_resources(...) and the resource handles are passed to all * individual clock callback handlers. * * * * OMAP4 devices are different from the previous OMAP3 devices in that there * is no longer a separate functional and interface clock for each module, * instead there is typically an interface clock that spans many modules. */ #define FREQ_96MHZ 96000000 #define FREQ_64MHZ 64000000 #define FREQ_48MHZ 48000000 #define FREQ_32KHZ 32000 #define PRM_INSTANCE 1 #define CM1_INSTANCE 2 #define CM2_INSTANCE 3 /** * Address offsets from the PRM memory region to the top level clock control * registers. */ #define CKGEN_PRM_OFFSET 0x00000100UL #define MPU_PRM_OFFSET 0x00000300UL #define DSP_PRM_OFFSET 0x00000400UL #define ABE_PRM_OFFSET 0x00000500UL #define ALWAYS_ON_PRM_OFFSET 0x00000600UL #define CORE_PRM_OFFSET 0x00000700UL #define IVAHD_PRM_OFFSET 0x00000F00UL #define CAM_PRM_OFFSET 0x00001000UL #define DSS_PRM_OFFSET 0x00001100UL #define SGX_PRM_OFFSET 0x00001200UL #define L3INIT_PRM_OFFSET 0x00001300UL #define L4PER_PRM_OFFSET 0x00001400UL #define WKUP_PRM_OFFSET 0x00001700UL #define WKUP_CM_OFFSET 0x00001800UL #define EMU_PRM_OFFSET 0x00001900UL #define EMU_CM_OFFSET 0x00001A00UL #define DEVICE_PRM_OFFSET 0x00001B00UL #define INSTR_PRM_OFFSET 0x00001F00UL #define CM_ABE_DSS_SYS_CLKSEL_OFFSET (CKGEN_PRM_OFFSET + 0x0000UL) #define CM_L4_WKUP_CLKSELL_OFFSET (CKGEN_PRM_OFFSET + 0x0008UL) #define CM_ABE_PLL_REF_CLKSEL_OFFSET (CKGEN_PRM_OFFSET + 0x000CUL) #define CM_SYS_CLKSEL_OFFSET (CKGEN_PRM_OFFSET + 0x0010UL) /** * Address offsets from the CM1 memory region to the top level clock control * registers. */ #define CKGEN_CM1_OFFSET 0x00000100UL #define MPU_CM1_OFFSET 0x00000300UL #define DSP_CM1_OFFSET 0x00000400UL #define ABE_CM1_OFFSET 0x00000500UL #define RESTORE_CM1_OFFSET 0x00000E00UL #define INSTR_CM1_OFFSET 0x00000F00UL #define CM_CLKSEL_DPLL_MPU (CKGEN_CM1_OFFSET + 0x006CUL) /** * Address offsets from the CM2 memory region to the top level clock control * registers. */ #define INTRCONN_SOCKET_CM2_OFFSET 0x00000000UL #define CKGEN_CM2_OFFSET 0x00000100UL #define ALWAYS_ON_CM2_OFFSET 0x00000600UL #define CORE_CM2_OFFSET 0x00000700UL #define IVAHD_CM2_OFFSET 0x00000F00UL #define CAM_CM2_OFFSET 0x00001000UL #define DSS_CM2_OFFSET 0x00001100UL #define SGX_CM2_OFFSET 0x00001200UL #define L3INIT_CM2_OFFSET 0x00001300UL #define L4PER_CM2_OFFSET 0x00001400UL #define RESTORE_CM2_OFFSET 0x00001E00UL #define INSTR_CM2_OFFSET 0x00001F00UL #define CLKCTRL_MODULEMODE_MASK 0x00000003UL #define CLKCTRL_MODULEMODE_DISABLE 0x00000000UL #define CLKCTRL_MODULEMODE_AUTO 0x00000001UL #define CLKCTRL_MODULEMODE_ENABLE 0x00000001UL #define CLKCTRL_IDLEST_MASK 0x00030000UL #define CLKCTRL_IDLEST_ENABLED 0x00000000UL #define CLKCTRL_IDLEST_WAKING 0x00010000UL #define CLKCTRL_IDLEST_IDLE 0x00020000UL #define CLKCTRL_IDLEST_DISABLED 0x00030000UL static struct ofw_compat_data compat_data[] = { {"ti,omap4-cm1", (uintptr_t)CM1_INSTANCE}, {"ti,omap4-cm2", (uintptr_t)CM2_INSTANCE}, {"ti,omap4-prm", (uintptr_t)PRM_INSTANCE}, {NULL, (uintptr_t)0}, }; struct omap4_prcm_softc { struct resource *sc_res; int sc_rid; int sc_instance; }; static int omap4_clk_generic_activate(struct ti_clock_dev *clkdev); static int omap4_clk_generic_deactivate(struct ti_clock_dev *clkdev); static int omap4_clk_generic_accessible(struct ti_clock_dev *clkdev); static int omap4_clk_generic_set_source(struct ti_clock_dev *clkdev, clk_src_t clksrc); static int omap4_clk_generic_get_source_freq(struct ti_clock_dev *clkdev, unsigned int *freq); static int omap4_clk_gptimer_set_source(struct ti_clock_dev *clkdev, clk_src_t clksrc); static int omap4_clk_gptimer_get_source_freq(struct ti_clock_dev *clkdev, unsigned int *freq); static int omap4_clk_hsmmc_set_source(struct ti_clock_dev *clkdev, clk_src_t clksrc); static int omap4_clk_hsmmc_get_source_freq(struct ti_clock_dev *clkdev, unsigned int *freq); static int omap4_clk_hsusbhost_set_source(struct ti_clock_dev *clkdev, clk_src_t clksrc); static int omap4_clk_hsusbhost_activate(struct ti_clock_dev *clkdev); static int omap4_clk_hsusbhost_deactivate(struct ti_clock_dev *clkdev); static int omap4_clk_hsusbhost_accessible(struct ti_clock_dev *clkdev); static int omap4_clk_get_sysclk_freq(struct ti_clock_dev *clkdev, unsigned int *freq); static int omap4_clk_get_arm_fclk_freq(struct ti_clock_dev *clkdev, unsigned int *freq); /** * omap_clk_devmap - Array of clock devices available on OMAP4xxx devices * * This map only defines which clocks are valid and the callback functions * for clock activate, deactivate, etc. It is used by the top level omap_prcm * driver. * * The actual details of the clocks (config registers, bit fields, sources, * etc) are in the private g_omap3_clk_details array below. * */ #define OMAP4_GENERIC_CLOCK_DEV(i) \ { .id = (i), \ .clk_activate = omap4_clk_generic_activate, \ .clk_deactivate = omap4_clk_generic_deactivate, \ .clk_set_source = omap4_clk_generic_set_source, \ .clk_accessible = omap4_clk_generic_accessible, \ .clk_get_source_freq = omap4_clk_generic_get_source_freq, \ .clk_set_source_freq = NULL \ } #define OMAP4_GPTIMER_CLOCK_DEV(i) \ { .id = (i), \ .clk_activate = omap4_clk_generic_activate, \ .clk_deactivate = omap4_clk_generic_deactivate, \ .clk_set_source = omap4_clk_gptimer_set_source, \ .clk_accessible = omap4_clk_generic_accessible, \ .clk_get_source_freq = omap4_clk_gptimer_get_source_freq, \ .clk_set_source_freq = NULL \ } #define OMAP4_HSMMC_CLOCK_DEV(i) \ { .id = (i), \ .clk_activate = omap4_clk_generic_activate, \ .clk_deactivate = omap4_clk_generic_deactivate, \ .clk_set_source = omap4_clk_hsmmc_set_source, \ .clk_accessible = omap4_clk_generic_accessible, \ .clk_get_source_freq = omap4_clk_hsmmc_get_source_freq, \ .clk_set_source_freq = NULL \ } #define OMAP4_HSUSBHOST_CLOCK_DEV(i) \ { .id = (i), \ .clk_activate = omap4_clk_hsusbhost_activate, \ .clk_deactivate = omap4_clk_hsusbhost_deactivate, \ .clk_set_source = omap4_clk_hsusbhost_set_source, \ .clk_accessible = omap4_clk_hsusbhost_accessible, \ .clk_get_source_freq = NULL, \ .clk_set_source_freq = NULL \ } struct ti_clock_dev ti_omap4_clk_devmap[] = { /* System clocks */ { .id = SYS_CLK, .clk_activate = NULL, .clk_deactivate = NULL, .clk_set_source = NULL, .clk_accessible = NULL, .clk_get_source_freq = omap4_clk_get_sysclk_freq, .clk_set_source_freq = NULL, }, /* MPU (ARM) core clocks */ { .id = MPU_CLK, .clk_activate = NULL, .clk_deactivate = NULL, .clk_set_source = NULL, .clk_accessible = NULL, .clk_get_source_freq = omap4_clk_get_arm_fclk_freq, .clk_set_source_freq = NULL, }, /* UART device clocks */ OMAP4_GENERIC_CLOCK_DEV(UART1_CLK), OMAP4_GENERIC_CLOCK_DEV(UART2_CLK), OMAP4_GENERIC_CLOCK_DEV(UART3_CLK), OMAP4_GENERIC_CLOCK_DEV(UART4_CLK), /* Timer device source clocks */ OMAP4_GPTIMER_CLOCK_DEV(TIMER1_CLK), OMAP4_GPTIMER_CLOCK_DEV(TIMER2_CLK), OMAP4_GPTIMER_CLOCK_DEV(TIMER3_CLK), OMAP4_GPTIMER_CLOCK_DEV(TIMER4_CLK), OMAP4_GPTIMER_CLOCK_DEV(TIMER5_CLK), OMAP4_GPTIMER_CLOCK_DEV(TIMER6_CLK), OMAP4_GPTIMER_CLOCK_DEV(TIMER7_CLK), OMAP4_GPTIMER_CLOCK_DEV(TIMER8_CLK), OMAP4_GPTIMER_CLOCK_DEV(TIMER9_CLK), OMAP4_GPTIMER_CLOCK_DEV(TIMER10_CLK), OMAP4_GPTIMER_CLOCK_DEV(TIMER11_CLK), /* MMC device clocks (MMC1 and MMC2 can have different input clocks) */ OMAP4_HSMMC_CLOCK_DEV(MMC1_CLK), OMAP4_HSMMC_CLOCK_DEV(MMC2_CLK), OMAP4_GENERIC_CLOCK_DEV(MMC3_CLK), OMAP4_GENERIC_CLOCK_DEV(MMC4_CLK), OMAP4_GENERIC_CLOCK_DEV(MMC5_CLK), /* USB HS (high speed TLL, EHCI and OHCI) */ OMAP4_HSUSBHOST_CLOCK_DEV(USBTLL_CLK), OMAP4_HSUSBHOST_CLOCK_DEV(USBHSHOST_CLK), OMAP4_HSUSBHOST_CLOCK_DEV(USBFSHOST_CLK), OMAP4_HSUSBHOST_CLOCK_DEV(USBP1_PHY_CLK), OMAP4_HSUSBHOST_CLOCK_DEV(USBP2_PHY_CLK), OMAP4_HSUSBHOST_CLOCK_DEV(USBP1_UTMI_CLK), OMAP4_HSUSBHOST_CLOCK_DEV(USBP2_UTMI_CLK), OMAP4_HSUSBHOST_CLOCK_DEV(USBP1_HSIC_CLK), OMAP4_HSUSBHOST_CLOCK_DEV(USBP2_HSIC_CLK), /* GPIO */ OMAP4_GENERIC_CLOCK_DEV(GPIO1_CLK), OMAP4_GENERIC_CLOCK_DEV(GPIO2_CLK), OMAP4_GENERIC_CLOCK_DEV(GPIO3_CLK), OMAP4_GENERIC_CLOCK_DEV(GPIO4_CLK), OMAP4_GENERIC_CLOCK_DEV(GPIO5_CLK), OMAP4_GENERIC_CLOCK_DEV(GPIO6_CLK), /* sDMA */ OMAP4_GENERIC_CLOCK_DEV(SDMA_CLK), /* I2C */ OMAP4_GENERIC_CLOCK_DEV(I2C1_CLK), OMAP4_GENERIC_CLOCK_DEV(I2C2_CLK), OMAP4_GENERIC_CLOCK_DEV(I2C3_CLK), OMAP4_GENERIC_CLOCK_DEV(I2C4_CLK), { INVALID_CLK_IDENT, NULL, NULL, NULL, NULL } }; /** * omap4_clk_details - Stores details for all the different clocks supported * * Whenever an operation on a clock is being performed (activated, deactivated, * etc) this array is looked up to find the correct register and bit(s) we * should be modifying. * */ struct omap4_clk_details { clk_ident_t id; uint32_t instance; uint32_t clksel_reg; int32_t src_freq; uint32_t enable_mode; }; #define OMAP4_GENERIC_CLOCK_DETAILS(i, f, di, r, e) \ { .id = (i), \ .instance = (di), \ .clksel_reg = (r), \ .src_freq = (f), \ .enable_mode = (e), \ } static struct omap4_clk_details g_omap4_clk_details[] = { /* UART */ OMAP4_GENERIC_CLOCK_DETAILS(UART1_CLK, FREQ_48MHZ, CM2_INSTANCE, (L4PER_CM2_OFFSET + 0x0140), CLKCTRL_MODULEMODE_ENABLE), OMAP4_GENERIC_CLOCK_DETAILS(UART2_CLK, FREQ_48MHZ, CM2_INSTANCE, (L4PER_CM2_OFFSET + 0x0148), CLKCTRL_MODULEMODE_ENABLE), OMAP4_GENERIC_CLOCK_DETAILS(UART3_CLK, FREQ_48MHZ, CM2_INSTANCE, (L4PER_CM2_OFFSET + 0x0150), CLKCTRL_MODULEMODE_ENABLE), OMAP4_GENERIC_CLOCK_DETAILS(UART4_CLK, FREQ_48MHZ, CM2_INSTANCE, (L4PER_CM2_OFFSET + 0x0158), CLKCTRL_MODULEMODE_ENABLE), /* General purpose timers */ OMAP4_GENERIC_CLOCK_DETAILS(TIMER1_CLK, -1, PRM_INSTANCE, (WKUP_CM_OFFSET + 0x040), CLKCTRL_MODULEMODE_ENABLE), OMAP4_GENERIC_CLOCK_DETAILS(TIMER2_CLK, -1, CM2_INSTANCE, (L4PER_CM2_OFFSET + 0x038), CLKCTRL_MODULEMODE_ENABLE), OMAP4_GENERIC_CLOCK_DETAILS(TIMER3_CLK, -1, CM2_INSTANCE, (L4PER_CM2_OFFSET + 0x040), CLKCTRL_MODULEMODE_ENABLE), OMAP4_GENERIC_CLOCK_DETAILS(TIMER4_CLK, -1, CM2_INSTANCE, (L4PER_CM2_OFFSET + 0x048), CLKCTRL_MODULEMODE_ENABLE), OMAP4_GENERIC_CLOCK_DETAILS(TIMER5_CLK, -1, CM1_INSTANCE, (ABE_CM1_OFFSET + 0x068), CLKCTRL_MODULEMODE_ENABLE), OMAP4_GENERIC_CLOCK_DETAILS(TIMER6_CLK, -1, CM1_INSTANCE, (ABE_CM1_OFFSET + 0x070), CLKCTRL_MODULEMODE_ENABLE), OMAP4_GENERIC_CLOCK_DETAILS(TIMER7_CLK, -1, CM1_INSTANCE, (ABE_CM1_OFFSET + 0x078), CLKCTRL_MODULEMODE_ENABLE), OMAP4_GENERIC_CLOCK_DETAILS(TIMER8_CLK, -1, CM1_INSTANCE, (ABE_CM1_OFFSET + 0x080), CLKCTRL_MODULEMODE_ENABLE), OMAP4_GENERIC_CLOCK_DETAILS(TIMER9_CLK, -1, CM2_INSTANCE, (L4PER_CM2_OFFSET + 0x050), CLKCTRL_MODULEMODE_ENABLE), OMAP4_GENERIC_CLOCK_DETAILS(TIMER10_CLK, -1, CM2_INSTANCE, (L4PER_CM2_OFFSET + 0x028), CLKCTRL_MODULEMODE_ENABLE), OMAP4_GENERIC_CLOCK_DETAILS(TIMER11_CLK, -1, CM2_INSTANCE, (L4PER_CM2_OFFSET + 0x030), CLKCTRL_MODULEMODE_ENABLE), /* HSMMC (MMC1 and MMC2 can have different input clocks) */ OMAP4_GENERIC_CLOCK_DETAILS(MMC1_CLK, -1, CM2_INSTANCE, (L3INIT_CM2_OFFSET + 0x028), /*CLKCTRL_MODULEMODE_ENABLE*/2), OMAP4_GENERIC_CLOCK_DETAILS(MMC2_CLK, -1, CM2_INSTANCE, (L3INIT_CM2_OFFSET + 0x030), /*CLKCTRL_MODULEMODE_ENABLE*/2), OMAP4_GENERIC_CLOCK_DETAILS(MMC3_CLK, FREQ_48MHZ, CM2_INSTANCE, (L4PER_CM2_OFFSET + 0x120), /*CLKCTRL_MODULEMODE_ENABLE*/2), OMAP4_GENERIC_CLOCK_DETAILS(MMC4_CLK, FREQ_48MHZ, CM2_INSTANCE, (L4PER_CM2_OFFSET + 0x128), /*CLKCTRL_MODULEMODE_ENABLE*/2), OMAP4_GENERIC_CLOCK_DETAILS(MMC5_CLK, FREQ_48MHZ, CM2_INSTANCE, (L4PER_CM2_OFFSET + 0x160), /*CLKCTRL_MODULEMODE_ENABLE*/1), /* GPIO modules */ OMAP4_GENERIC_CLOCK_DETAILS(GPIO1_CLK, -1, PRM_INSTANCE, (WKUP_CM_OFFSET + 0x038), CLKCTRL_MODULEMODE_AUTO), OMAP4_GENERIC_CLOCK_DETAILS(GPIO2_CLK, -1, CM2_INSTANCE, (L4PER_CM2_OFFSET + 0x060), CLKCTRL_MODULEMODE_AUTO), OMAP4_GENERIC_CLOCK_DETAILS(GPIO3_CLK, -1, CM2_INSTANCE, (L4PER_CM2_OFFSET + 0x068), CLKCTRL_MODULEMODE_AUTO), OMAP4_GENERIC_CLOCK_DETAILS(GPIO4_CLK, -1, CM2_INSTANCE, (L4PER_CM2_OFFSET + 0x070), CLKCTRL_MODULEMODE_AUTO), OMAP4_GENERIC_CLOCK_DETAILS(GPIO5_CLK, -1, CM2_INSTANCE, (L4PER_CM2_OFFSET + 0x078), CLKCTRL_MODULEMODE_AUTO), OMAP4_GENERIC_CLOCK_DETAILS(GPIO6_CLK, -1, CM2_INSTANCE, (L4PER_CM2_OFFSET + 0x080), CLKCTRL_MODULEMODE_AUTO), /* sDMA block */ OMAP4_GENERIC_CLOCK_DETAILS(SDMA_CLK, -1, CM2_INSTANCE, (CORE_CM2_OFFSET + 0x300), CLKCTRL_MODULEMODE_AUTO), /* I2C modules */ OMAP4_GENERIC_CLOCK_DETAILS(I2C1_CLK, -1, CM2_INSTANCE, (L4PER_CM2_OFFSET + 0x0A0), CLKCTRL_MODULEMODE_ENABLE), OMAP4_GENERIC_CLOCK_DETAILS(I2C2_CLK, -1, CM2_INSTANCE, (L4PER_CM2_OFFSET + 0x0A8), CLKCTRL_MODULEMODE_ENABLE), OMAP4_GENERIC_CLOCK_DETAILS(I2C3_CLK, -1, CM2_INSTANCE, (L4PER_CM2_OFFSET + 0x0B0), CLKCTRL_MODULEMODE_ENABLE), OMAP4_GENERIC_CLOCK_DETAILS(I2C4_CLK, -1, CM2_INSTANCE, (L4PER_CM2_OFFSET + 0x0B8), CLKCTRL_MODULEMODE_ENABLE), { INVALID_CLK_IDENT, 0, 0, 0, 0 }, }; /** * MAX_MODULE_ENABLE_WAIT - the number of loops to wait for the module to come * alive. * */ #define MAX_MODULE_ENABLE_WAIT 100 /** * ARRAY_SIZE - Macro to return the number of elements in a static const array. * */ #define ARRAY_SIZE(x) (sizeof(x)/sizeof(x[0])) /** * omap4_clk_details - writes a 32-bit value to one of the timer registers * @timer: Timer device context * @off: The offset of a register from the timer register address range * @val: The value to write into the register * * * RETURNS: * nothing */ static struct omap4_clk_details* omap4_clk_details(clk_ident_t id) { struct omap4_clk_details *walker; for (walker = g_omap4_clk_details; walker->id != INVALID_CLK_IDENT; walker++) { if (id == walker->id) return (walker); } return NULL; } static struct omap4_prcm_softc * omap4_prcm_get_instance_softc(int module_instance) { int i, maxunit; devclass_t prcm_devclass; device_t dev; struct omap4_prcm_softc *sc; prcm_devclass = devclass_find("omap4_prcm"); maxunit = devclass_get_maxunit(prcm_devclass); for (i = 0; i < maxunit; i++) { dev = devclass_get_device(prcm_devclass, i); sc = device_get_softc(dev); if (sc->sc_instance == module_instance) return (sc); } return (NULL); } /** * omap4_clk_generic_activate - checks if a module is accessible * @module: identifier for the module to check, see omap3_prcm.h for a list * of possible modules. * Example: OMAP3_MODULE_MMC1 * * * * LOCKING: * Inherits the locks from the omap_prcm driver, no internal locking. * * RETURNS: * Returns 0 on success or a positive error code on failure. */ static int omap4_clk_generic_activate(struct ti_clock_dev *clkdev) { struct omap4_prcm_softc *sc; struct omap4_clk_details* clk_details; struct resource* clk_mem_res; uint32_t clksel; unsigned int i; clk_details = omap4_clk_details(clkdev->id); if (clk_details == NULL) return (ENXIO); sc = omap4_prcm_get_instance_softc(clk_details->instance); if (sc == NULL) return ENXIO; clk_mem_res = sc->sc_res; if (clk_mem_res == NULL) return (EINVAL); /* All the 'generic' clocks have a CLKCTRL register which is more or less * generic - the have at least two fielda called MODULEMODE and IDLEST. */ clksel = bus_read_4(clk_mem_res, clk_details->clksel_reg); clksel &= ~CLKCTRL_MODULEMODE_MASK; clksel |= clk_details->enable_mode; bus_write_4(clk_mem_res, clk_details->clksel_reg, clksel); /* Now poll on the IDLEST register to tell us if the module has come up. * TODO: We need to take into account the parent clocks. */ /* Try MAX_MODULE_ENABLE_WAIT number of times to check if enabled */ for (i = 0; i < MAX_MODULE_ENABLE_WAIT; i++) { clksel = bus_read_4(clk_mem_res, clk_details->clksel_reg); if ((clksel & CLKCTRL_IDLEST_MASK) == CLKCTRL_IDLEST_ENABLED) break; DELAY(10); } /* Check the enabled state */ if ((clksel & CLKCTRL_IDLEST_MASK) != CLKCTRL_IDLEST_ENABLED) { printf("Error: failed to enable module with clock %d\n", clkdev->id); printf("Error: 0x%08x => 0x%08x\n", clk_details->clksel_reg, clksel); return (ETIMEDOUT); } return (0); } /** * omap4_clk_generic_deactivate - checks if a module is accessible * @module: identifier for the module to check, see omap3_prcm.h for a list * of possible modules. * Example: OMAP3_MODULE_MMC1 * * * * LOCKING: * Inherits the locks from the omap_prcm driver, no internal locking. * * RETURNS: * Returns 0 on success or a positive error code on failure. */ static int omap4_clk_generic_deactivate(struct ti_clock_dev *clkdev) { struct omap4_prcm_softc *sc; struct omap4_clk_details* clk_details; struct resource* clk_mem_res; uint32_t clksel; clk_details = omap4_clk_details(clkdev->id); if (clk_details == NULL) return (ENXIO); sc = omap4_prcm_get_instance_softc(clk_details->instance); if (sc == NULL) return ENXIO; clk_mem_res = sc->sc_res; if (clk_mem_res == NULL) return (EINVAL); /* All the 'generic' clocks have a CLKCTRL register which is more or less * generic - the have at least two fielda called MODULEMODE and IDLEST. */ clksel = bus_read_4(clk_mem_res, clk_details->clksel_reg); clksel &= ~CLKCTRL_MODULEMODE_MASK; clksel |= CLKCTRL_MODULEMODE_DISABLE; bus_write_4(clk_mem_res, clk_details->clksel_reg, clksel); return (0); } /** * omap4_clk_generic_set_source - checks if a module is accessible * @module: identifier for the module to check, see omap3_prcm.h for a list * of possible modules. * Example: OMAP3_MODULE_MMC1 * * * * LOCKING: * Inherits the locks from the omap_prcm driver, no internal locking. * * RETURNS: * Returns 0 on success or a positive error code on failure. */ static int omap4_clk_generic_set_source(struct ti_clock_dev *clkdev, clk_src_t clksrc) { return (0); } /** * omap4_clk_generic_accessible - checks if a module is accessible * @module: identifier for the module to check, see omap3_prcm.h for a list * of possible modules. * Example: OMAP3_MODULE_MMC1 * * * * LOCKING: * Inherits the locks from the omap_prcm driver, no internal locking. * * RETURNS: * Returns 0 on success or a negative error code on failure. */ static int omap4_clk_generic_accessible(struct ti_clock_dev *clkdev) { struct omap4_prcm_softc *sc; struct omap4_clk_details* clk_details; struct resource* clk_mem_res; uint32_t clksel; clk_details = omap4_clk_details(clkdev->id); if (clk_details == NULL) return (ENXIO); sc = omap4_prcm_get_instance_softc(clk_details->instance); if (sc == NULL) return ENXIO; clk_mem_res = sc->sc_res; if (clk_mem_res == NULL) return (EINVAL); clksel = bus_read_4(clk_mem_res, clk_details->clksel_reg); /* Check the enabled state */ if ((clksel & CLKCTRL_IDLEST_MASK) != CLKCTRL_IDLEST_ENABLED) return (0); return (1); } /** * omap4_clk_generic_get_source_freq - checks if a module is accessible * @module: identifier for the module to check, see omap3_prcm.h for a list * of possible modules. * Example: OMAP3_MODULE_MMC1 * * * * LOCKING: * Inherits the locks from the omap_prcm driver, no internal locking. * * RETURNS: * Returns 0 on success or a negative error code on failure. */ static int omap4_clk_generic_get_source_freq(struct ti_clock_dev *clkdev, unsigned int *freq ) { struct omap4_clk_details* clk_details = omap4_clk_details(clkdev->id); if (clk_details == NULL) return (ENXIO); /* Simply return the stored frequency */ if (freq) *freq = (unsigned int)clk_details->src_freq; return (0); } /** * omap4_clk_gptimer_set_source - checks if a module is accessible * @module: identifier for the module to check, see omap3_prcm.h for a list * of possible modules. * Example: OMAP3_MODULE_MMC1 * * * * LOCKING: * Inherits the locks from the omap_prcm driver, no internal locking. * * RETURNS: * Returns 0 on success or a negative error code on failure. */ static int omap4_clk_gptimer_set_source(struct ti_clock_dev *clkdev, clk_src_t clksrc) { struct omap4_prcm_softc *sc; struct omap4_clk_details* clk_details; struct resource* clk_mem_res; clk_details = omap4_clk_details(clkdev->id); if (clk_details == NULL) return (ENXIO); sc = omap4_prcm_get_instance_softc(clk_details->instance); if (sc == NULL) return ENXIO; clk_mem_res = sc->sc_res; if (clk_mem_res == NULL) return (EINVAL); /* TODO: Implement */ return (0); } /** * omap4_clk_gptimer_get_source_freq - checks if a module is accessible * @module: identifier for the module to check, see omap3_prcm.h for a list * of possible modules. * Example: OMAP3_MODULE_MMC1 * * * * LOCKING: * Inherits the locks from the omap_prcm driver, no internal locking. * * RETURNS: * Returns 0 on success or a negative error code on failure. */ static int omap4_clk_gptimer_get_source_freq(struct ti_clock_dev *clkdev, unsigned int *freq ) { struct omap4_prcm_softc *sc; struct omap4_clk_details* clk_details; struct resource* clk_mem_res; uint32_t clksel; unsigned int src_freq; clk_details = omap4_clk_details(clkdev->id); if (clk_details == NULL) return (ENXIO); sc = omap4_prcm_get_instance_softc(clk_details->instance); if (sc == NULL) return ENXIO; clk_mem_res = sc->sc_res; if (clk_mem_res == NULL) return (EINVAL); /* Need to read the CLKSEL field to determine the clock source */ clksel = bus_read_4(clk_mem_res, clk_details->clksel_reg); if (clksel & (0x1UL << 24)) src_freq = FREQ_32KHZ; else omap4_clk_get_sysclk_freq(NULL, &src_freq); /* Return the frequency */ if (freq) *freq = src_freq; return (0); } /** * omap4_clk_hsmmc_set_source - sets the source clock (freq) * @clkdev: pointer to the clockdev structure (id field will contain clock id) * * The MMC 1 and 2 clocks can be source from either a 64MHz or 96MHz clock. * * LOCKING: * Inherits the locks from the omap_prcm driver, no internal locking. * * RETURNS: * Returns 0 on success or a negative error code on failure. */ static int omap4_clk_hsmmc_set_source(struct ti_clock_dev *clkdev, clk_src_t clksrc) { struct omap4_prcm_softc *sc; struct omap4_clk_details* clk_details; struct resource* clk_mem_res; uint32_t clksel; clk_details = omap4_clk_details(clkdev->id); if (clk_details == NULL) return (ENXIO); sc = omap4_prcm_get_instance_softc(clk_details->instance); if (sc == NULL) return ENXIO; clk_mem_res = sc->sc_res; if (clk_mem_res == NULL) return (EINVAL); /* For MMC modules 3, 4 & 5 you can't change the freq, it's always 48MHz */ if ((clkdev->id == MMC3_CLK) || (clkdev->id == MMC4_CLK) || (clkdev->id == MMC5_CLK)) { if (clksrc != F48MHZ_CLK) return (EINVAL); return 0; } clksel = bus_read_4(clk_mem_res, clk_details->clksel_reg); /* Bit 24 is set if 96MHz clock or cleared for 64MHz clock */ if (clksrc == F64MHZ_CLK) clksel &= ~(0x1UL << 24); else if (clksrc == F96MHZ_CLK) clksel |= (0x1UL << 24); else return (EINVAL); bus_write_4(clk_mem_res, clk_details->clksel_reg, clksel); return (0); } /** * omap4_clk_hsmmc_get_source_freq - checks if a module is accessible * @clkdev: pointer to the clockdev structure (id field will contain clock id) * * * * LOCKING: * Inherits the locks from the omap_prcm driver, no internal locking. * * RETURNS: * Returns 0 on success or a negative error code on failure. */ static int omap4_clk_hsmmc_get_source_freq(struct ti_clock_dev *clkdev, unsigned int *freq ) { struct omap4_prcm_softc *sc; struct omap4_clk_details* clk_details; struct resource* clk_mem_res; uint32_t clksel; unsigned int src_freq; clk_details = omap4_clk_details(clkdev->id); if (clk_details == NULL) return (ENXIO); sc = omap4_prcm_get_instance_softc(clk_details->instance); if (sc == NULL) return ENXIO; clk_mem_res = sc->sc_res; if (clk_mem_res == NULL) return (EINVAL); switch (clkdev->id) { case MMC1_CLK: case MMC2_CLK: /* Need to read the CLKSEL field to determine the clock source */ clksel = bus_read_4(clk_mem_res, clk_details->clksel_reg); if (clksel & (0x1UL << 24)) src_freq = FREQ_96MHZ; else src_freq = FREQ_64MHZ; break; case MMC3_CLK: case MMC4_CLK: case MMC5_CLK: src_freq = FREQ_48MHZ; break; default: return (EINVAL); } /* Return the frequency */ if (freq) *freq = src_freq; return (0); } /** * omap4_clk_get_sysclk_freq - gets the sysclk frequency * @sc: pointer to the clk module/device context * * Read the clocking information from the power-control/boot-strap registers, * and stored in two global variables. * * RETURNS: * nothing, values are saved in global variables */ static int omap4_clk_get_sysclk_freq(struct ti_clock_dev *clkdev, unsigned int *freq) { uint32_t clksel; uint32_t sysclk; struct omap4_prcm_softc *sc; sc = omap4_prcm_get_instance_softc(PRM_INSTANCE); if (sc == NULL) return ENXIO; /* Read the input clock freq from the configuration register (CM_SYS_CLKSEL) */ clksel = bus_read_4(sc->sc_res, CM_SYS_CLKSEL_OFFSET); switch (clksel & 0x7) { case 0x1: /* 12Mhz */ sysclk = 12000000; break; case 0x3: /* 16.8Mhz */ sysclk = 16800000; break; case 0x4: /* 19.2Mhz */ sysclk = 19200000; break; case 0x5: /* 26Mhz */ sysclk = 26000000; break; case 0x7: /* 38.4Mhz */ sysclk = 38400000; break; default: panic("%s: Invalid clock freq", __func__); } /* Return the value */ if (freq) *freq = sysclk; return (0); } /** * omap4_clk_get_arm_fclk_freq - gets the MPU clock frequency * @clkdev: ignored * @freq: pointer which upon return will contain the freq in hz * @mem_res: array of allocated memory resources * * Reads the frequency setting information registers and returns the value * in the freq variable. * * RETURNS: * returns 0 on success, a positive error code on failure. */ static int omap4_clk_get_arm_fclk_freq(struct ti_clock_dev *clkdev, unsigned int *freq) { uint32_t clksel; uint32_t pll_mult, pll_div; uint32_t mpuclk, sysclk; struct omap4_prcm_softc *sc; sc = omap4_prcm_get_instance_softc(CM1_INSTANCE); if (sc == NULL) return ENXIO; /* Read the clksel register which contains the DPLL multiple and divide * values. These are applied to the sysclk. */ clksel = bus_read_4(sc->sc_res, CM_CLKSEL_DPLL_MPU); pll_mult = ((clksel >> 8) & 0x7ff); pll_div = (clksel & 0x7f) + 1; /* Get the system clock freq */ omap4_clk_get_sysclk_freq(NULL, &sysclk); /* Calculate the MPU freq */ mpuclk = ((uint64_t)sysclk * pll_mult) / pll_div; /* Return the value */ if (freq) *freq = mpuclk; return (0); } /** * omap4_clk_hsusbhost_activate - activates the USB clocks for the given module * @clkdev: pointer to the clock device structure. * @mem_res: array of memory resources allocated by the top level PRCM driver. * * The USB clocking setup seems to be a bit more tricky than the other modules, * to start with the clocking diagram for the HS host module shows 13 different * clocks. So to try and make it easier to follow the clocking activation - * and deactivation is handled in it's own set of callbacks. + * and deactivation is handled in its own set of callbacks. * * LOCKING: * Inherits the locks from the omap_prcm driver, no internal locking. * * RETURNS: * Returns 0 on success or a positive error code on failure. */ struct dpll_param { unsigned int m; unsigned int n; unsigned int m2; unsigned int m3; unsigned int m4; unsigned int m5; unsigned int m6; unsigned int m7; }; /* USB parameters */ struct dpll_param usb_dpll_param[7] = { /* 12M values */ {0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, /* 13M values */ {0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, /* 16.8M values */ {0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, /* 19.2M values */ {0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, /* 26M values */ {0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, /* 27M values */ {0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, /* 38.4M values */ #ifdef CONFIG_OMAP4_SDC {0x32, 0x1, 0x2, 0x0, 0x0, 0x0, 0x0, 0x0}, #else {0x32, 0x1, 0x2, 0x0, 0x0, 0x0, 0x0, 0x0}, #endif }; static int omap4_clk_hsusbhost_activate(struct ti_clock_dev *clkdev) { struct omap4_prcm_softc *sc; struct resource* clk_mem_res; uint32_t clksel_reg_off; uint32_t clksel; unsigned int i; sc = omap4_prcm_get_instance_softc(CM2_INSTANCE); if (sc == NULL) return ENXIO; switch (clkdev->id) { case USBTLL_CLK: /* For the USBTLL module we need to enable the following clocks: * - INIT_L4_ICLK (will be enabled by bootloader) * - TLL_CH0_FCLK * - TLL_CH1_FCLK */ /* We need the CM_L3INIT_HSUSBTLL_CLKCTRL register in CM2 register set */ clk_mem_res = sc->sc_res; clksel_reg_off = L3INIT_CM2_OFFSET + 0x68; /* Enable the module and also enable the optional func clocks for * channels 0 & 1 (is this needed ?) */ clksel = bus_read_4(clk_mem_res, clksel_reg_off); clksel &= ~CLKCTRL_MODULEMODE_MASK; clksel |= CLKCTRL_MODULEMODE_ENABLE; clksel |= (0x1 << 8); /* USB-HOST optional clock: USB_CH0_CLK */ clksel |= (0x1 << 9); /* USB-HOST optional clock: USB_CH1_CLK */ break; case USBHSHOST_CLK: case USBP1_PHY_CLK: case USBP2_PHY_CLK: case USBP1_UTMI_CLK: case USBP2_UTMI_CLK: case USBP1_HSIC_CLK: case USBP2_HSIC_CLK: /* For the USB HS HOST module we need to enable the following clocks: * - INIT_L4_ICLK (will be enabled by bootloader) * - INIT_L3_ICLK (will be enabled by bootloader) * - INIT_48MC_FCLK * - UTMI_ROOT_GFCLK (UTMI only, create a new clock for that ?) * - UTMI_P1_FCLK (UTMI only, create a new clock for that ?) * - UTMI_P2_FCLK (UTMI only, create a new clock for that ?) * - HSIC_P1_60 (HSIC only, create a new clock for that ?) * - HSIC_P1_480 (HSIC only, create a new clock for that ?) * - HSIC_P2_60 (HSIC only, create a new clock for that ?) * - HSIC_P2_480 (HSIC only, create a new clock for that ?) */ /* We need the CM_L3INIT_HSUSBHOST_CLKCTRL register in CM2 register set */ clk_mem_res = sc->sc_res; clksel_reg_off = L3INIT_CM2_OFFSET + 0x58; clksel = bus_read_4(clk_mem_res, clksel_reg_off); /* Enable the module and also enable the optional func clocks */ if (clkdev->id == USBHSHOST_CLK) { clksel &= ~CLKCTRL_MODULEMODE_MASK; clksel |= /*CLKCTRL_MODULEMODE_ENABLE*/2; clksel |= (0x1 << 15); /* USB-HOST clock control: FUNC48MCLK */ } else if (clkdev->id == USBP1_UTMI_CLK) clksel |= (0x1 << 8); /* UTMI_P1_CLK */ else if (clkdev->id == USBP2_UTMI_CLK) clksel |= (0x1 << 9); /* UTMI_P2_CLK */ else if (clkdev->id == USBP1_HSIC_CLK) clksel |= (0x5 << 11); /* HSIC60M_P1_CLK + HSIC480M_P1_CLK */ else if (clkdev->id == USBP2_HSIC_CLK) clksel |= (0x5 << 12); /* HSIC60M_P2_CLK + HSIC480M_P2_CLK */ break; default: return (EINVAL); } bus_write_4(clk_mem_res, clksel_reg_off, clksel); /* Try MAX_MODULE_ENABLE_WAIT number of times to check if enabled */ for (i = 0; i < MAX_MODULE_ENABLE_WAIT; i++) { clksel = bus_read_4(clk_mem_res, clksel_reg_off); if ((clksel & CLKCTRL_IDLEST_MASK) == CLKCTRL_IDLEST_ENABLED) break; } /* Check the enabled state */ if ((clksel & CLKCTRL_IDLEST_MASK) != CLKCTRL_IDLEST_ENABLED) { printf("Error: HERE failed to enable module with clock %d\n", clkdev->id); printf("Error: 0x%08x => 0x%08x\n", clksel_reg_off, clksel); return (ETIMEDOUT); } return (0); } /** * omap4_clk_generic_deactivate - checks if a module is accessible * @clkdev: pointer to the clock device structure. * @mem_res: array of memory resources allocated by the top level PRCM driver. * * * * LOCKING: * Inherits the locks from the omap_prcm driver, no internal locking. * * RETURNS: * Returns 0 on success or a positive error code on failure. */ static int omap4_clk_hsusbhost_deactivate(struct ti_clock_dev *clkdev) { struct omap4_prcm_softc *sc; struct resource* clk_mem_res; uint32_t clksel_reg_off; uint32_t clksel; sc = omap4_prcm_get_instance_softc(CM2_INSTANCE); if (sc == NULL) return ENXIO; switch (clkdev->id) { case USBTLL_CLK: /* We need the CM_L3INIT_HSUSBTLL_CLKCTRL register in CM2 register set */ clk_mem_res = sc->sc_res; clksel_reg_off = L3INIT_CM2_OFFSET + 0x68; clksel = bus_read_4(clk_mem_res, clksel_reg_off); clksel &= ~CLKCTRL_MODULEMODE_MASK; clksel |= CLKCTRL_MODULEMODE_DISABLE; break; case USBHSHOST_CLK: case USBP1_PHY_CLK: case USBP2_PHY_CLK: case USBP1_UTMI_CLK: case USBP2_UTMI_CLK: case USBP1_HSIC_CLK: case USBP2_HSIC_CLK: /* For the USB HS HOST module we need to enable the following clocks: * - INIT_L4_ICLK (will be enabled by bootloader) * - INIT_L3_ICLK (will be enabled by bootloader) * - INIT_48MC_FCLK * - UTMI_ROOT_GFCLK (UTMI only, create a new clock for that ?) * - UTMI_P1_FCLK (UTMI only, create a new clock for that ?) * - UTMI_P2_FCLK (UTMI only, create a new clock for that ?) * - HSIC_P1_60 (HSIC only, create a new clock for that ?) * - HSIC_P1_480 (HSIC only, create a new clock for that ?) * - HSIC_P2_60 (HSIC only, create a new clock for that ?) * - HSIC_P2_480 (HSIC only, create a new clock for that ?) */ /* We need the CM_L3INIT_HSUSBHOST_CLKCTRL register in CM2 register set */ clk_mem_res = sc->sc_res; clksel_reg_off = L3INIT_CM2_OFFSET + 0x58; clksel = bus_read_4(clk_mem_res, clksel_reg_off); /* Enable the module and also enable the optional func clocks */ if (clkdev->id == USBHSHOST_CLK) { clksel &= ~CLKCTRL_MODULEMODE_MASK; clksel |= CLKCTRL_MODULEMODE_DISABLE; clksel &= ~(0x1 << 15); /* USB-HOST clock control: FUNC48MCLK */ } else if (clkdev->id == USBP1_UTMI_CLK) clksel &= ~(0x1 << 8); /* UTMI_P1_CLK */ else if (clkdev->id == USBP2_UTMI_CLK) clksel &= ~(0x1 << 9); /* UTMI_P2_CLK */ else if (clkdev->id == USBP1_HSIC_CLK) clksel &= ~(0x5 << 11); /* HSIC60M_P1_CLK + HSIC480M_P1_CLK */ else if (clkdev->id == USBP2_HSIC_CLK) clksel &= ~(0x5 << 12); /* HSIC60M_P2_CLK + HSIC480M_P2_CLK */ break; default: return (EINVAL); } bus_write_4(clk_mem_res, clksel_reg_off, clksel); return (0); } /** * omap4_clk_hsusbhost_accessible - checks if a module is accessible * @clkdev: pointer to the clock device structure. * @mem_res: array of memory resources allocated by the top level PRCM driver. * * * * LOCKING: * Inherits the locks from the omap_prcm driver, no internal locking. * * RETURNS: * Returns 0 if module is not enable, 1 if module is enabled or a negative * error code on failure. */ static int omap4_clk_hsusbhost_accessible(struct ti_clock_dev *clkdev) { struct omap4_prcm_softc *sc; struct resource* clk_mem_res; uint32_t clksel_reg_off; uint32_t clksel; sc = omap4_prcm_get_instance_softc(CM2_INSTANCE); if (sc == NULL) return ENXIO; if (clkdev->id == USBTLL_CLK) { /* We need the CM_L3INIT_HSUSBTLL_CLKCTRL register in CM2 register set */ clk_mem_res = sc->sc_res; clksel_reg_off = L3INIT_CM2_OFFSET + 0x68; } else if (clkdev->id == USBHSHOST_CLK) { /* We need the CM_L3INIT_HSUSBHOST_CLKCTRL register in CM2 register set */ clk_mem_res = sc->sc_res; clksel_reg_off = L3INIT_CM2_OFFSET + 0x58; } else { return (EINVAL); } clksel = bus_read_4(clk_mem_res, clksel_reg_off); /* Check the enabled state */ if ((clksel & CLKCTRL_IDLEST_MASK) != CLKCTRL_IDLEST_ENABLED) return (0); return (1); } /** * omap4_clk_hsusbhost_set_source - sets the source clocks * @clkdev: pointer to the clock device structure. * @clksrc: the clock source ID for the given clock. * @mem_res: array of memory resources allocated by the top level PRCM driver. * * * * LOCKING: * Inherits the locks from the omap_prcm driver, no internal locking. * * RETURNS: * Returns 0 if successful otherwise a negative error code on failure. */ static int omap4_clk_hsusbhost_set_source(struct ti_clock_dev *clkdev, clk_src_t clksrc) { struct omap4_prcm_softc *sc; struct resource* clk_mem_res; uint32_t clksel_reg_off; uint32_t clksel; unsigned int bit; sc = omap4_prcm_get_instance_softc(CM2_INSTANCE); if (sc == NULL) return ENXIO; if (clkdev->id == USBP1_PHY_CLK) bit = 24; else if (clkdev->id != USBP2_PHY_CLK) bit = 25; else return (EINVAL); /* We need the CM_L3INIT_HSUSBHOST_CLKCTRL register in CM2 register set */ clk_mem_res = sc->sc_res; clksel_reg_off = L3INIT_CM2_OFFSET + 0x58; clksel = bus_read_4(clk_mem_res, clksel_reg_off); /* Set the clock source to either external or internal */ if (clksrc == EXT_CLK) clksel |= (0x1 << bit); else clksel &= ~(0x1 << bit); bus_write_4(clk_mem_res, clksel_reg_off, clksel); return (0); } #define PRM_RSTCTRL 0x1b00 #define PRM_RSTCTRL_RESET 0x2 static void omap4_prcm_reset(void) { struct omap4_prcm_softc *sc; sc = omap4_prcm_get_instance_softc(PRM_INSTANCE); if (sc == NULL) return; bus_write_4(sc->sc_res, PRM_RSTCTRL, bus_read_4(sc->sc_res, PRM_RSTCTRL) | PRM_RSTCTRL_RESET); bus_read_4(sc->sc_res, PRM_RSTCTRL); } /** * omap4_prcm_probe - probe function for the driver * @dev: prcm device handle * * Simply sets the name of the driver module. * * LOCKING: * None * * RETURNS: * Always returns 0 */ static int omap4_prcm_probe(device_t dev) { const struct ofw_compat_data *ocd; if (!ofw_bus_status_okay(dev)) return (ENXIO); ocd = ofw_bus_search_compatible(dev, compat_data); if ((int)ocd->ocd_data == 0) return (ENXIO); switch ((int)ocd->ocd_data) { case PRM_INSTANCE: device_set_desc(dev, "TI OMAP Power, Reset and Clock Management (PRM)"); break; case CM1_INSTANCE: device_set_desc(dev, "TI OMAP Power, Reset and Clock Management (C1)"); break; case CM2_INSTANCE: device_set_desc(dev, "TI OMAP Power, Reset and Clock Management (C2)"); break; default: device_printf(dev, "unknown instance type: %d\n", (int)ocd->ocd_data); return (ENXIO); } return (BUS_PROBE_DEFAULT); } /** * omap_prcm_attach - attach function for the driver * @dev: prcm device handle * * Allocates and sets up the driver context, this simply entails creating a * bus mappings for the PRCM register set. * * LOCKING: * None * * RETURNS: * Always returns 0 */ extern uint32_t platform_arm_tmr_freq; static int omap4_prcm_attach(device_t dev) { struct omap4_prcm_softc *sc; unsigned int freq; const struct ofw_compat_data *ocd; sc = device_get_softc(dev); ocd = ofw_bus_search_compatible(dev, compat_data); sc->sc_instance = (int)ocd->ocd_data; sc->sc_res = bus_alloc_resource_any(dev, SYS_RES_MEMORY, &sc->sc_rid, RF_ACTIVE); if (sc->sc_res == NULL) { device_printf(dev, "could not allocate resources\n"); return (ENXIO); } ti_cpu_reset = omap4_prcm_reset; /* * In order to determine ARM frequency we need both RPM and CM1 * instances up and running. So wait until all CRM devices are * initialized. Should be replaced with proper clock framework */ if (device_get_unit(dev) == 2) { omap4_clk_get_arm_fclk_freq(NULL, &freq); arm_tmr_change_frequency(freq / 2); } return (0); } static device_method_t omap4_prcm_methods[] = { DEVMETHOD(device_probe, omap4_prcm_probe), DEVMETHOD(device_attach, omap4_prcm_attach), {0, 0}, }; static driver_t omap4_prcm_driver = { "omap4_prcm", omap4_prcm_methods, sizeof(struct omap4_prcm_softc), }; static devclass_t omap4_prcm_devclass; EARLY_DRIVER_MODULE(omap4_prcm, simplebus, omap4_prcm_driver, omap4_prcm_devclass, 0, 0, BUS_PASS_TIMER + BUS_PASS_ORDER_EARLY); MODULE_VERSION(omap4_prcm, 1); Index: head/sys/arm64/arm64/identcpu.c =================================================================== --- head/sys/arm64/arm64/identcpu.c (revision 308456) +++ head/sys/arm64/arm64/identcpu.c (revision 308457) @@ -1,800 +1,800 @@ /*- * Copyright (c) 2014 Andrew Turner * Copyright (c) 2014 The FreeBSD Foundation * All rights reserved. * * Portions of this software were developed by Semihalf * under sponsorship of the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include static int ident_lock; char machine[] = "arm64"; SYSCTL_STRING(_hw, HW_MACHINE, machine, CTLFLAG_RD, machine, 0, "Machine class"); /* * Per-CPU affinity as provided in MPIDR_EL1 * Indexed by CPU number in logical order selected by the system. * Relevant fields can be extracted using CPU_AFFn macros, * Aff3.Aff2.Aff1.Aff0 construct a unique CPU address in the system. * * Fields used by us: * Aff1 - Cluster number * Aff0 - CPU number in Aff1 cluster */ uint64_t __cpu_affinity[MAXCPU]; static u_int cpu_aff_levels; struct cpu_desc { u_int cpu_impl; u_int cpu_part_num; u_int cpu_variant; u_int cpu_revision; const char *cpu_impl_name; const char *cpu_part_name; uint64_t mpidr; uint64_t id_aa64afr0; uint64_t id_aa64afr1; uint64_t id_aa64dfr0; uint64_t id_aa64dfr1; uint64_t id_aa64isar0; uint64_t id_aa64isar1; uint64_t id_aa64mmfr0; uint64_t id_aa64mmfr1; uint64_t id_aa64pfr0; uint64_t id_aa64pfr1; }; struct cpu_desc cpu_desc[MAXCPU]; static u_int cpu_print_regs; #define PRINT_ID_AA64_AFR0 0x00000001 #define PRINT_ID_AA64_AFR1 0x00000002 #define PRINT_ID_AA64_DFR0 0x00000004 #define PRINT_ID_AA64_DFR1 0x00000008 #define PRINT_ID_AA64_ISAR0 0x00000010 #define PRINT_ID_AA64_ISAR1 0x00000020 #define PRINT_ID_AA64_MMFR0 0x00000040 #define PRINT_ID_AA64_MMFR1 0x00000080 #define PRINT_ID_AA64_PFR0 0x00000100 #define PRINT_ID_AA64_PFR1 0x00000200 struct cpu_parts { u_int part_id; const char *part_name; }; #define CPU_PART_NONE { 0, "Unknown Processor" } struct cpu_implementers { u_int impl_id; const char *impl_name; /* * Part number is implementation defined * so each vendor will have its own set of values and names. */ const struct cpu_parts *cpu_parts; }; #define CPU_IMPLEMENTER_NONE { 0, "Unknown Implementer", cpu_parts_none } /* * Per-implementer table of (PartNum, CPU Name) pairs. */ /* ARM Ltd. */ static const struct cpu_parts cpu_parts_arm[] = { { CPU_PART_FOUNDATION, "Foundation-Model" }, { CPU_PART_CORTEX_A53, "Cortex-A53" }, { CPU_PART_CORTEX_A57, "Cortex-A57" }, CPU_PART_NONE, }; /* Cavium */ static const struct cpu_parts cpu_parts_cavium[] = { { CPU_PART_THUNDER, "Thunder" }, CPU_PART_NONE, }; /* Unknown */ static const struct cpu_parts cpu_parts_none[] = { CPU_PART_NONE, }; /* * Implementers table. */ const struct cpu_implementers cpu_implementers[] = { { CPU_IMPL_ARM, "ARM", cpu_parts_arm }, { CPU_IMPL_BROADCOM, "Broadcom", cpu_parts_none }, { CPU_IMPL_CAVIUM, "Cavium", cpu_parts_cavium }, { CPU_IMPL_DEC, "DEC", cpu_parts_none }, { CPU_IMPL_INFINEON, "IFX", cpu_parts_none }, { CPU_IMPL_FREESCALE, "Freescale", cpu_parts_none }, { CPU_IMPL_NVIDIA, "NVIDIA", cpu_parts_none }, { CPU_IMPL_APM, "APM", cpu_parts_none }, { CPU_IMPL_QUALCOMM, "Qualcomm", cpu_parts_none }, { CPU_IMPL_MARVELL, "Marvell", cpu_parts_none }, { CPU_IMPL_INTEL, "Intel", cpu_parts_none }, CPU_IMPLEMENTER_NONE, }; static void identify_cpu_sysinit(void *dummy __unused) { int cpu; CPU_FOREACH(cpu) { print_cpu_features(cpu); } } SYSINIT(idenrity_cpu, SI_SUB_SMP, SI_ORDER_ANY, identify_cpu_sysinit, NULL); void print_cpu_features(u_int cpu) { int printed; printf("CPU%3d: %s %s r%dp%d", cpu, cpu_desc[cpu].cpu_impl_name, cpu_desc[cpu].cpu_part_name, cpu_desc[cpu].cpu_variant, cpu_desc[cpu].cpu_revision); printf(" affinity:"); switch(cpu_aff_levels) { default: case 4: printf(" %2d", CPU_AFF3(cpu_desc[cpu].mpidr)); /* FALLTHROUGH */ case 3: printf(" %2d", CPU_AFF2(cpu_desc[cpu].mpidr)); /* FALLTHROUGH */ case 2: printf(" %2d", CPU_AFF1(cpu_desc[cpu].mpidr)); /* FALLTHROUGH */ case 1: case 0: /* On UP this will be zero */ printf(" %2d", CPU_AFF0(cpu_desc[cpu].mpidr)); break; } printf("\n"); /* * There is a hardware errata where, if one CPU is performing a TLB * invalidation while another is performing a store-exclusive the * store-exclusive may return the wrong status. A workaround seems * to be to use an IPI to invalidate on each CPU, however given the * limited number of affected units (pass 1.1 is the evaluation * hardware revision), and the lack of information from Cavium * this has not been implemented. * * At the time of writing this the only information is from: * https://lkml.org/lkml/2016/8/4/722 */ /* - * XXX: CPU_MATCH_ERRATA_CAVIUM_THUNDER_1_1 on it's own also + * XXX: CPU_MATCH_ERRATA_CAVIUM_THUNDER_1_1 on its own also * triggers on pass 2.0+. */ if (cpu == 0 && CPU_VAR(PCPU_GET(midr)) == 0 && CPU_MATCH_ERRATA_CAVIUM_THUNDER_1_1) printf("WARNING: ThunderX Pass 1.1 detected.\nThis has known " "hardware bugs that may cause the incorrect operation of " "atomic operations.\n"); if (cpu != 0 && cpu_print_regs == 0) return; #define SEP_STR ((printed++) == 0) ? "" : "," /* AArch64 Instruction Set Attribute Register 0 */ if (cpu == 0 || (cpu_print_regs & PRINT_ID_AA64_ISAR0) != 0) { printed = 0; printf(" Instruction Set Attributes 0 = <"); switch (ID_AA64ISAR0_RDM(cpu_desc[cpu].id_aa64isar0)) { case ID_AA64ISAR0_RDM_NONE: break; case ID_AA64ISAR0_RDM_IMPL: printf("%sRDM", SEP_STR); break; default: printf("%sUnknown RDM", SEP_STR); } switch (ID_AA64ISAR0_ATOMIC(cpu_desc[cpu].id_aa64isar0)) { case ID_AA64ISAR0_ATOMIC_NONE: break; case ID_AA64ISAR0_ATOMIC_IMPL: printf("%sAtomic", SEP_STR); break; default: printf("%sUnknown Atomic", SEP_STR); } switch (ID_AA64ISAR0_AES(cpu_desc[cpu].id_aa64isar0)) { case ID_AA64ISAR0_AES_NONE: break; case ID_AA64ISAR0_AES_BASE: printf("%sAES", SEP_STR); break; case ID_AA64ISAR0_AES_PMULL: printf("%sAES+PMULL", SEP_STR); break; default: printf("%sUnknown AES", SEP_STR); break; } switch (ID_AA64ISAR0_SHA1(cpu_desc[cpu].id_aa64isar0)) { case ID_AA64ISAR0_SHA1_NONE: break; case ID_AA64ISAR0_SHA1_BASE: printf("%sSHA1", SEP_STR); break; default: printf("%sUnknown SHA1", SEP_STR); break; } switch (ID_AA64ISAR0_SHA2(cpu_desc[cpu].id_aa64isar0)) { case ID_AA64ISAR0_SHA2_NONE: break; case ID_AA64ISAR0_SHA2_BASE: printf("%sSHA2", SEP_STR); break; default: printf("%sUnknown SHA2", SEP_STR); break; } switch (ID_AA64ISAR0_CRC32(cpu_desc[cpu].id_aa64isar0)) { case ID_AA64ISAR0_CRC32_NONE: break; case ID_AA64ISAR0_CRC32_BASE: printf("%sCRC32", SEP_STR); break; default: printf("%sUnknown CRC32", SEP_STR); break; } if ((cpu_desc[cpu].id_aa64isar0 & ~ID_AA64ISAR0_MASK) != 0) printf("%s%#lx", SEP_STR, cpu_desc[cpu].id_aa64isar0 & ~ID_AA64ISAR0_MASK); printf(">\n"); } /* AArch64 Instruction Set Attribute Register 1 */ if (cpu == 0 || (cpu_print_regs & PRINT_ID_AA64_ISAR1) != 0) { printf(" Instruction Set Attributes 1 = <%#lx>\n", cpu_desc[cpu].id_aa64isar1); } /* AArch64 Processor Feature Register 0 */ if (cpu == 0 || (cpu_print_regs & PRINT_ID_AA64_PFR0) != 0) { printed = 0; printf(" Processor Features 0 = <"); switch (ID_AA64PFR0_GIC(cpu_desc[cpu].id_aa64pfr0)) { case ID_AA64PFR0_GIC_CPUIF_NONE: break; case ID_AA64PFR0_GIC_CPUIF_EN: printf("%sGIC", SEP_STR); break; default: printf("%sUnknown GIC interface", SEP_STR); break; } switch (ID_AA64PFR0_ADV_SIMD(cpu_desc[cpu].id_aa64pfr0)) { case ID_AA64PFR0_ADV_SIMD_NONE: break; case ID_AA64PFR0_ADV_SIMD_IMPL: printf("%sAdvSIMD", SEP_STR); break; default: printf("%sUnknown AdvSIMD", SEP_STR); break; } switch (ID_AA64PFR0_FP(cpu_desc[cpu].id_aa64pfr0)) { case ID_AA64PFR0_FP_NONE: break; case ID_AA64PFR0_FP_IMPL: printf("%sFloat", SEP_STR); break; default: printf("%sUnknown Float", SEP_STR); break; } switch (ID_AA64PFR0_EL3(cpu_desc[cpu].id_aa64pfr0)) { case ID_AA64PFR0_EL3_NONE: printf("%sNo EL3", SEP_STR); break; case ID_AA64PFR0_EL3_64: printf("%sEL3", SEP_STR); break; case ID_AA64PFR0_EL3_64_32: printf("%sEL3 32", SEP_STR); break; default: printf("%sUnknown EL3", SEP_STR); break; } switch (ID_AA64PFR0_EL2(cpu_desc[cpu].id_aa64pfr0)) { case ID_AA64PFR0_EL2_NONE: printf("%sNo EL2", SEP_STR); break; case ID_AA64PFR0_EL2_64: printf("%sEL2", SEP_STR); break; case ID_AA64PFR0_EL2_64_32: printf("%sEL2 32", SEP_STR); break; default: printf("%sUnknown EL2", SEP_STR); break; } switch (ID_AA64PFR0_EL1(cpu_desc[cpu].id_aa64pfr0)) { case ID_AA64PFR0_EL1_64: printf("%sEL1", SEP_STR); break; case ID_AA64PFR0_EL1_64_32: printf("%sEL1 32", SEP_STR); break; default: printf("%sUnknown EL1", SEP_STR); break; } switch (ID_AA64PFR0_EL0(cpu_desc[cpu].id_aa64pfr0)) { case ID_AA64PFR0_EL0_64: printf("%sEL0", SEP_STR); break; case ID_AA64PFR0_EL0_64_32: printf("%sEL0 32", SEP_STR); break; default: printf("%sUnknown EL0", SEP_STR); break; } if ((cpu_desc[cpu].id_aa64pfr0 & ~ID_AA64PFR0_MASK) != 0) printf("%s%#lx", SEP_STR, cpu_desc[cpu].id_aa64pfr0 & ~ID_AA64PFR0_MASK); printf(">\n"); } /* AArch64 Processor Feature Register 1 */ if (cpu == 0 || (cpu_print_regs & PRINT_ID_AA64_PFR1) != 0) { printf(" Processor Features 1 = <%#lx>\n", cpu_desc[cpu].id_aa64pfr1); } /* AArch64 Memory Model Feature Register 0 */ if (cpu == 0 || (cpu_print_regs & PRINT_ID_AA64_MMFR0) != 0) { printed = 0; printf(" Memory Model Features 0 = <"); switch (ID_AA64MMFR0_TGRAN4(cpu_desc[cpu].id_aa64mmfr0)) { case ID_AA64MMFR0_TGRAN4_NONE: break; case ID_AA64MMFR0_TGRAN4_IMPL: printf("%s4k Granule", SEP_STR); break; default: printf("%sUnknown 4k Granule", SEP_STR); break; } switch (ID_AA64MMFR0_TGRAN16(cpu_desc[cpu].id_aa64mmfr0)) { case ID_AA64MMFR0_TGRAN16_NONE: break; case ID_AA64MMFR0_TGRAN16_IMPL: printf("%s16k Granule", SEP_STR); break; default: printf("%sUnknown 16k Granule", SEP_STR); break; } switch (ID_AA64MMFR0_TGRAN64(cpu_desc[cpu].id_aa64mmfr0)) { case ID_AA64MMFR0_TGRAN64_NONE: break; case ID_AA64MMFR0_TGRAN64_IMPL: printf("%s64k Granule", SEP_STR); break; default: printf("%sUnknown 64k Granule", SEP_STR); break; } switch (ID_AA64MMFR0_BIGEND(cpu_desc[cpu].id_aa64mmfr0)) { case ID_AA64MMFR0_BIGEND_FIXED: break; case ID_AA64MMFR0_BIGEND_MIXED: printf("%sMixedEndian", SEP_STR); break; default: printf("%sUnknown Endian switching", SEP_STR); break; } switch (ID_AA64MMFR0_BIGEND_EL0(cpu_desc[cpu].id_aa64mmfr0)) { case ID_AA64MMFR0_BIGEND_EL0_FIXED: break; case ID_AA64MMFR0_BIGEND_EL0_MIXED: printf("%sEL0 MixEndian", SEP_STR); break; default: printf("%sUnknown EL0 Endian switching", SEP_STR); break; } switch (ID_AA64MMFR0_S_NS_MEM(cpu_desc[cpu].id_aa64mmfr0)) { case ID_AA64MMFR0_S_NS_MEM_NONE: break; case ID_AA64MMFR0_S_NS_MEM_DISTINCT: printf("%sS/NS Mem", SEP_STR); break; default: printf("%sUnknown S/NS Mem", SEP_STR); break; } switch (ID_AA64MMFR0_ASID_BITS(cpu_desc[cpu].id_aa64mmfr0)) { case ID_AA64MMFR0_ASID_BITS_8: printf("%s8bit ASID", SEP_STR); break; case ID_AA64MMFR0_ASID_BITS_16: printf("%s16bit ASID", SEP_STR); break; default: printf("%sUnknown ASID", SEP_STR); break; } switch (ID_AA64MMFR0_PA_RANGE(cpu_desc[cpu].id_aa64mmfr0)) { case ID_AA64MMFR0_PA_RANGE_4G: printf("%s4GB PA", SEP_STR); break; case ID_AA64MMFR0_PA_RANGE_64G: printf("%s64GB PA", SEP_STR); break; case ID_AA64MMFR0_PA_RANGE_1T: printf("%s1TB PA", SEP_STR); break; case ID_AA64MMFR0_PA_RANGE_4T: printf("%s4TB PA", SEP_STR); break; case ID_AA64MMFR0_PA_RANGE_16T: printf("%s16TB PA", SEP_STR); break; case ID_AA64MMFR0_PA_RANGE_256T: printf("%s256TB PA", SEP_STR); break; default: printf("%sUnknown PA Range", SEP_STR); break; } if ((cpu_desc[cpu].id_aa64mmfr0 & ~ID_AA64MMFR0_MASK) != 0) printf("%s%#lx", SEP_STR, cpu_desc[cpu].id_aa64mmfr0 & ~ID_AA64MMFR0_MASK); printf(">\n"); } /* AArch64 Memory Model Feature Register 1 */ if (cpu == 0 || (cpu_print_regs & PRINT_ID_AA64_MMFR1) != 0) { printed = 0; printf(" Memory Model Features 1 = <"); switch (ID_AA64MMFR1_PAN(cpu_desc[cpu].id_aa64mmfr1)) { case ID_AA64MMFR1_PAN_NONE: break; case ID_AA64MMFR1_PAN_IMPL: printf("%sPAN", SEP_STR); break; default: printf("%sUnknown PAN", SEP_STR); break; } switch (ID_AA64MMFR1_LO(cpu_desc[cpu].id_aa64mmfr1)) { case ID_AA64MMFR1_LO_NONE: break; case ID_AA64MMFR1_LO_IMPL: printf("%sLO", SEP_STR); break; default: printf("%sUnknown LO", SEP_STR); break; } switch (ID_AA64MMFR1_HPDS(cpu_desc[cpu].id_aa64mmfr1)) { case ID_AA64MMFR1_HPDS_NONE: break; case ID_AA64MMFR1_HPDS_IMPL: printf("%sHPDS", SEP_STR); break; default: printf("%sUnknown HPDS", SEP_STR); break; } switch (ID_AA64MMFR1_VH(cpu_desc[cpu].id_aa64mmfr1)) { case ID_AA64MMFR1_VH_NONE: break; case ID_AA64MMFR1_VH_IMPL: printf("%sVHE", SEP_STR); break; default: printf("%sUnknown VHE", SEP_STR); break; } switch (ID_AA64MMFR1_VMIDBITS(cpu_desc[cpu].id_aa64mmfr1)) { case ID_AA64MMFR1_VMIDBITS_8: break; case ID_AA64MMFR1_VMIDBITS_16: printf("%s16 VMID bits", SEP_STR); break; default: printf("%sUnknown VMID bits", SEP_STR); break; } switch (ID_AA64MMFR1_HAFDBS(cpu_desc[cpu].id_aa64mmfr1)) { case ID_AA64MMFR1_HAFDBS_NONE: break; case ID_AA64MMFR1_HAFDBS_AF: printf("%sAF", SEP_STR); break; case ID_AA64MMFR1_HAFDBS_AF_DBS: printf("%sAF+DBS", SEP_STR); break; default: printf("%sUnknown Hardware update AF/DBS", SEP_STR); break; } if ((cpu_desc[cpu].id_aa64mmfr1 & ~ID_AA64MMFR1_MASK) != 0) printf("%s%#lx", SEP_STR, cpu_desc[cpu].id_aa64mmfr1 & ~ID_AA64MMFR1_MASK); printf(">\n"); } /* AArch64 Debug Feature Register 0 */ if (cpu == 0 || (cpu_print_regs & PRINT_ID_AA64_DFR0) != 0) { printed = 0; printf(" Debug Features 0 = <"); printf("%s%lu CTX Breakpoints", SEP_STR, ID_AA64DFR0_CTX_CMPS(cpu_desc[cpu].id_aa64dfr0)); printf("%s%lu Watchpoints", SEP_STR, ID_AA64DFR0_WRPS(cpu_desc[cpu].id_aa64dfr0)); printf("%s%lu Breakpoints", SEP_STR, ID_AA64DFR0_BRPS(cpu_desc[cpu].id_aa64dfr0)); switch (ID_AA64DFR0_PMU_VER(cpu_desc[cpu].id_aa64dfr0)) { case ID_AA64DFR0_PMU_VER_NONE: break; case ID_AA64DFR0_PMU_VER_3: printf("%sPMUv3", SEP_STR); break; case ID_AA64DFR0_PMU_VER_3_1: printf("%sPMUv3+16 bit evtCount", SEP_STR); break; case ID_AA64DFR0_PMU_VER_IMPL: printf("%sImplementation defined PMU", SEP_STR); break; default: printf("%sUnknown PMU", SEP_STR); break; } switch (ID_AA64DFR0_TRACE_VER(cpu_desc[cpu].id_aa64dfr0)) { case ID_AA64DFR0_TRACE_VER_NONE: break; case ID_AA64DFR0_TRACE_VER_IMPL: printf("%sTrace", SEP_STR); break; default: printf("%sUnknown Trace", SEP_STR); break; } switch (ID_AA64DFR0_DEBUG_VER(cpu_desc[cpu].id_aa64dfr0)) { case ID_AA64DFR0_DEBUG_VER_8: printf("%sDebug v8", SEP_STR); break; case ID_AA64DFR0_DEBUG_VER_8_VHE: printf("%sDebug v8+VHE", SEP_STR); break; default: printf("%sUnknown Debug", SEP_STR); break; } if (cpu_desc[cpu].id_aa64dfr0 & ~ID_AA64DFR0_MASK) printf("%s%#lx", SEP_STR, cpu_desc[cpu].id_aa64dfr0 & ~ID_AA64DFR0_MASK); printf(">\n"); } /* AArch64 Memory Model Feature Register 1 */ if (cpu == 0 || (cpu_print_regs & PRINT_ID_AA64_DFR1) != 0) { printf(" Debug Features 1 = <%#lx>\n", cpu_desc[cpu].id_aa64dfr1); } /* AArch64 Auxiliary Feature Register 0 */ if (cpu == 0 || (cpu_print_regs & PRINT_ID_AA64_AFR0) != 0) { printf(" Auxiliary Features 0 = <%#lx>\n", cpu_desc[cpu].id_aa64afr0); } /* AArch64 Auxiliary Feature Register 1 */ if (cpu == 0 || (cpu_print_regs & PRINT_ID_AA64_AFR1) != 0) { printf(" Auxiliary Features 1 = <%#lx>\n", cpu_desc[cpu].id_aa64afr1); } #undef SEP_STR } void identify_cpu(void) { u_int midr; u_int impl_id; u_int part_id; u_int cpu; size_t i; const struct cpu_parts *cpu_partsp = NULL; cpu = PCPU_GET(cpuid); midr = get_midr(); /* * Store midr to pcpu to allow fast reading * from EL0, EL1 and assembly code. */ PCPU_SET(midr, midr); impl_id = CPU_IMPL(midr); for (i = 0; i < nitems(cpu_implementers); i++) { if (impl_id == cpu_implementers[i].impl_id || cpu_implementers[i].impl_id == 0) { cpu_desc[cpu].cpu_impl = impl_id; cpu_desc[cpu].cpu_impl_name = cpu_implementers[i].impl_name; cpu_partsp = cpu_implementers[i].cpu_parts; break; } } part_id = CPU_PART(midr); for (i = 0; &cpu_partsp[i] != NULL; i++) { if (part_id == cpu_partsp[i].part_id || cpu_partsp[i].part_id == 0) { cpu_desc[cpu].cpu_part_num = part_id; cpu_desc[cpu].cpu_part_name = cpu_partsp[i].part_name; break; } } cpu_desc[cpu].cpu_revision = CPU_REV(midr); cpu_desc[cpu].cpu_variant = CPU_VAR(midr); /* Save affinity for current CPU */ cpu_desc[cpu].mpidr = get_mpidr(); CPU_AFFINITY(cpu) = cpu_desc[cpu].mpidr & CPU_AFF_MASK; cpu_desc[cpu].id_aa64dfr0 = READ_SPECIALREG(id_aa64dfr0_el1); cpu_desc[cpu].id_aa64dfr1 = READ_SPECIALREG(id_aa64dfr1_el1); cpu_desc[cpu].id_aa64isar0 = READ_SPECIALREG(id_aa64isar0_el1); cpu_desc[cpu].id_aa64isar1 = READ_SPECIALREG(id_aa64isar1_el1); cpu_desc[cpu].id_aa64mmfr0 = READ_SPECIALREG(id_aa64mmfr0_el1); cpu_desc[cpu].id_aa64mmfr1 = READ_SPECIALREG(id_aa64mmfr1_el1); cpu_desc[cpu].id_aa64pfr0 = READ_SPECIALREG(id_aa64pfr0_el1); cpu_desc[cpu].id_aa64pfr1 = READ_SPECIALREG(id_aa64pfr1_el1); if (cpu != 0) { /* * This code must run on one cpu at a time, but we are * not scheduling on the current core so implement a * simple spinlock. */ while (atomic_cmpset_acq_int(&ident_lock, 0, 1) == 0) __asm __volatile("wfe" ::: "memory"); switch (cpu_aff_levels) { case 0: if (CPU_AFF0(cpu_desc[cpu].mpidr) != CPU_AFF0(cpu_desc[0].mpidr)) cpu_aff_levels = 1; /* FALLTHROUGH */ case 1: if (CPU_AFF1(cpu_desc[cpu].mpidr) != CPU_AFF1(cpu_desc[0].mpidr)) cpu_aff_levels = 2; /* FALLTHROUGH */ case 2: if (CPU_AFF2(cpu_desc[cpu].mpidr) != CPU_AFF2(cpu_desc[0].mpidr)) cpu_aff_levels = 3; /* FALLTHROUGH */ case 3: if (CPU_AFF3(cpu_desc[cpu].mpidr) != CPU_AFF3(cpu_desc[0].mpidr)) cpu_aff_levels = 4; break; } if (cpu_desc[cpu].id_aa64afr0 != cpu_desc[0].id_aa64afr0) cpu_print_regs |= PRINT_ID_AA64_AFR0; if (cpu_desc[cpu].id_aa64afr1 != cpu_desc[0].id_aa64afr1) cpu_print_regs |= PRINT_ID_AA64_AFR1; if (cpu_desc[cpu].id_aa64dfr0 != cpu_desc[0].id_aa64dfr0) cpu_print_regs |= PRINT_ID_AA64_DFR0; if (cpu_desc[cpu].id_aa64dfr1 != cpu_desc[0].id_aa64dfr1) cpu_print_regs |= PRINT_ID_AA64_DFR1; if (cpu_desc[cpu].id_aa64isar0 != cpu_desc[0].id_aa64isar0) cpu_print_regs |= PRINT_ID_AA64_ISAR0; if (cpu_desc[cpu].id_aa64isar1 != cpu_desc[0].id_aa64isar1) cpu_print_regs |= PRINT_ID_AA64_ISAR1; if (cpu_desc[cpu].id_aa64mmfr0 != cpu_desc[0].id_aa64mmfr0) cpu_print_regs |= PRINT_ID_AA64_MMFR0; if (cpu_desc[cpu].id_aa64mmfr1 != cpu_desc[0].id_aa64mmfr1) cpu_print_regs |= PRINT_ID_AA64_MMFR1; if (cpu_desc[cpu].id_aa64pfr0 != cpu_desc[0].id_aa64pfr0) cpu_print_regs |= PRINT_ID_AA64_PFR0; if (cpu_desc[cpu].id_aa64pfr1 != cpu_desc[0].id_aa64pfr1) cpu_print_regs |= PRINT_ID_AA64_PFR1; /* Wake up the other CPUs */ atomic_store_rel_int(&ident_lock, 0); __asm __volatile("sev" ::: "memory"); } } Index: head/sys/cam/cam.h =================================================================== --- head/sys/cam/cam.h (revision 308456) +++ head/sys/cam/cam.h (revision 308457) @@ -1,413 +1,413 @@ /*- * Data structures and definitions for the CAM system. * * Copyright (c) 1997 Justin T. Gibbs. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions, and the following disclaimer, * without modification, immediately at the beginning of the file. * 2. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _CAM_CAM_H #define _CAM_CAM_H 1 #ifdef _KERNEL #include #endif #include typedef u_int path_id_t; typedef u_int target_id_t; typedef u_int64_t lun_id_t; #define CAM_XPT_PATH_ID ((path_id_t)~0) #define CAM_BUS_WILDCARD ((path_id_t)~0) #define CAM_TARGET_WILDCARD ((target_id_t)~0) #define CAM_LUN_WILDCARD (~(u_int)0) #define CAM_EXTLUN_BYTE_SWIZZLE(lun) ( \ ((((u_int64_t)lun) & 0xffff000000000000L) >> 48) | \ ((((u_int64_t)lun) & 0x0000ffff00000000L) >> 16) | \ ((((u_int64_t)lun) & 0x00000000ffff0000L) << 16) | \ ((((u_int64_t)lun) & 0x000000000000ffffL) << 48)) /* * Maximum length for a CAM CDB. */ #define CAM_MAX_CDBLEN 16 /* * Definition of a CAM peripheral driver entry. Peripheral drivers instantiate * one of these for each device they wish to communicate with and pass it into * the xpt layer when they wish to schedule work on that device via the * xpt_schedule API. */ struct cam_periph; /* * Priority information for a CAM structure. */ typedef enum { CAM_RL_HOST, CAM_RL_BUS, CAM_RL_XPT, CAM_RL_DEV, CAM_RL_NORMAL, CAM_RL_VALUES } cam_rl; /* * The generation number is incremented every time a new entry is entered into * the queue giving round robin per priority level scheduling. */ typedef struct { u_int32_t priority; #define CAM_PRIORITY_HOST ((CAM_RL_HOST << 8) + 0x80) #define CAM_PRIORITY_BUS ((CAM_RL_BUS << 8) + 0x80) #define CAM_PRIORITY_XPT ((CAM_RL_XPT << 8) + 0x80) #define CAM_PRIORITY_DEV ((CAM_RL_DEV << 8) + 0x80) #define CAM_PRIORITY_OOB (CAM_RL_DEV << 8) #define CAM_PRIORITY_NORMAL ((CAM_RL_NORMAL << 8) + 0x80) #define CAM_PRIORITY_NONE (u_int32_t)-1 u_int32_t generation; int index; #define CAM_UNQUEUED_INDEX -1 #define CAM_ACTIVE_INDEX -2 #define CAM_DONEQ_INDEX -3 #define CAM_EXTRAQ_INDEX INT_MAX } cam_pinfo; /* * Macro to compare two generation numbers. It is used like this: * * if (GENERATIONCMP(a, >=, b)) * ...; * * GERERATIONCMP uses modular arithmetic to guard against wraps * wraps in the generation number. */ #define GENERATIONCMP(x, op, y) ((int32_t)((x) - (y)) op 0) /* CAM flags XXX Move to cam_periph.h ??? */ typedef enum { CAM_FLAG_NONE = 0x00, CAM_EXPECT_INQ_CHANGE = 0x01, CAM_RETRY_SELTO = 0x02 /* Retry Selection Timeouts */ } cam_flags; enum { SF_RETRY_UA = 0x01, /* Retry UNIT ATTENTION conditions. */ SF_NO_PRINT = 0x02, /* Never print error status. */ SF_QUIET_IR = 0x04, /* Be quiet about Illegal Request responses */ SF_PRINT_ALWAYS = 0x08, /* Always print error status. */ SF_NO_RECOVERY = 0x10, /* Don't do active error recovery. */ SF_NO_RETRY = 0x20, /* Don't do any retries. */ SF_RETRY_BUSY = 0x40 /* Retry BUSY status. */ }; /* CAM Status field values */ typedef enum { /* CCB request is in progress */ CAM_REQ_INPROG = 0x00, /* CCB request completed without error */ CAM_REQ_CMP = 0x01, /* CCB request aborted by the host */ CAM_REQ_ABORTED = 0x02, /* Unable to abort CCB request */ CAM_UA_ABORT = 0x03, /* CCB request completed with an error */ CAM_REQ_CMP_ERR = 0x04, /* CAM subsystem is busy */ CAM_BUSY = 0x05, /* CCB request was invalid */ CAM_REQ_INVALID = 0x06, /* Supplied Path ID is invalid */ CAM_PATH_INVALID = 0x07, /* SCSI Device Not Installed/there */ CAM_DEV_NOT_THERE = 0x08, /* Unable to terminate I/O CCB request */ CAM_UA_TERMIO = 0x09, /* Target Selection Timeout */ CAM_SEL_TIMEOUT = 0x0a, /* Command timeout */ CAM_CMD_TIMEOUT = 0x0b, /* SCSI error, look at error code in CCB */ CAM_SCSI_STATUS_ERROR = 0x0c, /* Message Reject Received */ CAM_MSG_REJECT_REC = 0x0d, /* SCSI Bus Reset Sent/Received */ CAM_SCSI_BUS_RESET = 0x0e, /* Uncorrectable parity error occurred */ CAM_UNCOR_PARITY = 0x0f, /* Autosense: request sense cmd fail */ CAM_AUTOSENSE_FAIL = 0x10, /* No HBA Detected error */ CAM_NO_HBA = 0x11, /* Data Overrun error */ CAM_DATA_RUN_ERR = 0x12, /* Unexpected Bus Free */ CAM_UNEXP_BUSFREE = 0x13, /* Target Bus Phase Sequence Failure */ CAM_SEQUENCE_FAIL = 0x14, /* CCB length supplied is inadequate */ CAM_CCB_LEN_ERR = 0x15, /* Unable to provide requested capability*/ CAM_PROVIDE_FAIL = 0x16, /* A SCSI BDR msg was sent to target */ CAM_BDR_SENT = 0x17, /* CCB request terminated by the host */ CAM_REQ_TERMIO = 0x18, /* Unrecoverable Host Bus Adapter Error */ CAM_UNREC_HBA_ERROR = 0x19, /* Request was too large for this host */ CAM_REQ_TOO_BIG = 0x1a, /* * This request should be requeued to preserve * transaction ordering. This typically occurs * when the SIM recognizes an error that should * freeze the queue and must place additional * requests for the target at the sim level * back into the XPT queue. */ CAM_REQUEUE_REQ = 0x1b, /* ATA error, look at error code in CCB */ CAM_ATA_STATUS_ERROR = 0x1c, /* Initiator/Target Nexus lost. */ CAM_SCSI_IT_NEXUS_LOST = 0x1d, /* SMP error, look at error code in CCB */ CAM_SMP_STATUS_ERROR = 0x1e, /* * Command completed without error but exceeded the soft * timeout threshold. */ CAM_REQ_SOFTTIMEOUT = 0x1f, /* * 0x20 - 0x32 are unassigned */ /* Initiator Detected Error */ CAM_IDE = 0x33, /* Resource Unavailable */ CAM_RESRC_UNAVAIL = 0x34, /* Unacknowledged Event by Host */ CAM_UNACKED_EVENT = 0x35, /* Message Received in Host Target Mode */ CAM_MESSAGE_RECV = 0x36, /* Invalid CDB received in Host Target Mode */ CAM_INVALID_CDB = 0x37, /* Lun supplied is invalid */ CAM_LUN_INVALID = 0x38, /* Target ID supplied is invalid */ CAM_TID_INVALID = 0x39, /* The requested function is not available */ CAM_FUNC_NOTAVAIL = 0x3a, /* Nexus is not established */ CAM_NO_NEXUS = 0x3b, /* The initiator ID is invalid */ CAM_IID_INVALID = 0x3c, /* The SCSI CDB has been received */ CAM_CDB_RECVD = 0x3d, /* The LUN is already enabled for target mode */ CAM_LUN_ALRDY_ENA = 0x3e, /* SCSI Bus Busy */ CAM_SCSI_BUSY = 0x3f, /* * Flags */ /* The DEV queue is frozen w/this err */ CAM_DEV_QFRZN = 0x40, /* Autosense data valid for target */ CAM_AUTOSNS_VALID = 0x80, /* SIM ready to take more commands */ CAM_RELEASE_SIMQ = 0x100, - /* SIM has this command in it's queue */ + /* SIM has this command in its queue */ CAM_SIM_QUEUED = 0x200, /* Quality of service data is valid */ CAM_QOS_VALID = 0x400, /* Mask bits for just the status # */ CAM_STATUS_MASK = 0x3F, /* * Target Specific Adjunct Status */ /* sent sense with status */ CAM_SENT_SENSE = 0x40000000 } cam_status; typedef enum { CAM_ESF_NONE = 0x00, CAM_ESF_COMMAND = 0x01, CAM_ESF_CAM_STATUS = 0x02, CAM_ESF_PROTO_STATUS = 0x04, CAM_ESF_ALL = 0xff } cam_error_string_flags; typedef enum { CAM_EPF_NONE = 0x00, CAM_EPF_MINIMAL = 0x01, CAM_EPF_NORMAL = 0x02, CAM_EPF_ALL = 0x03, CAM_EPF_LEVEL_MASK = 0x0f /* All bits above bit 3 are protocol-specific */ } cam_error_proto_flags; typedef enum { CAM_ESF_PRINT_NONE = 0x00, CAM_ESF_PRINT_STATUS = 0x10, CAM_ESF_PRINT_SENSE = 0x20 } cam_error_scsi_flags; typedef enum { CAM_ESMF_PRINT_NONE = 0x00, CAM_ESMF_PRINT_STATUS = 0x10, CAM_ESMF_PRINT_FULL_CMD = 0x20, } cam_error_smp_flags; typedef enum { CAM_EAF_PRINT_NONE = 0x00, CAM_EAF_PRINT_STATUS = 0x10, CAM_EAF_PRINT_RESULT = 0x20 } cam_error_ata_flags; typedef enum { CAM_STRVIS_FLAG_NONE = 0x00, CAM_STRVIS_FLAG_NONASCII_MASK = 0x03, CAM_STRVIS_FLAG_NONASCII_TRIM = 0x00, CAM_STRVIS_FLAG_NONASCII_RAW = 0x01, CAM_STRVIS_FLAG_NONASCII_SPC = 0x02, CAM_STRVIS_FLAG_NONASCII_ESC = 0x03 } cam_strvis_flags; struct cam_status_entry { cam_status status_code; const char *status_text; }; extern const struct cam_status_entry cam_status_table[]; extern const int num_cam_status_entries; #ifdef _KERNEL extern int cam_sort_io_queues; #endif union ccb; struct sbuf; #ifdef SYSCTL_DECL /* from sysctl.h */ SYSCTL_DECL(_kern_cam); #endif __BEGIN_DECLS typedef int (cam_quirkmatch_t)(caddr_t, caddr_t); caddr_t cam_quirkmatch(caddr_t target, caddr_t quirk_table, int num_entries, int entry_size, cam_quirkmatch_t *comp_func); void cam_strvis(u_int8_t *dst, const u_int8_t *src, int srclen, int dstlen); void cam_strvis_sbuf(struct sbuf *sb, const u_int8_t *src, int srclen, uint32_t flags); int cam_strmatch(const u_int8_t *str, const u_int8_t *pattern, int str_len); const struct cam_status_entry* cam_fetch_status_entry(cam_status status); #ifdef _KERNEL char * cam_error_string(union ccb *ccb, char *str, int str_len, cam_error_string_flags flags, cam_error_proto_flags proto_flags); void cam_error_print(union ccb *ccb, cam_error_string_flags flags, cam_error_proto_flags proto_flags); #else /* _KERNEL */ struct cam_device; char * cam_error_string(struct cam_device *device, union ccb *ccb, char *str, int str_len, cam_error_string_flags flags, cam_error_proto_flags proto_flags); void cam_error_print(struct cam_device *device, union ccb *ccb, cam_error_string_flags flags, cam_error_proto_flags proto_flags, FILE *ofile); #endif /* _KERNEL */ __END_DECLS #ifdef _KERNEL static __inline void cam_init_pinfo(cam_pinfo *pinfo); static __inline void cam_init_pinfo(cam_pinfo *pinfo) { pinfo->priority = CAM_PRIORITY_NONE; pinfo->index = CAM_UNQUEUED_INDEX; } #endif #endif /* _CAM_CAM_H */ Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c =================================================================== --- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c (revision 308456) +++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c (revision 308457) @@ -1,7425 +1,7425 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, Joyent, Inc. All rights reserved. * Copyright (c) 2011, 2016 by Delphix. All rights reserved. * Copyright (c) 2014 by Saso Kiselkov. All rights reserved. * Copyright 2015 Nexenta Systems, Inc. All rights reserved. */ /* * DVA-based Adjustable Replacement Cache * * While much of the theory of operation used here is * based on the self-tuning, low overhead replacement cache * presented by Megiddo and Modha at FAST 2003, there are some * significant differences: * * 1. The Megiddo and Modha model assumes any page is evictable. * Pages in its cache cannot be "locked" into memory. This makes * the eviction algorithm simple: evict the last page in the list. * This also make the performance characteristics easy to reason * about. Our cache is not so simple. At any given moment, some * subset of the blocks in the cache are un-evictable because we * have handed out a reference to them. Blocks are only evictable * when there are no external references active. This makes * eviction far more problematic: we choose to evict the evictable * blocks that are the "lowest" in the list. * * There are times when it is not possible to evict the requested * space. In these circumstances we are unable to adjust the cache * size. To prevent the cache growing unbounded at these times we * implement a "cache throttle" that slows the flow of new data * into the cache until we can make space available. * * 2. The Megiddo and Modha model assumes a fixed cache size. * Pages are evicted when the cache is full and there is a cache * miss. Our model has a variable sized cache. It grows with * high use, but also tries to react to memory pressure from the * operating system: decreasing its size when system memory is * tight. * * 3. The Megiddo and Modha model assumes a fixed page size. All * elements of the cache are therefore exactly the same size. So * when adjusting the cache size following a cache miss, its simply * a matter of choosing a single page to evict. In our model, we * have variable sized cache blocks (rangeing from 512 bytes to * 128K bytes). We therefore choose a set of blocks to evict to make * space for a cache miss that approximates as closely as possible * the space used by the new block. * * See also: "ARC: A Self-Tuning, Low Overhead Replacement Cache" * by N. Megiddo & D. Modha, FAST 2003 */ /* * The locking model: * * A new reference to a cache buffer can be obtained in two * ways: 1) via a hash table lookup using the DVA as a key, * or 2) via one of the ARC lists. The arc_read() interface * uses method 1, while the internal arc algorithms for * adjusting the cache use method 2. We therefore provide two * types of locks: 1) the hash table lock array, and 2) the * arc list locks. * * Buffers do not have their own mutexes, rather they rely on the * hash table mutexes for the bulk of their protection (i.e. most * fields in the arc_buf_hdr_t are protected by these mutexes). * * buf_hash_find() returns the appropriate mutex (held) when it * locates the requested buffer in the hash table. It returns * NULL for the mutex if the buffer was not in the table. * * buf_hash_remove() expects the appropriate hash mutex to be * already held before it is invoked. * * Each arc state also has a mutex which is used to protect the * buffer list associated with the state. When attempting to * obtain a hash table lock while holding an arc list lock you * must use: mutex_tryenter() to avoid deadlock. Also note that * the active state mutex must be held before the ghost state mutex. * * Arc buffers may have an associated eviction callback function. * This function will be invoked prior to removing the buffer (e.g. * in arc_do_user_evicts()). Note however that the data associated * with the buffer may be evicted prior to the callback. The callback * must be made with *no locks held* (to prevent deadlock). Additionally, * the users of callbacks must ensure that their private data is * protected from simultaneous callbacks from arc_clear_callback() * and arc_do_user_evicts(). * * Note that the majority of the performance stats are manipulated * with atomic operations. * * The L2ARC uses the l2ad_mtx on each vdev for the following: * * - L2ARC buflist creation * - L2ARC buflist eviction * - L2ARC write completion, which walks L2ARC buflists * - ARC header destruction, as it removes from L2ARC buflists * - ARC header release, as it removes from L2ARC buflists */ /* * ARC operation: * * Every block that is in the ARC is tracked by an arc_buf_hdr_t structure. * This structure can point either to a block that is still in the cache or to * one that is only accessible in an L2 ARC device, or it can provide * information about a block that was recently evicted. If a block is * only accessible in the L2ARC, then the arc_buf_hdr_t only has enough * information to retrieve it from the L2ARC device. This information is * stored in the l2arc_buf_hdr_t sub-structure of the arc_buf_hdr_t. A block * that is in this state cannot access the data directly. * * Blocks that are actively being referenced or have not been evicted * are cached in the L1ARC. The L1ARC (l1arc_buf_hdr_t) is a structure within * the arc_buf_hdr_t that will point to the data block in memory. A block can * only be read by a consumer if it has an l1arc_buf_hdr_t. The L1ARC * caches data in two ways -- in a list of arc buffers (arc_buf_t) and * also in the arc_buf_hdr_t's private physical data block pointer (b_pdata). * Each arc buffer (arc_buf_t) is being actively accessed by a specific ARC * consumer, and always contains uncompressed data. The ARC will provide * references to this data and will keep it cached until it is no longer in * use. Typically, the arc will try to cache only the L1ARC's physical data * block and will aggressively evict any arc_buf_t that is no longer referenced. * The amount of memory consumed by the arc_buf_t's can be seen via the * "overhead_size" kstat. * * * arc_buf_hdr_t * +-----------+ * | | * | | * | | * +-----------+ * l2arc_buf_hdr_t| | * | | * +-----------+ * l1arc_buf_hdr_t| | * | | arc_buf_t * | b_buf +------------>+---------+ arc_buf_t * | | |b_next +---->+---------+ * | b_pdata +-+ |---------| |b_next +-->NULL * +-----------+ | | | +---------+ * | |b_data +-+ | | * | +---------+ | |b_data +-+ * +->+------+ | +---------+ | * (potentially) | | | | * compressed | | | | * data +------+ | v * +->+------+ +------+ * uncompressed | | | | * data | | | | * +------+ +------+ * * The L1ARC's data pointer, however, may or may not be uncompressed. The * ARC has the ability to store the physical data (b_pdata) associated with * the DVA of the arc_buf_hdr_t. Since the b_pdata is a copy of the on-disk * physical block, it will match its on-disk compression characteristics. * If the block on-disk is compressed, then the physical data block * in the cache will also be compressed and vice-versa. This behavior * can be disabled by setting 'zfs_compressed_arc_enabled' to B_FALSE. When the * compressed ARC functionality is disabled, the b_pdata will point to an * uncompressed version of the on-disk data. * * When a consumer reads a block, the ARC must first look to see if the * arc_buf_hdr_t is cached. If the hdr is cached and already has an arc_buf_t, * then an additional arc_buf_t is allocated and the uncompressed data is * bcopied from the existing arc_buf_t. If the hdr is cached but does not * have an arc_buf_t, then the ARC allocates a new arc_buf_t and decompresses * the b_pdata contents into the arc_buf_t's b_data. If the arc_buf_hdr_t's * b_pdata is not compressed, then the block is shared with the newly * allocated arc_buf_t. This block sharing only occurs with one arc_buf_t * in the arc buffer chain. Sharing the block reduces the memory overhead * required when the hdr is caching uncompressed blocks or the compressed * arc functionality has been disabled via 'zfs_compressed_arc_enabled'. * * The diagram below shows an example of an uncompressed ARC hdr that is * sharing its data with an arc_buf_t: * * arc_buf_hdr_t * +-----------+ * | | * | | * | | * +-----------+ * l2arc_buf_hdr_t| | * | | * +-----------+ * l1arc_buf_hdr_t| | * | | arc_buf_t (shared) * | b_buf +------------>+---------+ arc_buf_t * | | |b_next +---->+---------+ * | b_pdata +-+ |---------| |b_next +-->NULL * +-----------+ | | | +---------+ * | |b_data +-+ | | * | +---------+ | |b_data +-+ * +->+------+ | +---------+ | * | | | | * uncompressed | | | | * data +------+ | | * ^ +->+------+ | * | uncompressed | | | * | data | | | * | +------+ | * +---------------------------------+ * * Writing to the arc requires that the ARC first discard the b_pdata * since the physical block is about to be rewritten. The new data contents * will be contained in the arc_buf_t (uncompressed). As the I/O pipeline * performs the write, it may compress the data before writing it to disk. * The ARC will be called with the transformed data and will bcopy the * transformed on-disk block into a newly allocated b_pdata. * * When the L2ARC is in use, it will also take advantage of the b_pdata. The * L2ARC will always write the contents of b_pdata to the L2ARC. This means * that when compressed arc is enabled that the L2ARC blocks are identical * to the on-disk block in the main data pool. This provides a significant * advantage since the ARC can leverage the bp's checksum when reading from the * L2ARC to determine if the contents are valid. However, if the compressed * arc is disabled, then the L2ARC's block must be transformed to look * like the physical block in the main data pool before comparing the * checksum and determining its validity. */ #include #include #include #include #include #include #include #include #include #include #include #include #ifdef _KERNEL #include #include #endif #include #include #include #include #include #include #ifdef illumos #ifndef _KERNEL /* set with ZFS_DEBUG=watch, to enable watchpoints on frozen buffers */ boolean_t arc_watch = B_FALSE; int arc_procfd; #endif #endif /* illumos */ static kmutex_t arc_reclaim_lock; static kcondvar_t arc_reclaim_thread_cv; static boolean_t arc_reclaim_thread_exit; static kcondvar_t arc_reclaim_waiters_cv; static kmutex_t arc_dnlc_evicts_lock; static kcondvar_t arc_dnlc_evicts_cv; static boolean_t arc_dnlc_evicts_thread_exit; uint_t arc_reduce_dnlc_percent = 3; /* * The number of headers to evict in arc_evict_state_impl() before * dropping the sublist lock and evicting from another sublist. A lower * value means we're more likely to evict the "correct" header (i.e. the * oldest header in the arc state), but comes with higher overhead * (i.e. more invocations of arc_evict_state_impl()). */ int zfs_arc_evict_batch_limit = 10; /* * The number of sublists used for each of the arc state lists. If this * is not set to a suitable value by the user, it will be configured to * the number of CPUs on the system in arc_init(). */ int zfs_arc_num_sublists_per_state = 0; /* number of seconds before growing cache again */ static int arc_grow_retry = 60; /* shift of arc_c for calculating overflow limit in arc_get_data_buf */ int zfs_arc_overflow_shift = 8; /* shift of arc_c for calculating both min and max arc_p */ static int arc_p_min_shift = 4; /* log2(fraction of arc to reclaim) */ static int arc_shrink_shift = 7; /* * log2(fraction of ARC which must be free to allow growing). * I.e. If there is less than arc_c >> arc_no_grow_shift free memory, * when reading a new block into the ARC, we will evict an equal-sized block * from the ARC. * * This must be less than arc_shrink_shift, so that when we shrink the ARC, * we will still not allow it to grow. */ int arc_no_grow_shift = 5; /* * minimum lifespan of a prefetch block in clock ticks * (initialized in arc_init()) */ static int arc_min_prefetch_lifespan; /* * If this percent of memory is free, don't throttle. */ int arc_lotsfree_percent = 10; static int arc_dead; extern boolean_t zfs_prefetch_disable; /* * The arc has filled available memory and has now warmed up. */ static boolean_t arc_warm; /* * These tunables are for performance analysis. */ uint64_t zfs_arc_max; uint64_t zfs_arc_min; uint64_t zfs_arc_meta_limit = 0; uint64_t zfs_arc_meta_min = 0; int zfs_arc_grow_retry = 0; int zfs_arc_shrink_shift = 0; int zfs_arc_p_min_shift = 0; uint64_t zfs_arc_average_blocksize = 8 * 1024; /* 8KB */ u_int zfs_arc_free_target = 0; /* Absolute min for arc min / max is 16MB. */ static uint64_t arc_abs_min = 16 << 20; boolean_t zfs_compressed_arc_enabled = B_TRUE; static int sysctl_vfs_zfs_arc_free_target(SYSCTL_HANDLER_ARGS); static int sysctl_vfs_zfs_arc_meta_limit(SYSCTL_HANDLER_ARGS); static int sysctl_vfs_zfs_arc_max(SYSCTL_HANDLER_ARGS); static int sysctl_vfs_zfs_arc_min(SYSCTL_HANDLER_ARGS); #if defined(__FreeBSD__) && defined(_KERNEL) static void arc_free_target_init(void *unused __unused) { zfs_arc_free_target = vm_pageout_wakeup_thresh; } SYSINIT(arc_free_target_init, SI_SUB_KTHREAD_PAGE, SI_ORDER_ANY, arc_free_target_init, NULL); TUNABLE_QUAD("vfs.zfs.arc_meta_limit", &zfs_arc_meta_limit); TUNABLE_QUAD("vfs.zfs.arc_meta_min", &zfs_arc_meta_min); TUNABLE_INT("vfs.zfs.arc_shrink_shift", &zfs_arc_shrink_shift); SYSCTL_DECL(_vfs_zfs); SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_max, CTLTYPE_U64 | CTLFLAG_RWTUN, 0, sizeof(uint64_t), sysctl_vfs_zfs_arc_max, "QU", "Maximum ARC size"); SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_min, CTLTYPE_U64 | CTLFLAG_RWTUN, 0, sizeof(uint64_t), sysctl_vfs_zfs_arc_min, "QU", "Minimum ARC size"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_average_blocksize, CTLFLAG_RDTUN, &zfs_arc_average_blocksize, 0, "ARC average blocksize"); SYSCTL_INT(_vfs_zfs, OID_AUTO, arc_shrink_shift, CTLFLAG_RW, &arc_shrink_shift, 0, "log2(fraction of arc to reclaim)"); SYSCTL_INT(_vfs_zfs, OID_AUTO, compressed_arc_enabled, CTLFLAG_RDTUN, &zfs_compressed_arc_enabled, 0, "Enable compressed ARC"); /* * We don't have a tunable for arc_free_target due to the dependency on * pagedaemon initialisation. */ SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_free_target, CTLTYPE_UINT | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(u_int), sysctl_vfs_zfs_arc_free_target, "IU", "Desired number of free pages below which ARC triggers reclaim"); static int sysctl_vfs_zfs_arc_free_target(SYSCTL_HANDLER_ARGS) { u_int val; int err; val = zfs_arc_free_target; err = sysctl_handle_int(oidp, &val, 0, req); if (err != 0 || req->newptr == NULL) return (err); if (val < minfree) return (EINVAL); if (val > vm_cnt.v_page_count) return (EINVAL); zfs_arc_free_target = val; return (0); } /* * Must be declared here, before the definition of corresponding kstat * macro which uses the same names will confuse the compiler. */ SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_meta_limit, CTLTYPE_U64 | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(uint64_t), sysctl_vfs_zfs_arc_meta_limit, "QU", "ARC metadata limit"); #endif /* * Note that buffers can be in one of 6 states: * ARC_anon - anonymous (discussed below) * ARC_mru - recently used, currently cached * ARC_mru_ghost - recentely used, no longer in cache * ARC_mfu - frequently used, currently cached * ARC_mfu_ghost - frequently used, no longer in cache * ARC_l2c_only - exists in L2ARC but not other states * When there are no active references to the buffer, they are * are linked onto a list in one of these arc states. These are * the only buffers that can be evicted or deleted. Within each * state there are multiple lists, one for meta-data and one for * non-meta-data. Meta-data (indirect blocks, blocks of dnodes, * etc.) is tracked separately so that it can be managed more * explicitly: favored over data, limited explicitly. * * Anonymous buffers are buffers that are not associated with * a DVA. These are buffers that hold dirty block copies * before they are written to stable storage. By definition, * they are "ref'd" and are considered part of arc_mru * that cannot be freed. Generally, they will aquire a DVA * as they are written and migrate onto the arc_mru list. * * The ARC_l2c_only state is for buffers that are in the second * level ARC but no longer in any of the ARC_m* lists. The second * level ARC itself may also contain buffers that are in any of * the ARC_m* states - meaning that a buffer can exist in two * places. The reason for the ARC_l2c_only state is to keep the * buffer header in the hash table, so that reads that hit the * second level ARC benefit from these fast lookups. */ typedef struct arc_state { /* * list of evictable buffers */ multilist_t arcs_list[ARC_BUFC_NUMTYPES]; /* * total amount of evictable data in this state */ refcount_t arcs_esize[ARC_BUFC_NUMTYPES]; /* * total amount of data in this state; this includes: evictable, * non-evictable, ARC_BUFC_DATA, and ARC_BUFC_METADATA. */ refcount_t arcs_size; } arc_state_t; /* The 6 states: */ static arc_state_t ARC_anon; static arc_state_t ARC_mru; static arc_state_t ARC_mru_ghost; static arc_state_t ARC_mfu; static arc_state_t ARC_mfu_ghost; static arc_state_t ARC_l2c_only; typedef struct arc_stats { kstat_named_t arcstat_hits; kstat_named_t arcstat_misses; kstat_named_t arcstat_demand_data_hits; kstat_named_t arcstat_demand_data_misses; kstat_named_t arcstat_demand_metadata_hits; kstat_named_t arcstat_demand_metadata_misses; kstat_named_t arcstat_prefetch_data_hits; kstat_named_t arcstat_prefetch_data_misses; kstat_named_t arcstat_prefetch_metadata_hits; kstat_named_t arcstat_prefetch_metadata_misses; kstat_named_t arcstat_mru_hits; kstat_named_t arcstat_mru_ghost_hits; kstat_named_t arcstat_mfu_hits; kstat_named_t arcstat_mfu_ghost_hits; kstat_named_t arcstat_allocated; kstat_named_t arcstat_deleted; /* * Number of buffers that could not be evicted because the hash lock * was held by another thread. The lock may not necessarily be held * by something using the same buffer, since hash locks are shared * by multiple buffers. */ kstat_named_t arcstat_mutex_miss; /* * Number of buffers skipped because they have I/O in progress, are * indrect prefetch buffers that have not lived long enough, or are * not from the spa we're trying to evict from. */ kstat_named_t arcstat_evict_skip; /* * Number of times arc_evict_state() was unable to evict enough * buffers to reach it's target amount. */ kstat_named_t arcstat_evict_not_enough; kstat_named_t arcstat_evict_l2_cached; kstat_named_t arcstat_evict_l2_eligible; kstat_named_t arcstat_evict_l2_ineligible; kstat_named_t arcstat_evict_l2_skip; kstat_named_t arcstat_hash_elements; kstat_named_t arcstat_hash_elements_max; kstat_named_t arcstat_hash_collisions; kstat_named_t arcstat_hash_chains; kstat_named_t arcstat_hash_chain_max; kstat_named_t arcstat_p; kstat_named_t arcstat_c; kstat_named_t arcstat_c_min; kstat_named_t arcstat_c_max; kstat_named_t arcstat_size; /* * Number of compressed bytes stored in the arc_buf_hdr_t's b_pdata. * Note that the compressed bytes may match the uncompressed bytes * if the block is either not compressed or compressed arc is disabled. */ kstat_named_t arcstat_compressed_size; /* * Uncompressed size of the data stored in b_pdata. If compressed * arc is disabled then this value will be identical to the stat * above. */ kstat_named_t arcstat_uncompressed_size; /* * Number of bytes stored in all the arc_buf_t's. This is classified * as "overhead" since this data is typically short-lived and will * be evicted from the arc when it becomes unreferenced unless the * zfs_keep_uncompressed_metadata or zfs_keep_uncompressed_level * values have been set (see comment in dbuf.c for more information). */ kstat_named_t arcstat_overhead_size; /* * Number of bytes consumed by internal ARC structures necessary * for tracking purposes; these structures are not actually * backed by ARC buffers. This includes arc_buf_hdr_t structures * (allocated via arc_buf_hdr_t_full and arc_buf_hdr_t_l2only * caches), and arc_buf_t structures (allocated via arc_buf_t * cache). */ kstat_named_t arcstat_hdr_size; /* * Number of bytes consumed by ARC buffers of type equal to * ARC_BUFC_DATA. This is generally consumed by buffers backing * on disk user data (e.g. plain file contents). */ kstat_named_t arcstat_data_size; /* * Number of bytes consumed by ARC buffers of type equal to * ARC_BUFC_METADATA. This is generally consumed by buffers * backing on disk data that is used for internal ZFS * structures (e.g. ZAP, dnode, indirect blocks, etc). */ kstat_named_t arcstat_metadata_size; /* * Number of bytes consumed by various buffers and structures * not actually backed with ARC buffers. This includes bonus * buffers (allocated directly via zio_buf_* functions), * dmu_buf_impl_t structures (allocated via dmu_buf_impl_t * cache), and dnode_t structures (allocated via dnode_t cache). */ kstat_named_t arcstat_other_size; /* * Total number of bytes consumed by ARC buffers residing in the * arc_anon state. This includes *all* buffers in the arc_anon * state; e.g. data, metadata, evictable, and unevictable buffers * are all included in this value. */ kstat_named_t arcstat_anon_size; /* * Number of bytes consumed by ARC buffers that meet the * following criteria: backing buffers of type ARC_BUFC_DATA, * residing in the arc_anon state, and are eligible for eviction * (e.g. have no outstanding holds on the buffer). */ kstat_named_t arcstat_anon_evictable_data; /* * Number of bytes consumed by ARC buffers that meet the * following criteria: backing buffers of type ARC_BUFC_METADATA, * residing in the arc_anon state, and are eligible for eviction * (e.g. have no outstanding holds on the buffer). */ kstat_named_t arcstat_anon_evictable_metadata; /* * Total number of bytes consumed by ARC buffers residing in the * arc_mru state. This includes *all* buffers in the arc_mru * state; e.g. data, metadata, evictable, and unevictable buffers * are all included in this value. */ kstat_named_t arcstat_mru_size; /* * Number of bytes consumed by ARC buffers that meet the * following criteria: backing buffers of type ARC_BUFC_DATA, * residing in the arc_mru state, and are eligible for eviction * (e.g. have no outstanding holds on the buffer). */ kstat_named_t arcstat_mru_evictable_data; /* * Number of bytes consumed by ARC buffers that meet the * following criteria: backing buffers of type ARC_BUFC_METADATA, * residing in the arc_mru state, and are eligible for eviction * (e.g. have no outstanding holds on the buffer). */ kstat_named_t arcstat_mru_evictable_metadata; /* * Total number of bytes that *would have been* consumed by ARC * buffers in the arc_mru_ghost state. The key thing to note * here, is the fact that this size doesn't actually indicate * RAM consumption. The ghost lists only consist of headers and * don't actually have ARC buffers linked off of these headers. * Thus, *if* the headers had associated ARC buffers, these * buffers *would have* consumed this number of bytes. */ kstat_named_t arcstat_mru_ghost_size; /* * Number of bytes that *would have been* consumed by ARC * buffers that are eligible for eviction, of type * ARC_BUFC_DATA, and linked off the arc_mru_ghost state. */ kstat_named_t arcstat_mru_ghost_evictable_data; /* * Number of bytes that *would have been* consumed by ARC * buffers that are eligible for eviction, of type * ARC_BUFC_METADATA, and linked off the arc_mru_ghost state. */ kstat_named_t arcstat_mru_ghost_evictable_metadata; /* * Total number of bytes consumed by ARC buffers residing in the * arc_mfu state. This includes *all* buffers in the arc_mfu * state; e.g. data, metadata, evictable, and unevictable buffers * are all included in this value. */ kstat_named_t arcstat_mfu_size; /* * Number of bytes consumed by ARC buffers that are eligible for * eviction, of type ARC_BUFC_DATA, and reside in the arc_mfu * state. */ kstat_named_t arcstat_mfu_evictable_data; /* * Number of bytes consumed by ARC buffers that are eligible for * eviction, of type ARC_BUFC_METADATA, and reside in the * arc_mfu state. */ kstat_named_t arcstat_mfu_evictable_metadata; /* * Total number of bytes that *would have been* consumed by ARC * buffers in the arc_mfu_ghost state. See the comment above * arcstat_mru_ghost_size for more details. */ kstat_named_t arcstat_mfu_ghost_size; /* * Number of bytes that *would have been* consumed by ARC * buffers that are eligible for eviction, of type * ARC_BUFC_DATA, and linked off the arc_mfu_ghost state. */ kstat_named_t arcstat_mfu_ghost_evictable_data; /* * Number of bytes that *would have been* consumed by ARC * buffers that are eligible for eviction, of type * ARC_BUFC_METADATA, and linked off the arc_mru_ghost state. */ kstat_named_t arcstat_mfu_ghost_evictable_metadata; kstat_named_t arcstat_l2_hits; kstat_named_t arcstat_l2_misses; kstat_named_t arcstat_l2_feeds; kstat_named_t arcstat_l2_rw_clash; kstat_named_t arcstat_l2_read_bytes; kstat_named_t arcstat_l2_write_bytes; kstat_named_t arcstat_l2_writes_sent; kstat_named_t arcstat_l2_writes_done; kstat_named_t arcstat_l2_writes_error; kstat_named_t arcstat_l2_writes_lock_retry; kstat_named_t arcstat_l2_evict_lock_retry; kstat_named_t arcstat_l2_evict_reading; kstat_named_t arcstat_l2_evict_l1cached; kstat_named_t arcstat_l2_free_on_write; kstat_named_t arcstat_l2_abort_lowmem; kstat_named_t arcstat_l2_cksum_bad; kstat_named_t arcstat_l2_io_error; kstat_named_t arcstat_l2_size; kstat_named_t arcstat_l2_asize; kstat_named_t arcstat_l2_hdr_size; kstat_named_t arcstat_l2_padding_needed; kstat_named_t arcstat_l2_write_trylock_fail; kstat_named_t arcstat_l2_write_passed_headroom; kstat_named_t arcstat_l2_write_spa_mismatch; kstat_named_t arcstat_l2_write_in_l2; kstat_named_t arcstat_l2_write_hdr_io_in_progress; kstat_named_t arcstat_l2_write_not_cacheable; kstat_named_t arcstat_l2_write_full; kstat_named_t arcstat_l2_write_buffer_iter; kstat_named_t arcstat_l2_write_pios; kstat_named_t arcstat_l2_write_buffer_bytes_scanned; kstat_named_t arcstat_l2_write_buffer_list_iter; kstat_named_t arcstat_l2_write_buffer_list_null_iter; kstat_named_t arcstat_memory_throttle_count; kstat_named_t arcstat_meta_used; kstat_named_t arcstat_meta_limit; kstat_named_t arcstat_meta_max; kstat_named_t arcstat_meta_min; kstat_named_t arcstat_sync_wait_for_async; kstat_named_t arcstat_demand_hit_predictive_prefetch; } arc_stats_t; static arc_stats_t arc_stats = { { "hits", KSTAT_DATA_UINT64 }, { "misses", KSTAT_DATA_UINT64 }, { "demand_data_hits", KSTAT_DATA_UINT64 }, { "demand_data_misses", KSTAT_DATA_UINT64 }, { "demand_metadata_hits", KSTAT_DATA_UINT64 }, { "demand_metadata_misses", KSTAT_DATA_UINT64 }, { "prefetch_data_hits", KSTAT_DATA_UINT64 }, { "prefetch_data_misses", KSTAT_DATA_UINT64 }, { "prefetch_metadata_hits", KSTAT_DATA_UINT64 }, { "prefetch_metadata_misses", KSTAT_DATA_UINT64 }, { "mru_hits", KSTAT_DATA_UINT64 }, { "mru_ghost_hits", KSTAT_DATA_UINT64 }, { "mfu_hits", KSTAT_DATA_UINT64 }, { "mfu_ghost_hits", KSTAT_DATA_UINT64 }, { "allocated", KSTAT_DATA_UINT64 }, { "deleted", KSTAT_DATA_UINT64 }, { "mutex_miss", KSTAT_DATA_UINT64 }, { "evict_skip", KSTAT_DATA_UINT64 }, { "evict_not_enough", KSTAT_DATA_UINT64 }, { "evict_l2_cached", KSTAT_DATA_UINT64 }, { "evict_l2_eligible", KSTAT_DATA_UINT64 }, { "evict_l2_ineligible", KSTAT_DATA_UINT64 }, { "evict_l2_skip", KSTAT_DATA_UINT64 }, { "hash_elements", KSTAT_DATA_UINT64 }, { "hash_elements_max", KSTAT_DATA_UINT64 }, { "hash_collisions", KSTAT_DATA_UINT64 }, { "hash_chains", KSTAT_DATA_UINT64 }, { "hash_chain_max", KSTAT_DATA_UINT64 }, { "p", KSTAT_DATA_UINT64 }, { "c", KSTAT_DATA_UINT64 }, { "c_min", KSTAT_DATA_UINT64 }, { "c_max", KSTAT_DATA_UINT64 }, { "size", KSTAT_DATA_UINT64 }, { "compressed_size", KSTAT_DATA_UINT64 }, { "uncompressed_size", KSTAT_DATA_UINT64 }, { "overhead_size", KSTAT_DATA_UINT64 }, { "hdr_size", KSTAT_DATA_UINT64 }, { "data_size", KSTAT_DATA_UINT64 }, { "metadata_size", KSTAT_DATA_UINT64 }, { "other_size", KSTAT_DATA_UINT64 }, { "anon_size", KSTAT_DATA_UINT64 }, { "anon_evictable_data", KSTAT_DATA_UINT64 }, { "anon_evictable_metadata", KSTAT_DATA_UINT64 }, { "mru_size", KSTAT_DATA_UINT64 }, { "mru_evictable_data", KSTAT_DATA_UINT64 }, { "mru_evictable_metadata", KSTAT_DATA_UINT64 }, { "mru_ghost_size", KSTAT_DATA_UINT64 }, { "mru_ghost_evictable_data", KSTAT_DATA_UINT64 }, { "mru_ghost_evictable_metadata", KSTAT_DATA_UINT64 }, { "mfu_size", KSTAT_DATA_UINT64 }, { "mfu_evictable_data", KSTAT_DATA_UINT64 }, { "mfu_evictable_metadata", KSTAT_DATA_UINT64 }, { "mfu_ghost_size", KSTAT_DATA_UINT64 }, { "mfu_ghost_evictable_data", KSTAT_DATA_UINT64 }, { "mfu_ghost_evictable_metadata", KSTAT_DATA_UINT64 }, { "l2_hits", KSTAT_DATA_UINT64 }, { "l2_misses", KSTAT_DATA_UINT64 }, { "l2_feeds", KSTAT_DATA_UINT64 }, { "l2_rw_clash", KSTAT_DATA_UINT64 }, { "l2_read_bytes", KSTAT_DATA_UINT64 }, { "l2_write_bytes", KSTAT_DATA_UINT64 }, { "l2_writes_sent", KSTAT_DATA_UINT64 }, { "l2_writes_done", KSTAT_DATA_UINT64 }, { "l2_writes_error", KSTAT_DATA_UINT64 }, { "l2_writes_lock_retry", KSTAT_DATA_UINT64 }, { "l2_evict_lock_retry", KSTAT_DATA_UINT64 }, { "l2_evict_reading", KSTAT_DATA_UINT64 }, { "l2_evict_l1cached", KSTAT_DATA_UINT64 }, { "l2_free_on_write", KSTAT_DATA_UINT64 }, { "l2_abort_lowmem", KSTAT_DATA_UINT64 }, { "l2_cksum_bad", KSTAT_DATA_UINT64 }, { "l2_io_error", KSTAT_DATA_UINT64 }, { "l2_size", KSTAT_DATA_UINT64 }, { "l2_asize", KSTAT_DATA_UINT64 }, { "l2_hdr_size", KSTAT_DATA_UINT64 }, { "l2_padding_needed", KSTAT_DATA_UINT64 }, { "l2_write_trylock_fail", KSTAT_DATA_UINT64 }, { "l2_write_passed_headroom", KSTAT_DATA_UINT64 }, { "l2_write_spa_mismatch", KSTAT_DATA_UINT64 }, { "l2_write_in_l2", KSTAT_DATA_UINT64 }, { "l2_write_io_in_progress", KSTAT_DATA_UINT64 }, { "l2_write_not_cacheable", KSTAT_DATA_UINT64 }, { "l2_write_full", KSTAT_DATA_UINT64 }, { "l2_write_buffer_iter", KSTAT_DATA_UINT64 }, { "l2_write_pios", KSTAT_DATA_UINT64 }, { "l2_write_buffer_bytes_scanned", KSTAT_DATA_UINT64 }, { "l2_write_buffer_list_iter", KSTAT_DATA_UINT64 }, { "l2_write_buffer_list_null_iter", KSTAT_DATA_UINT64 }, { "memory_throttle_count", KSTAT_DATA_UINT64 }, { "arc_meta_used", KSTAT_DATA_UINT64 }, { "arc_meta_limit", KSTAT_DATA_UINT64 }, { "arc_meta_max", KSTAT_DATA_UINT64 }, { "arc_meta_min", KSTAT_DATA_UINT64 }, { "sync_wait_for_async", KSTAT_DATA_UINT64 }, { "demand_hit_predictive_prefetch", KSTAT_DATA_UINT64 }, }; #define ARCSTAT(stat) (arc_stats.stat.value.ui64) #define ARCSTAT_INCR(stat, val) \ atomic_add_64(&arc_stats.stat.value.ui64, (val)) #define ARCSTAT_BUMP(stat) ARCSTAT_INCR(stat, 1) #define ARCSTAT_BUMPDOWN(stat) ARCSTAT_INCR(stat, -1) #define ARCSTAT_MAX(stat, val) { \ uint64_t m; \ while ((val) > (m = arc_stats.stat.value.ui64) && \ (m != atomic_cas_64(&arc_stats.stat.value.ui64, m, (val)))) \ continue; \ } #define ARCSTAT_MAXSTAT(stat) \ ARCSTAT_MAX(stat##_max, arc_stats.stat.value.ui64) /* * We define a macro to allow ARC hits/misses to be easily broken down by * two separate conditions, giving a total of four different subtypes for * each of hits and misses (so eight statistics total). */ #define ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \ if (cond1) { \ if (cond2) { \ ARCSTAT_BUMP(arcstat_##stat1##_##stat2##_##stat); \ } else { \ ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \ } \ } else { \ if (cond2) { \ ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \ } else { \ ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\ } \ } kstat_t *arc_ksp; static arc_state_t *arc_anon; static arc_state_t *arc_mru; static arc_state_t *arc_mru_ghost; static arc_state_t *arc_mfu; static arc_state_t *arc_mfu_ghost; static arc_state_t *arc_l2c_only; /* * There are several ARC variables that are critical to export as kstats -- * but we don't want to have to grovel around in the kstat whenever we wish to * manipulate them. For these variables, we therefore define them to be in * terms of the statistic variable. This assures that we are not introducing * the possibility of inconsistency by having shadow copies of the variables, * while still allowing the code to be readable. */ #define arc_size ARCSTAT(arcstat_size) /* actual total arc size */ #define arc_p ARCSTAT(arcstat_p) /* target size of MRU */ #define arc_c ARCSTAT(arcstat_c) /* target size of cache */ #define arc_c_min ARCSTAT(arcstat_c_min) /* min target cache size */ #define arc_c_max ARCSTAT(arcstat_c_max) /* max target cache size */ #define arc_meta_limit ARCSTAT(arcstat_meta_limit) /* max size for metadata */ #define arc_meta_min ARCSTAT(arcstat_meta_min) /* min size for metadata */ #define arc_meta_used ARCSTAT(arcstat_meta_used) /* size of metadata */ #define arc_meta_max ARCSTAT(arcstat_meta_max) /* max size of metadata */ /* compressed size of entire arc */ #define arc_compressed_size ARCSTAT(arcstat_compressed_size) /* uncompressed size of entire arc */ #define arc_uncompressed_size ARCSTAT(arcstat_uncompressed_size) /* number of bytes in the arc from arc_buf_t's */ #define arc_overhead_size ARCSTAT(arcstat_overhead_size) static int arc_no_grow; /* Don't try to grow cache size */ static uint64_t arc_tempreserve; static uint64_t arc_loaned_bytes; typedef struct arc_callback arc_callback_t; struct arc_callback { void *acb_private; arc_done_func_t *acb_done; arc_buf_t *acb_buf; zio_t *acb_zio_dummy; arc_callback_t *acb_next; }; typedef struct arc_write_callback arc_write_callback_t; struct arc_write_callback { void *awcb_private; arc_done_func_t *awcb_ready; arc_done_func_t *awcb_children_ready; arc_done_func_t *awcb_physdone; arc_done_func_t *awcb_done; arc_buf_t *awcb_buf; }; /* * ARC buffers are separated into multiple structs as a memory saving measure: * - Common fields struct, always defined, and embedded within it: * - L2-only fields, always allocated but undefined when not in L2ARC * - L1-only fields, only allocated when in L1ARC * * Buffer in L1 Buffer only in L2 * +------------------------+ +------------------------+ * | arc_buf_hdr_t | | arc_buf_hdr_t | * | | | | * | | | | * | | | | * +------------------------+ +------------------------+ * | l2arc_buf_hdr_t | | l2arc_buf_hdr_t | * | (undefined if L1-only) | | | * +------------------------+ +------------------------+ * | l1arc_buf_hdr_t | * | | * | | * | | * | | * +------------------------+ * * Because it's possible for the L2ARC to become extremely large, we can wind * up eating a lot of memory in L2ARC buffer headers, so the size of a header * is minimized by only allocating the fields necessary for an L1-cached buffer * when a header is actually in the L1 cache. The sub-headers (l1arc_buf_hdr and * l2arc_buf_hdr) are embedded rather than allocated separately to save a couple * words in pointers. arc_hdr_realloc() is used to switch a header between * these two allocation states. */ typedef struct l1arc_buf_hdr { kmutex_t b_freeze_lock; zio_cksum_t *b_freeze_cksum; #ifdef ZFS_DEBUG /* * used for debugging wtih kmem_flags - by allocating and freeing * b_thawed when the buffer is thawed, we get a record of the stack * trace that thawed it. */ void *b_thawed; #endif arc_buf_t *b_buf; uint32_t b_bufcnt; /* for waiting on writes to complete */ kcondvar_t b_cv; uint8_t b_byteswap; /* protected by arc state mutex */ arc_state_t *b_state; multilist_node_t b_arc_node; /* updated atomically */ clock_t b_arc_access; /* self protecting */ refcount_t b_refcnt; arc_callback_t *b_acb; void *b_pdata; } l1arc_buf_hdr_t; typedef struct l2arc_dev l2arc_dev_t; typedef struct l2arc_buf_hdr { /* protected by arc_buf_hdr mutex */ l2arc_dev_t *b_dev; /* L2ARC device */ uint64_t b_daddr; /* disk address, offset byte */ list_node_t b_l2node; } l2arc_buf_hdr_t; struct arc_buf_hdr { /* protected by hash lock */ dva_t b_dva; uint64_t b_birth; arc_buf_contents_t b_type; arc_buf_hdr_t *b_hash_next; arc_flags_t b_flags; /* * This field stores the size of the data buffer after * compression, and is set in the arc's zio completion handlers. * It is in units of SPA_MINBLOCKSIZE (e.g. 1 == 512 bytes). * * While the block pointers can store up to 32MB in their psize * field, we can only store up to 32MB minus 512B. This is due * to the bp using a bias of 1, whereas we use a bias of 0 (i.e. * a field of zeros represents 512B in the bp). We can't use a * bias of 1 since we need to reserve a psize of zero, here, to * represent holes and embedded blocks. * * This isn't a problem in practice, since the maximum size of a * buffer is limited to 16MB, so we never need to store 32MB in * this field. Even in the upstream illumos code base, the * maximum size of a buffer is limited to 16MB. */ uint16_t b_psize; /* * This field stores the size of the data buffer before * compression, and cannot change once set. It is in units * of SPA_MINBLOCKSIZE (e.g. 2 == 1024 bytes) */ uint16_t b_lsize; /* immutable */ uint64_t b_spa; /* immutable */ /* L2ARC fields. Undefined when not in L2ARC. */ l2arc_buf_hdr_t b_l2hdr; /* L1ARC fields. Undefined when in l2arc_only state */ l1arc_buf_hdr_t b_l1hdr; }; #if defined(__FreeBSD__) && defined(_KERNEL) static int sysctl_vfs_zfs_arc_meta_limit(SYSCTL_HANDLER_ARGS) { uint64_t val; int err; val = arc_meta_limit; err = sysctl_handle_64(oidp, &val, 0, req); if (err != 0 || req->newptr == NULL) return (err); if (val <= 0 || val > arc_c_max) return (EINVAL); arc_meta_limit = val; return (0); } static int sysctl_vfs_zfs_arc_max(SYSCTL_HANDLER_ARGS) { uint64_t val; int err; val = zfs_arc_max; err = sysctl_handle_64(oidp, &val, 0, req); if (err != 0 || req->newptr == NULL) return (err); if (zfs_arc_max == 0) { /* Loader tunable so blindly set */ zfs_arc_max = val; return (0); } if (val < arc_abs_min || val > kmem_size()) return (EINVAL); if (val < arc_c_min) return (EINVAL); if (zfs_arc_meta_limit > 0 && val < zfs_arc_meta_limit) return (EINVAL); arc_c_max = val; arc_c = arc_c_max; arc_p = (arc_c >> 1); if (zfs_arc_meta_limit == 0) { /* limit meta-data to 1/4 of the arc capacity */ arc_meta_limit = arc_c_max / 4; } /* if kmem_flags are set, lets try to use less memory */ if (kmem_debugging()) arc_c = arc_c / 2; zfs_arc_max = arc_c; return (0); } static int sysctl_vfs_zfs_arc_min(SYSCTL_HANDLER_ARGS) { uint64_t val; int err; val = zfs_arc_min; err = sysctl_handle_64(oidp, &val, 0, req); if (err != 0 || req->newptr == NULL) return (err); if (zfs_arc_min == 0) { /* Loader tunable so blindly set */ zfs_arc_min = val; return (0); } if (val < arc_abs_min || val > arc_c_max) return (EINVAL); arc_c_min = val; if (zfs_arc_meta_min == 0) arc_meta_min = arc_c_min / 2; if (arc_c < arc_c_min) arc_c = arc_c_min; zfs_arc_min = arc_c_min; return (0); } #endif #define GHOST_STATE(state) \ ((state) == arc_mru_ghost || (state) == arc_mfu_ghost || \ (state) == arc_l2c_only) #define HDR_IN_HASH_TABLE(hdr) ((hdr)->b_flags & ARC_FLAG_IN_HASH_TABLE) #define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS) #define HDR_IO_ERROR(hdr) ((hdr)->b_flags & ARC_FLAG_IO_ERROR) #define HDR_PREFETCH(hdr) ((hdr)->b_flags & ARC_FLAG_PREFETCH) #define HDR_COMPRESSION_ENABLED(hdr) \ ((hdr)->b_flags & ARC_FLAG_COMPRESSED_ARC) #define HDR_L2CACHE(hdr) ((hdr)->b_flags & ARC_FLAG_L2CACHE) #define HDR_L2_READING(hdr) \ (((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS) && \ ((hdr)->b_flags & ARC_FLAG_HAS_L2HDR)) #define HDR_L2_WRITING(hdr) ((hdr)->b_flags & ARC_FLAG_L2_WRITING) #define HDR_L2_EVICTED(hdr) ((hdr)->b_flags & ARC_FLAG_L2_EVICTED) #define HDR_L2_WRITE_HEAD(hdr) ((hdr)->b_flags & ARC_FLAG_L2_WRITE_HEAD) #define HDR_SHARED_DATA(hdr) ((hdr)->b_flags & ARC_FLAG_SHARED_DATA) #define HDR_ISTYPE_METADATA(hdr) \ ((hdr)->b_flags & ARC_FLAG_BUFC_METADATA) #define HDR_ISTYPE_DATA(hdr) (!HDR_ISTYPE_METADATA(hdr)) #define HDR_HAS_L1HDR(hdr) ((hdr)->b_flags & ARC_FLAG_HAS_L1HDR) #define HDR_HAS_L2HDR(hdr) ((hdr)->b_flags & ARC_FLAG_HAS_L2HDR) /* For storing compression mode in b_flags */ #define HDR_COMPRESS_OFFSET (highbit64(ARC_FLAG_COMPRESS_0) - 1) #define HDR_GET_COMPRESS(hdr) ((enum zio_compress)BF32_GET((hdr)->b_flags, \ HDR_COMPRESS_OFFSET, SPA_COMPRESSBITS)) #define HDR_SET_COMPRESS(hdr, cmp) BF32_SET((hdr)->b_flags, \ HDR_COMPRESS_OFFSET, SPA_COMPRESSBITS, (cmp)); #define ARC_BUF_LAST(buf) ((buf)->b_next == NULL) /* * Other sizes */ #define HDR_FULL_SIZE ((int64_t)sizeof (arc_buf_hdr_t)) #define HDR_L2ONLY_SIZE ((int64_t)offsetof(arc_buf_hdr_t, b_l1hdr)) /* * Hash table routines */ #define HT_LOCK_PAD CACHE_LINE_SIZE struct ht_lock { kmutex_t ht_lock; #ifdef _KERNEL unsigned char pad[(HT_LOCK_PAD - sizeof (kmutex_t))]; #endif }; #define BUF_LOCKS 256 typedef struct buf_hash_table { uint64_t ht_mask; arc_buf_hdr_t **ht_table; struct ht_lock ht_locks[BUF_LOCKS] __aligned(CACHE_LINE_SIZE); } buf_hash_table_t; static buf_hash_table_t buf_hash_table; #define BUF_HASH_INDEX(spa, dva, birth) \ (buf_hash(spa, dva, birth) & buf_hash_table.ht_mask) #define BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)]) #define BUF_HASH_LOCK(idx) (&(BUF_HASH_LOCK_NTRY(idx).ht_lock)) #define HDR_LOCK(hdr) \ (BUF_HASH_LOCK(BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth))) uint64_t zfs_crc64_table[256]; /* * Level 2 ARC */ #define L2ARC_WRITE_SIZE (8 * 1024 * 1024) /* initial write max */ #define L2ARC_HEADROOM 2 /* num of writes */ /* * If we discover during ARC scan any buffers to be compressed, we boost * our headroom for the next scanning cycle by this percentage multiple. */ #define L2ARC_HEADROOM_BOOST 200 #define L2ARC_FEED_SECS 1 /* caching interval secs */ #define L2ARC_FEED_MIN_MS 200 /* min caching interval ms */ #define l2arc_writes_sent ARCSTAT(arcstat_l2_writes_sent) #define l2arc_writes_done ARCSTAT(arcstat_l2_writes_done) /* L2ARC Performance Tunables */ uint64_t l2arc_write_max = L2ARC_WRITE_SIZE; /* default max write size */ uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE; /* extra write during warmup */ uint64_t l2arc_headroom = L2ARC_HEADROOM; /* number of dev writes */ uint64_t l2arc_headroom_boost = L2ARC_HEADROOM_BOOST; uint64_t l2arc_feed_secs = L2ARC_FEED_SECS; /* interval seconds */ uint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS; /* min interval milliseconds */ boolean_t l2arc_noprefetch = B_TRUE; /* don't cache prefetch bufs */ boolean_t l2arc_feed_again = B_TRUE; /* turbo warmup */ boolean_t l2arc_norw = B_TRUE; /* no reads during writes */ SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_write_max, CTLFLAG_RW, &l2arc_write_max, 0, "max write size"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_write_boost, CTLFLAG_RW, &l2arc_write_boost, 0, "extra write during warmup"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_headroom, CTLFLAG_RW, &l2arc_headroom, 0, "number of dev writes"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_feed_secs, CTLFLAG_RW, &l2arc_feed_secs, 0, "interval seconds"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_feed_min_ms, CTLFLAG_RW, &l2arc_feed_min_ms, 0, "min interval milliseconds"); SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_noprefetch, CTLFLAG_RW, &l2arc_noprefetch, 0, "don't cache prefetch bufs"); SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_feed_again, CTLFLAG_RW, &l2arc_feed_again, 0, "turbo warmup"); SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_norw, CTLFLAG_RW, &l2arc_norw, 0, "no reads during writes"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_size, CTLFLAG_RD, &ARC_anon.arcs_size.rc_count, 0, "size of anonymous state"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_metadata_esize, CTLFLAG_RD, &ARC_anon.arcs_esize[ARC_BUFC_METADATA].rc_count, 0, "size of anonymous state"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_data_esize, CTLFLAG_RD, &ARC_anon.arcs_esize[ARC_BUFC_DATA].rc_count, 0, "size of anonymous state"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_size, CTLFLAG_RD, &ARC_mru.arcs_size.rc_count, 0, "size of mru state"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_metadata_esize, CTLFLAG_RD, &ARC_mru.arcs_esize[ARC_BUFC_METADATA].rc_count, 0, "size of metadata in mru state"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_data_esize, CTLFLAG_RD, &ARC_mru.arcs_esize[ARC_BUFC_DATA].rc_count, 0, "size of data in mru state"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_size, CTLFLAG_RD, &ARC_mru_ghost.arcs_size.rc_count, 0, "size of mru ghost state"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_metadata_esize, CTLFLAG_RD, &ARC_mru_ghost.arcs_esize[ARC_BUFC_METADATA].rc_count, 0, "size of metadata in mru ghost state"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_data_esize, CTLFLAG_RD, &ARC_mru_ghost.arcs_esize[ARC_BUFC_DATA].rc_count, 0, "size of data in mru ghost state"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_size, CTLFLAG_RD, &ARC_mfu.arcs_size.rc_count, 0, "size of mfu state"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_metadata_esize, CTLFLAG_RD, &ARC_mfu.arcs_esize[ARC_BUFC_METADATA].rc_count, 0, "size of metadata in mfu state"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_data_esize, CTLFLAG_RD, &ARC_mfu.arcs_esize[ARC_BUFC_DATA].rc_count, 0, "size of data in mfu state"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_size, CTLFLAG_RD, &ARC_mfu_ghost.arcs_size.rc_count, 0, "size of mfu ghost state"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_metadata_esize, CTLFLAG_RD, &ARC_mfu_ghost.arcs_esize[ARC_BUFC_METADATA].rc_count, 0, "size of metadata in mfu ghost state"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_data_esize, CTLFLAG_RD, &ARC_mfu_ghost.arcs_esize[ARC_BUFC_DATA].rc_count, 0, "size of data in mfu ghost state"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2c_only_size, CTLFLAG_RD, &ARC_l2c_only.arcs_size.rc_count, 0, "size of mru state"); /* * L2ARC Internals */ struct l2arc_dev { vdev_t *l2ad_vdev; /* vdev */ spa_t *l2ad_spa; /* spa */ uint64_t l2ad_hand; /* next write location */ uint64_t l2ad_start; /* first addr on device */ uint64_t l2ad_end; /* last addr on device */ boolean_t l2ad_first; /* first sweep through */ boolean_t l2ad_writing; /* currently writing */ kmutex_t l2ad_mtx; /* lock for buffer list */ list_t l2ad_buflist; /* buffer list */ list_node_t l2ad_node; /* device list node */ refcount_t l2ad_alloc; /* allocated bytes */ }; static list_t L2ARC_dev_list; /* device list */ static list_t *l2arc_dev_list; /* device list pointer */ static kmutex_t l2arc_dev_mtx; /* device list mutex */ static l2arc_dev_t *l2arc_dev_last; /* last device used */ static list_t L2ARC_free_on_write; /* free after write buf list */ static list_t *l2arc_free_on_write; /* free after write list ptr */ static kmutex_t l2arc_free_on_write_mtx; /* mutex for list */ static uint64_t l2arc_ndev; /* number of devices */ typedef struct l2arc_read_callback { arc_buf_hdr_t *l2rcb_hdr; /* read buffer */ blkptr_t l2rcb_bp; /* original blkptr */ zbookmark_phys_t l2rcb_zb; /* original bookmark */ int l2rcb_flags; /* original flags */ void *l2rcb_data; /* temporary buffer */ } l2arc_read_callback_t; typedef struct l2arc_write_callback { l2arc_dev_t *l2wcb_dev; /* device info */ arc_buf_hdr_t *l2wcb_head; /* head of write buflist */ } l2arc_write_callback_t; typedef struct l2arc_data_free { /* protected by l2arc_free_on_write_mtx */ void *l2df_data; size_t l2df_size; arc_buf_contents_t l2df_type; list_node_t l2df_list_node; } l2arc_data_free_t; static kmutex_t l2arc_feed_thr_lock; static kcondvar_t l2arc_feed_thr_cv; static uint8_t l2arc_thread_exit; static void *arc_get_data_buf(arc_buf_hdr_t *, uint64_t, void *); static void arc_free_data_buf(arc_buf_hdr_t *, void *, uint64_t, void *); static void arc_hdr_free_pdata(arc_buf_hdr_t *hdr); static void arc_hdr_alloc_pdata(arc_buf_hdr_t *); static void arc_access(arc_buf_hdr_t *, kmutex_t *); static boolean_t arc_is_overflowing(); static void arc_buf_watch(arc_buf_t *); static arc_buf_contents_t arc_buf_type(arc_buf_hdr_t *); static uint32_t arc_bufc_to_flags(arc_buf_contents_t); static inline void arc_hdr_set_flags(arc_buf_hdr_t *hdr, arc_flags_t flags); static inline void arc_hdr_clear_flags(arc_buf_hdr_t *hdr, arc_flags_t flags); static boolean_t l2arc_write_eligible(uint64_t, arc_buf_hdr_t *); static void l2arc_read_done(zio_t *); static void l2arc_trim(const arc_buf_hdr_t *hdr) { l2arc_dev_t *dev = hdr->b_l2hdr.b_dev; ASSERT(HDR_HAS_L2HDR(hdr)); ASSERT(MUTEX_HELD(&dev->l2ad_mtx)); if (HDR_GET_PSIZE(hdr) != 0) { trim_map_free(dev->l2ad_vdev, hdr->b_l2hdr.b_daddr, HDR_GET_PSIZE(hdr), 0); } } static uint64_t buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth) { uint8_t *vdva = (uint8_t *)dva; uint64_t crc = -1ULL; int i; ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); for (i = 0; i < sizeof (dva_t); i++) crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ vdva[i]) & 0xFF]; crc ^= (spa>>8) ^ birth; return (crc); } #define HDR_EMPTY(hdr) \ ((hdr)->b_dva.dva_word[0] == 0 && \ (hdr)->b_dva.dva_word[1] == 0) #define HDR_EQUAL(spa, dva, birth, hdr) \ ((hdr)->b_dva.dva_word[0] == (dva)->dva_word[0]) && \ ((hdr)->b_dva.dva_word[1] == (dva)->dva_word[1]) && \ ((hdr)->b_birth == birth) && ((hdr)->b_spa == spa) static void buf_discard_identity(arc_buf_hdr_t *hdr) { hdr->b_dva.dva_word[0] = 0; hdr->b_dva.dva_word[1] = 0; hdr->b_birth = 0; } static arc_buf_hdr_t * buf_hash_find(uint64_t spa, const blkptr_t *bp, kmutex_t **lockp) { const dva_t *dva = BP_IDENTITY(bp); uint64_t birth = BP_PHYSICAL_BIRTH(bp); uint64_t idx = BUF_HASH_INDEX(spa, dva, birth); kmutex_t *hash_lock = BUF_HASH_LOCK(idx); arc_buf_hdr_t *hdr; mutex_enter(hash_lock); for (hdr = buf_hash_table.ht_table[idx]; hdr != NULL; hdr = hdr->b_hash_next) { if (HDR_EQUAL(spa, dva, birth, hdr)) { *lockp = hash_lock; return (hdr); } } mutex_exit(hash_lock); *lockp = NULL; return (NULL); } /* * Insert an entry into the hash table. If there is already an element * equal to elem in the hash table, then the already existing element * will be returned and the new element will not be inserted. * Otherwise returns NULL. * If lockp == NULL, the caller is assumed to already hold the hash lock. */ static arc_buf_hdr_t * buf_hash_insert(arc_buf_hdr_t *hdr, kmutex_t **lockp) { uint64_t idx = BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth); kmutex_t *hash_lock = BUF_HASH_LOCK(idx); arc_buf_hdr_t *fhdr; uint32_t i; ASSERT(!DVA_IS_EMPTY(&hdr->b_dva)); ASSERT(hdr->b_birth != 0); ASSERT(!HDR_IN_HASH_TABLE(hdr)); if (lockp != NULL) { *lockp = hash_lock; mutex_enter(hash_lock); } else { ASSERT(MUTEX_HELD(hash_lock)); } for (fhdr = buf_hash_table.ht_table[idx], i = 0; fhdr != NULL; fhdr = fhdr->b_hash_next, i++) { if (HDR_EQUAL(hdr->b_spa, &hdr->b_dva, hdr->b_birth, fhdr)) return (fhdr); } hdr->b_hash_next = buf_hash_table.ht_table[idx]; buf_hash_table.ht_table[idx] = hdr; arc_hdr_set_flags(hdr, ARC_FLAG_IN_HASH_TABLE); /* collect some hash table performance data */ if (i > 0) { ARCSTAT_BUMP(arcstat_hash_collisions); if (i == 1) ARCSTAT_BUMP(arcstat_hash_chains); ARCSTAT_MAX(arcstat_hash_chain_max, i); } ARCSTAT_BUMP(arcstat_hash_elements); ARCSTAT_MAXSTAT(arcstat_hash_elements); return (NULL); } static void buf_hash_remove(arc_buf_hdr_t *hdr) { arc_buf_hdr_t *fhdr, **hdrp; uint64_t idx = BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth); ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx))); ASSERT(HDR_IN_HASH_TABLE(hdr)); hdrp = &buf_hash_table.ht_table[idx]; while ((fhdr = *hdrp) != hdr) { ASSERT3P(fhdr, !=, NULL); hdrp = &fhdr->b_hash_next; } *hdrp = hdr->b_hash_next; hdr->b_hash_next = NULL; arc_hdr_clear_flags(hdr, ARC_FLAG_IN_HASH_TABLE); /* collect some hash table performance data */ ARCSTAT_BUMPDOWN(arcstat_hash_elements); if (buf_hash_table.ht_table[idx] && buf_hash_table.ht_table[idx]->b_hash_next == NULL) ARCSTAT_BUMPDOWN(arcstat_hash_chains); } /* * Global data structures and functions for the buf kmem cache. */ static kmem_cache_t *hdr_full_cache; static kmem_cache_t *hdr_l2only_cache; static kmem_cache_t *buf_cache; static void buf_fini(void) { int i; kmem_free(buf_hash_table.ht_table, (buf_hash_table.ht_mask + 1) * sizeof (void *)); for (i = 0; i < BUF_LOCKS; i++) mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock); kmem_cache_destroy(hdr_full_cache); kmem_cache_destroy(hdr_l2only_cache); kmem_cache_destroy(buf_cache); } /* * Constructor callback - called when the cache is empty * and a new buf is requested. */ /* ARGSUSED */ static int hdr_full_cons(void *vbuf, void *unused, int kmflag) { arc_buf_hdr_t *hdr = vbuf; bzero(hdr, HDR_FULL_SIZE); cv_init(&hdr->b_l1hdr.b_cv, NULL, CV_DEFAULT, NULL); refcount_create(&hdr->b_l1hdr.b_refcnt); mutex_init(&hdr->b_l1hdr.b_freeze_lock, NULL, MUTEX_DEFAULT, NULL); multilist_link_init(&hdr->b_l1hdr.b_arc_node); arc_space_consume(HDR_FULL_SIZE, ARC_SPACE_HDRS); return (0); } /* ARGSUSED */ static int hdr_l2only_cons(void *vbuf, void *unused, int kmflag) { arc_buf_hdr_t *hdr = vbuf; bzero(hdr, HDR_L2ONLY_SIZE); arc_space_consume(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS); return (0); } /* ARGSUSED */ static int buf_cons(void *vbuf, void *unused, int kmflag) { arc_buf_t *buf = vbuf; bzero(buf, sizeof (arc_buf_t)); mutex_init(&buf->b_evict_lock, NULL, MUTEX_DEFAULT, NULL); arc_space_consume(sizeof (arc_buf_t), ARC_SPACE_HDRS); return (0); } /* * Destructor callback - called when a cached buf is * no longer required. */ /* ARGSUSED */ static void hdr_full_dest(void *vbuf, void *unused) { arc_buf_hdr_t *hdr = vbuf; ASSERT(HDR_EMPTY(hdr)); cv_destroy(&hdr->b_l1hdr.b_cv); refcount_destroy(&hdr->b_l1hdr.b_refcnt); mutex_destroy(&hdr->b_l1hdr.b_freeze_lock); ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node)); arc_space_return(HDR_FULL_SIZE, ARC_SPACE_HDRS); } /* ARGSUSED */ static void hdr_l2only_dest(void *vbuf, void *unused) { arc_buf_hdr_t *hdr = vbuf; ASSERT(HDR_EMPTY(hdr)); arc_space_return(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS); } /* ARGSUSED */ static void buf_dest(void *vbuf, void *unused) { arc_buf_t *buf = vbuf; mutex_destroy(&buf->b_evict_lock); arc_space_return(sizeof (arc_buf_t), ARC_SPACE_HDRS); } /* * Reclaim callback -- invoked when memory is low. */ /* ARGSUSED */ static void hdr_recl(void *unused) { dprintf("hdr_recl called\n"); /* * umem calls the reclaim func when we destroy the buf cache, * which is after we do arc_fini(). */ if (!arc_dead) cv_signal(&arc_reclaim_thread_cv); } static void buf_init(void) { uint64_t *ct; uint64_t hsize = 1ULL << 12; int i, j; /* * The hash table is big enough to fill all of physical memory * with an average block size of zfs_arc_average_blocksize (default 8K). * By default, the table will take up * totalmem * sizeof(void*) / 8K (1MB per GB with 8-byte pointers). */ while (hsize * zfs_arc_average_blocksize < (uint64_t)physmem * PAGESIZE) hsize <<= 1; retry: buf_hash_table.ht_mask = hsize - 1; buf_hash_table.ht_table = kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP); if (buf_hash_table.ht_table == NULL) { ASSERT(hsize > (1ULL << 8)); hsize >>= 1; goto retry; } hdr_full_cache = kmem_cache_create("arc_buf_hdr_t_full", HDR_FULL_SIZE, 0, hdr_full_cons, hdr_full_dest, hdr_recl, NULL, NULL, 0); hdr_l2only_cache = kmem_cache_create("arc_buf_hdr_t_l2only", HDR_L2ONLY_SIZE, 0, hdr_l2only_cons, hdr_l2only_dest, hdr_recl, NULL, NULL, 0); buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t), 0, buf_cons, buf_dest, NULL, NULL, NULL, 0); for (i = 0; i < 256; i++) for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--) *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY); for (i = 0; i < BUF_LOCKS; i++) { mutex_init(&buf_hash_table.ht_locks[i].ht_lock, NULL, MUTEX_DEFAULT, NULL); } } #define ARC_MINTIME (hz>>4) /* 62 ms */ static inline boolean_t arc_buf_is_shared(arc_buf_t *buf) { boolean_t shared = (buf->b_data != NULL && buf->b_data == buf->b_hdr->b_l1hdr.b_pdata); IMPLY(shared, HDR_SHARED_DATA(buf->b_hdr)); return (shared); } static inline void arc_cksum_free(arc_buf_hdr_t *hdr) { ASSERT(HDR_HAS_L1HDR(hdr)); mutex_enter(&hdr->b_l1hdr.b_freeze_lock); if (hdr->b_l1hdr.b_freeze_cksum != NULL) { kmem_free(hdr->b_l1hdr.b_freeze_cksum, sizeof (zio_cksum_t)); hdr->b_l1hdr.b_freeze_cksum = NULL; } mutex_exit(&hdr->b_l1hdr.b_freeze_lock); } static void arc_cksum_verify(arc_buf_t *buf) { arc_buf_hdr_t *hdr = buf->b_hdr; zio_cksum_t zc; if (!(zfs_flags & ZFS_DEBUG_MODIFY)) return; ASSERT(HDR_HAS_L1HDR(hdr)); mutex_enter(&hdr->b_l1hdr.b_freeze_lock); if (hdr->b_l1hdr.b_freeze_cksum == NULL || HDR_IO_ERROR(hdr)) { mutex_exit(&hdr->b_l1hdr.b_freeze_lock); return; } fletcher_2_native(buf->b_data, HDR_GET_LSIZE(hdr), NULL, &zc); if (!ZIO_CHECKSUM_EQUAL(*hdr->b_l1hdr.b_freeze_cksum, zc)) panic("buffer modified while frozen!"); mutex_exit(&hdr->b_l1hdr.b_freeze_lock); } static boolean_t arc_cksum_is_equal(arc_buf_hdr_t *hdr, zio_t *zio) { enum zio_compress compress = BP_GET_COMPRESS(zio->io_bp); boolean_t valid_cksum; ASSERT(!BP_IS_EMBEDDED(zio->io_bp)); VERIFY3U(BP_GET_PSIZE(zio->io_bp), ==, HDR_GET_PSIZE(hdr)); /* * We rely on the blkptr's checksum to determine if the block * is valid or not. When compressed arc is enabled, the l2arc * writes the block to the l2arc just as it appears in the pool. * This allows us to use the blkptr's checksum to validate the * data that we just read off of the l2arc without having to store * a separate checksum in the arc_buf_hdr_t. However, if compressed * arc is disabled, then the data written to the l2arc is always * uncompressed and won't match the block as it exists in the main * pool. When this is the case, we must first compress it if it is * compressed on the main pool before we can validate the checksum. */ if (!HDR_COMPRESSION_ENABLED(hdr) && compress != ZIO_COMPRESS_OFF) { ASSERT3U(HDR_GET_COMPRESS(hdr), ==, ZIO_COMPRESS_OFF); uint64_t lsize = HDR_GET_LSIZE(hdr); uint64_t csize; void *cbuf = zio_buf_alloc(HDR_GET_PSIZE(hdr)); csize = zio_compress_data(compress, zio->io_data, cbuf, lsize); ASSERT3U(csize, <=, HDR_GET_PSIZE(hdr)); if (csize < HDR_GET_PSIZE(hdr)) { /* * Compressed blocks are always a multiple of the * smallest ashift in the pool. Ideally, we would * like to round up the csize to the next * spa_min_ashift but that value may have changed * since the block was last written. Instead, * we rely on the fact that the hdr's psize * was set to the psize of the block when it was * last written. We set the csize to that value * and zero out any part that should not contain * data. */ bzero((char *)cbuf + csize, HDR_GET_PSIZE(hdr) - csize); csize = HDR_GET_PSIZE(hdr); } zio_push_transform(zio, cbuf, csize, HDR_GET_PSIZE(hdr), NULL); } /* * Block pointers always store the checksum for the logical data. * If the block pointer has the gang bit set, then the checksum * it represents is for the reconstituted data and not for an * individual gang member. The zio pipeline, however, must be able to * determine the checksum of each of the gang constituents so it * treats the checksum comparison differently than what we need * for l2arc blocks. This prevents us from using the * zio_checksum_error() interface directly. Instead we must call the * zio_checksum_error_impl() so that we can ensure the checksum is * generated using the correct checksum algorithm and accounts for the * logical I/O size and not just a gang fragment. */ valid_cksum = (zio_checksum_error_impl(zio->io_spa, zio->io_bp, BP_GET_CHECKSUM(zio->io_bp), zio->io_data, zio->io_size, zio->io_offset, NULL) == 0); zio_pop_transforms(zio); return (valid_cksum); } static void arc_cksum_compute(arc_buf_t *buf) { arc_buf_hdr_t *hdr = buf->b_hdr; if (!(zfs_flags & ZFS_DEBUG_MODIFY)) return; ASSERT(HDR_HAS_L1HDR(hdr)); mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock); if (hdr->b_l1hdr.b_freeze_cksum != NULL) { mutex_exit(&hdr->b_l1hdr.b_freeze_lock); return; } hdr->b_l1hdr.b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), KM_SLEEP); fletcher_2_native(buf->b_data, HDR_GET_LSIZE(hdr), NULL, hdr->b_l1hdr.b_freeze_cksum); mutex_exit(&hdr->b_l1hdr.b_freeze_lock); #ifdef illumos arc_buf_watch(buf); #endif } #ifdef illumos #ifndef _KERNEL typedef struct procctl { long cmd; prwatch_t prwatch; } procctl_t; #endif /* ARGSUSED */ static void arc_buf_unwatch(arc_buf_t *buf) { #ifndef _KERNEL if (arc_watch) { int result; procctl_t ctl; ctl.cmd = PCWATCH; ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data; ctl.prwatch.pr_size = 0; ctl.prwatch.pr_wflags = 0; result = write(arc_procfd, &ctl, sizeof (ctl)); ASSERT3U(result, ==, sizeof (ctl)); } #endif } /* ARGSUSED */ static void arc_buf_watch(arc_buf_t *buf) { #ifndef _KERNEL if (arc_watch) { int result; procctl_t ctl; ctl.cmd = PCWATCH; ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data; ctl.prwatch.pr_size = HDR_GET_LSIZE(buf->b_hdr); ctl.prwatch.pr_wflags = WA_WRITE; result = write(arc_procfd, &ctl, sizeof (ctl)); ASSERT3U(result, ==, sizeof (ctl)); } #endif } #endif /* illumos */ static arc_buf_contents_t arc_buf_type(arc_buf_hdr_t *hdr) { arc_buf_contents_t type; if (HDR_ISTYPE_METADATA(hdr)) { type = ARC_BUFC_METADATA; } else { type = ARC_BUFC_DATA; } VERIFY3U(hdr->b_type, ==, type); return (type); } static uint32_t arc_bufc_to_flags(arc_buf_contents_t type) { switch (type) { case ARC_BUFC_DATA: /* metadata field is 0 if buffer contains normal data */ return (0); case ARC_BUFC_METADATA: return (ARC_FLAG_BUFC_METADATA); default: break; } panic("undefined ARC buffer type!"); return ((uint32_t)-1); } void arc_buf_thaw(arc_buf_t *buf) { arc_buf_hdr_t *hdr = buf->b_hdr; if (zfs_flags & ZFS_DEBUG_MODIFY) { if (hdr->b_l1hdr.b_state != arc_anon) panic("modifying non-anon buffer!"); if (HDR_IO_IN_PROGRESS(hdr)) panic("modifying buffer while i/o in progress!"); arc_cksum_verify(buf); } ASSERT(HDR_HAS_L1HDR(hdr)); arc_cksum_free(hdr); mutex_enter(&hdr->b_l1hdr.b_freeze_lock); #ifdef ZFS_DEBUG if (zfs_flags & ZFS_DEBUG_MODIFY) { if (hdr->b_l1hdr.b_thawed != NULL) kmem_free(hdr->b_l1hdr.b_thawed, 1); hdr->b_l1hdr.b_thawed = kmem_alloc(1, KM_SLEEP); } #endif mutex_exit(&hdr->b_l1hdr.b_freeze_lock); #ifdef illumos arc_buf_unwatch(buf); #endif } void arc_buf_freeze(arc_buf_t *buf) { arc_buf_hdr_t *hdr = buf->b_hdr; kmutex_t *hash_lock; if (!(zfs_flags & ZFS_DEBUG_MODIFY)) return; hash_lock = HDR_LOCK(hdr); mutex_enter(hash_lock); ASSERT(HDR_HAS_L1HDR(hdr)); ASSERT(hdr->b_l1hdr.b_freeze_cksum != NULL || hdr->b_l1hdr.b_state == arc_anon); arc_cksum_compute(buf); mutex_exit(hash_lock); } /* * The arc_buf_hdr_t's b_flags should never be modified directly. Instead, * the following functions should be used to ensure that the flags are * updated in a thread-safe way. When manipulating the flags either * the hash_lock must be held or the hdr must be undiscoverable. This * ensures that we're not racing with any other threads when updating * the flags. */ static inline void arc_hdr_set_flags(arc_buf_hdr_t *hdr, arc_flags_t flags) { ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); hdr->b_flags |= flags; } static inline void arc_hdr_clear_flags(arc_buf_hdr_t *hdr, arc_flags_t flags) { ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); hdr->b_flags &= ~flags; } /* * Setting the compression bits in the arc_buf_hdr_t's b_flags is * done in a special way since we have to clear and set bits * at the same time. Consumers that wish to set the compression bits * must use this function to ensure that the flags are updated in * thread-safe manner. */ static void arc_hdr_set_compress(arc_buf_hdr_t *hdr, enum zio_compress cmp) { ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); /* * Holes and embedded blocks will always have a psize = 0 so * we ignore the compression of the blkptr and set the * arc_buf_hdr_t's compression to ZIO_COMPRESS_OFF. * Holes and embedded blocks remain anonymous so we don't * want to uncompress them. Mark them as uncompressed. */ if (!zfs_compressed_arc_enabled || HDR_GET_PSIZE(hdr) == 0) { arc_hdr_clear_flags(hdr, ARC_FLAG_COMPRESSED_ARC); HDR_SET_COMPRESS(hdr, ZIO_COMPRESS_OFF); ASSERT(!HDR_COMPRESSION_ENABLED(hdr)); ASSERT3U(HDR_GET_COMPRESS(hdr), ==, ZIO_COMPRESS_OFF); } else { arc_hdr_set_flags(hdr, ARC_FLAG_COMPRESSED_ARC); HDR_SET_COMPRESS(hdr, cmp); ASSERT3U(HDR_GET_COMPRESS(hdr), ==, cmp); ASSERT(HDR_COMPRESSION_ENABLED(hdr)); } } static int arc_decompress(arc_buf_t *buf) { arc_buf_hdr_t *hdr = buf->b_hdr; dmu_object_byteswap_t bswap = hdr->b_l1hdr.b_byteswap; int error; if (arc_buf_is_shared(buf)) { ASSERT3U(HDR_GET_COMPRESS(hdr), ==, ZIO_COMPRESS_OFF); } else if (HDR_GET_COMPRESS(hdr) == ZIO_COMPRESS_OFF) { /* * The arc_buf_hdr_t is either not compressed or is * associated with an embedded block or a hole in which * case they remain anonymous. */ IMPLY(HDR_COMPRESSION_ENABLED(hdr), HDR_GET_PSIZE(hdr) == 0 || HDR_GET_PSIZE(hdr) == HDR_GET_LSIZE(hdr)); ASSERT(!HDR_SHARED_DATA(hdr)); bcopy(hdr->b_l1hdr.b_pdata, buf->b_data, HDR_GET_LSIZE(hdr)); } else { ASSERT(!HDR_SHARED_DATA(hdr)); ASSERT3U(HDR_GET_LSIZE(hdr), !=, HDR_GET_PSIZE(hdr)); error = zio_decompress_data(HDR_GET_COMPRESS(hdr), hdr->b_l1hdr.b_pdata, buf->b_data, HDR_GET_PSIZE(hdr), HDR_GET_LSIZE(hdr)); if (error != 0) { zfs_dbgmsg("hdr %p, compress %d, psize %d, lsize %d", hdr, HDR_GET_COMPRESS(hdr), HDR_GET_PSIZE(hdr), HDR_GET_LSIZE(hdr)); return (SET_ERROR(EIO)); } } if (bswap != DMU_BSWAP_NUMFUNCS) { ASSERT(!HDR_SHARED_DATA(hdr)); ASSERT3U(bswap, <, DMU_BSWAP_NUMFUNCS); dmu_ot_byteswap[bswap].ob_func(buf->b_data, HDR_GET_LSIZE(hdr)); } arc_cksum_compute(buf); return (0); } /* * Return the size of the block, b_pdata, that is stored in the arc_buf_hdr_t. */ static uint64_t arc_hdr_size(arc_buf_hdr_t *hdr) { uint64_t size; if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF && HDR_GET_PSIZE(hdr) > 0) { size = HDR_GET_PSIZE(hdr); } else { ASSERT3U(HDR_GET_LSIZE(hdr), !=, 0); size = HDR_GET_LSIZE(hdr); } return (size); } /* * Increment the amount of evictable space in the arc_state_t's refcount. * We account for the space used by the hdr and the arc buf individually * so that we can add and remove them from the refcount individually. */ static void arc_evictable_space_increment(arc_buf_hdr_t *hdr, arc_state_t *state) { arc_buf_contents_t type = arc_buf_type(hdr); uint64_t lsize = HDR_GET_LSIZE(hdr); ASSERT(HDR_HAS_L1HDR(hdr)); if (GHOST_STATE(state)) { ASSERT0(hdr->b_l1hdr.b_bufcnt); ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL); (void) refcount_add_many(&state->arcs_esize[type], lsize, hdr); return; } ASSERT(!GHOST_STATE(state)); if (hdr->b_l1hdr.b_pdata != NULL) { (void) refcount_add_many(&state->arcs_esize[type], arc_hdr_size(hdr), hdr); } for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL; buf = buf->b_next) { if (arc_buf_is_shared(buf)) { ASSERT(ARC_BUF_LAST(buf)); continue; } (void) refcount_add_many(&state->arcs_esize[type], lsize, buf); } } /* * Decrement the amount of evictable space in the arc_state_t's refcount. * We account for the space used by the hdr and the arc buf individually * so that we can add and remove them from the refcount individually. */ static void arc_evitable_space_decrement(arc_buf_hdr_t *hdr, arc_state_t *state) { arc_buf_contents_t type = arc_buf_type(hdr); uint64_t lsize = HDR_GET_LSIZE(hdr); ASSERT(HDR_HAS_L1HDR(hdr)); if (GHOST_STATE(state)) { ASSERT0(hdr->b_l1hdr.b_bufcnt); ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL); (void) refcount_remove_many(&state->arcs_esize[type], lsize, hdr); return; } ASSERT(!GHOST_STATE(state)); if (hdr->b_l1hdr.b_pdata != NULL) { (void) refcount_remove_many(&state->arcs_esize[type], arc_hdr_size(hdr), hdr); } for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL; buf = buf->b_next) { if (arc_buf_is_shared(buf)) { ASSERT(ARC_BUF_LAST(buf)); continue; } (void) refcount_remove_many(&state->arcs_esize[type], lsize, buf); } } /* * Add a reference to this hdr indicating that someone is actively * referencing that memory. When the refcount transitions from 0 to 1, * we remove it from the respective arc_state_t list to indicate that * it is not evictable. */ static void add_reference(arc_buf_hdr_t *hdr, void *tag) { ASSERT(HDR_HAS_L1HDR(hdr)); if (!MUTEX_HELD(HDR_LOCK(hdr))) { ASSERT(hdr->b_l1hdr.b_state == arc_anon); ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); } arc_state_t *state = hdr->b_l1hdr.b_state; if ((refcount_add(&hdr->b_l1hdr.b_refcnt, tag) == 1) && (state != arc_anon)) { /* We don't use the L2-only state list. */ if (state != arc_l2c_only) { multilist_remove(&state->arcs_list[arc_buf_type(hdr)], hdr); arc_evitable_space_decrement(hdr, state); } /* remove the prefetch flag if we get a reference */ arc_hdr_clear_flags(hdr, ARC_FLAG_PREFETCH); } } /* * Remove a reference from this hdr. When the reference transitions from * 1 to 0 and we're not anonymous, then we add this hdr to the arc_state_t's * list making it eligible for eviction. */ static int remove_reference(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, void *tag) { int cnt; arc_state_t *state = hdr->b_l1hdr.b_state; ASSERT(HDR_HAS_L1HDR(hdr)); ASSERT(state == arc_anon || MUTEX_HELD(hash_lock)); ASSERT(!GHOST_STATE(state)); /* * arc_l2c_only counts as a ghost state so we don't need to explicitly * check to prevent usage of the arc_l2c_only list. */ if (((cnt = refcount_remove(&hdr->b_l1hdr.b_refcnt, tag)) == 0) && (state != arc_anon)) { multilist_insert(&state->arcs_list[arc_buf_type(hdr)], hdr); ASSERT3U(hdr->b_l1hdr.b_bufcnt, >, 0); arc_evictable_space_increment(hdr, state); } return (cnt); } /* * Move the supplied buffer to the indicated state. The hash lock * for the buffer must be held by the caller. */ static void arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr, kmutex_t *hash_lock) { arc_state_t *old_state; int64_t refcnt; uint32_t bufcnt; boolean_t update_old, update_new; arc_buf_contents_t buftype = arc_buf_type(hdr); /* * We almost always have an L1 hdr here, since we call arc_hdr_realloc() * in arc_read() when bringing a buffer out of the L2ARC. However, the * L1 hdr doesn't always exist when we change state to arc_anon before * destroying a header, in which case reallocating to add the L1 hdr is * pointless. */ if (HDR_HAS_L1HDR(hdr)) { old_state = hdr->b_l1hdr.b_state; refcnt = refcount_count(&hdr->b_l1hdr.b_refcnt); bufcnt = hdr->b_l1hdr.b_bufcnt; update_old = (bufcnt > 0 || hdr->b_l1hdr.b_pdata != NULL); } else { old_state = arc_l2c_only; refcnt = 0; bufcnt = 0; update_old = B_FALSE; } update_new = update_old; ASSERT(MUTEX_HELD(hash_lock)); ASSERT3P(new_state, !=, old_state); ASSERT(!GHOST_STATE(new_state) || bufcnt == 0); ASSERT(old_state != arc_anon || bufcnt <= 1); /* * If this buffer is evictable, transfer it from the * old state list to the new state list. */ if (refcnt == 0) { if (old_state != arc_anon && old_state != arc_l2c_only) { ASSERT(HDR_HAS_L1HDR(hdr)); multilist_remove(&old_state->arcs_list[buftype], hdr); if (GHOST_STATE(old_state)) { ASSERT0(bufcnt); ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); update_old = B_TRUE; } arc_evitable_space_decrement(hdr, old_state); } if (new_state != arc_anon && new_state != arc_l2c_only) { /* * An L1 header always exists here, since if we're * moving to some L1-cached state (i.e. not l2c_only or * anonymous), we realloc the header to add an L1hdr * beforehand. */ ASSERT(HDR_HAS_L1HDR(hdr)); multilist_insert(&new_state->arcs_list[buftype], hdr); if (GHOST_STATE(new_state)) { ASSERT0(bufcnt); ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); update_new = B_TRUE; } arc_evictable_space_increment(hdr, new_state); } } ASSERT(!HDR_EMPTY(hdr)); if (new_state == arc_anon && HDR_IN_HASH_TABLE(hdr)) buf_hash_remove(hdr); /* adjust state sizes (ignore arc_l2c_only) */ if (update_new && new_state != arc_l2c_only) { ASSERT(HDR_HAS_L1HDR(hdr)); if (GHOST_STATE(new_state)) { ASSERT0(bufcnt); /* * When moving a header to a ghost state, we first * remove all arc buffers. Thus, we'll have a * bufcnt of zero, and no arc buffer to use for * the reference. As a result, we use the arc * header pointer for the reference. */ (void) refcount_add_many(&new_state->arcs_size, HDR_GET_LSIZE(hdr), hdr); ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL); } else { uint32_t buffers = 0; /* * Each individual buffer holds a unique reference, * thus we must remove each of these references one * at a time. */ for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL; buf = buf->b_next) { ASSERT3U(bufcnt, !=, 0); buffers++; /* * When the arc_buf_t is sharing the data * block with the hdr, the owner of the * reference belongs to the hdr. Only * add to the refcount if the arc_buf_t is * not shared. */ if (arc_buf_is_shared(buf)) { ASSERT(ARC_BUF_LAST(buf)); continue; } (void) refcount_add_many(&new_state->arcs_size, HDR_GET_LSIZE(hdr), buf); } ASSERT3U(bufcnt, ==, buffers); if (hdr->b_l1hdr.b_pdata != NULL) { (void) refcount_add_many(&new_state->arcs_size, arc_hdr_size(hdr), hdr); } else { ASSERT(GHOST_STATE(old_state)); } } } if (update_old && old_state != arc_l2c_only) { ASSERT(HDR_HAS_L1HDR(hdr)); if (GHOST_STATE(old_state)) { ASSERT0(bufcnt); /* * When moving a header off of a ghost state, * the header will not contain any arc buffers. * We use the arc header pointer for the reference * which is exactly what we did when we put the * header on the ghost state. */ (void) refcount_remove_many(&old_state->arcs_size, HDR_GET_LSIZE(hdr), hdr); ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL); } else { uint32_t buffers = 0; /* * Each individual buffer holds a unique reference, * thus we must remove each of these references one * at a time. */ for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL; buf = buf->b_next) { ASSERT3P(bufcnt, !=, 0); buffers++; /* * When the arc_buf_t is sharing the data * block with the hdr, the owner of the * reference belongs to the hdr. Only * add to the refcount if the arc_buf_t is * not shared. */ if (arc_buf_is_shared(buf)) { ASSERT(ARC_BUF_LAST(buf)); continue; } (void) refcount_remove_many( &old_state->arcs_size, HDR_GET_LSIZE(hdr), buf); } ASSERT3U(bufcnt, ==, buffers); ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL); (void) refcount_remove_many( &old_state->arcs_size, arc_hdr_size(hdr), hdr); } } if (HDR_HAS_L1HDR(hdr)) hdr->b_l1hdr.b_state = new_state; /* * L2 headers should never be on the L2 state list since they don't * have L1 headers allocated. */ ASSERT(multilist_is_empty(&arc_l2c_only->arcs_list[ARC_BUFC_DATA]) && multilist_is_empty(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA])); } void arc_space_consume(uint64_t space, arc_space_type_t type) { ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES); switch (type) { case ARC_SPACE_DATA: ARCSTAT_INCR(arcstat_data_size, space); break; case ARC_SPACE_META: ARCSTAT_INCR(arcstat_metadata_size, space); break; case ARC_SPACE_OTHER: ARCSTAT_INCR(arcstat_other_size, space); break; case ARC_SPACE_HDRS: ARCSTAT_INCR(arcstat_hdr_size, space); break; case ARC_SPACE_L2HDRS: ARCSTAT_INCR(arcstat_l2_hdr_size, space); break; } if (type != ARC_SPACE_DATA) ARCSTAT_INCR(arcstat_meta_used, space); atomic_add_64(&arc_size, space); } void arc_space_return(uint64_t space, arc_space_type_t type) { ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES); switch (type) { case ARC_SPACE_DATA: ARCSTAT_INCR(arcstat_data_size, -space); break; case ARC_SPACE_META: ARCSTAT_INCR(arcstat_metadata_size, -space); break; case ARC_SPACE_OTHER: ARCSTAT_INCR(arcstat_other_size, -space); break; case ARC_SPACE_HDRS: ARCSTAT_INCR(arcstat_hdr_size, -space); break; case ARC_SPACE_L2HDRS: ARCSTAT_INCR(arcstat_l2_hdr_size, -space); break; } if (type != ARC_SPACE_DATA) { ASSERT(arc_meta_used >= space); if (arc_meta_max < arc_meta_used) arc_meta_max = arc_meta_used; ARCSTAT_INCR(arcstat_meta_used, -space); } ASSERT(arc_size >= space); atomic_add_64(&arc_size, -space); } /* * Allocate an initial buffer for this hdr, subsequent buffers will * use arc_buf_clone(). */ static arc_buf_t * arc_buf_alloc_impl(arc_buf_hdr_t *hdr, void *tag) { arc_buf_t *buf; ASSERT(HDR_HAS_L1HDR(hdr)); ASSERT3U(HDR_GET_LSIZE(hdr), >, 0); VERIFY(hdr->b_type == ARC_BUFC_DATA || hdr->b_type == ARC_BUFC_METADATA); ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); ASSERT0(hdr->b_l1hdr.b_bufcnt); buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); buf->b_hdr = hdr; buf->b_data = NULL; buf->b_next = NULL; add_reference(hdr, tag); /* * We're about to change the hdr's b_flags. We must either * hold the hash_lock or be undiscoverable. */ ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); /* * If the hdr's data can be shared (no byteswapping, hdr is * uncompressed, hdr's data is not currently being written to the * L2ARC write) then we share the data buffer and set the appropriate * bit in the hdr's b_flags to indicate the hdr is sharing it's * b_pdata with the arc_buf_t. Otherwise, we allocate a new buffer to * store the buf's data. */ if (hdr->b_l1hdr.b_byteswap == DMU_BSWAP_NUMFUNCS && HDR_GET_COMPRESS(hdr) == ZIO_COMPRESS_OFF && !HDR_L2_WRITING(hdr)) { buf->b_data = hdr->b_l1hdr.b_pdata; arc_hdr_set_flags(hdr, ARC_FLAG_SHARED_DATA); } else { buf->b_data = arc_get_data_buf(hdr, HDR_GET_LSIZE(hdr), buf); ARCSTAT_INCR(arcstat_overhead_size, HDR_GET_LSIZE(hdr)); arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA); } VERIFY3P(buf->b_data, !=, NULL); hdr->b_l1hdr.b_buf = buf; hdr->b_l1hdr.b_bufcnt += 1; return (buf); } /* * Used when allocating additional buffers. */ static arc_buf_t * arc_buf_clone(arc_buf_t *from) { arc_buf_t *buf; arc_buf_hdr_t *hdr = from->b_hdr; uint64_t size = HDR_GET_LSIZE(hdr); ASSERT(HDR_HAS_L1HDR(hdr)); ASSERT(hdr->b_l1hdr.b_state != arc_anon); buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); buf->b_hdr = hdr; buf->b_data = NULL; buf->b_next = hdr->b_l1hdr.b_buf; hdr->b_l1hdr.b_buf = buf; buf->b_data = arc_get_data_buf(hdr, HDR_GET_LSIZE(hdr), buf); bcopy(from->b_data, buf->b_data, size); hdr->b_l1hdr.b_bufcnt += 1; ARCSTAT_INCR(arcstat_overhead_size, HDR_GET_LSIZE(hdr)); return (buf); } static char *arc_onloan_tag = "onloan"; /* * Loan out an anonymous arc buffer. Loaned buffers are not counted as in * flight data by arc_tempreserve_space() until they are "returned". Loaned * buffers must be returned to the arc before they can be used by the DMU or * freed. */ arc_buf_t * arc_loan_buf(spa_t *spa, int size) { arc_buf_t *buf; buf = arc_alloc_buf(spa, size, arc_onloan_tag, ARC_BUFC_DATA); atomic_add_64(&arc_loaned_bytes, size); return (buf); } /* * Return a loaned arc buffer to the arc. */ void arc_return_buf(arc_buf_t *buf, void *tag) { arc_buf_hdr_t *hdr = buf->b_hdr; ASSERT3P(buf->b_data, !=, NULL); ASSERT(HDR_HAS_L1HDR(hdr)); (void) refcount_add(&hdr->b_l1hdr.b_refcnt, tag); (void) refcount_remove(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag); atomic_add_64(&arc_loaned_bytes, -HDR_GET_LSIZE(hdr)); } /* Detach an arc_buf from a dbuf (tag) */ void arc_loan_inuse_buf(arc_buf_t *buf, void *tag) { arc_buf_hdr_t *hdr = buf->b_hdr; ASSERT3P(buf->b_data, !=, NULL); ASSERT(HDR_HAS_L1HDR(hdr)); (void) refcount_add(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag); (void) refcount_remove(&hdr->b_l1hdr.b_refcnt, tag); atomic_add_64(&arc_loaned_bytes, HDR_GET_LSIZE(hdr)); } static void l2arc_free_data_on_write(void *data, size_t size, arc_buf_contents_t type) { l2arc_data_free_t *df = kmem_alloc(sizeof (*df), KM_SLEEP); df->l2df_data = data; df->l2df_size = size; df->l2df_type = type; mutex_enter(&l2arc_free_on_write_mtx); list_insert_head(l2arc_free_on_write, df); mutex_exit(&l2arc_free_on_write_mtx); } static void arc_hdr_free_on_write(arc_buf_hdr_t *hdr) { arc_state_t *state = hdr->b_l1hdr.b_state; arc_buf_contents_t type = arc_buf_type(hdr); uint64_t size = arc_hdr_size(hdr); /* protected by hash lock, if in the hash table */ if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) { ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); ASSERT(state != arc_anon && state != arc_l2c_only); (void) refcount_remove_many(&state->arcs_esize[type], size, hdr); } (void) refcount_remove_many(&state->arcs_size, size, hdr); l2arc_free_data_on_write(hdr->b_l1hdr.b_pdata, size, type); } /* * Share the arc_buf_t's data with the hdr. Whenever we are sharing the * data buffer, we transfer the refcount ownership to the hdr and update * the appropriate kstats. */ static void arc_share_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf) { arc_state_t *state = hdr->b_l1hdr.b_state; ASSERT(!HDR_SHARED_DATA(hdr)); ASSERT(!arc_buf_is_shared(buf)); ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL); ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); /* * Start sharing the data buffer. We transfer the * refcount ownership to the hdr since it always owns * the refcount whenever an arc_buf_t is shared. */ refcount_transfer_ownership(&state->arcs_size, buf, hdr); hdr->b_l1hdr.b_pdata = buf->b_data; arc_hdr_set_flags(hdr, ARC_FLAG_SHARED_DATA); /* * Since we've transferred ownership to the hdr we need * to increment its compressed and uncompressed kstats and * decrement the overhead size. */ ARCSTAT_INCR(arcstat_compressed_size, arc_hdr_size(hdr)); ARCSTAT_INCR(arcstat_uncompressed_size, HDR_GET_LSIZE(hdr)); ARCSTAT_INCR(arcstat_overhead_size, -HDR_GET_LSIZE(hdr)); } static void arc_unshare_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf) { arc_state_t *state = hdr->b_l1hdr.b_state; ASSERT(HDR_SHARED_DATA(hdr)); ASSERT(arc_buf_is_shared(buf)); ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL); ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); /* * We are no longer sharing this buffer so we need * to transfer its ownership to the rightful owner. */ refcount_transfer_ownership(&state->arcs_size, hdr, buf); arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA); hdr->b_l1hdr.b_pdata = NULL; /* * Since the buffer is no longer shared between * the arc buf and the hdr, count it as overhead. */ ARCSTAT_INCR(arcstat_compressed_size, -arc_hdr_size(hdr)); ARCSTAT_INCR(arcstat_uncompressed_size, -HDR_GET_LSIZE(hdr)); ARCSTAT_INCR(arcstat_overhead_size, HDR_GET_LSIZE(hdr)); } /* * Free up buf->b_data and if 'remove' is set, then pull the * arc_buf_t off of the the arc_buf_hdr_t's list and free it. */ static void arc_buf_destroy_impl(arc_buf_t *buf, boolean_t remove) { arc_buf_t **bufp; arc_buf_hdr_t *hdr = buf->b_hdr; uint64_t size = HDR_GET_LSIZE(hdr); boolean_t destroyed_buf_is_shared = arc_buf_is_shared(buf); /* * Free up the data associated with the buf but only * if we're not sharing this with the hdr. If we are sharing * it with the hdr, then hdr will have performed the allocation * so allow it to do the free. */ if (buf->b_data != NULL) { /* * We're about to change the hdr's b_flags. We must either * hold the hash_lock or be undiscoverable. */ ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); arc_cksum_verify(buf); #ifdef illumos arc_buf_unwatch(buf); #endif if (destroyed_buf_is_shared) { ASSERT(ARC_BUF_LAST(buf)); ASSERT(HDR_SHARED_DATA(hdr)); arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA); } else { arc_free_data_buf(hdr, buf->b_data, size, buf); ARCSTAT_INCR(arcstat_overhead_size, -size); } buf->b_data = NULL; ASSERT(hdr->b_l1hdr.b_bufcnt > 0); hdr->b_l1hdr.b_bufcnt -= 1; } /* only remove the buf if requested */ if (!remove) return; /* remove the buf from the hdr list */ arc_buf_t *lastbuf = NULL; bufp = &hdr->b_l1hdr.b_buf; while (*bufp != NULL) { if (*bufp == buf) *bufp = buf->b_next; /* * If we've removed a buffer in the middle of * the list then update the lastbuf and update * bufp. */ if (*bufp != NULL) { lastbuf = *bufp; bufp = &(*bufp)->b_next; } } buf->b_next = NULL; ASSERT3P(lastbuf, !=, buf); /* * If the current arc_buf_t is sharing its data * buffer with the hdr, then reassign the hdr's * b_pdata to share it with the new buffer at the end * of the list. The shared buffer is always the last one * on the hdr's buffer list. */ if (destroyed_buf_is_shared && lastbuf != NULL) { ASSERT(ARC_BUF_LAST(buf)); ASSERT(ARC_BUF_LAST(lastbuf)); VERIFY(!arc_buf_is_shared(lastbuf)); ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL); arc_hdr_free_pdata(hdr); /* * We must setup a new shared block between the * last buffer and the hdr. The data would have * been allocated by the arc buf so we need to transfer * ownership to the hdr since it's now being shared. */ arc_share_buf(hdr, lastbuf); } else if (HDR_SHARED_DATA(hdr)) { ASSERT(arc_buf_is_shared(lastbuf)); } if (hdr->b_l1hdr.b_bufcnt == 0) arc_cksum_free(hdr); /* clean up the buf */ buf->b_hdr = NULL; kmem_cache_free(buf_cache, buf); } static void arc_hdr_alloc_pdata(arc_buf_hdr_t *hdr) { ASSERT3U(HDR_GET_LSIZE(hdr), >, 0); ASSERT(HDR_HAS_L1HDR(hdr)); ASSERT(!HDR_SHARED_DATA(hdr)); ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL); hdr->b_l1hdr.b_pdata = arc_get_data_buf(hdr, arc_hdr_size(hdr), hdr); hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS; ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL); ARCSTAT_INCR(arcstat_compressed_size, arc_hdr_size(hdr)); ARCSTAT_INCR(arcstat_uncompressed_size, HDR_GET_LSIZE(hdr)); } static void arc_hdr_free_pdata(arc_buf_hdr_t *hdr) { ASSERT(HDR_HAS_L1HDR(hdr)); ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL); /* * If the hdr is currently being written to the l2arc then * we defer freeing the data by adding it to the l2arc_free_on_write * list. The l2arc will free the data once it's finished * writing it to the l2arc device. */ if (HDR_L2_WRITING(hdr)) { arc_hdr_free_on_write(hdr); ARCSTAT_BUMP(arcstat_l2_free_on_write); } else { arc_free_data_buf(hdr, hdr->b_l1hdr.b_pdata, arc_hdr_size(hdr), hdr); } hdr->b_l1hdr.b_pdata = NULL; hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS; ARCSTAT_INCR(arcstat_compressed_size, -arc_hdr_size(hdr)); ARCSTAT_INCR(arcstat_uncompressed_size, -HDR_GET_LSIZE(hdr)); } static arc_buf_hdr_t * arc_hdr_alloc(uint64_t spa, int32_t psize, int32_t lsize, enum zio_compress compress, arc_buf_contents_t type) { arc_buf_hdr_t *hdr; ASSERT3U(lsize, >, 0); VERIFY(type == ARC_BUFC_DATA || type == ARC_BUFC_METADATA); hdr = kmem_cache_alloc(hdr_full_cache, KM_PUSHPAGE); ASSERT(HDR_EMPTY(hdr)); ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL); ASSERT3P(hdr->b_l1hdr.b_thawed, ==, NULL); HDR_SET_PSIZE(hdr, psize); HDR_SET_LSIZE(hdr, lsize); hdr->b_spa = spa; hdr->b_type = type; hdr->b_flags = 0; arc_hdr_set_flags(hdr, arc_bufc_to_flags(type) | ARC_FLAG_HAS_L1HDR); arc_hdr_set_compress(hdr, compress); hdr->b_l1hdr.b_state = arc_anon; hdr->b_l1hdr.b_arc_access = 0; hdr->b_l1hdr.b_bufcnt = 0; hdr->b_l1hdr.b_buf = NULL; /* * Allocate the hdr's buffer. This will contain either * the compressed or uncompressed data depending on the block * it references and compressed arc enablement. */ arc_hdr_alloc_pdata(hdr); ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); return (hdr); } /* * Transition between the two allocation states for the arc_buf_hdr struct. * The arc_buf_hdr struct can be allocated with (hdr_full_cache) or without * (hdr_l2only_cache) the fields necessary for the L1 cache - the smaller * version is used when a cache buffer is only in the L2ARC in order to reduce * memory usage. */ static arc_buf_hdr_t * arc_hdr_realloc(arc_buf_hdr_t *hdr, kmem_cache_t *old, kmem_cache_t *new) { ASSERT(HDR_HAS_L2HDR(hdr)); arc_buf_hdr_t *nhdr; l2arc_dev_t *dev = hdr->b_l2hdr.b_dev; ASSERT((old == hdr_full_cache && new == hdr_l2only_cache) || (old == hdr_l2only_cache && new == hdr_full_cache)); nhdr = kmem_cache_alloc(new, KM_PUSHPAGE); ASSERT(MUTEX_HELD(HDR_LOCK(hdr))); buf_hash_remove(hdr); bcopy(hdr, nhdr, HDR_L2ONLY_SIZE); if (new == hdr_full_cache) { arc_hdr_set_flags(nhdr, ARC_FLAG_HAS_L1HDR); /* * arc_access and arc_change_state need to be aware that a * header has just come out of L2ARC, so we set its state to * l2c_only even though it's about to change. */ nhdr->b_l1hdr.b_state = arc_l2c_only; /* Verify previous threads set to NULL before freeing */ ASSERT3P(nhdr->b_l1hdr.b_pdata, ==, NULL); } else { ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); ASSERT0(hdr->b_l1hdr.b_bufcnt); ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL); /* * If we've reached here, We must have been called from * arc_evict_hdr(), as such we should have already been * removed from any ghost list we were previously on * (which protects us from racing with arc_evict_state), * thus no locking is needed during this check. */ ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node)); /* * A buffer must not be moved into the arc_l2c_only * state if it's not finished being written out to the * l2arc device. Otherwise, the b_l1hdr.b_pdata field * might try to be accessed, even though it was removed. */ VERIFY(!HDR_L2_WRITING(hdr)); VERIFY3P(hdr->b_l1hdr.b_pdata, ==, NULL); #ifdef ZFS_DEBUG if (hdr->b_l1hdr.b_thawed != NULL) { kmem_free(hdr->b_l1hdr.b_thawed, 1); hdr->b_l1hdr.b_thawed = NULL; } #endif arc_hdr_clear_flags(nhdr, ARC_FLAG_HAS_L1HDR); } /* * The header has been reallocated so we need to re-insert it into any * lists it was on. */ (void) buf_hash_insert(nhdr, NULL); ASSERT(list_link_active(&hdr->b_l2hdr.b_l2node)); mutex_enter(&dev->l2ad_mtx); /* * We must place the realloc'ed header back into the list at * the same spot. Otherwise, if it's placed earlier in the list, * l2arc_write_buffers() could find it during the function's * write phase, and try to write it out to the l2arc. */ list_insert_after(&dev->l2ad_buflist, hdr, nhdr); list_remove(&dev->l2ad_buflist, hdr); mutex_exit(&dev->l2ad_mtx); /* * Since we're using the pointer address as the tag when * incrementing and decrementing the l2ad_alloc refcount, we * must remove the old pointer (that we're about to destroy) and * add the new pointer to the refcount. Otherwise we'd remove * the wrong pointer address when calling arc_hdr_destroy() later. */ (void) refcount_remove_many(&dev->l2ad_alloc, arc_hdr_size(hdr), hdr); (void) refcount_add_many(&dev->l2ad_alloc, arc_hdr_size(nhdr), nhdr); buf_discard_identity(hdr); kmem_cache_free(old, hdr); return (nhdr); } /* * Allocate a new arc_buf_hdr_t and arc_buf_t and return the buf to the caller. * The buf is returned thawed since we expect the consumer to modify it. */ arc_buf_t * arc_alloc_buf(spa_t *spa, int32_t size, void *tag, arc_buf_contents_t type) { arc_buf_hdr_t *hdr = arc_hdr_alloc(spa_load_guid(spa), size, size, ZIO_COMPRESS_OFF, type); ASSERT(!MUTEX_HELD(HDR_LOCK(hdr))); arc_buf_t *buf = arc_buf_alloc_impl(hdr, tag); arc_buf_thaw(buf); return (buf); } static void arc_hdr_l2hdr_destroy(arc_buf_hdr_t *hdr) { l2arc_buf_hdr_t *l2hdr = &hdr->b_l2hdr; l2arc_dev_t *dev = l2hdr->b_dev; uint64_t asize = arc_hdr_size(hdr); ASSERT(MUTEX_HELD(&dev->l2ad_mtx)); ASSERT(HDR_HAS_L2HDR(hdr)); list_remove(&dev->l2ad_buflist, hdr); ARCSTAT_INCR(arcstat_l2_asize, -asize); ARCSTAT_INCR(arcstat_l2_size, -HDR_GET_LSIZE(hdr)); vdev_space_update(dev->l2ad_vdev, -asize, 0, 0); (void) refcount_remove_many(&dev->l2ad_alloc, asize, hdr); arc_hdr_clear_flags(hdr, ARC_FLAG_HAS_L2HDR); } static void arc_hdr_destroy(arc_buf_hdr_t *hdr) { if (HDR_HAS_L1HDR(hdr)) { ASSERT(hdr->b_l1hdr.b_buf == NULL || hdr->b_l1hdr.b_bufcnt > 0); ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon); } ASSERT(!HDR_IO_IN_PROGRESS(hdr)); ASSERT(!HDR_IN_HASH_TABLE(hdr)); if (!HDR_EMPTY(hdr)) buf_discard_identity(hdr); if (HDR_HAS_L2HDR(hdr)) { l2arc_dev_t *dev = hdr->b_l2hdr.b_dev; boolean_t buflist_held = MUTEX_HELD(&dev->l2ad_mtx); if (!buflist_held) mutex_enter(&dev->l2ad_mtx); /* * Even though we checked this conditional above, we * need to check this again now that we have the * l2ad_mtx. This is because we could be racing with * another thread calling l2arc_evict() which might have * destroyed this header's L2 portion as we were waiting * to acquire the l2ad_mtx. If that happens, we don't * want to re-destroy the header's L2 portion. */ if (HDR_HAS_L2HDR(hdr)) { l2arc_trim(hdr); arc_hdr_l2hdr_destroy(hdr); } if (!buflist_held) mutex_exit(&dev->l2ad_mtx); } if (HDR_HAS_L1HDR(hdr)) { arc_cksum_free(hdr); while (hdr->b_l1hdr.b_buf != NULL) arc_buf_destroy_impl(hdr->b_l1hdr.b_buf, B_TRUE); #ifdef ZFS_DEBUG if (hdr->b_l1hdr.b_thawed != NULL) { kmem_free(hdr->b_l1hdr.b_thawed, 1); hdr->b_l1hdr.b_thawed = NULL; } #endif if (hdr->b_l1hdr.b_pdata != NULL) { arc_hdr_free_pdata(hdr); } } ASSERT3P(hdr->b_hash_next, ==, NULL); if (HDR_HAS_L1HDR(hdr)) { ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node)); ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL); kmem_cache_free(hdr_full_cache, hdr); } else { kmem_cache_free(hdr_l2only_cache, hdr); } } void arc_buf_destroy(arc_buf_t *buf, void* tag) { arc_buf_hdr_t *hdr = buf->b_hdr; kmutex_t *hash_lock = HDR_LOCK(hdr); if (hdr->b_l1hdr.b_state == arc_anon) { ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1); ASSERT(!HDR_IO_IN_PROGRESS(hdr)); VERIFY0(remove_reference(hdr, NULL, tag)); arc_hdr_destroy(hdr); return; } mutex_enter(hash_lock); ASSERT3P(hdr, ==, buf->b_hdr); ASSERT(hdr->b_l1hdr.b_bufcnt > 0); ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); ASSERT3P(hdr->b_l1hdr.b_state, !=, arc_anon); ASSERT3P(buf->b_data, !=, NULL); (void) remove_reference(hdr, hash_lock, tag); arc_buf_destroy_impl(buf, B_TRUE); mutex_exit(hash_lock); } int32_t arc_buf_size(arc_buf_t *buf) { return (HDR_GET_LSIZE(buf->b_hdr)); } /* * Evict the arc_buf_hdr that is provided as a parameter. The resultant - * state of the header is dependent on it's state prior to entering this + * state of the header is dependent on its state prior to entering this * function. The following transitions are possible: * * - arc_mru -> arc_mru_ghost * - arc_mfu -> arc_mfu_ghost * - arc_mru_ghost -> arc_l2c_only * - arc_mru_ghost -> deleted * - arc_mfu_ghost -> arc_l2c_only * - arc_mfu_ghost -> deleted */ static int64_t arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) { arc_state_t *evicted_state, *state; int64_t bytes_evicted = 0; ASSERT(MUTEX_HELD(hash_lock)); ASSERT(HDR_HAS_L1HDR(hdr)); state = hdr->b_l1hdr.b_state; if (GHOST_STATE(state)) { ASSERT(!HDR_IO_IN_PROGRESS(hdr)); ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); /* * l2arc_write_buffers() relies on a header's L1 portion * (i.e. its b_pdata field) during its write phase. * Thus, we cannot push a header onto the arc_l2c_only * state (removing it's L1 piece) until the header is * done being written to the l2arc. */ if (HDR_HAS_L2HDR(hdr) && HDR_L2_WRITING(hdr)) { ARCSTAT_BUMP(arcstat_evict_l2_skip); return (bytes_evicted); } ARCSTAT_BUMP(arcstat_deleted); bytes_evicted += HDR_GET_LSIZE(hdr); DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, hdr); ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL); if (HDR_HAS_L2HDR(hdr)) { ASSERT(hdr->b_l1hdr.b_pdata == NULL); /* * This buffer is cached on the 2nd Level ARC; * don't destroy the header. */ arc_change_state(arc_l2c_only, hdr, hash_lock); /* * dropping from L1+L2 cached to L2-only, * realloc to remove the L1 header. */ hdr = arc_hdr_realloc(hdr, hdr_full_cache, hdr_l2only_cache); } else { ASSERT(hdr->b_l1hdr.b_pdata == NULL); arc_change_state(arc_anon, hdr, hash_lock); arc_hdr_destroy(hdr); } return (bytes_evicted); } ASSERT(state == arc_mru || state == arc_mfu); evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost; /* prefetch buffers have a minimum lifespan */ if (HDR_IO_IN_PROGRESS(hdr) || ((hdr->b_flags & (ARC_FLAG_PREFETCH | ARC_FLAG_INDIRECT)) && ddi_get_lbolt() - hdr->b_l1hdr.b_arc_access < arc_min_prefetch_lifespan)) { ARCSTAT_BUMP(arcstat_evict_skip); return (bytes_evicted); } ASSERT0(refcount_count(&hdr->b_l1hdr.b_refcnt)); while (hdr->b_l1hdr.b_buf) { arc_buf_t *buf = hdr->b_l1hdr.b_buf; if (!mutex_tryenter(&buf->b_evict_lock)) { ARCSTAT_BUMP(arcstat_mutex_miss); break; } if (buf->b_data != NULL) bytes_evicted += HDR_GET_LSIZE(hdr); mutex_exit(&buf->b_evict_lock); arc_buf_destroy_impl(buf, B_TRUE); } if (HDR_HAS_L2HDR(hdr)) { ARCSTAT_INCR(arcstat_evict_l2_cached, HDR_GET_LSIZE(hdr)); } else { if (l2arc_write_eligible(hdr->b_spa, hdr)) { ARCSTAT_INCR(arcstat_evict_l2_eligible, HDR_GET_LSIZE(hdr)); } else { ARCSTAT_INCR(arcstat_evict_l2_ineligible, HDR_GET_LSIZE(hdr)); } } if (hdr->b_l1hdr.b_bufcnt == 0) { arc_cksum_free(hdr); bytes_evicted += arc_hdr_size(hdr); /* * If this hdr is being evicted and has a compressed * buffer then we discard it here before we change states. * This ensures that the accounting is updated correctly * in arc_free_data_buf(). */ arc_hdr_free_pdata(hdr); arc_change_state(evicted_state, hdr, hash_lock); ASSERT(HDR_IN_HASH_TABLE(hdr)); arc_hdr_set_flags(hdr, ARC_FLAG_IN_HASH_TABLE); DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, hdr); } return (bytes_evicted); } static uint64_t arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker, uint64_t spa, int64_t bytes) { multilist_sublist_t *mls; uint64_t bytes_evicted = 0; arc_buf_hdr_t *hdr; kmutex_t *hash_lock; int evict_count = 0; ASSERT3P(marker, !=, NULL); IMPLY(bytes < 0, bytes == ARC_EVICT_ALL); mls = multilist_sublist_lock(ml, idx); for (hdr = multilist_sublist_prev(mls, marker); hdr != NULL; hdr = multilist_sublist_prev(mls, marker)) { if ((bytes != ARC_EVICT_ALL && bytes_evicted >= bytes) || (evict_count >= zfs_arc_evict_batch_limit)) break; /* * To keep our iteration location, move the marker * forward. Since we're not holding hdr's hash lock, we * must be very careful and not remove 'hdr' from the * sublist. Otherwise, other consumers might mistake the * 'hdr' as not being on a sublist when they call the * multilist_link_active() function (they all rely on * the hash lock protecting concurrent insertions and * removals). multilist_sublist_move_forward() was * specifically implemented to ensure this is the case * (only 'marker' will be removed and re-inserted). */ multilist_sublist_move_forward(mls, marker); /* * The only case where the b_spa field should ever be * zero, is the marker headers inserted by * arc_evict_state(). It's possible for multiple threads * to be calling arc_evict_state() concurrently (e.g. * dsl_pool_close() and zio_inject_fault()), so we must * skip any markers we see from these other threads. */ if (hdr->b_spa == 0) continue; /* we're only interested in evicting buffers of a certain spa */ if (spa != 0 && hdr->b_spa != spa) { ARCSTAT_BUMP(arcstat_evict_skip); continue; } hash_lock = HDR_LOCK(hdr); /* * We aren't calling this function from any code path * that would already be holding a hash lock, so we're * asserting on this assumption to be defensive in case * this ever changes. Without this check, it would be * possible to incorrectly increment arcstat_mutex_miss * below (e.g. if the code changed such that we called * this function with a hash lock held). */ ASSERT(!MUTEX_HELD(hash_lock)); if (mutex_tryenter(hash_lock)) { uint64_t evicted = arc_evict_hdr(hdr, hash_lock); mutex_exit(hash_lock); bytes_evicted += evicted; /* * If evicted is zero, arc_evict_hdr() must have * decided to skip this header, don't increment * evict_count in this case. */ if (evicted != 0) evict_count++; /* * If arc_size isn't overflowing, signal any * threads that might happen to be waiting. * * For each header evicted, we wake up a single * thread. If we used cv_broadcast, we could * wake up "too many" threads causing arc_size * to significantly overflow arc_c; since * arc_get_data_buf() doesn't check for overflow * when it's woken up (it doesn't because it's * possible for the ARC to be overflowing while * full of un-evictable buffers, and the * function should proceed in this case). * * If threads are left sleeping, due to not * using cv_broadcast, they will be woken up * just before arc_reclaim_thread() sleeps. */ mutex_enter(&arc_reclaim_lock); if (!arc_is_overflowing()) cv_signal(&arc_reclaim_waiters_cv); mutex_exit(&arc_reclaim_lock); } else { ARCSTAT_BUMP(arcstat_mutex_miss); } } multilist_sublist_unlock(mls); return (bytes_evicted); } /* * Evict buffers from the given arc state, until we've removed the * specified number of bytes. Move the removed buffers to the * appropriate evict state. * * This function makes a "best effort". It skips over any buffers * it can't get a hash_lock on, and so, may not catch all candidates. * It may also return without evicting as much space as requested. * * If bytes is specified using the special value ARC_EVICT_ALL, this * will evict all available (i.e. unlocked and evictable) buffers from * the given arc state; which is used by arc_flush(). */ static uint64_t arc_evict_state(arc_state_t *state, uint64_t spa, int64_t bytes, arc_buf_contents_t type) { uint64_t total_evicted = 0; multilist_t *ml = &state->arcs_list[type]; int num_sublists; arc_buf_hdr_t **markers; IMPLY(bytes < 0, bytes == ARC_EVICT_ALL); num_sublists = multilist_get_num_sublists(ml); /* * If we've tried to evict from each sublist, made some * progress, but still have not hit the target number of bytes * to evict, we want to keep trying. The markers allow us to * pick up where we left off for each individual sublist, rather * than starting from the tail each time. */ markers = kmem_zalloc(sizeof (*markers) * num_sublists, KM_SLEEP); for (int i = 0; i < num_sublists; i++) { markers[i] = kmem_cache_alloc(hdr_full_cache, KM_SLEEP); /* * A b_spa of 0 is used to indicate that this header is * a marker. This fact is used in arc_adjust_type() and * arc_evict_state_impl(). */ markers[i]->b_spa = 0; multilist_sublist_t *mls = multilist_sublist_lock(ml, i); multilist_sublist_insert_tail(mls, markers[i]); multilist_sublist_unlock(mls); } /* * While we haven't hit our target number of bytes to evict, or * we're evicting all available buffers. */ while (total_evicted < bytes || bytes == ARC_EVICT_ALL) { /* * Start eviction using a randomly selected sublist, * this is to try and evenly balance eviction across all * sublists. Always starting at the same sublist * (e.g. index 0) would cause evictions to favor certain * sublists over others. */ int sublist_idx = multilist_get_random_index(ml); uint64_t scan_evicted = 0; for (int i = 0; i < num_sublists; i++) { uint64_t bytes_remaining; uint64_t bytes_evicted; if (bytes == ARC_EVICT_ALL) bytes_remaining = ARC_EVICT_ALL; else if (total_evicted < bytes) bytes_remaining = bytes - total_evicted; else break; bytes_evicted = arc_evict_state_impl(ml, sublist_idx, markers[sublist_idx], spa, bytes_remaining); scan_evicted += bytes_evicted; total_evicted += bytes_evicted; /* we've reached the end, wrap to the beginning */ if (++sublist_idx >= num_sublists) sublist_idx = 0; } /* * If we didn't evict anything during this scan, we have * no reason to believe we'll evict more during another * scan, so break the loop. */ if (scan_evicted == 0) { /* This isn't possible, let's make that obvious */ ASSERT3S(bytes, !=, 0); /* * When bytes is ARC_EVICT_ALL, the only way to * break the loop is when scan_evicted is zero. * In that case, we actually have evicted enough, * so we don't want to increment the kstat. */ if (bytes != ARC_EVICT_ALL) { ASSERT3S(total_evicted, <, bytes); ARCSTAT_BUMP(arcstat_evict_not_enough); } break; } } for (int i = 0; i < num_sublists; i++) { multilist_sublist_t *mls = multilist_sublist_lock(ml, i); multilist_sublist_remove(mls, markers[i]); multilist_sublist_unlock(mls); kmem_cache_free(hdr_full_cache, markers[i]); } kmem_free(markers, sizeof (*markers) * num_sublists); return (total_evicted); } /* * Flush all "evictable" data of the given type from the arc state * specified. This will not evict any "active" buffers (i.e. referenced). * * When 'retry' is set to B_FALSE, the function will make a single pass * over the state and evict any buffers that it can. Since it doesn't * continually retry the eviction, it might end up leaving some buffers * in the ARC due to lock misses. * * When 'retry' is set to B_TRUE, the function will continually retry the * eviction until *all* evictable buffers have been removed from the * state. As a result, if concurrent insertions into the state are * allowed (e.g. if the ARC isn't shutting down), this function might * wind up in an infinite loop, continually trying to evict buffers. */ static uint64_t arc_flush_state(arc_state_t *state, uint64_t spa, arc_buf_contents_t type, boolean_t retry) { uint64_t evicted = 0; while (refcount_count(&state->arcs_esize[type]) != 0) { evicted += arc_evict_state(state, spa, ARC_EVICT_ALL, type); if (!retry) break; } return (evicted); } /* * Evict the specified number of bytes from the state specified, * restricting eviction to the spa and type given. This function * prevents us from trying to evict more from a state's list than * is "evictable", and to skip evicting altogether when passed a * negative value for "bytes". In contrast, arc_evict_state() will * evict everything it can, when passed a negative value for "bytes". */ static uint64_t arc_adjust_impl(arc_state_t *state, uint64_t spa, int64_t bytes, arc_buf_contents_t type) { int64_t delta; if (bytes > 0 && refcount_count(&state->arcs_esize[type]) > 0) { delta = MIN(refcount_count(&state->arcs_esize[type]), bytes); return (arc_evict_state(state, spa, delta, type)); } return (0); } /* * Evict metadata buffers from the cache, such that arc_meta_used is * capped by the arc_meta_limit tunable. */ static uint64_t arc_adjust_meta(void) { uint64_t total_evicted = 0; int64_t target; /* * If we're over the meta limit, we want to evict enough * metadata to get back under the meta limit. We don't want to * evict so much that we drop the MRU below arc_p, though. If * we're over the meta limit more than we're over arc_p, we * evict some from the MRU here, and some from the MFU below. */ target = MIN((int64_t)(arc_meta_used - arc_meta_limit), (int64_t)(refcount_count(&arc_anon->arcs_size) + refcount_count(&arc_mru->arcs_size) - arc_p)); total_evicted += arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_METADATA); /* * Similar to the above, we want to evict enough bytes to get us * below the meta limit, but not so much as to drop us below the * space alloted to the MFU (which is defined as arc_c - arc_p). */ target = MIN((int64_t)(arc_meta_used - arc_meta_limit), (int64_t)(refcount_count(&arc_mfu->arcs_size) - (arc_c - arc_p))); total_evicted += arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_METADATA); return (total_evicted); } /* * Return the type of the oldest buffer in the given arc state * * This function will select a random sublist of type ARC_BUFC_DATA and * a random sublist of type ARC_BUFC_METADATA. The tail of each sublist * is compared, and the type which contains the "older" buffer will be * returned. */ static arc_buf_contents_t arc_adjust_type(arc_state_t *state) { multilist_t *data_ml = &state->arcs_list[ARC_BUFC_DATA]; multilist_t *meta_ml = &state->arcs_list[ARC_BUFC_METADATA]; int data_idx = multilist_get_random_index(data_ml); int meta_idx = multilist_get_random_index(meta_ml); multilist_sublist_t *data_mls; multilist_sublist_t *meta_mls; arc_buf_contents_t type; arc_buf_hdr_t *data_hdr; arc_buf_hdr_t *meta_hdr; /* * We keep the sublist lock until we're finished, to prevent * the headers from being destroyed via arc_evict_state(). */ data_mls = multilist_sublist_lock(data_ml, data_idx); meta_mls = multilist_sublist_lock(meta_ml, meta_idx); /* * These two loops are to ensure we skip any markers that * might be at the tail of the lists due to arc_evict_state(). */ for (data_hdr = multilist_sublist_tail(data_mls); data_hdr != NULL; data_hdr = multilist_sublist_prev(data_mls, data_hdr)) { if (data_hdr->b_spa != 0) break; } for (meta_hdr = multilist_sublist_tail(meta_mls); meta_hdr != NULL; meta_hdr = multilist_sublist_prev(meta_mls, meta_hdr)) { if (meta_hdr->b_spa != 0) break; } if (data_hdr == NULL && meta_hdr == NULL) { type = ARC_BUFC_DATA; } else if (data_hdr == NULL) { ASSERT3P(meta_hdr, !=, NULL); type = ARC_BUFC_METADATA; } else if (meta_hdr == NULL) { ASSERT3P(data_hdr, !=, NULL); type = ARC_BUFC_DATA; } else { ASSERT3P(data_hdr, !=, NULL); ASSERT3P(meta_hdr, !=, NULL); /* The headers can't be on the sublist without an L1 header */ ASSERT(HDR_HAS_L1HDR(data_hdr)); ASSERT(HDR_HAS_L1HDR(meta_hdr)); if (data_hdr->b_l1hdr.b_arc_access < meta_hdr->b_l1hdr.b_arc_access) { type = ARC_BUFC_DATA; } else { type = ARC_BUFC_METADATA; } } multilist_sublist_unlock(meta_mls); multilist_sublist_unlock(data_mls); return (type); } /* * Evict buffers from the cache, such that arc_size is capped by arc_c. */ static uint64_t arc_adjust(void) { uint64_t total_evicted = 0; uint64_t bytes; int64_t target; /* * If we're over arc_meta_limit, we want to correct that before * potentially evicting data buffers below. */ total_evicted += arc_adjust_meta(); /* * Adjust MRU size * * If we're over the target cache size, we want to evict enough * from the list to get back to our target size. We don't want * to evict too much from the MRU, such that it drops below * arc_p. So, if we're over our target cache size more than * the MRU is over arc_p, we'll evict enough to get back to * arc_p here, and then evict more from the MFU below. */ target = MIN((int64_t)(arc_size - arc_c), (int64_t)(refcount_count(&arc_anon->arcs_size) + refcount_count(&arc_mru->arcs_size) + arc_meta_used - arc_p)); /* * If we're below arc_meta_min, always prefer to evict data. * Otherwise, try to satisfy the requested number of bytes to * evict from the type which contains older buffers; in an * effort to keep newer buffers in the cache regardless of their * type. If we cannot satisfy the number of bytes from this * type, spill over into the next type. */ if (arc_adjust_type(arc_mru) == ARC_BUFC_METADATA && arc_meta_used > arc_meta_min) { bytes = arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_METADATA); total_evicted += bytes; /* * If we couldn't evict our target number of bytes from * metadata, we try to get the rest from data. */ target -= bytes; total_evicted += arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_DATA); } else { bytes = arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_DATA); total_evicted += bytes; /* * If we couldn't evict our target number of bytes from * data, we try to get the rest from metadata. */ target -= bytes; total_evicted += arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_METADATA); } /* * Adjust MFU size * * Now that we've tried to evict enough from the MRU to get its * size back to arc_p, if we're still above the target cache * size, we evict the rest from the MFU. */ target = arc_size - arc_c; if (arc_adjust_type(arc_mfu) == ARC_BUFC_METADATA && arc_meta_used > arc_meta_min) { bytes = arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_METADATA); total_evicted += bytes; /* * If we couldn't evict our target number of bytes from * metadata, we try to get the rest from data. */ target -= bytes; total_evicted += arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_DATA); } else { bytes = arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_DATA); total_evicted += bytes; /* * If we couldn't evict our target number of bytes from * data, we try to get the rest from data. */ target -= bytes; total_evicted += arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_METADATA); } /* * Adjust ghost lists * * In addition to the above, the ARC also defines target values * for the ghost lists. The sum of the mru list and mru ghost * list should never exceed the target size of the cache, and * the sum of the mru list, mfu list, mru ghost list, and mfu * ghost list should never exceed twice the target size of the * cache. The following logic enforces these limits on the ghost * caches, and evicts from them as needed. */ target = refcount_count(&arc_mru->arcs_size) + refcount_count(&arc_mru_ghost->arcs_size) - arc_c; bytes = arc_adjust_impl(arc_mru_ghost, 0, target, ARC_BUFC_DATA); total_evicted += bytes; target -= bytes; total_evicted += arc_adjust_impl(arc_mru_ghost, 0, target, ARC_BUFC_METADATA); /* * We assume the sum of the mru list and mfu list is less than * or equal to arc_c (we enforced this above), which means we * can use the simpler of the two equations below: * * mru + mfu + mru ghost + mfu ghost <= 2 * arc_c * mru ghost + mfu ghost <= arc_c */ target = refcount_count(&arc_mru_ghost->arcs_size) + refcount_count(&arc_mfu_ghost->arcs_size) - arc_c; bytes = arc_adjust_impl(arc_mfu_ghost, 0, target, ARC_BUFC_DATA); total_evicted += bytes; target -= bytes; total_evicted += arc_adjust_impl(arc_mfu_ghost, 0, target, ARC_BUFC_METADATA); return (total_evicted); } void arc_flush(spa_t *spa, boolean_t retry) { uint64_t guid = 0; /* * If retry is B_TRUE, a spa must not be specified since we have * no good way to determine if all of a spa's buffers have been * evicted from an arc state. */ ASSERT(!retry || spa == 0); if (spa != NULL) guid = spa_load_guid(spa); (void) arc_flush_state(arc_mru, guid, ARC_BUFC_DATA, retry); (void) arc_flush_state(arc_mru, guid, ARC_BUFC_METADATA, retry); (void) arc_flush_state(arc_mfu, guid, ARC_BUFC_DATA, retry); (void) arc_flush_state(arc_mfu, guid, ARC_BUFC_METADATA, retry); (void) arc_flush_state(arc_mru_ghost, guid, ARC_BUFC_DATA, retry); (void) arc_flush_state(arc_mru_ghost, guid, ARC_BUFC_METADATA, retry); (void) arc_flush_state(arc_mfu_ghost, guid, ARC_BUFC_DATA, retry); (void) arc_flush_state(arc_mfu_ghost, guid, ARC_BUFC_METADATA, retry); } void arc_shrink(int64_t to_free) { if (arc_c > arc_c_min) { DTRACE_PROBE4(arc__shrink, uint64_t, arc_c, uint64_t, arc_c_min, uint64_t, arc_p, uint64_t, to_free); if (arc_c > arc_c_min + to_free) atomic_add_64(&arc_c, -to_free); else arc_c = arc_c_min; atomic_add_64(&arc_p, -(arc_p >> arc_shrink_shift)); if (arc_c > arc_size) arc_c = MAX(arc_size, arc_c_min); if (arc_p > arc_c) arc_p = (arc_c >> 1); DTRACE_PROBE2(arc__shrunk, uint64_t, arc_c, uint64_t, arc_p); ASSERT(arc_c >= arc_c_min); ASSERT((int64_t)arc_p >= 0); } if (arc_size > arc_c) { DTRACE_PROBE2(arc__shrink_adjust, uint64_t, arc_size, uint64_t, arc_c); (void) arc_adjust(); } } static long needfree = 0; typedef enum free_memory_reason_t { FMR_UNKNOWN, FMR_NEEDFREE, FMR_LOTSFREE, FMR_SWAPFS_MINFREE, FMR_PAGES_PP_MAXIMUM, FMR_HEAP_ARENA, FMR_ZIO_ARENA, FMR_ZIO_FRAG, } free_memory_reason_t; int64_t last_free_memory; free_memory_reason_t last_free_reason; /* * Additional reserve of pages for pp_reserve. */ int64_t arc_pages_pp_reserve = 64; /* * Additional reserve of pages for swapfs. */ int64_t arc_swapfs_reserve = 64; /* * Return the amount of memory that can be consumed before reclaim will be * needed. Positive if there is sufficient free memory, negative indicates * the amount of memory that needs to be freed up. */ static int64_t arc_available_memory(void) { int64_t lowest = INT64_MAX; int64_t n; free_memory_reason_t r = FMR_UNKNOWN; #ifdef _KERNEL if (needfree > 0) { n = PAGESIZE * (-needfree); if (n < lowest) { lowest = n; r = FMR_NEEDFREE; } } /* * Cooperate with pagedaemon when it's time for it to scan * and reclaim some pages. */ n = PAGESIZE * ((int64_t)freemem - zfs_arc_free_target); if (n < lowest) { lowest = n; r = FMR_LOTSFREE; } #ifdef illumos /* * check that we're out of range of the pageout scanner. It starts to * schedule paging if freemem is less than lotsfree and needfree. * lotsfree is the high-water mark for pageout, and needfree is the * number of needed free pages. We add extra pages here to make sure * the scanner doesn't start up while we're freeing memory. */ n = PAGESIZE * (freemem - lotsfree - needfree - desfree); if (n < lowest) { lowest = n; r = FMR_LOTSFREE; } /* * check to make sure that swapfs has enough space so that anon * reservations can still succeed. anon_resvmem() checks that the * availrmem is greater than swapfs_minfree, and the number of reserved * swap pages. We also add a bit of extra here just to prevent * circumstances from getting really dire. */ n = PAGESIZE * (availrmem - swapfs_minfree - swapfs_reserve - desfree - arc_swapfs_reserve); if (n < lowest) { lowest = n; r = FMR_SWAPFS_MINFREE; } /* * Check that we have enough availrmem that memory locking (e.g., via * mlock(3C) or memcntl(2)) can still succeed. (pages_pp_maximum * stores the number of pages that cannot be locked; when availrmem * drops below pages_pp_maximum, page locking mechanisms such as * page_pp_lock() will fail.) */ n = PAGESIZE * (availrmem - pages_pp_maximum - arc_pages_pp_reserve); if (n < lowest) { lowest = n; r = FMR_PAGES_PP_MAXIMUM; } #endif /* illumos */ #if defined(__i386) || !defined(UMA_MD_SMALL_ALLOC) /* * If we're on an i386 platform, it's possible that we'll exhaust the * kernel heap space before we ever run out of available physical * memory. Most checks of the size of the heap_area compare against * tune.t_minarmem, which is the minimum available real memory that we * can have in the system. However, this is generally fixed at 25 pages * which is so low that it's useless. In this comparison, we seek to * calculate the total heap-size, and reclaim if more than 3/4ths of the * heap is allocated. (Or, in the calculation, if less than 1/4th is * free) */ n = (int64_t)vmem_size(heap_arena, VMEM_FREE) - (vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC) >> 2); if (n < lowest) { lowest = n; r = FMR_HEAP_ARENA; } #define zio_arena NULL #else #define zio_arena heap_arena #endif /* * If zio data pages are being allocated out of a separate heap segment, * then enforce that the size of available vmem for this arena remains * above about 1/16th free. * * Note: The 1/16th arena free requirement was put in place * to aggressively evict memory from the arc in order to avoid * memory fragmentation issues. */ if (zio_arena != NULL) { n = (int64_t)vmem_size(zio_arena, VMEM_FREE) - (vmem_size(zio_arena, VMEM_ALLOC) >> 4); if (n < lowest) { lowest = n; r = FMR_ZIO_ARENA; } } /* * Above limits know nothing about real level of KVA fragmentation. * Start aggressive reclamation if too little sequential KVA left. */ if (lowest > 0) { n = (vmem_size(heap_arena, VMEM_MAXFREE) < zfs_max_recordsize) ? -((int64_t)vmem_size(heap_arena, VMEM_ALLOC) >> 4) : INT64_MAX; if (n < lowest) { lowest = n; r = FMR_ZIO_FRAG; } } #else /* _KERNEL */ /* Every 100 calls, free a small amount */ if (spa_get_random(100) == 0) lowest = -1024; #endif /* _KERNEL */ last_free_memory = lowest; last_free_reason = r; DTRACE_PROBE2(arc__available_memory, int64_t, lowest, int, r); return (lowest); } /* * Determine if the system is under memory pressure and is asking * to reclaim memory. A return value of B_TRUE indicates that the system * is under memory pressure and that the arc should adjust accordingly. */ static boolean_t arc_reclaim_needed(void) { return (arc_available_memory() < 0); } extern kmem_cache_t *zio_buf_cache[]; extern kmem_cache_t *zio_data_buf_cache[]; extern kmem_cache_t *range_seg_cache; static __noinline void arc_kmem_reap_now(void) { size_t i; kmem_cache_t *prev_cache = NULL; kmem_cache_t *prev_data_cache = NULL; DTRACE_PROBE(arc__kmem_reap_start); #ifdef _KERNEL if (arc_meta_used >= arc_meta_limit) { /* * We are exceeding our meta-data cache limit. * Purge some DNLC entries to release holds on meta-data. */ dnlc_reduce_cache((void *)(uintptr_t)arc_reduce_dnlc_percent); } #if defined(__i386) /* * Reclaim unused memory from all kmem caches. */ kmem_reap(); #endif #endif for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) { if (zio_buf_cache[i] != prev_cache) { prev_cache = zio_buf_cache[i]; kmem_cache_reap_now(zio_buf_cache[i]); } if (zio_data_buf_cache[i] != prev_data_cache) { prev_data_cache = zio_data_buf_cache[i]; kmem_cache_reap_now(zio_data_buf_cache[i]); } } kmem_cache_reap_now(buf_cache); kmem_cache_reap_now(hdr_full_cache); kmem_cache_reap_now(hdr_l2only_cache); kmem_cache_reap_now(range_seg_cache); #ifdef illumos if (zio_arena != NULL) { /* * Ask the vmem arena to reclaim unused memory from its * quantum caches. */ vmem_qcache_reap(zio_arena); } #endif DTRACE_PROBE(arc__kmem_reap_end); } /* * Threads can block in arc_get_data_buf() waiting for this thread to evict * enough data and signal them to proceed. When this happens, the threads in * arc_get_data_buf() are sleeping while holding the hash lock for their * particular arc header. Thus, we must be careful to never sleep on a * hash lock in this thread. This is to prevent the following deadlock: * * - Thread A sleeps on CV in arc_get_data_buf() holding hash lock "L", * waiting for the reclaim thread to signal it. * * - arc_reclaim_thread() tries to acquire hash lock "L" using mutex_enter, * fails, and goes to sleep forever. * * This possible deadlock is avoided by always acquiring a hash lock * using mutex_tryenter() from arc_reclaim_thread(). */ static void arc_reclaim_thread(void *dummy __unused) { hrtime_t growtime = 0; callb_cpr_t cpr; CALLB_CPR_INIT(&cpr, &arc_reclaim_lock, callb_generic_cpr, FTAG); mutex_enter(&arc_reclaim_lock); while (!arc_reclaim_thread_exit) { int64_t free_memory = arc_available_memory(); uint64_t evicted = 0; /* * This is necessary in order for the mdb ::arc dcmd to * show up to date information. Since the ::arc command * does not call the kstat's update function, without * this call, the command may show stale stats for the * anon, mru, mru_ghost, mfu, and mfu_ghost lists. Even * with this change, the data might be up to 1 second * out of date; but that should suffice. The arc_state_t * structures can be queried directly if more accurate * information is needed. */ if (arc_ksp != NULL) arc_ksp->ks_update(arc_ksp, KSTAT_READ); mutex_exit(&arc_reclaim_lock); if (free_memory < 0) { arc_no_grow = B_TRUE; arc_warm = B_TRUE; /* * Wait at least zfs_grow_retry (default 60) seconds * before considering growing. */ growtime = gethrtime() + SEC2NSEC(arc_grow_retry); arc_kmem_reap_now(); /* * If we are still low on memory, shrink the ARC * so that we have arc_shrink_min free space. */ free_memory = arc_available_memory(); int64_t to_free = (arc_c >> arc_shrink_shift) - free_memory; if (to_free > 0) { #ifdef _KERNEL to_free = MAX(to_free, ptob(needfree)); #endif arc_shrink(to_free); } } else if (free_memory < arc_c >> arc_no_grow_shift) { arc_no_grow = B_TRUE; } else if (gethrtime() >= growtime) { arc_no_grow = B_FALSE; } evicted = arc_adjust(); mutex_enter(&arc_reclaim_lock); /* * If evicted is zero, we couldn't evict anything via * arc_adjust(). This could be due to hash lock * collisions, but more likely due to the majority of * arc buffers being unevictable. Therefore, even if * arc_size is above arc_c, another pass is unlikely to * be helpful and could potentially cause us to enter an * infinite loop. */ if (arc_size <= arc_c || evicted == 0) { #ifdef _KERNEL needfree = 0; #endif /* * We're either no longer overflowing, or we * can't evict anything more, so we should wake * up any threads before we go to sleep. */ cv_broadcast(&arc_reclaim_waiters_cv); /* * Block until signaled, or after one second (we * might need to perform arc_kmem_reap_now() * even if we aren't being signalled) */ CALLB_CPR_SAFE_BEGIN(&cpr); (void) cv_timedwait_hires(&arc_reclaim_thread_cv, &arc_reclaim_lock, SEC2NSEC(1), MSEC2NSEC(1), 0); CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_lock); } } arc_reclaim_thread_exit = B_FALSE; cv_broadcast(&arc_reclaim_thread_cv); CALLB_CPR_EXIT(&cpr); /* drops arc_reclaim_lock */ thread_exit(); } static u_int arc_dnlc_evicts_arg; extern struct vfsops zfs_vfsops; static void arc_dnlc_evicts_thread(void *dummy __unused) { callb_cpr_t cpr; u_int percent; CALLB_CPR_INIT(&cpr, &arc_dnlc_evicts_lock, callb_generic_cpr, FTAG); mutex_enter(&arc_dnlc_evicts_lock); while (!arc_dnlc_evicts_thread_exit) { CALLB_CPR_SAFE_BEGIN(&cpr); (void) cv_wait(&arc_dnlc_evicts_cv, &arc_dnlc_evicts_lock); CALLB_CPR_SAFE_END(&cpr, &arc_dnlc_evicts_lock); if (arc_dnlc_evicts_arg != 0) { percent = arc_dnlc_evicts_arg; mutex_exit(&arc_dnlc_evicts_lock); #ifdef _KERNEL vnlru_free(desiredvnodes * percent / 100, &zfs_vfsops); #endif mutex_enter(&arc_dnlc_evicts_lock); /* * Clear our token only after vnlru_free() * pass is done, to avoid false queueing of * the requests. */ arc_dnlc_evicts_arg = 0; } } arc_dnlc_evicts_thread_exit = FALSE; cv_broadcast(&arc_dnlc_evicts_cv); CALLB_CPR_EXIT(&cpr); thread_exit(); } void dnlc_reduce_cache(void *arg) { u_int percent; percent = (u_int)(uintptr_t)arg; mutex_enter(&arc_dnlc_evicts_lock); if (arc_dnlc_evicts_arg == 0) { arc_dnlc_evicts_arg = percent; cv_broadcast(&arc_dnlc_evicts_cv); } mutex_exit(&arc_dnlc_evicts_lock); } /* * Adapt arc info given the number of bytes we are trying to add and * the state that we are comming from. This function is only called * when we are adding new content to the cache. */ static void arc_adapt(int bytes, arc_state_t *state) { int mult; uint64_t arc_p_min = (arc_c >> arc_p_min_shift); int64_t mrug_size = refcount_count(&arc_mru_ghost->arcs_size); int64_t mfug_size = refcount_count(&arc_mfu_ghost->arcs_size); if (state == arc_l2c_only) return; ASSERT(bytes > 0); /* * Adapt the target size of the MRU list: * - if we just hit in the MRU ghost list, then increase * the target size of the MRU list. * - if we just hit in the MFU ghost list, then increase * the target size of the MFU list by decreasing the * target size of the MRU list. */ if (state == arc_mru_ghost) { mult = (mrug_size >= mfug_size) ? 1 : (mfug_size / mrug_size); mult = MIN(mult, 10); /* avoid wild arc_p adjustment */ arc_p = MIN(arc_c - arc_p_min, arc_p + bytes * mult); } else if (state == arc_mfu_ghost) { uint64_t delta; mult = (mfug_size >= mrug_size) ? 1 : (mrug_size / mfug_size); mult = MIN(mult, 10); delta = MIN(bytes * mult, arc_p); arc_p = MAX(arc_p_min, arc_p - delta); } ASSERT((int64_t)arc_p >= 0); if (arc_reclaim_needed()) { cv_signal(&arc_reclaim_thread_cv); return; } if (arc_no_grow) return; if (arc_c >= arc_c_max) return; /* * If we're within (2 * maxblocksize) bytes of the target * cache size, increment the target cache size */ if (arc_size > arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) { DTRACE_PROBE1(arc__inc_adapt, int, bytes); atomic_add_64(&arc_c, (int64_t)bytes); if (arc_c > arc_c_max) arc_c = arc_c_max; else if (state == arc_anon) atomic_add_64(&arc_p, (int64_t)bytes); if (arc_p > arc_c) arc_p = arc_c; } ASSERT((int64_t)arc_p >= 0); } /* * Check if arc_size has grown past our upper threshold, determined by * zfs_arc_overflow_shift. */ static boolean_t arc_is_overflowing(void) { /* Always allow at least one block of overflow */ uint64_t overflow = MAX(SPA_MAXBLOCKSIZE, arc_c >> zfs_arc_overflow_shift); return (arc_size >= arc_c + overflow); } /* * Allocate a block and return it to the caller. If we are hitting the * hard limit for the cache size, we must sleep, waiting for the eviction * thread to catch up. If we're past the target size but below the hard * limit, we'll only signal the reclaim thread and continue on. */ static void * arc_get_data_buf(arc_buf_hdr_t *hdr, uint64_t size, void *tag) { void *datap = NULL; arc_state_t *state = hdr->b_l1hdr.b_state; arc_buf_contents_t type = arc_buf_type(hdr); arc_adapt(size, state); /* * If arc_size is currently overflowing, and has grown past our * upper limit, we must be adding data faster than the evict * thread can evict. Thus, to ensure we don't compound the * problem by adding more data and forcing arc_size to grow even * further past it's target size, we halt and wait for the * eviction thread to catch up. * * It's also possible that the reclaim thread is unable to evict * enough buffers to get arc_size below the overflow limit (e.g. * due to buffers being un-evictable, or hash lock collisions). * In this case, we want to proceed regardless if we're * overflowing; thus we don't use a while loop here. */ if (arc_is_overflowing()) { mutex_enter(&arc_reclaim_lock); /* * Now that we've acquired the lock, we may no longer be * over the overflow limit, lets check. * * We're ignoring the case of spurious wake ups. If that * were to happen, it'd let this thread consume an ARC * buffer before it should have (i.e. before we're under * the overflow limit and were signalled by the reclaim * thread). As long as that is a rare occurrence, it * shouldn't cause any harm. */ if (arc_is_overflowing()) { cv_signal(&arc_reclaim_thread_cv); cv_wait(&arc_reclaim_waiters_cv, &arc_reclaim_lock); } mutex_exit(&arc_reclaim_lock); } VERIFY3U(hdr->b_type, ==, type); if (type == ARC_BUFC_METADATA) { datap = zio_buf_alloc(size); arc_space_consume(size, ARC_SPACE_META); } else { ASSERT(type == ARC_BUFC_DATA); datap = zio_data_buf_alloc(size); arc_space_consume(size, ARC_SPACE_DATA); } /* * Update the state size. Note that ghost states have a * "ghost size" and so don't need to be updated. */ if (!GHOST_STATE(state)) { (void) refcount_add_many(&state->arcs_size, size, tag); /* * If this is reached via arc_read, the link is * protected by the hash lock. If reached via * arc_buf_alloc, the header should not be accessed by * any other thread. And, if reached via arc_read_done, * the hash lock will protect it if it's found in the * hash table; otherwise no other thread should be * trying to [add|remove]_reference it. */ if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) { ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); (void) refcount_add_many(&state->arcs_esize[type], size, tag); } /* * If we are growing the cache, and we are adding anonymous * data, and we have outgrown arc_p, update arc_p */ if (arc_size < arc_c && hdr->b_l1hdr.b_state == arc_anon && (refcount_count(&arc_anon->arcs_size) + refcount_count(&arc_mru->arcs_size) > arc_p)) arc_p = MIN(arc_c, arc_p + size); } ARCSTAT_BUMP(arcstat_allocated); return (datap); } /* * Free the arc data buffer. */ static void arc_free_data_buf(arc_buf_hdr_t *hdr, void *data, uint64_t size, void *tag) { arc_state_t *state = hdr->b_l1hdr.b_state; arc_buf_contents_t type = arc_buf_type(hdr); /* protected by hash lock, if in the hash table */ if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) { ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); ASSERT(state != arc_anon && state != arc_l2c_only); (void) refcount_remove_many(&state->arcs_esize[type], size, tag); } (void) refcount_remove_many(&state->arcs_size, size, tag); VERIFY3U(hdr->b_type, ==, type); if (type == ARC_BUFC_METADATA) { zio_buf_free(data, size); arc_space_return(size, ARC_SPACE_META); } else { ASSERT(type == ARC_BUFC_DATA); zio_data_buf_free(data, size); arc_space_return(size, ARC_SPACE_DATA); } } /* * This routine is called whenever a buffer is accessed. * NOTE: the hash lock is dropped in this function. */ static void arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) { clock_t now; ASSERT(MUTEX_HELD(hash_lock)); ASSERT(HDR_HAS_L1HDR(hdr)); if (hdr->b_l1hdr.b_state == arc_anon) { /* * This buffer is not in the cache, and does not * appear in our "ghost" list. Add the new buffer * to the MRU state. */ ASSERT0(hdr->b_l1hdr.b_arc_access); hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr); arc_change_state(arc_mru, hdr, hash_lock); } else if (hdr->b_l1hdr.b_state == arc_mru) { now = ddi_get_lbolt(); /* * If this buffer is here because of a prefetch, then either: * - clear the flag if this is a "referencing" read * (any subsequent access will bump this into the MFU state). * or * - move the buffer to the head of the list if this is * another prefetch (to make it less likely to be evicted). */ if (HDR_PREFETCH(hdr)) { if (refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) { /* link protected by hash lock */ ASSERT(multilist_link_active( &hdr->b_l1hdr.b_arc_node)); } else { arc_hdr_clear_flags(hdr, ARC_FLAG_PREFETCH); ARCSTAT_BUMP(arcstat_mru_hits); } hdr->b_l1hdr.b_arc_access = now; return; } /* * This buffer has been "accessed" only once so far, * but it is still in the cache. Move it to the MFU * state. */ if (now > hdr->b_l1hdr.b_arc_access + ARC_MINTIME) { /* * More than 125ms have passed since we * instantiated this buffer. Move it to the * most frequently used state. */ hdr->b_l1hdr.b_arc_access = now; DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); arc_change_state(arc_mfu, hdr, hash_lock); } ARCSTAT_BUMP(arcstat_mru_hits); } else if (hdr->b_l1hdr.b_state == arc_mru_ghost) { arc_state_t *new_state; /* * This buffer has been "accessed" recently, but * was evicted from the cache. Move it to the * MFU state. */ if (HDR_PREFETCH(hdr)) { new_state = arc_mru; if (refcount_count(&hdr->b_l1hdr.b_refcnt) > 0) arc_hdr_clear_flags(hdr, ARC_FLAG_PREFETCH); DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr); } else { new_state = arc_mfu; DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); } hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); arc_change_state(new_state, hdr, hash_lock); ARCSTAT_BUMP(arcstat_mru_ghost_hits); } else if (hdr->b_l1hdr.b_state == arc_mfu) { /* * This buffer has been accessed more than once and is * still in the cache. Keep it in the MFU state. * * NOTE: an add_reference() that occurred when we did * the arc_read() will have kicked this off the list. * If it was a prefetch, we will explicitly move it to * the head of the list now. */ if ((HDR_PREFETCH(hdr)) != 0) { ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); /* link protected by hash_lock */ ASSERT(multilist_link_active(&hdr->b_l1hdr.b_arc_node)); } ARCSTAT_BUMP(arcstat_mfu_hits); hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); } else if (hdr->b_l1hdr.b_state == arc_mfu_ghost) { arc_state_t *new_state = arc_mfu; /* * This buffer has been accessed more than once but has * been evicted from the cache. Move it back to the * MFU state. */ if (HDR_PREFETCH(hdr)) { /* * This is a prefetch access... * move this block back to the MRU state. */ ASSERT0(refcount_count(&hdr->b_l1hdr.b_refcnt)); new_state = arc_mru; } hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); arc_change_state(new_state, hdr, hash_lock); ARCSTAT_BUMP(arcstat_mfu_ghost_hits); } else if (hdr->b_l1hdr.b_state == arc_l2c_only) { /* * This buffer is on the 2nd Level ARC. */ hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); arc_change_state(arc_mfu, hdr, hash_lock); } else { ASSERT(!"invalid arc state"); } } /* a generic arc_done_func_t which you can use */ /* ARGSUSED */ void arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg) { if (zio == NULL || zio->io_error == 0) bcopy(buf->b_data, arg, HDR_GET_LSIZE(buf->b_hdr)); arc_buf_destroy(buf, arg); } /* a generic arc_done_func_t */ void arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg) { arc_buf_t **bufp = arg; if (zio && zio->io_error) { arc_buf_destroy(buf, arg); *bufp = NULL; } else { *bufp = buf; ASSERT(buf->b_data); } } static void arc_hdr_verify(arc_buf_hdr_t *hdr, blkptr_t *bp) { if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) { ASSERT3U(HDR_GET_PSIZE(hdr), ==, 0); ASSERT3U(HDR_GET_COMPRESS(hdr), ==, ZIO_COMPRESS_OFF); } else { if (HDR_COMPRESSION_ENABLED(hdr)) { ASSERT3U(HDR_GET_COMPRESS(hdr), ==, BP_GET_COMPRESS(bp)); } ASSERT3U(HDR_GET_LSIZE(hdr), ==, BP_GET_LSIZE(bp)); ASSERT3U(HDR_GET_PSIZE(hdr), ==, BP_GET_PSIZE(bp)); } } static void arc_read_done(zio_t *zio) { arc_buf_hdr_t *hdr = zio->io_private; arc_buf_t *abuf = NULL; /* buffer we're assigning to callback */ kmutex_t *hash_lock = NULL; arc_callback_t *callback_list, *acb; int freeable = B_FALSE; /* * The hdr was inserted into hash-table and removed from lists * prior to starting I/O. We should find this header, since * it's in the hash table, and it should be legit since it's * not possible to evict it during the I/O. The only possible * reason for it not to be found is if we were freed during the * read. */ if (HDR_IN_HASH_TABLE(hdr)) { ASSERT3U(hdr->b_birth, ==, BP_PHYSICAL_BIRTH(zio->io_bp)); ASSERT3U(hdr->b_dva.dva_word[0], ==, BP_IDENTITY(zio->io_bp)->dva_word[0]); ASSERT3U(hdr->b_dva.dva_word[1], ==, BP_IDENTITY(zio->io_bp)->dva_word[1]); arc_buf_hdr_t *found = buf_hash_find(hdr->b_spa, zio->io_bp, &hash_lock); ASSERT((found == hdr && DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) || (found == hdr && HDR_L2_READING(hdr))); ASSERT3P(hash_lock, !=, NULL); } if (zio->io_error == 0) { /* byteswap if necessary */ if (BP_SHOULD_BYTESWAP(zio->io_bp)) { if (BP_GET_LEVEL(zio->io_bp) > 0) { hdr->b_l1hdr.b_byteswap = DMU_BSWAP_UINT64; } else { hdr->b_l1hdr.b_byteswap = DMU_OT_BYTESWAP(BP_GET_TYPE(zio->io_bp)); } } else { hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS; } } arc_hdr_clear_flags(hdr, ARC_FLAG_L2_EVICTED); if (l2arc_noprefetch && HDR_PREFETCH(hdr)) arc_hdr_clear_flags(hdr, ARC_FLAG_L2CACHE); callback_list = hdr->b_l1hdr.b_acb; ASSERT3P(callback_list, !=, NULL); if (hash_lock && zio->io_error == 0 && hdr->b_l1hdr.b_state == arc_anon) { /* * Only call arc_access on anonymous buffers. This is because * if we've issued an I/O for an evicted buffer, we've already * called arc_access (to prevent any simultaneous readers from * getting confused). */ arc_access(hdr, hash_lock); } /* create copies of the data buffer for the callers */ for (acb = callback_list; acb; acb = acb->acb_next) { if (acb->acb_done != NULL) { /* * If we're here, then this must be a demand read * since prefetch requests don't have callbacks. * If a read request has a callback (i.e. acb_done is * not NULL), then we decompress the data for the * first request and clone the rest. This avoids * having to waste cpu resources decompressing data * that nobody is explicitly waiting to read. */ if (abuf == NULL) { acb->acb_buf = arc_buf_alloc_impl(hdr, acb->acb_private); if (zio->io_error == 0) { zio->io_error = arc_decompress(acb->acb_buf); } abuf = acb->acb_buf; } else { add_reference(hdr, acb->acb_private); acb->acb_buf = arc_buf_clone(abuf); } } } hdr->b_l1hdr.b_acb = NULL; arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); if (abuf == NULL) { /* * This buffer didn't have a callback so it must * be a prefetch. */ ASSERT(HDR_PREFETCH(hdr)); ASSERT0(hdr->b_l1hdr.b_bufcnt); ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL); } ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt) || callback_list != NULL); if (zio->io_error == 0) { arc_hdr_verify(hdr, zio->io_bp); } else { arc_hdr_set_flags(hdr, ARC_FLAG_IO_ERROR); if (hdr->b_l1hdr.b_state != arc_anon) arc_change_state(arc_anon, hdr, hash_lock); if (HDR_IN_HASH_TABLE(hdr)) buf_hash_remove(hdr); freeable = refcount_is_zero(&hdr->b_l1hdr.b_refcnt); } /* * Broadcast before we drop the hash_lock to avoid the possibility * that the hdr (and hence the cv) might be freed before we get to * the cv_broadcast(). */ cv_broadcast(&hdr->b_l1hdr.b_cv); if (hash_lock != NULL) { mutex_exit(hash_lock); } else { /* * This block was freed while we waited for the read to * complete. It has been removed from the hash table and * moved to the anonymous state (so that it won't show up * in the cache). */ ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon); freeable = refcount_is_zero(&hdr->b_l1hdr.b_refcnt); } /* execute each callback and free its structure */ while ((acb = callback_list) != NULL) { if (acb->acb_done) acb->acb_done(zio, acb->acb_buf, acb->acb_private); if (acb->acb_zio_dummy != NULL) { acb->acb_zio_dummy->io_error = zio->io_error; zio_nowait(acb->acb_zio_dummy); } callback_list = acb->acb_next; kmem_free(acb, sizeof (arc_callback_t)); } if (freeable) arc_hdr_destroy(hdr); } /* * "Read" the block at the specified DVA (in bp) via the * cache. If the block is found in the cache, invoke the provided * callback immediately and return. Note that the `zio' parameter * in the callback will be NULL in this case, since no IO was * required. If the block is not in the cache pass the read request * on to the spa with a substitute callback function, so that the * requested block will be added to the cache. * * If a read request arrives for a block that has a read in-progress, * either wait for the in-progress read to complete (and return the * results); or, if this is a read with a "done" func, add a record * to the read to invoke the "done" func when the read completes, * and return; or just return. * * arc_read_done() will invoke all the requested "done" functions * for readers of this block. */ int arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, void *private, zio_priority_t priority, int zio_flags, arc_flags_t *arc_flags, const zbookmark_phys_t *zb) { arc_buf_hdr_t *hdr = NULL; kmutex_t *hash_lock = NULL; zio_t *rzio; uint64_t guid = spa_load_guid(spa); ASSERT(!BP_IS_EMBEDDED(bp) || BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA); top: if (!BP_IS_EMBEDDED(bp)) { /* * Embedded BP's have no DVA and require no I/O to "read". * Create an anonymous arc buf to back it. */ hdr = buf_hash_find(guid, bp, &hash_lock); } if (hdr != NULL && HDR_HAS_L1HDR(hdr) && hdr->b_l1hdr.b_pdata != NULL) { arc_buf_t *buf = NULL; *arc_flags |= ARC_FLAG_CACHED; if (HDR_IO_IN_PROGRESS(hdr)) { if ((hdr->b_flags & ARC_FLAG_PRIO_ASYNC_READ) && priority == ZIO_PRIORITY_SYNC_READ) { /* * This sync read must wait for an * in-progress async read (e.g. a predictive * prefetch). Async reads are queued * separately at the vdev_queue layer, so * this is a form of priority inversion. * Ideally, we would "inherit" the demand * i/o's priority by moving the i/o from * the async queue to the synchronous queue, * but there is currently no mechanism to do * so. Track this so that we can evaluate * the magnitude of this potential performance * problem. * * Note that if the prefetch i/o is already * active (has been issued to the device), * the prefetch improved performance, because * we issued it sooner than we would have * without the prefetch. */ DTRACE_PROBE1(arc__sync__wait__for__async, arc_buf_hdr_t *, hdr); ARCSTAT_BUMP(arcstat_sync_wait_for_async); } if (hdr->b_flags & ARC_FLAG_PREDICTIVE_PREFETCH) { arc_hdr_clear_flags(hdr, ARC_FLAG_PREDICTIVE_PREFETCH); } if (*arc_flags & ARC_FLAG_WAIT) { cv_wait(&hdr->b_l1hdr.b_cv, hash_lock); mutex_exit(hash_lock); goto top; } ASSERT(*arc_flags & ARC_FLAG_NOWAIT); if (done) { arc_callback_t *acb = NULL; acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP); acb->acb_done = done; acb->acb_private = private; if (pio != NULL) acb->acb_zio_dummy = zio_null(pio, spa, NULL, NULL, NULL, zio_flags); ASSERT3P(acb->acb_done, !=, NULL); acb->acb_next = hdr->b_l1hdr.b_acb; hdr->b_l1hdr.b_acb = acb; mutex_exit(hash_lock); return (0); } mutex_exit(hash_lock); return (0); } ASSERT(hdr->b_l1hdr.b_state == arc_mru || hdr->b_l1hdr.b_state == arc_mfu); if (done) { if (hdr->b_flags & ARC_FLAG_PREDICTIVE_PREFETCH) { /* * This is a demand read which does not have to * wait for i/o because we did a predictive * prefetch i/o for it, which has completed. */ DTRACE_PROBE1( arc__demand__hit__predictive__prefetch, arc_buf_hdr_t *, hdr); ARCSTAT_BUMP( arcstat_demand_hit_predictive_prefetch); arc_hdr_clear_flags(hdr, ARC_FLAG_PREDICTIVE_PREFETCH); } ASSERT(!BP_IS_EMBEDDED(bp) || !BP_IS_HOLE(bp)); /* * If this block is already in use, create a new * copy of the data so that we will be guaranteed * that arc_release() will always succeed. */ buf = hdr->b_l1hdr.b_buf; if (buf == NULL) { ASSERT0(refcount_count(&hdr->b_l1hdr.b_refcnt)); ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL); buf = arc_buf_alloc_impl(hdr, private); VERIFY0(arc_decompress(buf)); } else { add_reference(hdr, private); buf = arc_buf_clone(buf); } ASSERT3P(buf->b_data, !=, NULL); } else if (*arc_flags & ARC_FLAG_PREFETCH && refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) { arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH); } DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr); arc_access(hdr, hash_lock); if (*arc_flags & ARC_FLAG_L2CACHE) arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE); mutex_exit(hash_lock); ARCSTAT_BUMP(arcstat_hits); ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr), demand, prefetch, !HDR_ISTYPE_METADATA(hdr), data, metadata, hits); if (done) done(NULL, buf, private); } else { uint64_t lsize = BP_GET_LSIZE(bp); uint64_t psize = BP_GET_PSIZE(bp); arc_callback_t *acb; vdev_t *vd = NULL; uint64_t addr = 0; boolean_t devw = B_FALSE; uint64_t size; if (hdr == NULL) { /* this block is not in the cache */ arc_buf_hdr_t *exists = NULL; arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp); hdr = arc_hdr_alloc(spa_load_guid(spa), psize, lsize, BP_GET_COMPRESS(bp), type); if (!BP_IS_EMBEDDED(bp)) { hdr->b_dva = *BP_IDENTITY(bp); hdr->b_birth = BP_PHYSICAL_BIRTH(bp); exists = buf_hash_insert(hdr, &hash_lock); } if (exists != NULL) { /* somebody beat us to the hash insert */ mutex_exit(hash_lock); buf_discard_identity(hdr); arc_hdr_destroy(hdr); goto top; /* restart the IO request */ } } else { /* * This block is in the ghost cache. If it was L2-only * (and thus didn't have an L1 hdr), we realloc the * header to add an L1 hdr. */ if (!HDR_HAS_L1HDR(hdr)) { hdr = arc_hdr_realloc(hdr, hdr_l2only_cache, hdr_full_cache); } ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL); ASSERT(GHOST_STATE(hdr->b_l1hdr.b_state)); ASSERT(!HDR_IO_IN_PROGRESS(hdr)); ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); /* * This is a delicate dance that we play here. * This hdr is in the ghost list so we access it * to move it out of the ghost list before we * initiate the read. If it's a prefetch then * it won't have a callback so we'll remove the * reference that arc_buf_alloc_impl() created. We * do this after we've called arc_access() to * avoid hitting an assert in remove_reference(). */ arc_access(hdr, hash_lock); arc_hdr_alloc_pdata(hdr); } ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL); size = arc_hdr_size(hdr); /* * If compression is enabled on the hdr, then will do * RAW I/O and will store the compressed data in the hdr's * data block. Otherwise, the hdr's data block will contain * the uncompressed data. */ if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF) { zio_flags |= ZIO_FLAG_RAW; } if (*arc_flags & ARC_FLAG_PREFETCH) arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH); if (*arc_flags & ARC_FLAG_L2CACHE) arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE); if (BP_GET_LEVEL(bp) > 0) arc_hdr_set_flags(hdr, ARC_FLAG_INDIRECT); if (*arc_flags & ARC_FLAG_PREDICTIVE_PREFETCH) arc_hdr_set_flags(hdr, ARC_FLAG_PREDICTIVE_PREFETCH); ASSERT(!GHOST_STATE(hdr->b_l1hdr.b_state)); acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP); acb->acb_done = done; acb->acb_private = private; ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL); hdr->b_l1hdr.b_acb = acb; arc_hdr_set_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); if (HDR_HAS_L2HDR(hdr) && (vd = hdr->b_l2hdr.b_dev->l2ad_vdev) != NULL) { devw = hdr->b_l2hdr.b_dev->l2ad_writing; addr = hdr->b_l2hdr.b_daddr; /* * Lock out device removal. */ if (vdev_is_dead(vd) || !spa_config_tryenter(spa, SCL_L2ARC, vd, RW_READER)) vd = NULL; } if (priority == ZIO_PRIORITY_ASYNC_READ) arc_hdr_set_flags(hdr, ARC_FLAG_PRIO_ASYNC_READ); else arc_hdr_clear_flags(hdr, ARC_FLAG_PRIO_ASYNC_READ); if (hash_lock != NULL) mutex_exit(hash_lock); /* * At this point, we have a level 1 cache miss. Try again in * L2ARC if possible. */ ASSERT3U(HDR_GET_LSIZE(hdr), ==, lsize); DTRACE_PROBE4(arc__miss, arc_buf_hdr_t *, hdr, blkptr_t *, bp, uint64_t, lsize, zbookmark_phys_t *, zb); ARCSTAT_BUMP(arcstat_misses); ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr), demand, prefetch, !HDR_ISTYPE_METADATA(hdr), data, metadata, misses); #ifdef _KERNEL #ifdef RACCT if (racct_enable) { PROC_LOCK(curproc); racct_add_force(curproc, RACCT_READBPS, size); racct_add_force(curproc, RACCT_READIOPS, 1); PROC_UNLOCK(curproc); } #endif /* RACCT */ curthread->td_ru.ru_inblock++; #endif if (vd != NULL && l2arc_ndev != 0 && !(l2arc_norw && devw)) { /* * Read from the L2ARC if the following are true: * 1. The L2ARC vdev was previously cached. * 2. This buffer still has L2ARC metadata. * 3. This buffer isn't currently writing to the L2ARC. * 4. The L2ARC entry wasn't evicted, which may * also have invalidated the vdev. * 5. This isn't prefetch and l2arc_noprefetch is set. */ if (HDR_HAS_L2HDR(hdr) && !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr) && !(l2arc_noprefetch && HDR_PREFETCH(hdr))) { l2arc_read_callback_t *cb; void* b_data; DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr); ARCSTAT_BUMP(arcstat_l2_hits); cb = kmem_zalloc(sizeof (l2arc_read_callback_t), KM_SLEEP); cb->l2rcb_hdr = hdr; cb->l2rcb_bp = *bp; cb->l2rcb_zb = *zb; cb->l2rcb_flags = zio_flags; uint64_t asize = vdev_psize_to_asize(vd, size); if (asize != size) { b_data = zio_data_buf_alloc(asize); cb->l2rcb_data = b_data; } else { b_data = hdr->b_l1hdr.b_pdata; } ASSERT(addr >= VDEV_LABEL_START_SIZE && addr + asize < vd->vdev_psize - VDEV_LABEL_END_SIZE); /* * l2arc read. The SCL_L2ARC lock will be * released by l2arc_read_done(). * Issue a null zio if the underlying buffer * was squashed to zero size by compression. */ ASSERT3U(HDR_GET_COMPRESS(hdr), !=, ZIO_COMPRESS_EMPTY); rzio = zio_read_phys(pio, vd, addr, asize, b_data, ZIO_CHECKSUM_OFF, l2arc_read_done, cb, priority, zio_flags | ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY, B_FALSE); DTRACE_PROBE2(l2arc__read, vdev_t *, vd, zio_t *, rzio); ARCSTAT_INCR(arcstat_l2_read_bytes, size); if (*arc_flags & ARC_FLAG_NOWAIT) { zio_nowait(rzio); return (0); } ASSERT(*arc_flags & ARC_FLAG_WAIT); if (zio_wait(rzio) == 0) return (0); /* l2arc read error; goto zio_read() */ } else { DTRACE_PROBE1(l2arc__miss, arc_buf_hdr_t *, hdr); ARCSTAT_BUMP(arcstat_l2_misses); if (HDR_L2_WRITING(hdr)) ARCSTAT_BUMP(arcstat_l2_rw_clash); spa_config_exit(spa, SCL_L2ARC, vd); } } else { if (vd != NULL) spa_config_exit(spa, SCL_L2ARC, vd); if (l2arc_ndev != 0) { DTRACE_PROBE1(l2arc__miss, arc_buf_hdr_t *, hdr); ARCSTAT_BUMP(arcstat_l2_misses); } } rzio = zio_read(pio, spa, bp, hdr->b_l1hdr.b_pdata, size, arc_read_done, hdr, priority, zio_flags, zb); if (*arc_flags & ARC_FLAG_WAIT) return (zio_wait(rzio)); ASSERT(*arc_flags & ARC_FLAG_NOWAIT); zio_nowait(rzio); } return (0); } /* * Notify the arc that a block was freed, and thus will never be used again. */ void arc_freed(spa_t *spa, const blkptr_t *bp) { arc_buf_hdr_t *hdr; kmutex_t *hash_lock; uint64_t guid = spa_load_guid(spa); ASSERT(!BP_IS_EMBEDDED(bp)); hdr = buf_hash_find(guid, bp, &hash_lock); if (hdr == NULL) return; /* * We might be trying to free a block that is still doing I/O * (i.e. prefetch) or has a reference (i.e. a dedup-ed, * dmu_sync-ed block). If this block is being prefetched, then it * would still have the ARC_FLAG_IO_IN_PROGRESS flag set on the hdr * until the I/O completes. A block may also have a reference if it is * part of a dedup-ed, dmu_synced write. The dmu_sync() function would * have written the new block to its final resting place on disk but * without the dedup flag set. This would have left the hdr in the MRU * state and discoverable. When the txg finally syncs it detects that * the block was overridden in open context and issues an override I/O. * Since this is a dedup block, the override I/O will determine if the * block is already in the DDT. If so, then it will replace the io_bp * with the bp from the DDT and allow the I/O to finish. When the I/O * reaches the done callback, dbuf_write_override_done, it will * check to see if the io_bp and io_bp_override are identical. * If they are not, then it indicates that the bp was replaced with * the bp in the DDT and the override bp is freed. This allows * us to arrive here with a reference on a block that is being * freed. So if we have an I/O in progress, or a reference to * this hdr, then we don't destroy the hdr. */ if (!HDR_HAS_L1HDR(hdr) || (!HDR_IO_IN_PROGRESS(hdr) && refcount_is_zero(&hdr->b_l1hdr.b_refcnt))) { arc_change_state(arc_anon, hdr, hash_lock); arc_hdr_destroy(hdr); mutex_exit(hash_lock); } else { mutex_exit(hash_lock); } } /* * Release this buffer from the cache, making it an anonymous buffer. This * must be done after a read and prior to modifying the buffer contents. * If the buffer has more than one reference, we must make * a new hdr for the buffer. */ void arc_release(arc_buf_t *buf, void *tag) { arc_buf_hdr_t *hdr = buf->b_hdr; /* * It would be nice to assert that if it's DMU metadata (level > * 0 || it's the dnode file), then it must be syncing context. * But we don't know that information at this level. */ mutex_enter(&buf->b_evict_lock); ASSERT(HDR_HAS_L1HDR(hdr)); /* * We don't grab the hash lock prior to this check, because if * the buffer's header is in the arc_anon state, it won't be * linked into the hash table. */ if (hdr->b_l1hdr.b_state == arc_anon) { mutex_exit(&buf->b_evict_lock); ASSERT(!HDR_IO_IN_PROGRESS(hdr)); ASSERT(!HDR_IN_HASH_TABLE(hdr)); ASSERT(!HDR_HAS_L2HDR(hdr)); ASSERT(HDR_EMPTY(hdr)); ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1); ASSERT3S(refcount_count(&hdr->b_l1hdr.b_refcnt), ==, 1); ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node)); hdr->b_l1hdr.b_arc_access = 0; /* * If the buf is being overridden then it may already * have a hdr that is not empty. */ buf_discard_identity(hdr); arc_buf_thaw(buf); return; } kmutex_t *hash_lock = HDR_LOCK(hdr); mutex_enter(hash_lock); /* * This assignment is only valid as long as the hash_lock is * held, we must be careful not to reference state or the * b_state field after dropping the lock. */ arc_state_t *state = hdr->b_l1hdr.b_state; ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); ASSERT3P(state, !=, arc_anon); /* this buffer is not on any list */ ASSERT(refcount_count(&hdr->b_l1hdr.b_refcnt) > 0); if (HDR_HAS_L2HDR(hdr)) { mutex_enter(&hdr->b_l2hdr.b_dev->l2ad_mtx); /* * We have to recheck this conditional again now that * we're holding the l2ad_mtx to prevent a race with * another thread which might be concurrently calling * l2arc_evict(). In that case, l2arc_evict() might have * destroyed the header's L2 portion as we were waiting * to acquire the l2ad_mtx. */ if (HDR_HAS_L2HDR(hdr)) { l2arc_trim(hdr); arc_hdr_l2hdr_destroy(hdr); } mutex_exit(&hdr->b_l2hdr.b_dev->l2ad_mtx); } /* * Do we have more than one buf? */ if (hdr->b_l1hdr.b_bufcnt > 1) { arc_buf_hdr_t *nhdr; arc_buf_t **bufp; uint64_t spa = hdr->b_spa; uint64_t psize = HDR_GET_PSIZE(hdr); uint64_t lsize = HDR_GET_LSIZE(hdr); enum zio_compress compress = HDR_GET_COMPRESS(hdr); arc_buf_contents_t type = arc_buf_type(hdr); VERIFY3U(hdr->b_type, ==, type); ASSERT(hdr->b_l1hdr.b_buf != buf || buf->b_next != NULL); (void) remove_reference(hdr, hash_lock, tag); if (arc_buf_is_shared(buf)) { ASSERT(HDR_SHARED_DATA(hdr)); ASSERT3P(hdr->b_l1hdr.b_buf, !=, buf); ASSERT(ARC_BUF_LAST(buf)); } /* * Pull the data off of this hdr and attach it to * a new anonymous hdr. Also find the last buffer * in the hdr's buffer list. */ arc_buf_t *lastbuf = NULL; bufp = &hdr->b_l1hdr.b_buf; while (*bufp != NULL) { if (*bufp == buf) { *bufp = buf->b_next; } /* * If we've removed a buffer in the middle of * the list then update the lastbuf and update * bufp. */ if (*bufp != NULL) { lastbuf = *bufp; bufp = &(*bufp)->b_next; } } buf->b_next = NULL; ASSERT3P(lastbuf, !=, buf); ASSERT3P(lastbuf, !=, NULL); /* * If the current arc_buf_t and the hdr are sharing their data * buffer, then we must stop sharing that block, transfer * ownership and setup sharing with a new arc_buf_t at the end * of the hdr's b_buf list. */ if (arc_buf_is_shared(buf)) { ASSERT3P(hdr->b_l1hdr.b_buf, !=, buf); ASSERT(ARC_BUF_LAST(lastbuf)); VERIFY(!arc_buf_is_shared(lastbuf)); /* * First, sever the block sharing relationship between * buf and the arc_buf_hdr_t. Then, setup a new * block sharing relationship with the last buffer * on the arc_buf_t list. */ arc_unshare_buf(hdr, buf); arc_share_buf(hdr, lastbuf); VERIFY3P(lastbuf->b_data, !=, NULL); } else if (HDR_SHARED_DATA(hdr)) { ASSERT(arc_buf_is_shared(lastbuf)); } ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL); ASSERT3P(state, !=, arc_l2c_only); (void) refcount_remove_many(&state->arcs_size, HDR_GET_LSIZE(hdr), buf); if (refcount_is_zero(&hdr->b_l1hdr.b_refcnt)) { ASSERT3P(state, !=, arc_l2c_only); (void) refcount_remove_many(&state->arcs_esize[type], HDR_GET_LSIZE(hdr), buf); } hdr->b_l1hdr.b_bufcnt -= 1; arc_cksum_verify(buf); #ifdef illumos arc_buf_unwatch(buf); #endif mutex_exit(hash_lock); /* * Allocate a new hdr. The new hdr will contain a b_pdata * buffer which will be freed in arc_write(). */ nhdr = arc_hdr_alloc(spa, psize, lsize, compress, type); ASSERT3P(nhdr->b_l1hdr.b_buf, ==, NULL); ASSERT0(nhdr->b_l1hdr.b_bufcnt); ASSERT0(refcount_count(&nhdr->b_l1hdr.b_refcnt)); VERIFY3U(nhdr->b_type, ==, type); ASSERT(!HDR_SHARED_DATA(nhdr)); nhdr->b_l1hdr.b_buf = buf; nhdr->b_l1hdr.b_bufcnt = 1; (void) refcount_add(&nhdr->b_l1hdr.b_refcnt, tag); buf->b_hdr = nhdr; mutex_exit(&buf->b_evict_lock); (void) refcount_add_many(&arc_anon->arcs_size, HDR_GET_LSIZE(nhdr), buf); } else { mutex_exit(&buf->b_evict_lock); ASSERT(refcount_count(&hdr->b_l1hdr.b_refcnt) == 1); /* protected by hash lock, or hdr is on arc_anon */ ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node)); ASSERT(!HDR_IO_IN_PROGRESS(hdr)); arc_change_state(arc_anon, hdr, hash_lock); hdr->b_l1hdr.b_arc_access = 0; mutex_exit(hash_lock); buf_discard_identity(hdr); arc_buf_thaw(buf); } } int arc_released(arc_buf_t *buf) { int released; mutex_enter(&buf->b_evict_lock); released = (buf->b_data != NULL && buf->b_hdr->b_l1hdr.b_state == arc_anon); mutex_exit(&buf->b_evict_lock); return (released); } #ifdef ZFS_DEBUG int arc_referenced(arc_buf_t *buf) { int referenced; mutex_enter(&buf->b_evict_lock); referenced = (refcount_count(&buf->b_hdr->b_l1hdr.b_refcnt)); mutex_exit(&buf->b_evict_lock); return (referenced); } #endif static void arc_write_ready(zio_t *zio) { arc_write_callback_t *callback = zio->io_private; arc_buf_t *buf = callback->awcb_buf; arc_buf_hdr_t *hdr = buf->b_hdr; uint64_t psize = BP_IS_HOLE(zio->io_bp) ? 0 : BP_GET_PSIZE(zio->io_bp); ASSERT(HDR_HAS_L1HDR(hdr)); ASSERT(!refcount_is_zero(&buf->b_hdr->b_l1hdr.b_refcnt)); ASSERT(hdr->b_l1hdr.b_bufcnt > 0); /* * If we're reexecuting this zio because the pool suspended, then * cleanup any state that was previously set the first time the * callback as invoked. */ if (zio->io_flags & ZIO_FLAG_REEXECUTED) { arc_cksum_free(hdr); #ifdef illumos arc_buf_unwatch(buf); #endif if (hdr->b_l1hdr.b_pdata != NULL) { if (arc_buf_is_shared(buf)) { ASSERT(HDR_SHARED_DATA(hdr)); arc_unshare_buf(hdr, buf); } else { arc_hdr_free_pdata(hdr); } } } ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL); ASSERT(!HDR_SHARED_DATA(hdr)); ASSERT(!arc_buf_is_shared(buf)); callback->awcb_ready(zio, buf, callback->awcb_private); if (HDR_IO_IN_PROGRESS(hdr)) ASSERT(zio->io_flags & ZIO_FLAG_REEXECUTED); arc_cksum_compute(buf); arc_hdr_set_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); enum zio_compress compress; if (BP_IS_HOLE(zio->io_bp) || BP_IS_EMBEDDED(zio->io_bp)) { compress = ZIO_COMPRESS_OFF; } else { ASSERT3U(HDR_GET_LSIZE(hdr), ==, BP_GET_LSIZE(zio->io_bp)); compress = BP_GET_COMPRESS(zio->io_bp); } HDR_SET_PSIZE(hdr, psize); arc_hdr_set_compress(hdr, compress); /* * If the hdr is compressed, then copy the compressed * zio contents into arc_buf_hdr_t. Otherwise, copy the original * data buf into the hdr. Ideally, we would like to always copy the * io_data into b_pdata but the user may have disabled compressed * arc thus the on-disk block may or may not match what we maintain * in the hdr's b_pdata field. */ if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF) { ASSERT(BP_GET_COMPRESS(zio->io_bp) != ZIO_COMPRESS_OFF); ASSERT3U(psize, >, 0); arc_hdr_alloc_pdata(hdr); bcopy(zio->io_data, hdr->b_l1hdr.b_pdata, psize); } else { ASSERT3P(buf->b_data, ==, zio->io_orig_data); ASSERT3U(zio->io_orig_size, ==, HDR_GET_LSIZE(hdr)); ASSERT3U(hdr->b_l1hdr.b_byteswap, ==, DMU_BSWAP_NUMFUNCS); ASSERT(!HDR_SHARED_DATA(hdr)); ASSERT(!arc_buf_is_shared(buf)); ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1); ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL); /* * This hdr is not compressed so we're able to share * the arc_buf_t data buffer with the hdr. */ arc_share_buf(hdr, buf); VERIFY0(bcmp(zio->io_orig_data, hdr->b_l1hdr.b_pdata, HDR_GET_LSIZE(hdr))); } arc_hdr_verify(hdr, zio->io_bp); } static void arc_write_children_ready(zio_t *zio) { arc_write_callback_t *callback = zio->io_private; arc_buf_t *buf = callback->awcb_buf; callback->awcb_children_ready(zio, buf, callback->awcb_private); } /* * The SPA calls this callback for each physical write that happens on behalf * of a logical write. See the comment in dbuf_write_physdone() for details. */ static void arc_write_physdone(zio_t *zio) { arc_write_callback_t *cb = zio->io_private; if (cb->awcb_physdone != NULL) cb->awcb_physdone(zio, cb->awcb_buf, cb->awcb_private); } static void arc_write_done(zio_t *zio) { arc_write_callback_t *callback = zio->io_private; arc_buf_t *buf = callback->awcb_buf; arc_buf_hdr_t *hdr = buf->b_hdr; ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL); if (zio->io_error == 0) { arc_hdr_verify(hdr, zio->io_bp); if (BP_IS_HOLE(zio->io_bp) || BP_IS_EMBEDDED(zio->io_bp)) { buf_discard_identity(hdr); } else { hdr->b_dva = *BP_IDENTITY(zio->io_bp); hdr->b_birth = BP_PHYSICAL_BIRTH(zio->io_bp); } } else { ASSERT(HDR_EMPTY(hdr)); } /* * If the block to be written was all-zero or compressed enough to be * embedded in the BP, no write was performed so there will be no * dva/birth/checksum. The buffer must therefore remain anonymous * (and uncached). */ if (!HDR_EMPTY(hdr)) { arc_buf_hdr_t *exists; kmutex_t *hash_lock; ASSERT(zio->io_error == 0); arc_cksum_verify(buf); exists = buf_hash_insert(hdr, &hash_lock); if (exists != NULL) { /* * This can only happen if we overwrite for * sync-to-convergence, because we remove * buffers from the hash table when we arc_free(). */ if (zio->io_flags & ZIO_FLAG_IO_REWRITE) { if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp)) panic("bad overwrite, hdr=%p exists=%p", (void *)hdr, (void *)exists); ASSERT(refcount_is_zero( &exists->b_l1hdr.b_refcnt)); arc_change_state(arc_anon, exists, hash_lock); mutex_exit(hash_lock); arc_hdr_destroy(exists); exists = buf_hash_insert(hdr, &hash_lock); ASSERT3P(exists, ==, NULL); } else if (zio->io_flags & ZIO_FLAG_NOPWRITE) { /* nopwrite */ ASSERT(zio->io_prop.zp_nopwrite); if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp)) panic("bad nopwrite, hdr=%p exists=%p", (void *)hdr, (void *)exists); } else { /* Dedup */ ASSERT(hdr->b_l1hdr.b_bufcnt == 1); ASSERT(hdr->b_l1hdr.b_state == arc_anon); ASSERT(BP_GET_DEDUP(zio->io_bp)); ASSERT(BP_GET_LEVEL(zio->io_bp) == 0); } } arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); /* if it's not anon, we are doing a scrub */ if (exists == NULL && hdr->b_l1hdr.b_state == arc_anon) arc_access(hdr, hash_lock); mutex_exit(hash_lock); } else { arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); } ASSERT(!refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); callback->awcb_done(zio, buf, callback->awcb_private); kmem_free(callback, sizeof (arc_write_callback_t)); } zio_t * arc_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, const zio_prop_t *zp, arc_done_func_t *ready, arc_done_func_t *children_ready, arc_done_func_t *physdone, arc_done_func_t *done, void *private, zio_priority_t priority, int zio_flags, const zbookmark_phys_t *zb) { arc_buf_hdr_t *hdr = buf->b_hdr; arc_write_callback_t *callback; zio_t *zio; ASSERT3P(ready, !=, NULL); ASSERT3P(done, !=, NULL); ASSERT(!HDR_IO_ERROR(hdr)); ASSERT(!HDR_IO_IN_PROGRESS(hdr)); ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL); ASSERT3U(hdr->b_l1hdr.b_bufcnt, >, 0); if (l2arc) arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE); callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP); callback->awcb_ready = ready; callback->awcb_children_ready = children_ready; callback->awcb_physdone = physdone; callback->awcb_done = done; callback->awcb_private = private; callback->awcb_buf = buf; /* * The hdr's b_pdata is now stale, free it now. A new data block * will be allocated when the zio pipeline calls arc_write_ready(). */ if (hdr->b_l1hdr.b_pdata != NULL) { /* * If the buf is currently sharing the data block with * the hdr then we need to break that relationship here. * The hdr will remain with a NULL data pointer and the * buf will take sole ownership of the block. */ if (arc_buf_is_shared(buf)) { ASSERT(ARC_BUF_LAST(buf)); arc_unshare_buf(hdr, buf); } else { arc_hdr_free_pdata(hdr); } VERIFY3P(buf->b_data, !=, NULL); arc_hdr_set_compress(hdr, ZIO_COMPRESS_OFF); } ASSERT(!arc_buf_is_shared(buf)); ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL); zio = zio_write(pio, spa, txg, bp, buf->b_data, HDR_GET_LSIZE(hdr), zp, arc_write_ready, (children_ready != NULL) ? arc_write_children_ready : NULL, arc_write_physdone, arc_write_done, callback, priority, zio_flags, zb); return (zio); } static int arc_memory_throttle(uint64_t reserve, uint64_t txg) { #ifdef _KERNEL uint64_t available_memory = ptob(freemem); static uint64_t page_load = 0; static uint64_t last_txg = 0; #if defined(__i386) || !defined(UMA_MD_SMALL_ALLOC) available_memory = MIN(available_memory, ptob(vmem_size(heap_arena, VMEM_FREE))); #endif if (freemem > (uint64_t)physmem * arc_lotsfree_percent / 100) return (0); if (txg > last_txg) { last_txg = txg; page_load = 0; } /* * If we are in pageout, we know that memory is already tight, * the arc is already going to be evicting, so we just want to * continue to let page writes occur as quickly as possible. */ if (curproc == pageproc) { if (page_load > MAX(ptob(minfree), available_memory) / 4) return (SET_ERROR(ERESTART)); /* Note: reserve is inflated, so we deflate */ page_load += reserve / 8; return (0); } else if (page_load > 0 && arc_reclaim_needed()) { /* memory is low, delay before restarting */ ARCSTAT_INCR(arcstat_memory_throttle_count, 1); return (SET_ERROR(EAGAIN)); } page_load = 0; #endif return (0); } void arc_tempreserve_clear(uint64_t reserve) { atomic_add_64(&arc_tempreserve, -reserve); ASSERT((int64_t)arc_tempreserve >= 0); } int arc_tempreserve_space(uint64_t reserve, uint64_t txg) { int error; uint64_t anon_size; if (reserve > arc_c/4 && !arc_no_grow) { arc_c = MIN(arc_c_max, reserve * 4); DTRACE_PROBE1(arc__set_reserve, uint64_t, arc_c); } if (reserve > arc_c) return (SET_ERROR(ENOMEM)); /* * Don't count loaned bufs as in flight dirty data to prevent long * network delays from blocking transactions that are ready to be * assigned to a txg. */ anon_size = MAX((int64_t)(refcount_count(&arc_anon->arcs_size) - arc_loaned_bytes), 0); /* * Writes will, almost always, require additional memory allocations * in order to compress/encrypt/etc the data. We therefore need to * make sure that there is sufficient available memory for this. */ error = arc_memory_throttle(reserve, txg); if (error != 0) return (error); /* * Throttle writes when the amount of dirty data in the cache * gets too large. We try to keep the cache less than half full * of dirty blocks so that our sync times don't grow too large. * Note: if two requests come in concurrently, we might let them * both succeed, when one of them should fail. Not a huge deal. */ if (reserve + arc_tempreserve + anon_size > arc_c / 2 && anon_size > arc_c / 4) { uint64_t meta_esize = refcount_count(&arc_anon->arcs_esize[ARC_BUFC_METADATA]); uint64_t data_esize = refcount_count(&arc_anon->arcs_esize[ARC_BUFC_DATA]); dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK " "anon_data=%lluK tempreserve=%lluK arc_c=%lluK\n", arc_tempreserve >> 10, meta_esize >> 10, data_esize >> 10, reserve >> 10, arc_c >> 10); return (SET_ERROR(ERESTART)); } atomic_add_64(&arc_tempreserve, reserve); return (0); } static void arc_kstat_update_state(arc_state_t *state, kstat_named_t *size, kstat_named_t *evict_data, kstat_named_t *evict_metadata) { size->value.ui64 = refcount_count(&state->arcs_size); evict_data->value.ui64 = refcount_count(&state->arcs_esize[ARC_BUFC_DATA]); evict_metadata->value.ui64 = refcount_count(&state->arcs_esize[ARC_BUFC_METADATA]); } static int arc_kstat_update(kstat_t *ksp, int rw) { arc_stats_t *as = ksp->ks_data; if (rw == KSTAT_WRITE) { return (EACCES); } else { arc_kstat_update_state(arc_anon, &as->arcstat_anon_size, &as->arcstat_anon_evictable_data, &as->arcstat_anon_evictable_metadata); arc_kstat_update_state(arc_mru, &as->arcstat_mru_size, &as->arcstat_mru_evictable_data, &as->arcstat_mru_evictable_metadata); arc_kstat_update_state(arc_mru_ghost, &as->arcstat_mru_ghost_size, &as->arcstat_mru_ghost_evictable_data, &as->arcstat_mru_ghost_evictable_metadata); arc_kstat_update_state(arc_mfu, &as->arcstat_mfu_size, &as->arcstat_mfu_evictable_data, &as->arcstat_mfu_evictable_metadata); arc_kstat_update_state(arc_mfu_ghost, &as->arcstat_mfu_ghost_size, &as->arcstat_mfu_ghost_evictable_data, &as->arcstat_mfu_ghost_evictable_metadata); } return (0); } /* * This function *must* return indices evenly distributed between all * sublists of the multilist. This is needed due to how the ARC eviction * code is laid out; arc_evict_state() assumes ARC buffers are evenly * distributed between all sublists and uses this assumption when * deciding which sublist to evict from and how much to evict from it. */ unsigned int arc_state_multilist_index_func(multilist_t *ml, void *obj) { arc_buf_hdr_t *hdr = obj; /* * We rely on b_dva to generate evenly distributed index * numbers using buf_hash below. So, as an added precaution, * let's make sure we never add empty buffers to the arc lists. */ ASSERT(!HDR_EMPTY(hdr)); /* * The assumption here, is the hash value for a given * arc_buf_hdr_t will remain constant throughout it's lifetime * (i.e. it's b_spa, b_dva, and b_birth fields don't change). * Thus, we don't need to store the header's sublist index * on insertion, as this index can be recalculated on removal. * * Also, the low order bits of the hash value are thought to be * distributed evenly. Otherwise, in the case that the multilist * has a power of two number of sublists, each sublists' usage * would not be evenly distributed. */ return (buf_hash(hdr->b_spa, &hdr->b_dva, hdr->b_birth) % multilist_get_num_sublists(ml)); } #ifdef _KERNEL static eventhandler_tag arc_event_lowmem = NULL; static void arc_lowmem(void *arg __unused, int howto __unused) { mutex_enter(&arc_reclaim_lock); /* XXX: Memory deficit should be passed as argument. */ needfree = btoc(arc_c >> arc_shrink_shift); DTRACE_PROBE(arc__needfree); cv_signal(&arc_reclaim_thread_cv); /* * It is unsafe to block here in arbitrary threads, because we can come * here from ARC itself and may hold ARC locks and thus risk a deadlock * with ARC reclaim thread. */ if (curproc == pageproc) (void) cv_wait(&arc_reclaim_waiters_cv, &arc_reclaim_lock); mutex_exit(&arc_reclaim_lock); } #endif static void arc_state_init(void) { arc_anon = &ARC_anon; arc_mru = &ARC_mru; arc_mru_ghost = &ARC_mru_ghost; arc_mfu = &ARC_mfu; arc_mfu_ghost = &ARC_mfu_ghost; arc_l2c_only = &ARC_l2c_only; multilist_create(&arc_mru->arcs_list[ARC_BUFC_METADATA], sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); multilist_create(&arc_mru->arcs_list[ARC_BUFC_DATA], sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); multilist_create(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA], sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); multilist_create(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA], sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); multilist_create(&arc_mfu->arcs_list[ARC_BUFC_METADATA], sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); multilist_create(&arc_mfu->arcs_list[ARC_BUFC_DATA], sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); multilist_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA], sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); multilist_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA], sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); multilist_create(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA], sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); multilist_create(&arc_l2c_only->arcs_list[ARC_BUFC_DATA], sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); refcount_create(&arc_anon->arcs_esize[ARC_BUFC_METADATA]); refcount_create(&arc_anon->arcs_esize[ARC_BUFC_DATA]); refcount_create(&arc_mru->arcs_esize[ARC_BUFC_METADATA]); refcount_create(&arc_mru->arcs_esize[ARC_BUFC_DATA]); refcount_create(&arc_mru_ghost->arcs_esize[ARC_BUFC_METADATA]); refcount_create(&arc_mru_ghost->arcs_esize[ARC_BUFC_DATA]); refcount_create(&arc_mfu->arcs_esize[ARC_BUFC_METADATA]); refcount_create(&arc_mfu->arcs_esize[ARC_BUFC_DATA]); refcount_create(&arc_mfu_ghost->arcs_esize[ARC_BUFC_METADATA]); refcount_create(&arc_mfu_ghost->arcs_esize[ARC_BUFC_DATA]); refcount_create(&arc_l2c_only->arcs_esize[ARC_BUFC_METADATA]); refcount_create(&arc_l2c_only->arcs_esize[ARC_BUFC_DATA]); refcount_create(&arc_anon->arcs_size); refcount_create(&arc_mru->arcs_size); refcount_create(&arc_mru_ghost->arcs_size); refcount_create(&arc_mfu->arcs_size); refcount_create(&arc_mfu_ghost->arcs_size); refcount_create(&arc_l2c_only->arcs_size); } static void arc_state_fini(void) { refcount_destroy(&arc_anon->arcs_esize[ARC_BUFC_METADATA]); refcount_destroy(&arc_anon->arcs_esize[ARC_BUFC_DATA]); refcount_destroy(&arc_mru->arcs_esize[ARC_BUFC_METADATA]); refcount_destroy(&arc_mru->arcs_esize[ARC_BUFC_DATA]); refcount_destroy(&arc_mru_ghost->arcs_esize[ARC_BUFC_METADATA]); refcount_destroy(&arc_mru_ghost->arcs_esize[ARC_BUFC_DATA]); refcount_destroy(&arc_mfu->arcs_esize[ARC_BUFC_METADATA]); refcount_destroy(&arc_mfu->arcs_esize[ARC_BUFC_DATA]); refcount_destroy(&arc_mfu_ghost->arcs_esize[ARC_BUFC_METADATA]); refcount_destroy(&arc_mfu_ghost->arcs_esize[ARC_BUFC_DATA]); refcount_destroy(&arc_l2c_only->arcs_esize[ARC_BUFC_METADATA]); refcount_destroy(&arc_l2c_only->arcs_esize[ARC_BUFC_DATA]); refcount_destroy(&arc_anon->arcs_size); refcount_destroy(&arc_mru->arcs_size); refcount_destroy(&arc_mru_ghost->arcs_size); refcount_destroy(&arc_mfu->arcs_size); refcount_destroy(&arc_mfu_ghost->arcs_size); refcount_destroy(&arc_l2c_only->arcs_size); multilist_destroy(&arc_mru->arcs_list[ARC_BUFC_METADATA]); multilist_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]); multilist_destroy(&arc_mfu->arcs_list[ARC_BUFC_METADATA]); multilist_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA]); multilist_destroy(&arc_mru->arcs_list[ARC_BUFC_DATA]); multilist_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA]); multilist_destroy(&arc_mfu->arcs_list[ARC_BUFC_DATA]); multilist_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]); } uint64_t arc_max_bytes(void) { return (arc_c_max); } void arc_init(void) { int i, prefetch_tunable_set = 0; mutex_init(&arc_reclaim_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&arc_reclaim_thread_cv, NULL, CV_DEFAULT, NULL); cv_init(&arc_reclaim_waiters_cv, NULL, CV_DEFAULT, NULL); mutex_init(&arc_dnlc_evicts_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&arc_dnlc_evicts_cv, NULL, CV_DEFAULT, NULL); /* Convert seconds to clock ticks */ arc_min_prefetch_lifespan = 1 * hz; /* Start out with 1/8 of all memory */ arc_c = kmem_size() / 8; #ifdef illumos #ifdef _KERNEL /* * On architectures where the physical memory can be larger * than the addressable space (intel in 32-bit mode), we may * need to limit the cache to 1/8 of VM size. */ arc_c = MIN(arc_c, vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 8); #endif #endif /* illumos */ /* set min cache to 1/32 of all memory, or arc_abs_min, whichever is more */ arc_c_min = MAX(arc_c / 4, arc_abs_min); /* set max to 1/2 of all memory, or all but 1GB, whichever is more */ if (arc_c * 8 >= 1 << 30) arc_c_max = (arc_c * 8) - (1 << 30); else arc_c_max = arc_c_min; arc_c_max = MAX(arc_c * 5, arc_c_max); /* * In userland, there's only the memory pressure that we artificially * create (see arc_available_memory()). Don't let arc_c get too * small, because it can cause transactions to be larger than * arc_c, causing arc_tempreserve_space() to fail. */ #ifndef _KERNEL arc_c_min = arc_c_max / 2; #endif #ifdef _KERNEL /* * Allow the tunables to override our calculations if they are * reasonable. */ if (zfs_arc_max > arc_abs_min && zfs_arc_max < kmem_size()) { arc_c_max = zfs_arc_max; arc_c_min = MIN(arc_c_min, arc_c_max); } if (zfs_arc_min > arc_abs_min && zfs_arc_min <= arc_c_max) arc_c_min = zfs_arc_min; #endif arc_c = arc_c_max; arc_p = (arc_c >> 1); arc_size = 0; /* limit meta-data to 1/4 of the arc capacity */ arc_meta_limit = arc_c_max / 4; /* Allow the tunable to override if it is reasonable */ if (zfs_arc_meta_limit > 0 && zfs_arc_meta_limit <= arc_c_max) arc_meta_limit = zfs_arc_meta_limit; if (arc_c_min < arc_meta_limit / 2 && zfs_arc_min == 0) arc_c_min = arc_meta_limit / 2; if (zfs_arc_meta_min > 0) { arc_meta_min = zfs_arc_meta_min; } else { arc_meta_min = arc_c_min / 2; } if (zfs_arc_grow_retry > 0) arc_grow_retry = zfs_arc_grow_retry; if (zfs_arc_shrink_shift > 0) arc_shrink_shift = zfs_arc_shrink_shift; /* * Ensure that arc_no_grow_shift is less than arc_shrink_shift. */ if (arc_no_grow_shift >= arc_shrink_shift) arc_no_grow_shift = arc_shrink_shift - 1; if (zfs_arc_p_min_shift > 0) arc_p_min_shift = zfs_arc_p_min_shift; if (zfs_arc_num_sublists_per_state < 1) zfs_arc_num_sublists_per_state = MAX(max_ncpus, 1); /* if kmem_flags are set, lets try to use less memory */ if (kmem_debugging()) arc_c = arc_c / 2; if (arc_c < arc_c_min) arc_c = arc_c_min; zfs_arc_min = arc_c_min; zfs_arc_max = arc_c_max; arc_state_init(); buf_init(); arc_reclaim_thread_exit = B_FALSE; arc_dnlc_evicts_thread_exit = FALSE; arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED, sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); if (arc_ksp != NULL) { arc_ksp->ks_data = &arc_stats; arc_ksp->ks_update = arc_kstat_update; kstat_install(arc_ksp); } (void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0, TS_RUN, minclsyspri); #ifdef _KERNEL arc_event_lowmem = EVENTHANDLER_REGISTER(vm_lowmem, arc_lowmem, NULL, EVENTHANDLER_PRI_FIRST); #endif (void) thread_create(NULL, 0, arc_dnlc_evicts_thread, NULL, 0, &p0, TS_RUN, minclsyspri); arc_dead = B_FALSE; arc_warm = B_FALSE; /* * Calculate maximum amount of dirty data per pool. * * If it has been set by /etc/system, take that. * Otherwise, use a percentage of physical memory defined by * zfs_dirty_data_max_percent (default 10%) with a cap at * zfs_dirty_data_max_max (default 4GB). */ if (zfs_dirty_data_max == 0) { zfs_dirty_data_max = ptob(physmem) * zfs_dirty_data_max_percent / 100; zfs_dirty_data_max = MIN(zfs_dirty_data_max, zfs_dirty_data_max_max); } #ifdef _KERNEL if (TUNABLE_INT_FETCH("vfs.zfs.prefetch_disable", &zfs_prefetch_disable)) prefetch_tunable_set = 1; #ifdef __i386__ if (prefetch_tunable_set == 0) { printf("ZFS NOTICE: Prefetch is disabled by default on i386 " "-- to enable,\n"); printf(" add \"vfs.zfs.prefetch_disable=0\" " "to /boot/loader.conf.\n"); zfs_prefetch_disable = 1; } #else if ((((uint64_t)physmem * PAGESIZE) < (1ULL << 32)) && prefetch_tunable_set == 0) { printf("ZFS NOTICE: Prefetch is disabled by default if less " "than 4GB of RAM is present;\n" " to enable, add \"vfs.zfs.prefetch_disable=0\" " "to /boot/loader.conf.\n"); zfs_prefetch_disable = 1; } #endif /* Warn about ZFS memory and address space requirements. */ if (((uint64_t)physmem * PAGESIZE) < (256 + 128 + 64) * (1 << 20)) { printf("ZFS WARNING: Recommended minimum RAM size is 512MB; " "expect unstable behavior.\n"); } if (kmem_size() < 512 * (1 << 20)) { printf("ZFS WARNING: Recommended minimum kmem_size is 512MB; " "expect unstable behavior.\n"); printf(" Consider tuning vm.kmem_size and " "vm.kmem_size_max\n"); printf(" in /boot/loader.conf.\n"); } #endif } void arc_fini(void) { mutex_enter(&arc_reclaim_lock); arc_reclaim_thread_exit = B_TRUE; /* * The reclaim thread will set arc_reclaim_thread_exit back to * B_FALSE when it is finished exiting; we're waiting for that. */ while (arc_reclaim_thread_exit) { cv_signal(&arc_reclaim_thread_cv); cv_wait(&arc_reclaim_thread_cv, &arc_reclaim_lock); } mutex_exit(&arc_reclaim_lock); /* Use B_TRUE to ensure *all* buffers are evicted */ arc_flush(NULL, B_TRUE); mutex_enter(&arc_dnlc_evicts_lock); arc_dnlc_evicts_thread_exit = TRUE; /* * The user evicts thread will set arc_user_evicts_thread_exit * to FALSE when it is finished exiting; we're waiting for that. */ while (arc_dnlc_evicts_thread_exit) { cv_signal(&arc_dnlc_evicts_cv); cv_wait(&arc_dnlc_evicts_cv, &arc_dnlc_evicts_lock); } mutex_exit(&arc_dnlc_evicts_lock); arc_dead = B_TRUE; if (arc_ksp != NULL) { kstat_delete(arc_ksp); arc_ksp = NULL; } mutex_destroy(&arc_reclaim_lock); cv_destroy(&arc_reclaim_thread_cv); cv_destroy(&arc_reclaim_waiters_cv); mutex_destroy(&arc_dnlc_evicts_lock); cv_destroy(&arc_dnlc_evicts_cv); arc_state_fini(); buf_fini(); ASSERT0(arc_loaned_bytes); #ifdef _KERNEL if (arc_event_lowmem != NULL) EVENTHANDLER_DEREGISTER(vm_lowmem, arc_event_lowmem); #endif } /* * Level 2 ARC * * The level 2 ARC (L2ARC) is a cache layer in-between main memory and disk. * It uses dedicated storage devices to hold cached data, which are populated * using large infrequent writes. The main role of this cache is to boost * the performance of random read workloads. The intended L2ARC devices * include short-stroked disks, solid state disks, and other media with * substantially faster read latency than disk. * * +-----------------------+ * | ARC | * +-----------------------+ * | ^ ^ * | | | * l2arc_feed_thread() arc_read() * | | | * | l2arc read | * V | | * +---------------+ | * | L2ARC | | * +---------------+ | * | ^ | * l2arc_write() | | * | | | * V | | * +-------+ +-------+ * | vdev | | vdev | * | cache | | cache | * +-------+ +-------+ * +=========+ .-----. * : L2ARC : |-_____-| * : devices : | Disks | * +=========+ `-_____-' * * Read requests are satisfied from the following sources, in order: * * 1) ARC * 2) vdev cache of L2ARC devices * 3) L2ARC devices * 4) vdev cache of disks * 5) disks * * Some L2ARC device types exhibit extremely slow write performance. * To accommodate for this there are some significant differences between * the L2ARC and traditional cache design: * * 1. There is no eviction path from the ARC to the L2ARC. Evictions from * the ARC behave as usual, freeing buffers and placing headers on ghost * lists. The ARC does not send buffers to the L2ARC during eviction as * this would add inflated write latencies for all ARC memory pressure. * * 2. The L2ARC attempts to cache data from the ARC before it is evicted. * It does this by periodically scanning buffers from the eviction-end of * the MFU and MRU ARC lists, copying them to the L2ARC devices if they are * not already there. It scans until a headroom of buffers is satisfied, * which itself is a buffer for ARC eviction. If a compressible buffer is * found during scanning and selected for writing to an L2ARC device, we * temporarily boost scanning headroom during the next scan cycle to make * sure we adapt to compression effects (which might significantly reduce * the data volume we write to L2ARC). The thread that does this is * l2arc_feed_thread(), illustrated below; example sizes are included to * provide a better sense of ratio than this diagram: * * head --> tail * +---------------------+----------+ * ARC_mfu |:::::#:::::::::::::::|o#o###o###|-->. # already on L2ARC * +---------------------+----------+ | o L2ARC eligible * ARC_mru |:#:::::::::::::::::::|#o#ooo####|-->| : ARC buffer * +---------------------+----------+ | * 15.9 Gbytes ^ 32 Mbytes | * headroom | * l2arc_feed_thread() * | * l2arc write hand <--[oooo]--' * | 8 Mbyte * | write max * V * +==============================+ * L2ARC dev |####|#|###|###| |####| ... | * +==============================+ * 32 Gbytes * * 3. If an ARC buffer is copied to the L2ARC but then hit instead of * evicted, then the L2ARC has cached a buffer much sooner than it probably * needed to, potentially wasting L2ARC device bandwidth and storage. It is * safe to say that this is an uncommon case, since buffers at the end of * the ARC lists have moved there due to inactivity. * * 4. If the ARC evicts faster than the L2ARC can maintain a headroom, * then the L2ARC simply misses copying some buffers. This serves as a * pressure valve to prevent heavy read workloads from both stalling the ARC * with waits and clogging the L2ARC with writes. This also helps prevent * the potential for the L2ARC to churn if it attempts to cache content too * quickly, such as during backups of the entire pool. * * 5. After system boot and before the ARC has filled main memory, there are * no evictions from the ARC and so the tails of the ARC_mfu and ARC_mru * lists can remain mostly static. Instead of searching from tail of these * lists as pictured, the l2arc_feed_thread() will search from the list heads * for eligible buffers, greatly increasing its chance of finding them. * * The L2ARC device write speed is also boosted during this time so that * the L2ARC warms up faster. Since there have been no ARC evictions yet, * there are no L2ARC reads, and no fear of degrading read performance * through increased writes. * * 6. Writes to the L2ARC devices are grouped and sent in-sequence, so that * the vdev queue can aggregate them into larger and fewer writes. Each * device is written to in a rotor fashion, sweeping writes through * available space then repeating. * * 7. The L2ARC does not store dirty content. It never needs to flush * write buffers back to disk based storage. * * 8. If an ARC buffer is written (and dirtied) which also exists in the * L2ARC, the now stale L2ARC buffer is immediately dropped. * * The performance of the L2ARC can be tweaked by a number of tunables, which * may be necessary for different workloads: * * l2arc_write_max max write bytes per interval * l2arc_write_boost extra write bytes during device warmup * l2arc_noprefetch skip caching prefetched buffers * l2arc_headroom number of max device writes to precache * l2arc_headroom_boost when we find compressed buffers during ARC * scanning, we multiply headroom by this * percentage factor for the next scan cycle, * since more compressed buffers are likely to * be present * l2arc_feed_secs seconds between L2ARC writing * * Tunables may be removed or added as future performance improvements are * integrated, and also may become zpool properties. * * There are three key functions that control how the L2ARC warms up: * * l2arc_write_eligible() check if a buffer is eligible to cache * l2arc_write_size() calculate how much to write * l2arc_write_interval() calculate sleep delay between writes * * These three functions determine what to write, how much, and how quickly * to send writes. */ static boolean_t l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *hdr) { /* * A buffer is *not* eligible for the L2ARC if it: * 1. belongs to a different spa. * 2. is already cached on the L2ARC. * 3. has an I/O in progress (it may be an incomplete read). * 4. is flagged not eligible (zfs property). */ if (hdr->b_spa != spa_guid) { ARCSTAT_BUMP(arcstat_l2_write_spa_mismatch); return (B_FALSE); } if (HDR_HAS_L2HDR(hdr)) { ARCSTAT_BUMP(arcstat_l2_write_in_l2); return (B_FALSE); } if (HDR_IO_IN_PROGRESS(hdr)) { ARCSTAT_BUMP(arcstat_l2_write_hdr_io_in_progress); return (B_FALSE); } if (!HDR_L2CACHE(hdr)) { ARCSTAT_BUMP(arcstat_l2_write_not_cacheable); return (B_FALSE); } return (B_TRUE); } static uint64_t l2arc_write_size(void) { uint64_t size; /* * Make sure our globals have meaningful values in case the user * altered them. */ size = l2arc_write_max; if (size == 0) { cmn_err(CE_NOTE, "Bad value for l2arc_write_max, value must " "be greater than zero, resetting it to the default (%d)", L2ARC_WRITE_SIZE); size = l2arc_write_max = L2ARC_WRITE_SIZE; } if (arc_warm == B_FALSE) size += l2arc_write_boost; return (size); } static clock_t l2arc_write_interval(clock_t began, uint64_t wanted, uint64_t wrote) { clock_t interval, next, now; /* * If the ARC lists are busy, increase our write rate; if the * lists are stale, idle back. This is achieved by checking * how much we previously wrote - if it was more than half of * what we wanted, schedule the next write much sooner. */ if (l2arc_feed_again && wrote > (wanted / 2)) interval = (hz * l2arc_feed_min_ms) / 1000; else interval = hz * l2arc_feed_secs; now = ddi_get_lbolt(); next = MAX(now, MIN(now + interval, began + interval)); return (next); } /* * Cycle through L2ARC devices. This is how L2ARC load balances. * If a device is returned, this also returns holding the spa config lock. */ static l2arc_dev_t * l2arc_dev_get_next(void) { l2arc_dev_t *first, *next = NULL; /* * Lock out the removal of spas (spa_namespace_lock), then removal * of cache devices (l2arc_dev_mtx). Once a device has been selected, * both locks will be dropped and a spa config lock held instead. */ mutex_enter(&spa_namespace_lock); mutex_enter(&l2arc_dev_mtx); /* if there are no vdevs, there is nothing to do */ if (l2arc_ndev == 0) goto out; first = NULL; next = l2arc_dev_last; do { /* loop around the list looking for a non-faulted vdev */ if (next == NULL) { next = list_head(l2arc_dev_list); } else { next = list_next(l2arc_dev_list, next); if (next == NULL) next = list_head(l2arc_dev_list); } /* if we have come back to the start, bail out */ if (first == NULL) first = next; else if (next == first) break; } while (vdev_is_dead(next->l2ad_vdev)); /* if we were unable to find any usable vdevs, return NULL */ if (vdev_is_dead(next->l2ad_vdev)) next = NULL; l2arc_dev_last = next; out: mutex_exit(&l2arc_dev_mtx); /* * Grab the config lock to prevent the 'next' device from being * removed while we are writing to it. */ if (next != NULL) spa_config_enter(next->l2ad_spa, SCL_L2ARC, next, RW_READER); mutex_exit(&spa_namespace_lock); return (next); } /* * Free buffers that were tagged for destruction. */ static void l2arc_do_free_on_write() { list_t *buflist; l2arc_data_free_t *df, *df_prev; mutex_enter(&l2arc_free_on_write_mtx); buflist = l2arc_free_on_write; for (df = list_tail(buflist); df; df = df_prev) { df_prev = list_prev(buflist, df); ASSERT3P(df->l2df_data, !=, NULL); if (df->l2df_type == ARC_BUFC_METADATA) { zio_buf_free(df->l2df_data, df->l2df_size); } else { ASSERT(df->l2df_type == ARC_BUFC_DATA); zio_data_buf_free(df->l2df_data, df->l2df_size); } list_remove(buflist, df); kmem_free(df, sizeof (l2arc_data_free_t)); } mutex_exit(&l2arc_free_on_write_mtx); } /* * A write to a cache device has completed. Update all headers to allow * reads from these buffers to begin. */ static void l2arc_write_done(zio_t *zio) { l2arc_write_callback_t *cb; l2arc_dev_t *dev; list_t *buflist; arc_buf_hdr_t *head, *hdr, *hdr_prev; kmutex_t *hash_lock; int64_t bytes_dropped = 0; cb = zio->io_private; ASSERT3P(cb, !=, NULL); dev = cb->l2wcb_dev; ASSERT3P(dev, !=, NULL); head = cb->l2wcb_head; ASSERT3P(head, !=, NULL); buflist = &dev->l2ad_buflist; ASSERT3P(buflist, !=, NULL); DTRACE_PROBE2(l2arc__iodone, zio_t *, zio, l2arc_write_callback_t *, cb); if (zio->io_error != 0) ARCSTAT_BUMP(arcstat_l2_writes_error); /* * All writes completed, or an error was hit. */ top: mutex_enter(&dev->l2ad_mtx); for (hdr = list_prev(buflist, head); hdr; hdr = hdr_prev) { hdr_prev = list_prev(buflist, hdr); hash_lock = HDR_LOCK(hdr); /* * We cannot use mutex_enter or else we can deadlock * with l2arc_write_buffers (due to swapping the order * the hash lock and l2ad_mtx are taken). */ if (!mutex_tryenter(hash_lock)) { /* * Missed the hash lock. We must retry so we * don't leave the ARC_FLAG_L2_WRITING bit set. */ ARCSTAT_BUMP(arcstat_l2_writes_lock_retry); /* * We don't want to rescan the headers we've * already marked as having been written out, so * we reinsert the head node so we can pick up * where we left off. */ list_remove(buflist, head); list_insert_after(buflist, hdr, head); mutex_exit(&dev->l2ad_mtx); /* * We wait for the hash lock to become available * to try and prevent busy waiting, and increase * the chance we'll be able to acquire the lock * the next time around. */ mutex_enter(hash_lock); mutex_exit(hash_lock); goto top; } /* * We could not have been moved into the arc_l2c_only * state while in-flight due to our ARC_FLAG_L2_WRITING * bit being set. Let's just ensure that's being enforced. */ ASSERT(HDR_HAS_L1HDR(hdr)); if (zio->io_error != 0) { /* * Error - drop L2ARC entry. */ list_remove(buflist, hdr); l2arc_trim(hdr); arc_hdr_clear_flags(hdr, ARC_FLAG_HAS_L2HDR); ARCSTAT_INCR(arcstat_l2_asize, -arc_hdr_size(hdr)); ARCSTAT_INCR(arcstat_l2_size, -HDR_GET_LSIZE(hdr)); bytes_dropped += arc_hdr_size(hdr); (void) refcount_remove_many(&dev->l2ad_alloc, arc_hdr_size(hdr), hdr); } /* * Allow ARC to begin reads and ghost list evictions to * this L2ARC entry. */ arc_hdr_clear_flags(hdr, ARC_FLAG_L2_WRITING); mutex_exit(hash_lock); } atomic_inc_64(&l2arc_writes_done); list_remove(buflist, head); ASSERT(!HDR_HAS_L1HDR(head)); kmem_cache_free(hdr_l2only_cache, head); mutex_exit(&dev->l2ad_mtx); vdev_space_update(dev->l2ad_vdev, -bytes_dropped, 0, 0); l2arc_do_free_on_write(); kmem_free(cb, sizeof (l2arc_write_callback_t)); } /* * A read to a cache device completed. Validate buffer contents before * handing over to the regular ARC routines. */ static void l2arc_read_done(zio_t *zio) { l2arc_read_callback_t *cb; arc_buf_hdr_t *hdr; kmutex_t *hash_lock; boolean_t valid_cksum; ASSERT3P(zio->io_vd, !=, NULL); ASSERT(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE); spa_config_exit(zio->io_spa, SCL_L2ARC, zio->io_vd); cb = zio->io_private; ASSERT3P(cb, !=, NULL); hdr = cb->l2rcb_hdr; ASSERT3P(hdr, !=, NULL); hash_lock = HDR_LOCK(hdr); mutex_enter(hash_lock); ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); /* * If the data was read into a temporary buffer, * move it and free the buffer. */ if (cb->l2rcb_data != NULL) { ASSERT3U(arc_hdr_size(hdr), <, zio->io_size); if (zio->io_error == 0) { bcopy(cb->l2rcb_data, hdr->b_l1hdr.b_pdata, arc_hdr_size(hdr)); } /* * The following must be done regardless of whether * there was an error: * - free the temporary buffer * - point zio to the real ARC buffer * - set zio size accordingly * These are required because zio is either re-used for * an I/O of the block in the case of the error * or the zio is passed to arc_read_done() and it * needs real data. */ zio_data_buf_free(cb->l2rcb_data, zio->io_size); zio->io_size = zio->io_orig_size = arc_hdr_size(hdr); zio->io_data = zio->io_orig_data = hdr->b_l1hdr.b_pdata; } ASSERT3P(zio->io_data, !=, NULL); /* * Check this survived the L2ARC journey. */ ASSERT3P(zio->io_data, ==, hdr->b_l1hdr.b_pdata); zio->io_bp_copy = cb->l2rcb_bp; /* XXX fix in L2ARC 2.0 */ zio->io_bp = &zio->io_bp_copy; /* XXX fix in L2ARC 2.0 */ valid_cksum = arc_cksum_is_equal(hdr, zio); if (valid_cksum && zio->io_error == 0 && !HDR_L2_EVICTED(hdr)) { mutex_exit(hash_lock); zio->io_private = hdr; arc_read_done(zio); } else { mutex_exit(hash_lock); /* * Buffer didn't survive caching. Increment stats and * reissue to the original storage device. */ if (zio->io_error != 0) { ARCSTAT_BUMP(arcstat_l2_io_error); } else { zio->io_error = SET_ERROR(EIO); } if (!valid_cksum) ARCSTAT_BUMP(arcstat_l2_cksum_bad); /* * If there's no waiter, issue an async i/o to the primary * storage now. If there *is* a waiter, the caller must * issue the i/o in a context where it's OK to block. */ if (zio->io_waiter == NULL) { zio_t *pio = zio_unique_parent(zio); ASSERT(!pio || pio->io_child_type == ZIO_CHILD_LOGICAL); zio_nowait(zio_read(pio, zio->io_spa, zio->io_bp, hdr->b_l1hdr.b_pdata, zio->io_size, arc_read_done, hdr, zio->io_priority, cb->l2rcb_flags, &cb->l2rcb_zb)); } } kmem_free(cb, sizeof (l2arc_read_callback_t)); } /* * This is the list priority from which the L2ARC will search for pages to * cache. This is used within loops (0..3) to cycle through lists in the * desired order. This order can have a significant effect on cache * performance. * * Currently the metadata lists are hit first, MFU then MRU, followed by * the data lists. This function returns a locked list, and also returns * the lock pointer. */ static multilist_sublist_t * l2arc_sublist_lock(int list_num) { multilist_t *ml = NULL; unsigned int idx; ASSERT(list_num >= 0 && list_num <= 3); switch (list_num) { case 0: ml = &arc_mfu->arcs_list[ARC_BUFC_METADATA]; break; case 1: ml = &arc_mru->arcs_list[ARC_BUFC_METADATA]; break; case 2: ml = &arc_mfu->arcs_list[ARC_BUFC_DATA]; break; case 3: ml = &arc_mru->arcs_list[ARC_BUFC_DATA]; break; } /* * Return a randomly-selected sublist. This is acceptable * because the caller feeds only a little bit of data for each * call (8MB). Subsequent calls will result in different * sublists being selected. */ idx = multilist_get_random_index(ml); return (multilist_sublist_lock(ml, idx)); } /* * Evict buffers from the device write hand to the distance specified in * bytes. This distance may span populated buffers, it may span nothing. * This is clearing a region on the L2ARC device ready for writing. * If the 'all' boolean is set, every buffer is evicted. */ static void l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all) { list_t *buflist; arc_buf_hdr_t *hdr, *hdr_prev; kmutex_t *hash_lock; uint64_t taddr; buflist = &dev->l2ad_buflist; if (!all && dev->l2ad_first) { /* * This is the first sweep through the device. There is * nothing to evict. */ return; } if (dev->l2ad_hand >= (dev->l2ad_end - (2 * distance))) { /* * When nearing the end of the device, evict to the end * before the device write hand jumps to the start. */ taddr = dev->l2ad_end; } else { taddr = dev->l2ad_hand + distance; } DTRACE_PROBE4(l2arc__evict, l2arc_dev_t *, dev, list_t *, buflist, uint64_t, taddr, boolean_t, all); top: mutex_enter(&dev->l2ad_mtx); for (hdr = list_tail(buflist); hdr; hdr = hdr_prev) { hdr_prev = list_prev(buflist, hdr); hash_lock = HDR_LOCK(hdr); /* * We cannot use mutex_enter or else we can deadlock * with l2arc_write_buffers (due to swapping the order * the hash lock and l2ad_mtx are taken). */ if (!mutex_tryenter(hash_lock)) { /* * Missed the hash lock. Retry. */ ARCSTAT_BUMP(arcstat_l2_evict_lock_retry); mutex_exit(&dev->l2ad_mtx); mutex_enter(hash_lock); mutex_exit(hash_lock); goto top; } if (HDR_L2_WRITE_HEAD(hdr)) { /* * We hit a write head node. Leave it for * l2arc_write_done(). */ list_remove(buflist, hdr); mutex_exit(hash_lock); continue; } if (!all && HDR_HAS_L2HDR(hdr) && (hdr->b_l2hdr.b_daddr > taddr || hdr->b_l2hdr.b_daddr < dev->l2ad_hand)) { /* * We've evicted to the target address, * or the end of the device. */ mutex_exit(hash_lock); break; } ASSERT(HDR_HAS_L2HDR(hdr)); if (!HDR_HAS_L1HDR(hdr)) { ASSERT(!HDR_L2_READING(hdr)); /* * This doesn't exist in the ARC. Destroy. * arc_hdr_destroy() will call list_remove() * and decrement arcstat_l2_size. */ arc_change_state(arc_anon, hdr, hash_lock); arc_hdr_destroy(hdr); } else { ASSERT(hdr->b_l1hdr.b_state != arc_l2c_only); ARCSTAT_BUMP(arcstat_l2_evict_l1cached); /* * Invalidate issued or about to be issued * reads, since we may be about to write * over this location. */ if (HDR_L2_READING(hdr)) { ARCSTAT_BUMP(arcstat_l2_evict_reading); arc_hdr_set_flags(hdr, ARC_FLAG_L2_EVICTED); } /* Ensure this header has finished being written */ ASSERT(!HDR_L2_WRITING(hdr)); arc_hdr_l2hdr_destroy(hdr); } mutex_exit(hash_lock); } mutex_exit(&dev->l2ad_mtx); } /* * Find and write ARC buffers to the L2ARC device. * * An ARC_FLAG_L2_WRITING flag is set so that the L2ARC buffers are not valid * for reading until they have completed writing. * The headroom_boost is an in-out parameter used to maintain headroom boost * state between calls to this function. * * Returns the number of bytes actually written (which may be smaller than * the delta by which the device hand has changed due to alignment). */ static uint64_t l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) { arc_buf_hdr_t *hdr, *hdr_prev, *head; uint64_t write_asize, write_psize, write_sz, headroom; boolean_t full; l2arc_write_callback_t *cb; zio_t *pio, *wzio; uint64_t guid = spa_load_guid(spa); int try; ASSERT3P(dev->l2ad_vdev, !=, NULL); pio = NULL; write_sz = write_asize = write_psize = 0; full = B_FALSE; head = kmem_cache_alloc(hdr_l2only_cache, KM_PUSHPAGE); arc_hdr_set_flags(head, ARC_FLAG_L2_WRITE_HEAD | ARC_FLAG_HAS_L2HDR); ARCSTAT_BUMP(arcstat_l2_write_buffer_iter); /* * Copy buffers for L2ARC writing. */ for (try = 0; try <= 3; try++) { multilist_sublist_t *mls = l2arc_sublist_lock(try); uint64_t passed_sz = 0; ARCSTAT_BUMP(arcstat_l2_write_buffer_list_iter); /* * L2ARC fast warmup. * * Until the ARC is warm and starts to evict, read from the * head of the ARC lists rather than the tail. */ if (arc_warm == B_FALSE) hdr = multilist_sublist_head(mls); else hdr = multilist_sublist_tail(mls); if (hdr == NULL) ARCSTAT_BUMP(arcstat_l2_write_buffer_list_null_iter); headroom = target_sz * l2arc_headroom; if (zfs_compressed_arc_enabled) headroom = (headroom * l2arc_headroom_boost) / 100; for (; hdr; hdr = hdr_prev) { kmutex_t *hash_lock; if (arc_warm == B_FALSE) hdr_prev = multilist_sublist_next(mls, hdr); else hdr_prev = multilist_sublist_prev(mls, hdr); ARCSTAT_INCR(arcstat_l2_write_buffer_bytes_scanned, HDR_GET_LSIZE(hdr)); hash_lock = HDR_LOCK(hdr); if (!mutex_tryenter(hash_lock)) { ARCSTAT_BUMP(arcstat_l2_write_trylock_fail); /* * Skip this buffer rather than waiting. */ continue; } passed_sz += HDR_GET_LSIZE(hdr); if (passed_sz > headroom) { /* * Searched too far. */ mutex_exit(hash_lock); ARCSTAT_BUMP(arcstat_l2_write_passed_headroom); break; } if (!l2arc_write_eligible(guid, hdr)) { mutex_exit(hash_lock); continue; } if ((write_asize + HDR_GET_LSIZE(hdr)) > target_sz) { full = B_TRUE; mutex_exit(hash_lock); ARCSTAT_BUMP(arcstat_l2_write_full); break; } if (pio == NULL) { /* * Insert a dummy header on the buflist so * l2arc_write_done() can find where the * write buffers begin without searching. */ mutex_enter(&dev->l2ad_mtx); list_insert_head(&dev->l2ad_buflist, head); mutex_exit(&dev->l2ad_mtx); cb = kmem_alloc( sizeof (l2arc_write_callback_t), KM_SLEEP); cb->l2wcb_dev = dev; cb->l2wcb_head = head; pio = zio_root(spa, l2arc_write_done, cb, ZIO_FLAG_CANFAIL); ARCSTAT_BUMP(arcstat_l2_write_pios); } hdr->b_l2hdr.b_dev = dev; hdr->b_l2hdr.b_daddr = dev->l2ad_hand; arc_hdr_set_flags(hdr, ARC_FLAG_L2_WRITING | ARC_FLAG_HAS_L2HDR); mutex_enter(&dev->l2ad_mtx); list_insert_head(&dev->l2ad_buflist, hdr); mutex_exit(&dev->l2ad_mtx); /* * We rely on the L1 portion of the header below, so * it's invalid for this header to have been evicted out * of the ghost cache, prior to being written out. The * ARC_FLAG_L2_WRITING bit ensures this won't happen. */ ASSERT(HDR_HAS_L1HDR(hdr)); ASSERT3U(HDR_GET_PSIZE(hdr), >, 0); ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL); ASSERT3U(arc_hdr_size(hdr), >, 0); uint64_t size = arc_hdr_size(hdr); uint64_t asize = vdev_psize_to_asize(dev->l2ad_vdev, size); (void) refcount_add_many(&dev->l2ad_alloc, size, hdr); /* * Normally the L2ARC can use the hdr's data, but if * we're sharing data between the hdr and one of its * bufs, L2ARC needs its own copy of the data so that * the ZIO below can't race with the buf consumer. To * ensure that this copy will be available for the * lifetime of the ZIO and be cleaned up afterwards, we * add it to the l2arc_free_on_write queue. */ void *to_write; if (!HDR_SHARED_DATA(hdr) && size == asize) { to_write = hdr->b_l1hdr.b_pdata; } else { arc_buf_contents_t type = arc_buf_type(hdr); if (type == ARC_BUFC_METADATA) { to_write = zio_buf_alloc(asize); } else { ASSERT3U(type, ==, ARC_BUFC_DATA); to_write = zio_data_buf_alloc(asize); } bcopy(hdr->b_l1hdr.b_pdata, to_write, size); if (asize != size) bzero(to_write + size, asize - size); l2arc_free_data_on_write(to_write, asize, type); } wzio = zio_write_phys(pio, dev->l2ad_vdev, hdr->b_l2hdr.b_daddr, asize, to_write, ZIO_CHECKSUM_OFF, NULL, hdr, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_CANFAIL, B_FALSE); write_sz += HDR_GET_LSIZE(hdr); DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev, zio_t *, wzio); write_asize += size; write_psize += asize; dev->l2ad_hand += asize; mutex_exit(hash_lock); (void) zio_nowait(wzio); } multilist_sublist_unlock(mls); if (full == B_TRUE) break; } /* No buffers selected for writing? */ if (pio == NULL) { ASSERT0(write_sz); ASSERT(!HDR_HAS_L1HDR(head)); kmem_cache_free(hdr_l2only_cache, head); return (0); } ASSERT3U(write_asize, <=, target_sz); ARCSTAT_BUMP(arcstat_l2_writes_sent); ARCSTAT_INCR(arcstat_l2_write_bytes, write_asize); ARCSTAT_INCR(arcstat_l2_size, write_sz); ARCSTAT_INCR(arcstat_l2_asize, write_asize); vdev_space_update(dev->l2ad_vdev, write_asize, 0, 0); /* * Bump device hand to the device start if it is approaching the end. * l2arc_evict() will already have evicted ahead for this case. */ if (dev->l2ad_hand >= (dev->l2ad_end - target_sz)) { dev->l2ad_hand = dev->l2ad_start; dev->l2ad_first = B_FALSE; } dev->l2ad_writing = B_TRUE; (void) zio_wait(pio); dev->l2ad_writing = B_FALSE; return (write_asize); } /* * This thread feeds the L2ARC at regular intervals. This is the beating * heart of the L2ARC. */ static void l2arc_feed_thread(void *dummy __unused) { callb_cpr_t cpr; l2arc_dev_t *dev; spa_t *spa; uint64_t size, wrote; clock_t begin, next = ddi_get_lbolt(); CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG); mutex_enter(&l2arc_feed_thr_lock); while (l2arc_thread_exit == 0) { CALLB_CPR_SAFE_BEGIN(&cpr); (void) cv_timedwait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock, next - ddi_get_lbolt()); CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock); next = ddi_get_lbolt() + hz; /* * Quick check for L2ARC devices. */ mutex_enter(&l2arc_dev_mtx); if (l2arc_ndev == 0) { mutex_exit(&l2arc_dev_mtx); continue; } mutex_exit(&l2arc_dev_mtx); begin = ddi_get_lbolt(); /* * This selects the next l2arc device to write to, and in * doing so the next spa to feed from: dev->l2ad_spa. This * will return NULL if there are now no l2arc devices or if * they are all faulted. * * If a device is returned, its spa's config lock is also * held to prevent device removal. l2arc_dev_get_next() * will grab and release l2arc_dev_mtx. */ if ((dev = l2arc_dev_get_next()) == NULL) continue; spa = dev->l2ad_spa; ASSERT3P(spa, !=, NULL); /* * If the pool is read-only then force the feed thread to * sleep a little longer. */ if (!spa_writeable(spa)) { next = ddi_get_lbolt() + 5 * l2arc_feed_secs * hz; spa_config_exit(spa, SCL_L2ARC, dev); continue; } /* * Avoid contributing to memory pressure. */ if (arc_reclaim_needed()) { ARCSTAT_BUMP(arcstat_l2_abort_lowmem); spa_config_exit(spa, SCL_L2ARC, dev); continue; } ARCSTAT_BUMP(arcstat_l2_feeds); size = l2arc_write_size(); /* * Evict L2ARC buffers that will be overwritten. */ l2arc_evict(dev, size, B_FALSE); /* * Write ARC buffers. */ wrote = l2arc_write_buffers(spa, dev, size); /* * Calculate interval between writes. */ next = l2arc_write_interval(begin, size, wrote); spa_config_exit(spa, SCL_L2ARC, dev); } l2arc_thread_exit = 0; cv_broadcast(&l2arc_feed_thr_cv); CALLB_CPR_EXIT(&cpr); /* drops l2arc_feed_thr_lock */ thread_exit(); } boolean_t l2arc_vdev_present(vdev_t *vd) { l2arc_dev_t *dev; mutex_enter(&l2arc_dev_mtx); for (dev = list_head(l2arc_dev_list); dev != NULL; dev = list_next(l2arc_dev_list, dev)) { if (dev->l2ad_vdev == vd) break; } mutex_exit(&l2arc_dev_mtx); return (dev != NULL); } /* * Add a vdev for use by the L2ARC. By this point the spa has already * validated the vdev and opened it. */ void l2arc_add_vdev(spa_t *spa, vdev_t *vd) { l2arc_dev_t *adddev; ASSERT(!l2arc_vdev_present(vd)); vdev_ashift_optimize(vd); /* * Create a new l2arc device entry. */ adddev = kmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP); adddev->l2ad_spa = spa; adddev->l2ad_vdev = vd; adddev->l2ad_start = VDEV_LABEL_START_SIZE; adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd); adddev->l2ad_hand = adddev->l2ad_start; adddev->l2ad_first = B_TRUE; adddev->l2ad_writing = B_FALSE; mutex_init(&adddev->l2ad_mtx, NULL, MUTEX_DEFAULT, NULL); /* * This is a list of all ARC buffers that are still valid on the * device. */ list_create(&adddev->l2ad_buflist, sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_l2hdr.b_l2node)); vdev_space_update(vd, 0, 0, adddev->l2ad_end - adddev->l2ad_hand); refcount_create(&adddev->l2ad_alloc); /* * Add device to global list */ mutex_enter(&l2arc_dev_mtx); list_insert_head(l2arc_dev_list, adddev); atomic_inc_64(&l2arc_ndev); mutex_exit(&l2arc_dev_mtx); } /* * Remove a vdev from the L2ARC. */ void l2arc_remove_vdev(vdev_t *vd) { l2arc_dev_t *dev, *nextdev, *remdev = NULL; /* * Find the device by vdev */ mutex_enter(&l2arc_dev_mtx); for (dev = list_head(l2arc_dev_list); dev; dev = nextdev) { nextdev = list_next(l2arc_dev_list, dev); if (vd == dev->l2ad_vdev) { remdev = dev; break; } } ASSERT3P(remdev, !=, NULL); /* * Remove device from global list */ list_remove(l2arc_dev_list, remdev); l2arc_dev_last = NULL; /* may have been invalidated */ atomic_dec_64(&l2arc_ndev); mutex_exit(&l2arc_dev_mtx); /* * Clear all buflists and ARC references. L2ARC device flush. */ l2arc_evict(remdev, 0, B_TRUE); list_destroy(&remdev->l2ad_buflist); mutex_destroy(&remdev->l2ad_mtx); refcount_destroy(&remdev->l2ad_alloc); kmem_free(remdev, sizeof (l2arc_dev_t)); } void l2arc_init(void) { l2arc_thread_exit = 0; l2arc_ndev = 0; l2arc_writes_sent = 0; l2arc_writes_done = 0; mutex_init(&l2arc_feed_thr_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&l2arc_feed_thr_cv, NULL, CV_DEFAULT, NULL); mutex_init(&l2arc_dev_mtx, NULL, MUTEX_DEFAULT, NULL); mutex_init(&l2arc_free_on_write_mtx, NULL, MUTEX_DEFAULT, NULL); l2arc_dev_list = &L2ARC_dev_list; l2arc_free_on_write = &L2ARC_free_on_write; list_create(l2arc_dev_list, sizeof (l2arc_dev_t), offsetof(l2arc_dev_t, l2ad_node)); list_create(l2arc_free_on_write, sizeof (l2arc_data_free_t), offsetof(l2arc_data_free_t, l2df_list_node)); } void l2arc_fini(void) { /* * This is called from dmu_fini(), which is called from spa_fini(); * Because of this, we can assume that all l2arc devices have * already been removed when the pools themselves were removed. */ l2arc_do_free_on_write(); mutex_destroy(&l2arc_feed_thr_lock); cv_destroy(&l2arc_feed_thr_cv); mutex_destroy(&l2arc_dev_mtx); mutex_destroy(&l2arc_free_on_write_mtx); list_destroy(l2arc_dev_list); list_destroy(l2arc_free_on_write); } void l2arc_start(void) { if (!(spa_mode_global & FWRITE)) return; (void) thread_create(NULL, 0, l2arc_feed_thread, NULL, 0, &p0, TS_RUN, minclsyspri); } void l2arc_stop(void) { if (!(spa_mode_global & FWRITE)) return; mutex_enter(&l2arc_feed_thr_lock); cv_signal(&l2arc_feed_thr_cv); /* kick thread out of startup */ l2arc_thread_exit = 1; while (l2arc_thread_exit != 0) cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock); mutex_exit(&l2arc_feed_thr_lock); } Index: head/sys/cddl/dev/dtrace/aarch64/dtrace_subr.c =================================================================== --- head/sys/cddl/dev/dtrace/aarch64/dtrace_subr.c (revision 308456) +++ head/sys/cddl/dev/dtrace/aarch64/dtrace_subr.c (revision 308457) @@ -1,310 +1,310 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License, Version 1.0 only * (the "License"). You may not use this file except in compliance * with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END * * $FreeBSD$ * */ /* * Copyright 2005 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include extern dtrace_id_t dtrace_probeid_error; extern int (*dtrace_invop_jump_addr)(struct trapframe *); extern void dtrace_getnanotime(struct timespec *tsp); int dtrace_invop(uintptr_t, struct trapframe *, uintptr_t); void dtrace_invop_init(void); void dtrace_invop_uninit(void); typedef struct dtrace_invop_hdlr { int (*dtih_func)(uintptr_t, struct trapframe *, uintptr_t); struct dtrace_invop_hdlr *dtih_next; } dtrace_invop_hdlr_t; dtrace_invop_hdlr_t *dtrace_invop_hdlr; int dtrace_invop(uintptr_t addr, struct trapframe *frame, uintptr_t eax) { dtrace_invop_hdlr_t *hdlr; int rval; for (hdlr = dtrace_invop_hdlr; hdlr != NULL; hdlr = hdlr->dtih_next) if ((rval = hdlr->dtih_func(addr, frame, eax)) != 0) return (rval); return (0); } void dtrace_invop_add(int (*func)(uintptr_t, struct trapframe *, uintptr_t)) { dtrace_invop_hdlr_t *hdlr; hdlr = kmem_alloc(sizeof (dtrace_invop_hdlr_t), KM_SLEEP); hdlr->dtih_func = func; hdlr->dtih_next = dtrace_invop_hdlr; dtrace_invop_hdlr = hdlr; } void dtrace_invop_remove(int (*func)(uintptr_t, struct trapframe *, uintptr_t)) { dtrace_invop_hdlr_t *hdlr, *prev; hdlr = dtrace_invop_hdlr; prev = NULL; for (;;) { if (hdlr == NULL) panic("attempt to remove non-existent invop handler"); if (hdlr->dtih_func == func) break; prev = hdlr; hdlr = hdlr->dtih_next; } if (prev == NULL) { ASSERT(dtrace_invop_hdlr == hdlr); dtrace_invop_hdlr = hdlr->dtih_next; } else { ASSERT(dtrace_invop_hdlr != hdlr); prev->dtih_next = hdlr->dtih_next; } kmem_free(hdlr, 0); } /*ARGSUSED*/ void dtrace_toxic_ranges(void (*func)(uintptr_t base, uintptr_t limit)) { (*func)(0, (uintptr_t)VM_MIN_KERNEL_ADDRESS); } void dtrace_xcall(processorid_t cpu, dtrace_xcall_t func, void *arg) { cpuset_t cpus; if (cpu == DTRACE_CPUALL) cpus = all_cpus; else CPU_SETOF(cpu, &cpus); smp_rendezvous_cpus(cpus, smp_no_rendevous_barrier, func, smp_no_rendevous_barrier, arg); } static void dtrace_sync_func(void) { } void dtrace_sync(void) { dtrace_xcall(DTRACE_CPUALL, (dtrace_xcall_t)dtrace_sync_func, NULL); } /* * DTrace needs a high resolution time function which can * be called from a probe context and guaranteed not to have * instrumented with probes itself. * * Returns nanoseconds since boot. */ uint64_t dtrace_gethrtime() { struct timespec curtime; nanouptime(&curtime); return (curtime.tv_sec * 1000000000UL + curtime.tv_nsec); } uint64_t dtrace_gethrestime(void) { struct timespec current_time; dtrace_getnanotime(¤t_time); return (current_time.tv_sec * 1000000000UL + current_time.tv_nsec); } /* Function to handle DTrace traps during probes. See arm64/arm64/trap.c */ int dtrace_trap(struct trapframe *frame, u_int type) { /* * A trap can occur while DTrace executes a probe. Before * executing the probe, DTrace blocks re-scheduling and sets - * a flag in it's per-cpu flags to indicate that it doesn't + * a flag in its per-cpu flags to indicate that it doesn't * want to fault. On returning from the probe, the no-fault * flag is cleared and finally re-scheduling is enabled. * * Check if DTrace has enabled 'no-fault' mode: * */ if ((cpu_core[curcpu].cpuc_dtrace_flags & CPU_DTRACE_NOFAULT) != 0) { /* * There are only a couple of trap types that are expected. * All the rest will be handled in the usual way. */ switch (type) { case EXCP_DATA_ABORT: /* Flag a bad address. */ cpu_core[curcpu].cpuc_dtrace_flags |= CPU_DTRACE_BADADDR; cpu_core[curcpu].cpuc_dtrace_illval = 0; /* * Offset the instruction pointer to the instruction * following the one causing the fault. */ frame->tf_elr += 4; return (1); default: /* Handle all other traps in the usual way. */ break; } } /* Handle the trap in the usual way. */ return (0); } void dtrace_probe_error(dtrace_state_t *state, dtrace_epid_t epid, int which, int fault, int fltoffs, uintptr_t illval) { dtrace_probe(dtrace_probeid_error, (uint64_t)(uintptr_t)state, (uintptr_t)epid, (uintptr_t)which, (uintptr_t)fault, (uintptr_t)fltoffs); } static int dtrace_invop_start(struct trapframe *frame) { int data, invop, reg, update_sp; register_t arg1, arg2; register_t *sp; int offs; int tmp; int i; invop = dtrace_invop(frame->tf_elr, frame, frame->tf_elr); tmp = (invop & LDP_STP_MASK); if (tmp == STP_64 || tmp == LDP_64) { sp = (register_t *)frame->tf_sp; data = invop; arg1 = (data >> ARG1_SHIFT) & ARG1_MASK; arg2 = (data >> ARG2_SHIFT) & ARG2_MASK; offs = (data >> OFFSET_SHIFT) & OFFSET_MASK; switch (tmp) { case STP_64: if (offs >> (OFFSET_SIZE - 1)) sp -= (~offs & OFFSET_MASK) + 1; else sp += (offs); *(sp + 0) = frame->tf_x[arg1]; *(sp + 1) = frame->tf_x[arg2]; break; case LDP_64: frame->tf_x[arg1] = *(sp + 0); frame->tf_x[arg2] = *(sp + 1); if (offs >> (OFFSET_SIZE - 1)) sp -= (~offs & OFFSET_MASK) + 1; else sp += (offs); break; default: break; } /* Update the stack pointer and program counter to continue */ frame->tf_sp = (register_t)sp; frame->tf_elr += INSN_SIZE; return (0); } if ((invop & B_MASK) == B_INSTR) { data = (invop & B_DATA_MASK); /* The data is the number of 4-byte words to change the pc */ data *= 4; frame->tf_elr += data; return (0); } if (invop == RET_INSTR) { frame->tf_elr = frame->tf_lr; return (0); } return (-1); } void dtrace_invop_init(void) { dtrace_invop_jump_addr = dtrace_invop_start; } void dtrace_invop_uninit(void) { dtrace_invop_jump_addr = 0; } Index: head/sys/cddl/dev/dtrace/arm/dtrace_subr.c =================================================================== --- head/sys/cddl/dev/dtrace/arm/dtrace_subr.c (revision 308456) +++ head/sys/cddl/dev/dtrace/arm/dtrace_subr.c (revision 308457) @@ -1,345 +1,345 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License, Version 1.0 only * (the "License"). You may not use this file except in compliance * with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END * * $FreeBSD$ * */ /* * Copyright 2005 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define DELAYBRANCH(x) ((int)(x) < 0) #define BIT_PC 15 #define BIT_LR 14 #define BIT_SP 13 extern dtrace_id_t dtrace_probeid_error; extern int (*dtrace_invop_jump_addr)(struct trapframe *); extern void dtrace_getnanotime(struct timespec *tsp); int dtrace_invop(uintptr_t, struct trapframe *, uintptr_t); void dtrace_invop_init(void); void dtrace_invop_uninit(void); typedef struct dtrace_invop_hdlr { int (*dtih_func)(uintptr_t, struct trapframe *, uintptr_t); struct dtrace_invop_hdlr *dtih_next; } dtrace_invop_hdlr_t; dtrace_invop_hdlr_t *dtrace_invop_hdlr; int dtrace_invop(uintptr_t addr, struct trapframe *frame, uintptr_t eax) { dtrace_invop_hdlr_t *hdlr; int rval; for (hdlr = dtrace_invop_hdlr; hdlr != NULL; hdlr = hdlr->dtih_next) if ((rval = hdlr->dtih_func(addr, frame, eax)) != 0) return (rval); return (0); } void dtrace_invop_add(int (*func)(uintptr_t, struct trapframe *, uintptr_t)) { dtrace_invop_hdlr_t *hdlr; hdlr = kmem_alloc(sizeof (dtrace_invop_hdlr_t), KM_SLEEP); hdlr->dtih_func = func; hdlr->dtih_next = dtrace_invop_hdlr; dtrace_invop_hdlr = hdlr; } void dtrace_invop_remove(int (*func)(uintptr_t, struct trapframe *, uintptr_t)) { dtrace_invop_hdlr_t *hdlr = dtrace_invop_hdlr, *prev = NULL; for (;;) { if (hdlr == NULL) panic("attempt to remove non-existent invop handler"); if (hdlr->dtih_func == func) break; prev = hdlr; hdlr = hdlr->dtih_next; } if (prev == NULL) { ASSERT(dtrace_invop_hdlr == hdlr); dtrace_invop_hdlr = hdlr->dtih_next; } else { ASSERT(dtrace_invop_hdlr != hdlr); prev->dtih_next = hdlr->dtih_next; } kmem_free(hdlr, 0); } /*ARGSUSED*/ void dtrace_toxic_ranges(void (*func)(uintptr_t base, uintptr_t limit)) { printf("IMPLEMENT ME: dtrace_toxic_ranges\n"); } void dtrace_xcall(processorid_t cpu, dtrace_xcall_t func, void *arg) { cpuset_t cpus; if (cpu == DTRACE_CPUALL) cpus = all_cpus; else CPU_SETOF(cpu, &cpus); smp_rendezvous_cpus(cpus, smp_no_rendevous_barrier, func, smp_no_rendevous_barrier, arg); } static void dtrace_sync_func(void) { } void dtrace_sync(void) { dtrace_xcall(DTRACE_CPUALL, (dtrace_xcall_t)dtrace_sync_func, NULL); } /* * DTrace needs a high resolution time function which can * be called from a probe context and guaranteed not to have * instrumented with probes itself. * * Returns nanoseconds since boot. */ uint64_t dtrace_gethrtime() { struct timespec curtime; nanouptime(&curtime); return (curtime.tv_sec * 1000000000UL + curtime.tv_nsec); } uint64_t dtrace_gethrestime(void) { struct timespec current_time; dtrace_getnanotime(¤t_time); return (current_time.tv_sec * 1000000000UL + current_time.tv_nsec); } /* Function to handle DTrace traps during probes. See amd64/amd64/trap.c */ int dtrace_trap(struct trapframe *frame, u_int type) { /* * A trap can occur while DTrace executes a probe. Before * executing the probe, DTrace blocks re-scheduling and sets - * a flag in it's per-cpu flags to indicate that it doesn't + * a flag in its per-cpu flags to indicate that it doesn't * want to fault. On returning from the probe, the no-fault * flag is cleared and finally re-scheduling is enabled. * * Check if DTrace has enabled 'no-fault' mode: * */ if ((cpu_core[curcpu].cpuc_dtrace_flags & CPU_DTRACE_NOFAULT) != 0) { /* * There are only a couple of trap types that are expected. * All the rest will be handled in the usual way. */ switch (type) { /* Page fault. */ case FAULT_ALIGN: /* Flag a bad address. */ cpu_core[curcpu].cpuc_dtrace_flags |= CPU_DTRACE_BADADDR; cpu_core[curcpu].cpuc_dtrace_illval = 0; /* * Offset the instruction pointer to the instruction * following the one causing the fault. */ frame->tf_pc += sizeof(int); return (1); default: /* Handle all other traps in the usual way. */ break; } } /* Handle the trap in the usual way. */ return (0); } void dtrace_probe_error(dtrace_state_t *state, dtrace_epid_t epid, int which, int fault, int fltoffs, uintptr_t illval) { dtrace_probe(dtrace_probeid_error, (uint64_t)(uintptr_t)state, (uintptr_t)epid, (uintptr_t)which, (uintptr_t)fault, (uintptr_t)fltoffs); } static int dtrace_invop_start(struct trapframe *frame) { register_t *r0, *sp; int data, invop, reg, update_sp; invop = dtrace_invop(frame->tf_pc, frame, frame->tf_pc); switch (invop & DTRACE_INVOP_MASK) { case DTRACE_INVOP_PUSHM: sp = (register_t *)frame->tf_svc_sp; r0 = &frame->tf_r0; data = DTRACE_INVOP_DATA(invop); /* * Store the pc, lr, and sp. These have their own * entries in the struct. */ if (data & (1 << BIT_PC)) { sp--; *sp = frame->tf_pc; } if (data & (1 << BIT_LR)) { sp--; *sp = frame->tf_svc_lr; } if (data & (1 << BIT_SP)) { sp--; *sp = frame->tf_svc_sp; } /* Store the general registers */ for (reg = 12; reg >= 0; reg--) { if (data & (1 << reg)) { sp--; *sp = r0[reg]; } } /* Update the stack pointer and program counter to continue */ frame->tf_svc_sp = (register_t)sp; frame->tf_pc += 4; break; case DTRACE_INVOP_POPM: sp = (register_t *)frame->tf_svc_sp; r0 = &frame->tf_r0; data = DTRACE_INVOP_DATA(invop); /* Read the general registers */ for (reg = 0; reg <= 12; reg++) { if (data & (1 << reg)) { r0[reg] = *sp; sp++; } } /* * Set the stack pointer. If we don't update it here we will * need to update it at the end as the instruction would do */ update_sp = 1; if (data & (1 << BIT_SP)) { frame->tf_svc_sp = *sp; *sp++; update_sp = 0; } /* Update the link register, we need to use the correct copy */ if (data & (1 << BIT_LR)) { frame->tf_svc_lr = *sp; *sp++; } /* * And the program counter. If it's not in the list skip over * it when we return so to not hit this again. */ if (data & (1 << BIT_PC)) { frame->tf_pc = *sp; *sp++; } else frame->tf_pc += 4; /* Update the stack pointer if we haven't already done so */ if (update_sp) frame->tf_svc_sp = (register_t)sp; break; case DTRACE_INVOP_B: data = DTRACE_INVOP_DATA(invop) & 0x00ffffff; /* Sign extend the data */ if ((data & (1 << 23)) != 0) data |= 0xff000000; /* The data is the number of 4-byte words to change the pc */ data *= 4; data += 8; frame->tf_pc += data; break; default: return (-1); break; } return (0); } void dtrace_invop_init(void) { dtrace_invop_jump_addr = dtrace_invop_start; } void dtrace_invop_uninit(void) { dtrace_invop_jump_addr = 0; } Index: head/sys/cddl/dev/dtrace/riscv/dtrace_subr.c =================================================================== --- head/sys/cddl/dev/dtrace/riscv/dtrace_subr.c (revision 308456) +++ head/sys/cddl/dev/dtrace/riscv/dtrace_subr.c (revision 308457) @@ -1,282 +1,282 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License, Version 1.0 only * (the "License"). You may not use this file except in compliance * with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END * * Portions Copyright 2016 Ruslan Bukin * * $FreeBSD$ * */ /* * Copyright 2005 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include extern dtrace_id_t dtrace_probeid_error; extern int (*dtrace_invop_jump_addr)(struct trapframe *); extern void dtrace_getnanotime(struct timespec *tsp); int dtrace_invop(uintptr_t, struct trapframe *, uintptr_t); void dtrace_invop_init(void); void dtrace_invop_uninit(void); typedef struct dtrace_invop_hdlr { int (*dtih_func)(uintptr_t, struct trapframe *, uintptr_t); struct dtrace_invop_hdlr *dtih_next; } dtrace_invop_hdlr_t; dtrace_invop_hdlr_t *dtrace_invop_hdlr; int dtrace_invop(uintptr_t addr, struct trapframe *frame, uintptr_t eax) { dtrace_invop_hdlr_t *hdlr; int rval; for (hdlr = dtrace_invop_hdlr; hdlr != NULL; hdlr = hdlr->dtih_next) if ((rval = hdlr->dtih_func(addr, frame, eax)) != 0) return (rval); return (0); } void dtrace_invop_add(int (*func)(uintptr_t, struct trapframe *, uintptr_t)) { dtrace_invop_hdlr_t *hdlr; hdlr = kmem_alloc(sizeof (dtrace_invop_hdlr_t), KM_SLEEP); hdlr->dtih_func = func; hdlr->dtih_next = dtrace_invop_hdlr; dtrace_invop_hdlr = hdlr; } void dtrace_invop_remove(int (*func)(uintptr_t, struct trapframe *, uintptr_t)) { dtrace_invop_hdlr_t *hdlr, *prev; hdlr = dtrace_invop_hdlr; prev = NULL; for (;;) { if (hdlr == NULL) panic("attempt to remove non-existent invop handler"); if (hdlr->dtih_func == func) break; prev = hdlr; hdlr = hdlr->dtih_next; } if (prev == NULL) { ASSERT(dtrace_invop_hdlr == hdlr); dtrace_invop_hdlr = hdlr->dtih_next; } else { ASSERT(dtrace_invop_hdlr != hdlr); prev->dtih_next = hdlr->dtih_next; } kmem_free(hdlr, 0); } /*ARGSUSED*/ void dtrace_toxic_ranges(void (*func)(uintptr_t base, uintptr_t limit)) { (*func)(0, (uintptr_t)VM_MIN_KERNEL_ADDRESS); } void dtrace_xcall(processorid_t cpu, dtrace_xcall_t func, void *arg) { cpuset_t cpus; if (cpu == DTRACE_CPUALL) cpus = all_cpus; else CPU_SETOF(cpu, &cpus); smp_rendezvous_cpus(cpus, smp_no_rendevous_barrier, func, smp_no_rendevous_barrier, arg); } static void dtrace_sync_func(void) { } void dtrace_sync(void) { dtrace_xcall(DTRACE_CPUALL, (dtrace_xcall_t)dtrace_sync_func, NULL); } /* * DTrace needs a high resolution time function which can * be called from a probe context and guaranteed not to have * instrumented with probes itself. * * Returns nanoseconds since boot. */ uint64_t dtrace_gethrtime() { struct timespec curtime; nanouptime(&curtime); return (curtime.tv_sec * 1000000000UL + curtime.tv_nsec); } uint64_t dtrace_gethrestime(void) { struct timespec current_time; dtrace_getnanotime(¤t_time); return (current_time.tv_sec * 1000000000UL + current_time.tv_nsec); } /* Function to handle DTrace traps during probes. See riscv/riscv/trap.c */ int dtrace_trap(struct trapframe *frame, u_int type) { /* * A trap can occur while DTrace executes a probe. Before * executing the probe, DTrace blocks re-scheduling and sets - * a flag in it's per-cpu flags to indicate that it doesn't + * a flag in its per-cpu flags to indicate that it doesn't * want to fault. On returning from the probe, the no-fault * flag is cleared and finally re-scheduling is enabled. * * Check if DTrace has enabled 'no-fault' mode: * */ if ((cpu_core[curcpu].cpuc_dtrace_flags & CPU_DTRACE_NOFAULT) != 0) { /* * There are only a couple of trap types that are expected. * All the rest will be handled in the usual way. */ switch (type) { case EXCP_FAULT_LOAD: case EXCP_FAULT_STORE: case EXCP_FAULT_FETCH: /* Flag a bad address. */ cpu_core[curcpu].cpuc_dtrace_flags |= CPU_DTRACE_BADADDR; cpu_core[curcpu].cpuc_dtrace_illval = 0; /* * Offset the instruction pointer to the instruction * following the one causing the fault. */ frame->tf_sepc += 4; return (1); default: /* Handle all other traps in the usual way. */ break; } } /* Handle the trap in the usual way. */ return (0); } void dtrace_probe_error(dtrace_state_t *state, dtrace_epid_t epid, int which, int fault, int fltoffs, uintptr_t illval) { dtrace_probe(dtrace_probeid_error, (uint64_t)(uintptr_t)state, (uintptr_t)epid, (uintptr_t)which, (uintptr_t)fault, (uintptr_t)fltoffs); } static int dtrace_invop_start(struct trapframe *frame) { int data, invop, reg, update_sp; register_t arg1, arg2; register_t *sp; uint32_t imm; InstFmt i; int offs; int tmp; invop = dtrace_invop(frame->tf_sepc, frame, frame->tf_sepc); if (invop == RISCV_INSN_RET) { frame->tf_sepc = frame->tf_ra; return (0); } if ((invop & SD_RA_SP_MASK) == SD_RA_SP) { i.word = invop; imm = i.SType.imm0_4 | (i.SType.imm5_11 << 5); sp = (register_t *)((uint8_t *)frame->tf_sp + imm); *sp = frame->tf_ra; frame->tf_sepc += INSN_SIZE; return (0); } return (-1); } void dtrace_invop_init(void) { dtrace_invop_jump_addr = dtrace_invop_start; } void dtrace_invop_uninit(void) { dtrace_invop_jump_addr = 0; } Index: head/sys/contrib/dev/ath/ath_hal/ar9300/ar9300_eeprom.c =================================================================== --- head/sys/contrib/dev/ath/ath_hal/ar9300/ar9300_eeprom.c (revision 308456) +++ head/sys/contrib/dev/ath/ath_hal/ar9300/ar9300_eeprom.c (revision 308457) @@ -1,4787 +1,4787 @@ /* * Copyright (c) 2013 Qualcomm Atheros, Inc. * * Permission to use, copy, modify, and/or distribute this software for any * purpose with or without fee is hereby granted, provided that the above * copyright notice and this permission notice appear in all copies. * * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR * PERFORMANCE OF THIS SOFTWARE. */ #include "opt_ah.h" #include "ah.h" #include "ah_internal.h" #include "ah_devid.h" #ifdef AH_DEBUG #include "ah_desc.h" /* NB: for HAL_PHYERR* */ #endif #include "ar9300/ar9300.h" #include "ar9300/ar9300eep.h" #include "ar9300/ar9300template_generic.h" #include "ar9300/ar9300template_xb112.h" #include "ar9300/ar9300template_hb116.h" #include "ar9300/ar9300template_xb113.h" #include "ar9300/ar9300template_hb112.h" #include "ar9300/ar9300template_ap121.h" #include "ar9300/ar9300template_osprey_k31.h" #include "ar9300/ar9300template_wasp_2.h" #include "ar9300/ar9300template_wasp_k31.h" #include "ar9300/ar9300template_aphrodite.h" #include "ar9300/ar9300reg.h" #include "ar9300/ar9300phy.h" #if AH_BYTE_ORDER == AH_BIG_ENDIAN void ar9300_swap_eeprom(ar9300_eeprom_t *eep); void ar9300_eeprom_template_swap(void); #endif static u_int16_t ar9300_eeprom_get_spur_chan(struct ath_hal *ah, int spur_chan, HAL_BOOL is_2ghz); #ifdef UNUSED static inline HAL_BOOL ar9300_fill_eeprom(struct ath_hal *ah); static inline HAL_STATUS ar9300_check_eeprom(struct ath_hal *ah); #endif static ar9300_eeprom_t *default9300[] = { &ar9300_template_generic, &ar9300_template_xb112, &ar9300_template_hb116, &ar9300_template_hb112, &ar9300_template_xb113, &ar9300_template_ap121, &ar9300_template_wasp_2, &ar9300_template_wasp_k31, &ar9300_template_osprey_k31, &ar9300_template_aphrodite, }; /* * Different types of memory where the calibration data might be stored. * All types are searched in ar9300_eeprom_restore() * in the order flash, eeprom, otp. * To disable searching a type, set its parameter to 0. */ /* * This is where we look for the calibration data. * must be set before ath_attach() is called */ static int calibration_data_try = calibration_data_none; static int calibration_data_try_address = 0; /* * Set the type of memory used to store calibration data. * Used by nart to force reading/writing of a specific type. * The driver can normally allow autodetection * by setting source to calibration_data_none=0. */ void ar9300_calibration_data_set(struct ath_hal *ah, int32_t source) { if (ah != 0) { AH9300(ah)->calibration_data_source = source; } else { calibration_data_try = source; } } int32_t ar9300_calibration_data_get(struct ath_hal *ah) { if (ah != 0) { return AH9300(ah)->calibration_data_source; } else { return calibration_data_try; } } /* * Set the address of first byte used to store calibration data. * Used by nart to force reading/writing at a specific address. * The driver can normally allow autodetection by setting size=0. */ void ar9300_calibration_data_address_set(struct ath_hal *ah, int32_t size) { if (ah != 0) { AH9300(ah)->calibration_data_source_address = size; } else { calibration_data_try_address = size; } } int32_t ar9300_calibration_data_address_get(struct ath_hal *ah) { if (ah != 0) { return AH9300(ah)->calibration_data_source_address; } else { return calibration_data_try_address; } } /* * This is the template that is loaded if ar9300_eeprom_restore() * can't find valid data in the memory. */ static int Ar9300_eeprom_template_preference = ar9300_eeprom_template_generic; void ar9300_eeprom_template_preference(int32_t value) { Ar9300_eeprom_template_preference = value; } /* * Install the specified default template. * Overwrites any existing calibration and configuration information in memory. */ int32_t ar9300_eeprom_template_install(struct ath_hal *ah, int32_t value) { struct ath_hal_9300 *ahp = AH9300(ah); ar9300_eeprom_t *mptr, *dptr; int mdata_size; mptr = &ahp->ah_eeprom; mdata_size = ar9300_eeprom_struct_size(); if (mptr != 0) { #if 0 calibration_data_source = calibration_data_none; calibration_data_source_address = 0; #endif dptr = ar9300_eeprom_struct_default_find_by_id(value); if (dptr != 0) { OS_MEMCPY(mptr, dptr, mdata_size); return 0; } } return -1; } static int ar9300_eeprom_restore_something(struct ath_hal *ah, ar9300_eeprom_t *mptr, int mdata_size) { int it; ar9300_eeprom_t *dptr; int nptr; nptr = -1; /* * if we didn't find any blocks in the memory, * put the prefered template in place */ if (nptr < 0) { AH9300(ah)->calibration_data_source = calibration_data_none; AH9300(ah)->calibration_data_source_address = 0; dptr = ar9300_eeprom_struct_default_find_by_id( Ar9300_eeprom_template_preference); if (dptr != 0) { OS_MEMCPY(mptr, dptr, mdata_size); nptr = 0; } } /* * if we didn't find the prefered one, * put the normal default template in place */ if (nptr < 0) { AH9300(ah)->calibration_data_source = calibration_data_none; AH9300(ah)->calibration_data_source_address = 0; dptr = ar9300_eeprom_struct_default_find_by_id( ar9300_eeprom_template_default); if (dptr != 0) { OS_MEMCPY(mptr, dptr, mdata_size); nptr = 0; } } /* * if we can't find the best template, put any old template in place * presume that newer ones are better, so search backwards */ if (nptr < 0) { AH9300(ah)->calibration_data_source = calibration_data_none; AH9300(ah)->calibration_data_source_address = 0; for (it = ar9300_eeprom_struct_default_many() - 1; it >= 0; it--) { dptr = ar9300_eeprom_struct_default(it); if (dptr != 0) { OS_MEMCPY(mptr, dptr, mdata_size); nptr = 0; break; } } } return nptr; } /* * Read 16 bits of data from offset into *data */ HAL_BOOL ar9300_eeprom_read_word(struct ath_hal *ah, u_int off, u_int16_t *data) { if (AR_SREV_OSPREY(ah) || AR_SREV_POSEIDON(ah)) { (void) OS_REG_READ(ah, AR9300_EEPROM_OFFSET + (off << AR9300_EEPROM_S)); if (!ath_hal_wait(ah, AR_HOSTIF_REG(ah, AR_EEPROM_STATUS_DATA), AR_EEPROM_STATUS_DATA_BUSY | AR_EEPROM_STATUS_DATA_PROT_ACCESS, 0)) { return AH_FALSE; } *data = MS(OS_REG_READ(ah, AR_HOSTIF_REG(ah, AR_EEPROM_STATUS_DATA)), AR_EEPROM_STATUS_DATA_VAL); return AH_TRUE; } else { *data = 0; return AH_FALSE; } } HAL_BOOL ar9300_otp_read(struct ath_hal *ah, u_int off, u_int32_t *data, HAL_BOOL is_wifi) { int time_out = 1000; int status = 0; u_int32_t addr; if (AR_SREV_HONEYBEE(ah)){ /* no OTP for Honeybee */ return false; } addr = (AR_SREV_WASP(ah) || AR_SREV_SCORPION(ah))? OTP_MEM_START_ADDRESS_WASP : OTP_MEM_START_ADDRESS; if (!is_wifi) { addr = BTOTP_MEM_START_ADDRESS; } addr += off * 4; /* OTP is 32 bit addressable */ (void) OS_REG_READ(ah, addr); addr = (AR_SREV_WASP(ah) || AR_SREV_SCORPION(ah)) ? OTP_STATUS0_OTP_SM_BUSY_WASP : OTP_STATUS0_OTP_SM_BUSY; if (!is_wifi) { addr = BTOTP_STATUS0_OTP_SM_BUSY; } while ((time_out > 0) && (!status)) { /* wait for access complete */ /* Read data valid, access not busy, sm not busy */ status = ((OS_REG_READ(ah, addr) & 0x7) == 0x4) ? 1 : 0; time_out--; } if (time_out == 0) { HALDEBUG(ah, HAL_DEBUG_EEPROM, "%s: Timed out during OTP Status0 validation\n", __func__); return AH_FALSE; } addr = (AR_SREV_WASP(ah) || AR_SREV_SCORPION(ah)) ? OTP_STATUS1_EFUSE_READ_DATA_WASP : OTP_STATUS1_EFUSE_READ_DATA; if (!is_wifi) { addr = BTOTP_STATUS1_EFUSE_READ_DATA; } *data = OS_REG_READ(ah, addr); return AH_TRUE; } static HAL_STATUS ar9300_flash_map(struct ath_hal *ah) { /* XXX disable flash remapping for now (ie, SoC support) */ ath_hal_printf(ah, "%s: unimplemented for now\n", __func__); #if 0 struct ath_hal_9300 *ahp = AH9300(ah); #if defined(AR9100) || defined(__NetBSD__) ahp->ah_cal_mem = OS_REMAP(ah, AR9300_EEPROM_START_ADDR, AR9300_EEPROM_MAX); #else ahp->ah_cal_mem = OS_REMAP((uintptr_t)(AH_PRIVATE(ah)->ah_st), (AR9300_EEPROM_MAX + AR9300_FLASH_CAL_START_OFFSET)); #endif if (!ahp->ah_cal_mem) { HALDEBUG(ah, HAL_DEBUG_EEPROM, "%s: cannot remap eeprom region \n", __func__); return HAL_EIO; } #endif return HAL_OK; } HAL_BOOL ar9300_flash_read(struct ath_hal *ah, u_int off, u_int16_t *data) { struct ath_hal_9300 *ahp = AH9300(ah); *data = ((u_int16_t *)ahp->ah_cal_mem)[off]; return AH_TRUE; } HAL_BOOL ar9300_flash_write(struct ath_hal *ah, u_int off, u_int16_t data) { struct ath_hal_9300 *ahp = AH9300(ah); ((u_int16_t *)ahp->ah_cal_mem)[off] = data; return AH_TRUE; } HAL_STATUS ar9300_eeprom_attach(struct ath_hal *ah) { struct ath_hal_9300 *ahp = AH9300(ah); ahp->try_dram = 1; ahp->try_eeprom = 1; ahp->try_otp = 1; #ifdef ATH_CAL_NAND_FLASH ahp->try_nand = 1; #else ahp->try_flash = 1; #endif ahp->calibration_data_source = calibration_data_none; ahp->calibration_data_source_address = 0; ahp->calibration_data_try = calibration_data_try; ahp->calibration_data_try_address = 0; /* * In case flash will be used for EEPROM. Otherwise ahp->ah_cal_mem * must be set to NULL or the real EEPROM address. */ ar9300_flash_map(ah); /* * ###### This function always return NO SPUR. * This is not true for many board designs. * Does anyone use this? */ AH_PRIVATE(ah)->ah_getSpurChan = ar9300_eeprom_get_spur_chan; #ifdef OLDCODE /* XXX Needs to be moved for dynamic selection */ ahp->ah_eeprom = *(default9300[ar9300_eeprom_template_default]); if (AR_SREV_HORNET(ah)) { /* Set default values for Hornet. */ ahp->ah_eeprom.base_eep_header.op_cap_flags.op_flags = AR9300_OPFLAGS_11G; ahp->ah_eeprom.base_eep_header.txrx_mask = 0x11; } else if (AR_SREV_POSEIDON(ah)) { /* Set default values for Poseidon. */ ahp->ah_eeprom.base_eep_header.op_cap_flags.op_flags = AR9300_OPFLAGS_11G; ahp->ah_eeprom.base_eep_header.txrx_mask = 0x11; } if (AH_PRIVATE(ah)->ah_config.ath_hal_skip_eeprom_read) { ahp->ah_emu_eeprom = 1; return HAL_OK; } ahp->ah_emu_eeprom = 1; #ifdef UNUSED #endif if (!ar9300_fill_eeprom(ah)) { return HAL_EIO; } return HAL_OK; /* return ar9300_check_eeprom(ah); */ #else ahp->ah_emu_eeprom = 1; #if 0 /*#ifdef MDK_AP*/ /* MDK_AP is defined only in NART AP build */ u_int8_t buffer[10]; int caldata_check = 0; ar9300_calibration_data_read_flash( ah, FLASH_BASE_CALDATA_OFFSET, buffer, 4); printf("flash caldata:: %x\n", buffer[0]); if (buffer[0] != 0xff) { caldata_check = 1; } if (!caldata_check) { ar9300_eeprom_t *mptr; int mdata_size; if (AR_SREV_HORNET(ah)) { /* XXX: For initial testing */ mptr = &ahp->ah_eeprom; mdata_size = ar9300_eeprom_struct_size(); ahp->ah_eeprom = ar9300_template_ap121; ahp->ah_emu_eeprom = 1; /* need it to let art save in to flash ????? */ calibration_data_source = calibration_data_flash; } else if (AR_SREV_WASP(ah)) { /* XXX: For initial testing */ ath_hal_printf(ah, " wasp eep attach\n"); mptr = &ahp->ah_eeprom; mdata_size = ar9300_eeprom_struct_size(); ahp->ah_eeprom = ar9300_template_generic; ahp->ah_eeprom.mac_addr[0] = 0x00; ahp->ah_eeprom.mac_addr[1] = 0x03; ahp->ah_eeprom.mac_addr[2] = 0x7F; ahp->ah_eeprom.mac_addr[3] = 0xBA; ahp->ah_eeprom.mac_addr[4] = 0xD0; ahp->ah_eeprom.mac_addr[5] = 0x00; ahp->ah_emu_eeprom = 1; ahp->ah_eeprom.base_eep_header.txrx_mask = 0x33; ahp->ah_eeprom.base_eep_header.txrxgain = 0x10; /* need it to let art save in to flash ????? */ calibration_data_source = calibration_data_flash; } return HAL_OK; } #endif if (AR_SREV_HORNET(ah) || AR_SREV_WASP(ah) || AR_SREV_SCORPION(ah) || AR_SREV_HONEYBEE(ah)) { ahp->try_eeprom = 0; } if (AR_SREV_HONEYBEE(ah)) { ahp->try_otp = 0; } if (!ar9300_eeprom_restore(ah)) { return HAL_EIO; } return HAL_OK; #endif } u_int32_t ar9300_eeprom_get(struct ath_hal_9300 *ahp, EEPROM_PARAM param) { ar9300_eeprom_t *eep = &ahp->ah_eeprom; OSPREY_BASE_EEP_HEADER *p_base = &eep->base_eep_header; OSPREY_BASE_EXTENSION_1 *base_ext1 = &eep->base_ext1; switch (param) { #ifdef NOTYET case EEP_NFTHRESH_5: return p_modal[0].noise_floor_thresh_ch[0]; case EEP_NFTHRESH_2: return p_modal[1].noise_floor_thresh_ch[0]; #endif case EEP_MAC_LSW: return eep->mac_addr[0] << 8 | eep->mac_addr[1]; case EEP_MAC_MID: return eep->mac_addr[2] << 8 | eep->mac_addr[3]; case EEP_MAC_MSW: return eep->mac_addr[4] << 8 | eep->mac_addr[5]; case EEP_REG_0: return p_base->reg_dmn[0]; case EEP_REG_1: return p_base->reg_dmn[1]; case EEP_OP_CAP: return p_base->device_cap; case EEP_OP_MODE: return p_base->op_cap_flags.op_flags; case EEP_RF_SILENT: return p_base->rf_silent; #ifdef NOTYET case EEP_OB_5: return p_modal[0].ob; case EEP_DB_5: return p_modal[0].db; case EEP_OB_2: return p_modal[1].ob; case EEP_DB_2: return p_modal[1].db; case EEP_MINOR_REV: return p_base->eeprom_version & AR9300_EEP_VER_MINOR_MASK; #endif case EEP_TX_MASK: return (p_base->txrx_mask >> 4) & 0xf; case EEP_RX_MASK: return p_base->txrx_mask & 0xf; #ifdef NOTYET case EEP_FSTCLK_5G: return p_base->fast_clk5g; case EEP_RXGAIN_TYPE: return p_base->rx_gain_type; #endif case EEP_DRIVE_STRENGTH: #define AR9300_EEP_BASE_DRIVE_STRENGTH 0x1 return p_base->misc_configuration & AR9300_EEP_BASE_DRIVE_STRENGTH; case EEP_INTERNAL_REGULATOR: /* Bit 4 is internal regulator flag */ return ((p_base->feature_enable & 0x10) >> 4); case EEP_SWREG: return (p_base->swreg); case EEP_PAPRD_ENABLED: /* Bit 5 is paprd flag */ return ((p_base->feature_enable & 0x20) >> 5); case EEP_ANTDIV_control: return (u_int32_t)(base_ext1->ant_div_control); case EEP_CHAIN_MASK_REDUCE: return ((p_base->misc_configuration >> 3) & 0x1); case EEP_OL_PWRCTRL: return 0; case EEP_DEV_TYPE: return p_base->device_type; default: HALASSERT(0); return 0; } } /******************************************************************************/ /*! ** \brief EEPROM fixup code for INI values ** ** This routine provides a place to insert "fixup" code for specific devices ** that need to modify INI values based on EEPROM values, BEFORE the INI values ** are written. ** Certain registers in the INI file can only be written once without ** undesired side effects, and this provides a place for EEPROM overrides ** in these cases. ** ** This is called at attach time once. It should not affect run time ** performance at all ** ** \param ah Pointer to HAL object (this) ** \param p_eep_data Pointer to (filled in) eeprom data structure ** \param reg register being inspected on this call ** \param value value in INI file ** ** \return Updated value for INI file. */ u_int32_t ar9300_ini_fixup(struct ath_hal *ah, ar9300_eeprom_t *p_eep_data, u_int32_t reg, u_int32_t value) { HALDEBUG(AH_NULL, HAL_DEBUG_UNMASKABLE, "ar9300_eeprom_def_ini_fixup: FIXME\n"); #if 0 BASE_EEPDEF_HEADER *p_base = &(p_eep_data->base_eep_header); switch (AH_PRIVATE(ah)->ah_devid) { case AR9300_DEVID_AR9300_PCI: /* ** Need to set the external/internal regulator bit to the proper value. ** Can only write this ONCE. */ if ( reg == 0x7894 ) { /* ** Check for an EEPROM data structure of "0x0b" or better */ HALDEBUG(ah, HAL_DEBUG_EEPROM, "ini VAL: %x EEPROM: %x\n", value, (p_base->version & 0xff)); if ( (p_base->version & 0xff) > 0x0a) { HALDEBUG(ah, HAL_DEBUG_EEPROM, "PWDCLKIND: %d\n", p_base->pwdclkind); value &= ~AR_AN_TOP2_PWDCLKIND; value |= AR_AN_TOP2_PWDCLKIND & (p_base->pwdclkind << AR_AN_TOP2_PWDCLKIND_S); } else { HALDEBUG(ah, HAL_DEBUG_EEPROM, "PWDCLKIND Earlier Rev\n"); } HALDEBUG(ah, HAL_DEBUG_EEPROM, "final ini VAL: %x\n", value); } break; } return (value); #else return 0; #endif } /* * Returns the interpolated y value corresponding to the specified x value * from the np ordered pairs of data (px,py). * The pairs do not have to be in any order. * If the specified x value is less than any of the px, * the returned y value is equal to the py for the lowest px. * If the specified x value is greater than any of the px, * the returned y value is equal to the py for the highest px. */ static int interpolate(int32_t x, int32_t *px, int32_t *py, u_int16_t np) { int ip = 0; int lx = 0, ly = 0, lhave = 0; int hx = 0, hy = 0, hhave = 0; int dx = 0; int y = 0; int bf, factor, plus; lhave = 0; hhave = 0; /* * identify best lower and higher x calibration measurement */ for (ip = 0; ip < np; ip++) { dx = x - px[ip]; /* this measurement is higher than our desired x */ if (dx <= 0) { if (!hhave || dx > (x - hx)) { /* new best higher x measurement */ hx = px[ip]; hy = py[ip]; hhave = 1; } } /* this measurement is lower than our desired x */ if (dx >= 0) { if (!lhave || dx < (x - lx)) { /* new best lower x measurement */ lx = px[ip]; ly = py[ip]; lhave = 1; } } } /* the low x is good */ if (lhave) { /* so is the high x */ if (hhave) { /* they're the same, so just pick one */ if (hx == lx) { y = ly; } else { /* interpolate with round off */ bf = (2 * (hy - ly) * (x - lx)) / (hx - lx); plus = (bf % 2); factor = bf / 2; y = ly + factor + plus; } } else { /* only low is good, use it */ y = ly; } } else if (hhave) { /* only high is good, use it */ y = hy; } else { /* nothing is good,this should never happen unless np=0, ???? */ y = -(1 << 30); } return y; } u_int8_t ar9300_eeprom_get_legacy_trgt_pwr(struct ath_hal *ah, u_int16_t rate_index, u_int16_t freq, HAL_BOOL is_2ghz) { u_int16_t num_piers, i; int32_t target_power_array[OSPREY_NUM_5G_20_TARGET_POWERS]; int32_t freq_array[OSPREY_NUM_5G_20_TARGET_POWERS]; u_int8_t *p_freq_bin; ar9300_eeprom_t *eep = &AH9300(ah)->ah_eeprom; CAL_TARGET_POWER_LEG *p_eeprom_target_pwr; if (is_2ghz) { num_piers = OSPREY_NUM_2G_20_TARGET_POWERS; p_eeprom_target_pwr = eep->cal_target_power_2g; p_freq_bin = eep->cal_target_freqbin_2g; } else { num_piers = OSPREY_NUM_5G_20_TARGET_POWERS; p_eeprom_target_pwr = eep->cal_target_power_5g; p_freq_bin = eep->cal_target_freqbin_5g; } /* * create array of channels and targetpower from * targetpower piers stored on eeprom */ for (i = 0; i < num_piers; i++) { freq_array[i] = FBIN2FREQ(p_freq_bin[i], is_2ghz); target_power_array[i] = p_eeprom_target_pwr[i].t_pow2x[rate_index]; } /* interpolate to get target power for given frequency */ return ((u_int8_t)interpolate( (int32_t)freq, freq_array, target_power_array, num_piers)); } u_int8_t ar9300_eeprom_get_ht20_trgt_pwr(struct ath_hal *ah, u_int16_t rate_index, u_int16_t freq, HAL_BOOL is_2ghz) { u_int16_t num_piers, i; int32_t target_power_array[OSPREY_NUM_5G_20_TARGET_POWERS]; int32_t freq_array[OSPREY_NUM_5G_20_TARGET_POWERS]; u_int8_t *p_freq_bin; ar9300_eeprom_t *eep = &AH9300(ah)->ah_eeprom; OSP_CAL_TARGET_POWER_HT *p_eeprom_target_pwr; if (is_2ghz) { num_piers = OSPREY_NUM_2G_20_TARGET_POWERS; p_eeprom_target_pwr = eep->cal_target_power_2g_ht20; p_freq_bin = eep->cal_target_freqbin_2g_ht20; } else { num_piers = OSPREY_NUM_5G_20_TARGET_POWERS; p_eeprom_target_pwr = eep->cal_target_power_5g_ht20; p_freq_bin = eep->cal_target_freqbin_5g_ht20; } /* * create array of channels and targetpower from * targetpower piers stored on eeprom */ for (i = 0; i < num_piers; i++) { freq_array[i] = FBIN2FREQ(p_freq_bin[i], is_2ghz); target_power_array[i] = p_eeprom_target_pwr[i].t_pow2x[rate_index]; } /* interpolate to get target power for given frequency */ return ((u_int8_t)interpolate( (int32_t)freq, freq_array, target_power_array, num_piers)); } u_int8_t ar9300_eeprom_get_ht40_trgt_pwr(struct ath_hal *ah, u_int16_t rate_index, u_int16_t freq, HAL_BOOL is_2ghz) { u_int16_t num_piers, i; int32_t target_power_array[OSPREY_NUM_5G_40_TARGET_POWERS]; int32_t freq_array[OSPREY_NUM_5G_40_TARGET_POWERS]; u_int8_t *p_freq_bin; ar9300_eeprom_t *eep = &AH9300(ah)->ah_eeprom; OSP_CAL_TARGET_POWER_HT *p_eeprom_target_pwr; if (is_2ghz) { num_piers = OSPREY_NUM_2G_40_TARGET_POWERS; p_eeprom_target_pwr = eep->cal_target_power_2g_ht40; p_freq_bin = eep->cal_target_freqbin_2g_ht40; } else { num_piers = OSPREY_NUM_5G_40_TARGET_POWERS; p_eeprom_target_pwr = eep->cal_target_power_5g_ht40; p_freq_bin = eep->cal_target_freqbin_5g_ht40; } /* * create array of channels and targetpower from * targetpower piers stored on eeprom */ for (i = 0; i < num_piers; i++) { freq_array[i] = FBIN2FREQ(p_freq_bin[i], is_2ghz); target_power_array[i] = p_eeprom_target_pwr[i].t_pow2x[rate_index]; } /* interpolate to get target power for given frequency */ return ((u_int8_t)interpolate( (int32_t)freq, freq_array, target_power_array, num_piers)); } u_int8_t ar9300_eeprom_get_cck_trgt_pwr(struct ath_hal *ah, u_int16_t rate_index, u_int16_t freq) { u_int16_t num_piers = OSPREY_NUM_2G_CCK_TARGET_POWERS, i; int32_t target_power_array[OSPREY_NUM_2G_CCK_TARGET_POWERS]; int32_t freq_array[OSPREY_NUM_2G_CCK_TARGET_POWERS]; ar9300_eeprom_t *eep = &AH9300(ah)->ah_eeprom; u_int8_t *p_freq_bin = eep->cal_target_freqbin_cck; CAL_TARGET_POWER_LEG *p_eeprom_target_pwr = eep->cal_target_power_cck; /* * create array of channels and targetpower from * targetpower piers stored on eeprom */ for (i = 0; i < num_piers; i++) { freq_array[i] = FBIN2FREQ(p_freq_bin[i], 1); target_power_array[i] = p_eeprom_target_pwr[i].t_pow2x[rate_index]; } /* interpolate to get target power for given frequency */ return ((u_int8_t)interpolate( (int32_t)freq, freq_array, target_power_array, num_piers)); } /* * Set tx power registers to array of values passed in */ int ar9300_transmit_power_reg_write(struct ath_hal *ah, u_int8_t *p_pwr_array) { #define POW_SM(_r, _s) (((_r) & 0x3f) << (_s)) /* make sure forced gain is not set */ #if 0 field_write("force_dac_gain", 0); OS_REG_WRITE(ah, 0xa3f8, 0); field_write("force_tx_gain", 0); #endif OS_REG_WRITE(ah, 0xa458, 0); /* Write the OFDM power per rate set */ /* 6 (LSB), 9, 12, 18 (MSB) */ OS_REG_WRITE(ah, 0xa3c0, POW_SM(p_pwr_array[ALL_TARGET_LEGACY_6_24], 24) | POW_SM(p_pwr_array[ALL_TARGET_LEGACY_6_24], 16) | POW_SM(p_pwr_array[ALL_TARGET_LEGACY_6_24], 8) | POW_SM(p_pwr_array[ALL_TARGET_LEGACY_6_24], 0) ); /* 24 (LSB), 36, 48, 54 (MSB) */ OS_REG_WRITE(ah, 0xa3c4, POW_SM(p_pwr_array[ALL_TARGET_LEGACY_54], 24) | POW_SM(p_pwr_array[ALL_TARGET_LEGACY_48], 16) | POW_SM(p_pwr_array[ALL_TARGET_LEGACY_36], 8) | POW_SM(p_pwr_array[ALL_TARGET_LEGACY_6_24], 0) ); /* Write the CCK power per rate set */ /* 1L (LSB), reserved, 2L, 2S (MSB) */ OS_REG_WRITE(ah, 0xa3c8, POW_SM(p_pwr_array[ALL_TARGET_LEGACY_1L_5L], 24) | POW_SM(p_pwr_array[ALL_TARGET_LEGACY_1L_5L], 16) /* | POW_SM(tx_power_times2, 8)*/ /* this is reserved for Osprey */ | POW_SM(p_pwr_array[ALL_TARGET_LEGACY_1L_5L], 0) ); /* 5.5L (LSB), 5.5S, 11L, 11S (MSB) */ OS_REG_WRITE(ah, 0xa3cc, POW_SM(p_pwr_array[ALL_TARGET_LEGACY_11S], 24) | POW_SM(p_pwr_array[ALL_TARGET_LEGACY_11L], 16) | POW_SM(p_pwr_array[ALL_TARGET_LEGACY_5S], 8) | POW_SM(p_pwr_array[ALL_TARGET_LEGACY_1L_5L], 0) ); /* write the power for duplicated frames - HT40 */ /* dup40_cck (LSB), dup40_ofdm, ext20_cck, ext20_ofdm (MSB) */ OS_REG_WRITE(ah, 0xa3e0, POW_SM(p_pwr_array[ALL_TARGET_LEGACY_6_24], 24) | POW_SM(p_pwr_array[ALL_TARGET_LEGACY_1L_5L], 16) | POW_SM(p_pwr_array[ALL_TARGET_LEGACY_6_24], 8) | POW_SM(p_pwr_array[ALL_TARGET_LEGACY_1L_5L], 0) ); /* Write the HT20 power per rate set */ /* 0/8/16 (LSB), 1-3/9-11/17-19, 4, 5 (MSB) */ OS_REG_WRITE(ah, 0xa3d0, POW_SM(p_pwr_array[ALL_TARGET_HT20_5], 24) | POW_SM(p_pwr_array[ALL_TARGET_HT20_4], 16) | POW_SM(p_pwr_array[ALL_TARGET_HT20_1_3_9_11_17_19], 8) | POW_SM(p_pwr_array[ALL_TARGET_HT20_0_8_16], 0) ); /* 6 (LSB), 7, 12, 13 (MSB) */ OS_REG_WRITE(ah, 0xa3d4, POW_SM(p_pwr_array[ALL_TARGET_HT20_13], 24) | POW_SM(p_pwr_array[ALL_TARGET_HT20_12], 16) | POW_SM(p_pwr_array[ALL_TARGET_HT20_7], 8) | POW_SM(p_pwr_array[ALL_TARGET_HT20_6], 0) ); /* 14 (LSB), 15, 20, 21 */ OS_REG_WRITE(ah, 0xa3e4, POW_SM(p_pwr_array[ALL_TARGET_HT20_21], 24) | POW_SM(p_pwr_array[ALL_TARGET_HT20_20], 16) | POW_SM(p_pwr_array[ALL_TARGET_HT20_15], 8) | POW_SM(p_pwr_array[ALL_TARGET_HT20_14], 0) ); /* Mixed HT20 and HT40 rates */ /* HT20 22 (LSB), HT20 23, HT40 22, HT40 23 (MSB) */ OS_REG_WRITE(ah, 0xa3e8, POW_SM(p_pwr_array[ALL_TARGET_HT40_23], 24) | POW_SM(p_pwr_array[ALL_TARGET_HT40_22], 16) | POW_SM(p_pwr_array[ALL_TARGET_HT20_23], 8) | POW_SM(p_pwr_array[ALL_TARGET_HT20_22], 0) ); /* Write the HT40 power per rate set */ /* correct PAR difference between HT40 and HT20/LEGACY */ /* 0/8/16 (LSB), 1-3/9-11/17-19, 4, 5 (MSB) */ OS_REG_WRITE(ah, 0xa3d8, POW_SM(p_pwr_array[ALL_TARGET_HT40_5], 24) | POW_SM(p_pwr_array[ALL_TARGET_HT40_4], 16) | POW_SM(p_pwr_array[ALL_TARGET_HT40_1_3_9_11_17_19], 8) | POW_SM(p_pwr_array[ALL_TARGET_HT40_0_8_16], 0) ); /* 6 (LSB), 7, 12, 13 (MSB) */ OS_REG_WRITE(ah, 0xa3dc, POW_SM(p_pwr_array[ALL_TARGET_HT40_13], 24) | POW_SM(p_pwr_array[ALL_TARGET_HT40_12], 16) | POW_SM(p_pwr_array[ALL_TARGET_HT40_7], 8) | POW_SM(p_pwr_array[ALL_TARGET_HT40_6], 0) ); /* 14 (LSB), 15, 20, 21 */ OS_REG_WRITE(ah, 0xa3ec, POW_SM(p_pwr_array[ALL_TARGET_HT40_21], 24) | POW_SM(p_pwr_array[ALL_TARGET_HT40_20], 16) | POW_SM(p_pwr_array[ALL_TARGET_HT40_15], 8) | POW_SM(p_pwr_array[ALL_TARGET_HT40_14], 0) ); return 0; #undef POW_SM } static void ar9300_selfgen_tpc_reg_write(struct ath_hal *ah, const struct ieee80211_channel *chan, u_int8_t *p_pwr_array) { u_int32_t tpc_reg_val; /* Set the target power values for self generated frames (ACK,RTS/CTS) to * be within limits. This is just a safety measure.With per packet TPC mode * enabled the target power value used with self generated frames will be * MIN( TPC reg, BB_powertx_rate register) */ if (IEEE80211_IS_CHAN_2GHZ(chan)) { tpc_reg_val = (SM(p_pwr_array[ALL_TARGET_LEGACY_1L_5L], AR_TPC_ACK) | SM(p_pwr_array[ALL_TARGET_LEGACY_1L_5L], AR_TPC_CTS) | SM(0x3f, AR_TPC_CHIRP) | SM(0x3f, AR_TPC_RPT)); } else { tpc_reg_val = (SM(p_pwr_array[ALL_TARGET_LEGACY_6_24], AR_TPC_ACK) | SM(p_pwr_array[ALL_TARGET_LEGACY_6_24], AR_TPC_CTS) | SM(0x3f, AR_TPC_CHIRP) | SM(0x3f, AR_TPC_RPT)); } OS_REG_WRITE(ah, AR_TPC, tpc_reg_val); } void ar9300_set_target_power_from_eeprom(struct ath_hal *ah, u_int16_t freq, u_int8_t *target_power_val_t2) { /* hard code for now, need to get from eeprom struct */ u_int8_t ht40_power_inc_for_pdadc = 0; HAL_BOOL is_2ghz = 0; if (freq < 4000) { is_2ghz = 1; } target_power_val_t2[ALL_TARGET_LEGACY_6_24] = ar9300_eeprom_get_legacy_trgt_pwr( ah, LEGACY_TARGET_RATE_6_24, freq, is_2ghz); target_power_val_t2[ALL_TARGET_LEGACY_36] = ar9300_eeprom_get_legacy_trgt_pwr( ah, LEGACY_TARGET_RATE_36, freq, is_2ghz); target_power_val_t2[ALL_TARGET_LEGACY_48] = ar9300_eeprom_get_legacy_trgt_pwr( ah, LEGACY_TARGET_RATE_48, freq, is_2ghz); target_power_val_t2[ALL_TARGET_LEGACY_54] = ar9300_eeprom_get_legacy_trgt_pwr( ah, LEGACY_TARGET_RATE_54, freq, is_2ghz); target_power_val_t2[ALL_TARGET_LEGACY_1L_5L] = ar9300_eeprom_get_cck_trgt_pwr( ah, LEGACY_TARGET_RATE_1L_5L, freq); target_power_val_t2[ALL_TARGET_LEGACY_5S] = ar9300_eeprom_get_cck_trgt_pwr( ah, LEGACY_TARGET_RATE_5S, freq); target_power_val_t2[ALL_TARGET_LEGACY_11L] = ar9300_eeprom_get_cck_trgt_pwr( ah, LEGACY_TARGET_RATE_11L, freq); target_power_val_t2[ALL_TARGET_LEGACY_11S] = ar9300_eeprom_get_cck_trgt_pwr( ah, LEGACY_TARGET_RATE_11S, freq); target_power_val_t2[ALL_TARGET_HT20_0_8_16] = ar9300_eeprom_get_ht20_trgt_pwr( ah, HT_TARGET_RATE_0_8_16, freq, is_2ghz); target_power_val_t2[ALL_TARGET_HT20_1_3_9_11_17_19] = ar9300_eeprom_get_ht20_trgt_pwr( ah, HT_TARGET_RATE_1_3_9_11_17_19, freq, is_2ghz); target_power_val_t2[ALL_TARGET_HT20_4] = ar9300_eeprom_get_ht20_trgt_pwr( ah, HT_TARGET_RATE_4, freq, is_2ghz); target_power_val_t2[ALL_TARGET_HT20_5] = ar9300_eeprom_get_ht20_trgt_pwr( ah, HT_TARGET_RATE_5, freq, is_2ghz); target_power_val_t2[ALL_TARGET_HT20_6] = ar9300_eeprom_get_ht20_trgt_pwr( ah, HT_TARGET_RATE_6, freq, is_2ghz); target_power_val_t2[ALL_TARGET_HT20_7] = ar9300_eeprom_get_ht20_trgt_pwr( ah, HT_TARGET_RATE_7, freq, is_2ghz); target_power_val_t2[ALL_TARGET_HT20_12] = ar9300_eeprom_get_ht20_trgt_pwr( ah, HT_TARGET_RATE_12, freq, is_2ghz); target_power_val_t2[ALL_TARGET_HT20_13] = ar9300_eeprom_get_ht20_trgt_pwr( ah, HT_TARGET_RATE_13, freq, is_2ghz); target_power_val_t2[ALL_TARGET_HT20_14] = ar9300_eeprom_get_ht20_trgt_pwr( ah, HT_TARGET_RATE_14, freq, is_2ghz); target_power_val_t2[ALL_TARGET_HT20_15] = ar9300_eeprom_get_ht20_trgt_pwr( ah, HT_TARGET_RATE_15, freq, is_2ghz); target_power_val_t2[ALL_TARGET_HT20_20] = ar9300_eeprom_get_ht20_trgt_pwr( ah, HT_TARGET_RATE_20, freq, is_2ghz); target_power_val_t2[ALL_TARGET_HT20_21] = ar9300_eeprom_get_ht20_trgt_pwr( ah, HT_TARGET_RATE_21, freq, is_2ghz); target_power_val_t2[ALL_TARGET_HT20_22] = ar9300_eeprom_get_ht20_trgt_pwr( ah, HT_TARGET_RATE_22, freq, is_2ghz); target_power_val_t2[ALL_TARGET_HT20_23] = ar9300_eeprom_get_ht20_trgt_pwr( ah, HT_TARGET_RATE_23, freq, is_2ghz); target_power_val_t2[ALL_TARGET_HT40_0_8_16] = ar9300_eeprom_get_ht40_trgt_pwr( ah, HT_TARGET_RATE_0_8_16, freq, is_2ghz) + ht40_power_inc_for_pdadc; target_power_val_t2[ALL_TARGET_HT40_1_3_9_11_17_19] = ar9300_eeprom_get_ht40_trgt_pwr( ah, HT_TARGET_RATE_1_3_9_11_17_19, freq, is_2ghz) + ht40_power_inc_for_pdadc; target_power_val_t2[ALL_TARGET_HT40_4] = ar9300_eeprom_get_ht40_trgt_pwr( ah, HT_TARGET_RATE_4, freq, is_2ghz) + ht40_power_inc_for_pdadc; target_power_val_t2[ALL_TARGET_HT40_5] = ar9300_eeprom_get_ht40_trgt_pwr( ah, HT_TARGET_RATE_5, freq, is_2ghz) + ht40_power_inc_for_pdadc; target_power_val_t2[ALL_TARGET_HT40_6] = ar9300_eeprom_get_ht40_trgt_pwr( ah, HT_TARGET_RATE_6, freq, is_2ghz) + ht40_power_inc_for_pdadc; target_power_val_t2[ALL_TARGET_HT40_7] = ar9300_eeprom_get_ht40_trgt_pwr( ah, HT_TARGET_RATE_7, freq, is_2ghz) + ht40_power_inc_for_pdadc; target_power_val_t2[ALL_TARGET_HT40_12] = ar9300_eeprom_get_ht40_trgt_pwr( ah, HT_TARGET_RATE_12, freq, is_2ghz) + ht40_power_inc_for_pdadc; target_power_val_t2[ALL_TARGET_HT40_13] = ar9300_eeprom_get_ht40_trgt_pwr( ah, HT_TARGET_RATE_13, freq, is_2ghz) + ht40_power_inc_for_pdadc; target_power_val_t2[ALL_TARGET_HT40_14] = ar9300_eeprom_get_ht40_trgt_pwr( ah, HT_TARGET_RATE_14, freq, is_2ghz) + ht40_power_inc_for_pdadc; target_power_val_t2[ALL_TARGET_HT40_15] = ar9300_eeprom_get_ht40_trgt_pwr( ah, HT_TARGET_RATE_15, freq, is_2ghz) + ht40_power_inc_for_pdadc; target_power_val_t2[ALL_TARGET_HT40_20] = ar9300_eeprom_get_ht40_trgt_pwr( ah, HT_TARGET_RATE_20, freq, is_2ghz) + ht40_power_inc_for_pdadc; target_power_val_t2[ALL_TARGET_HT40_21] = ar9300_eeprom_get_ht40_trgt_pwr( ah, HT_TARGET_RATE_21, freq, is_2ghz) + ht40_power_inc_for_pdadc; target_power_val_t2[ALL_TARGET_HT40_22] = ar9300_eeprom_get_ht40_trgt_pwr( ah, HT_TARGET_RATE_22, freq, is_2ghz) + ht40_power_inc_for_pdadc; target_power_val_t2[ALL_TARGET_HT40_23] = ar9300_eeprom_get_ht40_trgt_pwr( ah, HT_TARGET_RATE_23, freq, is_2ghz) + ht40_power_inc_for_pdadc; #ifdef AH_DEBUG { int i = 0; HALDEBUG(ah, HAL_DEBUG_EEPROM, "%s: APPLYING TARGET POWERS\n", __func__); while (i < ar9300_rate_size) { HALDEBUG(ah, HAL_DEBUG_EEPROM, "%s: TPC[%02d] 0x%08x ", __func__, i, target_power_val_t2[i]); i++; if (i == ar9300_rate_size) { break; } HALDEBUG(ah, HAL_DEBUG_EEPROM, "%s: TPC[%02d] 0x%08x ", __func__, i, target_power_val_t2[i]); i++; if (i == ar9300_rate_size) { break; } HALDEBUG(ah, HAL_DEBUG_EEPROM, "%s: TPC[%02d] 0x%08x ", __func__, i, target_power_val_t2[i]); i++; if (i == ar9300_rate_size) { break; } HALDEBUG(ah, HAL_DEBUG_EEPROM, "%s: TPC[%02d] 0x%08x \n", __func__, i, target_power_val_t2[i]); i++; } } #endif } u_int16_t *ar9300_regulatory_domain_get(struct ath_hal *ah) { ar9300_eeprom_t *eep = &AH9300(ah)->ah_eeprom; return eep->base_eep_header.reg_dmn; } int32_t ar9300_eeprom_write_enable_gpio_get(struct ath_hal *ah) { ar9300_eeprom_t *eep = &AH9300(ah)->ah_eeprom; return eep->base_eep_header.eeprom_write_enable_gpio; } int32_t ar9300_wlan_disable_gpio_get(struct ath_hal *ah) { ar9300_eeprom_t *eep = &AH9300(ah)->ah_eeprom; return eep->base_eep_header.wlan_disable_gpio; } int32_t ar9300_wlan_led_gpio_get(struct ath_hal *ah) { ar9300_eeprom_t *eep = &AH9300(ah)->ah_eeprom; return eep->base_eep_header.wlan_led_gpio; } int32_t ar9300_rx_band_select_gpio_get(struct ath_hal *ah) { ar9300_eeprom_t *eep = &AH9300(ah)->ah_eeprom; return eep->base_eep_header.rx_band_select_gpio; } /* * since valid noise floor values are negative, returns 1 on error */ int32_t ar9300_noise_floor_cal_or_power_get(struct ath_hal *ah, int32_t frequency, int32_t ichain, HAL_BOOL use_cal) { int nf_use = 1; /* start with an error return value */ int32_t fx[OSPREY_NUM_5G_CAL_PIERS + OSPREY_NUM_2G_CAL_PIERS]; int32_t nf[OSPREY_NUM_5G_CAL_PIERS + OSPREY_NUM_2G_CAL_PIERS]; int nnf; int is_2ghz; int ipier, npier; ar9300_eeprom_t *eep = &AH9300(ah)->ah_eeprom; u_int8_t *p_cal_pier; OSP_CAL_DATA_PER_FREQ_OP_LOOP *p_cal_pier_struct; /* * check chain value */ if (ichain < 0 || ichain >= OSPREY_MAX_CHAINS) { return 1; } /* figure out which band we're using */ is_2ghz = (frequency < 4000); if (is_2ghz) { npier = OSPREY_NUM_2G_CAL_PIERS; p_cal_pier = eep->cal_freq_pier_2g; p_cal_pier_struct = eep->cal_pier_data_2g[ichain]; } else { npier = OSPREY_NUM_5G_CAL_PIERS; p_cal_pier = eep->cal_freq_pier_5g; p_cal_pier_struct = eep->cal_pier_data_5g[ichain]; } /* look for valid noise floor values */ nnf = 0; for (ipier = 0; ipier < npier; ipier++) { fx[nnf] = FBIN2FREQ(p_cal_pier[ipier], is_2ghz); nf[nnf] = use_cal ? p_cal_pier_struct[ipier].rx_noisefloor_cal : p_cal_pier_struct[ipier].rx_noisefloor_power; if (nf[nnf] < 0) { nnf++; } } /* * If we have some valid values, interpolate to find the value * at the desired frequency. */ if (nnf > 0) { nf_use = interpolate(frequency, fx, nf, nnf); } return nf_use; } /* * Return the Rx NF offset for specific channel. * The values saved in EEPROM/OTP/Flash is converted through the following way: * ((_p) - NOISE_PWR_DATA_OFFSET) << 2 * So we need to convert back to the original values. */ int ar9300_get_rx_nf_offset(struct ath_hal *ah, struct ieee80211_channel *chan, int8_t *nf_pwr, int8_t *nf_cal) { HAL_CHANNEL_INTERNAL *ichan = ath_hal_checkchannel(ah, chan); int8_t rx_nf_pwr, rx_nf_cal; int i; //HALASSERT(ichan); /* Fill 0 if valid internal channel is not found */ if (ichan == AH_NULL) { OS_MEMZERO(nf_pwr, sizeof(nf_pwr[0])*OSPREY_MAX_CHAINS); OS_MEMZERO(nf_cal, sizeof(nf_cal[0])*OSPREY_MAX_CHAINS); return -1; } for (i = 0; i < OSPREY_MAX_CHAINS; i++) { if ((rx_nf_pwr = ar9300_noise_floor_cal_or_power_get(ah, ichan->channel, i, 0)) == 1) { nf_pwr[i] = 0; } else { //printk("%s: raw nf_pwr[%d] = %d\n", __func__, i, rx_nf_pwr); nf_pwr[i] = NOISE_PWR_DBM_2_INT(rx_nf_pwr); } if ((rx_nf_cal = ar9300_noise_floor_cal_or_power_get(ah, ichan->channel, i, 1)) == 1) { nf_cal[i] = 0; } else { //printk("%s: raw nf_cal[%d] = %d\n", __func__, i, rx_nf_cal); nf_cal[i] = NOISE_PWR_DBM_2_INT(rx_nf_cal); } } return 0; } int32_t ar9300_rx_gain_index_get(struct ath_hal *ah) { ar9300_eeprom_t *eep = &AH9300(ah)->ah_eeprom; return (eep->base_eep_header.txrxgain) & 0xf; /* bits 3:0 */ } int32_t ar9300_tx_gain_index_get(struct ath_hal *ah) { ar9300_eeprom_t *eep = &AH9300(ah)->ah_eeprom; return (eep->base_eep_header.txrxgain >> 4) & 0xf; /* bits 7:4 */ } HAL_BOOL ar9300_internal_regulator_apply(struct ath_hal *ah) { struct ath_hal_9300 *ahp = AH9300(ah); int internal_regulator = ar9300_eeprom_get(ahp, EEP_INTERNAL_REGULATOR); int reg_pmu1, reg_pmu2, reg_pmu1_set, reg_pmu2_set; u_int32_t reg_PMU1, reg_PMU2; unsigned long eep_addr; u_int32_t reg_val, reg_usb = 0, reg_pmu = 0; int usb_valid = 0, pmu_valid = 0; unsigned char pmu_refv; if (AR_SREV_JUPITER(ah) || AR_SREV_APHRODITE(ah)) { reg_PMU1 = AR_PHY_PMU1_JUPITER; reg_PMU2 = AR_PHY_PMU2_JUPITER; } else { reg_PMU1 = AR_PHY_PMU1; reg_PMU2 = AR_PHY_PMU2; } if (internal_regulator) { if (AR_SREV_HORNET(ah) || AR_SREV_POSEIDON(ah)) { if (AR_SREV_HORNET(ah)) { /* Read OTP first */ for (eep_addr = 0x14; ; eep_addr -= 0x10) { ar9300_otp_read(ah, eep_addr / 4, ®_val, 1); if ((reg_val & 0x80) == 0x80){ usb_valid = 1; reg_usb = reg_val & 0x000000ff; } if ((reg_val & 0x80000000) == 0x80000000){ pmu_valid = 1; reg_pmu = (reg_val & 0xff000000) >> 24; } if (eep_addr == 0x4) { break; } } if (pmu_valid) { pmu_refv = reg_pmu & 0xf; } else { pmu_refv = 0x8; } /* * If (valid) { * Usb_phy_ctrl2_tx_cal_en -> 0 * Usb_phy_ctrl2_tx_cal_sel -> 0 * Usb_phy_ctrl2_tx_man_cal -> 0, 1, 3, 7 or 15 from OTP * } */ if (usb_valid) { OS_REG_RMW_FIELD(ah, 0x16c88, AR_PHY_CTRL2_TX_CAL_EN, 0x0); OS_REG_RMW_FIELD(ah, 0x16c88, AR_PHY_CTRL2_TX_CAL_SEL, 0x0); OS_REG_RMW_FIELD(ah, 0x16c88, AR_PHY_CTRL2_TX_MAN_CAL, (reg_usb & 0xf)); } } else { pmu_refv = 0x8; } /*#ifndef USE_HIF*/ /* Follow the MDK settings for Hornet PMU. * my $pwd = 0x0; * my $Nfdiv = 0x3; # xtal_freq = 25MHz * my $Nfdiv = 0x4; # xtal_freq = 40MHz * my $Refv = 0x7; # 0x5:1.22V; 0x8:1.29V * my $Gm1 = 0x3; #Poseidon $Gm1=1 * my $classb = 0x0; * my $Cc = 0x1; #Poseidon $Cc=7 * my $Rc = 0x6; * my $ramp_slope = 0x1; * my $Segm = 0x3; * my $use_local_osc = 0x0; * my $force_xosc_stable = 0x0; * my $Selfb = 0x0; #Poseidon $Selfb=1 * my $Filterfb = 0x3; #Poseidon $Filterfb=0 * my $Filtervc = 0x0; * my $disc = 0x0; * my $discdel = 0x4; * my $spare = 0x0; * $reg_PMU1 = * $pwd | ($Nfdiv<<1) | ($Refv<<4) | ($Gm1<<8) | * ($classb<<11) | ($Cc<<14) | ($Rc<<17) | ($ramp_slope<<20) | * ($Segm<<24) | ($use_local_osc<<26) | * ($force_xosc_stable<<27) | ($Selfb<<28) | ($Filterfb<<29); * $reg_PMU2 = $handle->reg_rd("ch0_PMU2"); * $reg_PMU2 = ($reg_PMU2 & 0xfe3fffff) | ($Filtervc<<22); * $reg_PMU2 = ($reg_PMU2 & 0xe3ffffff) | ($discdel<<26); * $reg_PMU2 = ($reg_PMU2 & 0x1fffffff) | ($spare<<29); */ if (ahp->clk_25mhz) { reg_pmu1_set = 0 | (3 << 1) | (pmu_refv << 4) | (3 << 8) | (0 << 11) | (1 << 14) | (6 << 17) | (1 << 20) | (3 << 24) | (0 << 26) | (0 << 27) | (0 << 28) | (0 << 29); } else { if (AR_SREV_POSEIDON(ah)) { reg_pmu1_set = 0 | (5 << 1) | (7 << 4) | (2 << 8) | (0 << 11) | (2 << 14) | (6 << 17) | (1 << 20) | (3 << 24) | (0 << 26) | (0 << 27) | (1 << 28) | (0 << 29) ; } else { reg_pmu1_set = 0 | (4 << 1) | (7 << 4) | (3 << 8) | (0 << 11) | (1 << 14) | (6 << 17) | (1 << 20) | (3 << 24) | (0 << 26) | (0 << 27) | (0 << 28) | (0 << 29) ; } } OS_REG_RMW_FIELD(ah, reg_PMU2, AR_PHY_PMU2_PGM, 0x0); OS_REG_WRITE(ah, reg_PMU1, reg_pmu1_set); /* 0x638c8376 */ reg_pmu1 = OS_REG_READ(ah, reg_PMU1); while (reg_pmu1 != reg_pmu1_set) { OS_REG_WRITE(ah, reg_PMU1, reg_pmu1_set); /* 0x638c8376 */ OS_DELAY(10); reg_pmu1 = OS_REG_READ(ah, reg_PMU1); } reg_pmu2_set = (OS_REG_READ(ah, reg_PMU2) & (~0xFFC00000)) | (4 << 26); OS_REG_WRITE(ah, reg_PMU2, reg_pmu2_set); reg_pmu2 = OS_REG_READ(ah, reg_PMU2); while (reg_pmu2 != reg_pmu2_set) { OS_REG_WRITE(ah, reg_PMU2, reg_pmu2_set); OS_DELAY(10); reg_pmu2 = OS_REG_READ(ah, reg_PMU2); } reg_pmu2_set = (OS_REG_READ(ah, reg_PMU2) & (~0x00200000)) | (1 << 21); OS_REG_WRITE(ah, reg_PMU2, reg_pmu2_set); reg_pmu2 = OS_REG_READ(ah, reg_PMU2); while (reg_pmu2 != reg_pmu2_set) { OS_REG_WRITE(ah, reg_PMU2, reg_pmu2_set); OS_DELAY(10); reg_pmu2 = OS_REG_READ(ah, reg_PMU2); } /*#endif*/ } else if (AR_SREV_JUPITER(ah) || AR_SREV_APHRODITE(ah)) { /* Internal regulator is ON. Write swreg register. */ int swreg = ar9300_eeprom_get(ahp, EEP_SWREG); OS_REG_WRITE(ah, reg_PMU1, swreg); } else { /* Internal regulator is ON. Write swreg register. */ int swreg = ar9300_eeprom_get(ahp, EEP_SWREG); OS_REG_WRITE(ah, AR_RTC_REG_CONTROL1, OS_REG_READ(ah, AR_RTC_REG_CONTROL1) & (~AR_RTC_REG_CONTROL1_SWREG_PROGRAM)); OS_REG_WRITE(ah, AR_RTC_REG_CONTROL0, swreg); /* Set REG_CONTROL1.SWREG_PROGRAM */ OS_REG_WRITE(ah, AR_RTC_REG_CONTROL1, OS_REG_READ(ah, AR_RTC_REG_CONTROL1) | AR_RTC_REG_CONTROL1_SWREG_PROGRAM); } } else { if (AR_SREV_HORNET(ah) || AR_SREV_POSEIDON(ah)) { OS_REG_RMW_FIELD(ah, reg_PMU2, AR_PHY_PMU2_PGM, 0x0); reg_pmu2 = OS_REG_READ_FIELD(ah, reg_PMU2, AR_PHY_PMU2_PGM); while (reg_pmu2) { OS_DELAY(10); reg_pmu2 = OS_REG_READ_FIELD(ah, reg_PMU2, AR_PHY_PMU2_PGM); } OS_REG_RMW_FIELD(ah, reg_PMU1, AR_PHY_PMU1_PWD, 0x1); reg_pmu1 = OS_REG_READ_FIELD(ah, reg_PMU1, AR_PHY_PMU1_PWD); while (!reg_pmu1) { OS_DELAY(10); reg_pmu1 = OS_REG_READ_FIELD(ah, reg_PMU1, AR_PHY_PMU1_PWD); } OS_REG_RMW_FIELD(ah, reg_PMU2, AR_PHY_PMU2_PGM, 0x1); reg_pmu2 = OS_REG_READ_FIELD(ah, reg_PMU2, AR_PHY_PMU2_PGM); while (!reg_pmu2) { OS_DELAY(10); reg_pmu2 = OS_REG_READ_FIELD(ah, reg_PMU2, AR_PHY_PMU2_PGM); } } else if (AR_SREV_JUPITER(ah) || AR_SREV_APHRODITE(ah)) { OS_REG_RMW_FIELD(ah, reg_PMU1, AR_PHY_PMU1_PWD, 0x1); } else { OS_REG_WRITE(ah, AR_RTC_SLEEP_CLK, (OS_REG_READ(ah, AR_RTC_SLEEP_CLK) | AR_RTC_FORCE_SWREG_PRD | AR_RTC_PCIE_RST_PWDN_EN)); } } return 0; } HAL_BOOL ar9300_drive_strength_apply(struct ath_hal *ah) { struct ath_hal_9300 *ahp = AH9300(ah); int drive_strength; unsigned long reg; drive_strength = ar9300_eeprom_get(ahp, EEP_DRIVE_STRENGTH); if (drive_strength) { reg = OS_REG_READ(ah, AR_PHY_65NM_CH0_BIAS1); reg &= ~0x00ffffc0; reg |= 0x5 << 21; reg |= 0x5 << 18; reg |= 0x5 << 15; reg |= 0x5 << 12; reg |= 0x5 << 9; reg |= 0x5 << 6; OS_REG_WRITE(ah, AR_PHY_65NM_CH0_BIAS1, reg); reg = OS_REG_READ(ah, AR_PHY_65NM_CH0_BIAS2); reg &= ~0xffffffe0; reg |= 0x5 << 29; reg |= 0x5 << 26; reg |= 0x5 << 23; reg |= 0x5 << 20; reg |= 0x5 << 17; reg |= 0x5 << 14; reg |= 0x5 << 11; reg |= 0x5 << 8; reg |= 0x5 << 5; OS_REG_WRITE(ah, AR_PHY_65NM_CH0_BIAS2, reg); reg = OS_REG_READ(ah, AR_PHY_65NM_CH0_BIAS4); reg &= ~0xff800000; reg |= 0x5 << 29; reg |= 0x5 << 26; reg |= 0x5 << 23; OS_REG_WRITE(ah, AR_PHY_65NM_CH0_BIAS4, reg); } return 0; } int32_t ar9300_xpa_bias_level_get(struct ath_hal *ah, HAL_BOOL is_2ghz) { ar9300_eeprom_t *eep = &AH9300(ah)->ah_eeprom; if (is_2ghz) { return eep->modal_header_2g.xpa_bias_lvl; } else { return eep->modal_header_5g.xpa_bias_lvl; } } HAL_BOOL ar9300_xpa_bias_level_apply(struct ath_hal *ah, HAL_BOOL is_2ghz) { /* * In ar9330 emu, we can't access radio registers, * merlin is used for radio part. */ int bias; bias = ar9300_xpa_bias_level_get(ah, is_2ghz); if (AR_SREV_HORNET(ah) || AR_SREV_POSEIDON(ah) || AR_SREV_WASP(ah)) { OS_REG_RMW_FIELD(ah, AR_HORNET_CH0_TOP2, AR_HORNET_CH0_TOP2_XPABIASLVL, bias); } else if (AR_SREV_SCORPION(ah)) { OS_REG_RMW_FIELD(ah, AR_SCORPION_CH0_TOP, AR_SCORPION_CH0_TOP_XPABIASLVL, bias); } else if (AR_SREV_JUPITER(ah) || AR_SREV_APHRODITE(ah)) { OS_REG_RMW_FIELD(ah, AR_PHY_65NM_CH0_TOP_JUPITER, AR_PHY_65NM_CH0_TOP_XPABIASLVL, bias); } else { OS_REG_RMW_FIELD(ah, AR_PHY_65NM_CH0_TOP, AR_PHY_65NM_CH0_TOP_XPABIASLVL, bias); OS_REG_RMW_FIELD(ah, AR_PHY_65NM_CH0_THERM, AR_PHY_65NM_CH0_THERM_XPABIASLVL_MSB, bias >> 2); OS_REG_RMW_FIELD(ah, AR_PHY_65NM_CH0_THERM, AR_PHY_65NM_CH0_THERM_XPASHORT2GND, 1); } return 0; } u_int32_t ar9300_ant_ctrl_common_get(struct ath_hal *ah, HAL_BOOL is_2ghz) { ar9300_eeprom_t *eep = &AH9300(ah)->ah_eeprom; if (is_2ghz) { return eep->modal_header_2g.ant_ctrl_common; } else { return eep->modal_header_5g.ant_ctrl_common; } } static u_int16_t ar9300_switch_com_spdt_get(struct ath_hal *ah, HAL_BOOL is_2ghz) { ar9300_eeprom_t *eep = &AH9300(ah)->ah_eeprom; if (is_2ghz) { return eep->modal_header_2g.switchcomspdt; } else { return eep->modal_header_5g.switchcomspdt; } } u_int32_t ar9300_ant_ctrl_common2_get(struct ath_hal *ah, HAL_BOOL is_2ghz) { ar9300_eeprom_t *eep = &AH9300(ah)->ah_eeprom; if (is_2ghz) { return eep->modal_header_2g.ant_ctrl_common2; } else { return eep->modal_header_5g.ant_ctrl_common2; } } u_int16_t ar9300_ant_ctrl_chain_get(struct ath_hal *ah, int chain, HAL_BOOL is_2ghz) { ar9300_eeprom_t *eep = &AH9300(ah)->ah_eeprom; if (chain >= 0 && chain < OSPREY_MAX_CHAINS) { if (is_2ghz) { return eep->modal_header_2g.ant_ctrl_chain[chain]; } else { return eep->modal_header_5g.ant_ctrl_chain[chain]; } } return 0; } /* * Select the usage of antenna via the RF switch. * Default values are loaded from eeprom. */ HAL_BOOL ar9300_ant_swcom_sel(struct ath_hal *ah, u_int8_t ops, u_int32_t *common_tbl1, u_int32_t *common_tbl2) { ar9300_eeprom_t *eep = &AH9300(ah)->ah_eeprom; struct ath_hal_private *ap = AH_PRIVATE(ah); const struct ieee80211_channel *curchan = ap->ah_curchan; enum { ANT_SELECT_OPS_GET, ANT_SELECT_OPS_SET, }; if (AR_SREV_JUPITER(ah) || AR_SREV_SCORPION(ah)) return AH_FALSE; if (!curchan) return AH_FALSE; #define AR_SWITCH_TABLE_COM_ALL (0xffff) #define AR_SWITCH_TABLE_COM_ALL_S (0) #define AR_SWITCH_TABLE_COM2_ALL (0xffffff) #define AR_SWITCH_TABLE_COM2_ALL_S (0) switch (ops) { case ANT_SELECT_OPS_GET: *common_tbl1 = OS_REG_READ_FIELD(ah, AR_PHY_SWITCH_COM, AR_SWITCH_TABLE_COM_ALL); *common_tbl2 = OS_REG_READ_FIELD(ah, AR_PHY_SWITCH_COM_2, AR_SWITCH_TABLE_COM2_ALL); break; case ANT_SELECT_OPS_SET: OS_REG_RMW_FIELD(ah, AR_PHY_SWITCH_COM, AR_SWITCH_TABLE_COM_ALL, *common_tbl1); OS_REG_RMW_FIELD(ah, AR_PHY_SWITCH_COM_2, AR_SWITCH_TABLE_COM2_ALL, *common_tbl2); /* write back to eeprom */ if (IEEE80211_IS_CHAN_2GHZ(curchan)) { eep->modal_header_2g.ant_ctrl_common = *common_tbl1; eep->modal_header_2g.ant_ctrl_common2 = *common_tbl2; } else { eep->modal_header_5g.ant_ctrl_common = *common_tbl1; eep->modal_header_5g.ant_ctrl_common2 = *common_tbl2; } break; default: break; } return AH_TRUE; } HAL_BOOL ar9300_ant_ctrl_apply(struct ath_hal *ah, HAL_BOOL is_2ghz) { u_int32_t value; struct ath_hal_9300 *ahp = AH9300(ah); u_int32_t regval; struct ath_hal_private *ahpriv = AH_PRIVATE(ah); #if ATH_ANT_DIV_COMB HAL_CAPABILITIES *pcap = &ahpriv->ah_caps; #endif /* ATH_ANT_DIV_COMB */ u_int32_t xlan_gpio_cfg; u_int8_t i; HALDEBUG(ah, HAL_DEBUG_BT_COEX, "%s: use_bt_ant_enable=%d\n", __func__, ahp->ah_lna_div_use_bt_ant_enable); /* XXX TODO: only if rx_gain_idx == 0 */ if (AR_SREV_POSEIDON(ah)) { xlan_gpio_cfg = ah->ah_config.ath_hal_ext_lna_ctl_gpio; if (xlan_gpio_cfg) { for (i = 0; i < 32; i++) { if (xlan_gpio_cfg & (1 << i)) { ath_hal_gpioCfgOutput(ah, i, HAL_GPIO_OUTPUT_MUX_PCIE_ATTENTION_LED); } } } } #define AR_SWITCH_TABLE_COM_ALL (0xffff) #define AR_SWITCH_TABLE_COM_ALL_S (0) #define AR_SWITCH_TABLE_COM_JUPITER_ALL (0xffffff) #define AR_SWITCH_TABLE_COM_JUPITER_ALL_S (0) #define AR_SWITCH_TABLE_COM_SCORPION_ALL (0xffffff) #define AR_SWITCH_TABLE_COM_SCORPION_ALL_S (0) #define AR_SWITCH_TABLE_COM_HONEYBEE_ALL (0xffffff) #define AR_SWITCH_TABLE_COM_HONEYBEE_ALL_S (0) #define AR_SWITCH_TABLE_COM_SPDT (0x00f00000) value = ar9300_ant_ctrl_common_get(ah, is_2ghz); if (AR_SREV_JUPITER(ah) || AR_SREV_APHRODITE(ah)) { if (AR_SREV_JUPITER_10(ah)) { /* Force SPDT setting for Jupiter 1.0 chips. */ value &= ~AR_SWITCH_TABLE_COM_SPDT; value |= 0x00100000; } OS_REG_RMW_FIELD(ah, AR_PHY_SWITCH_COM, AR_SWITCH_TABLE_COM_JUPITER_ALL, value); } else if (AR_SREV_SCORPION(ah)) { OS_REG_RMW_FIELD(ah, AR_PHY_SWITCH_COM, AR_SWITCH_TABLE_COM_SCORPION_ALL, value); } else if (AR_SREV_HONEYBEE(ah)) { OS_REG_RMW_FIELD(ah, AR_PHY_SWITCH_COM, AR_SWITCH_TABLE_COM_HONEYBEE_ALL, value); } else { OS_REG_RMW_FIELD(ah, AR_PHY_SWITCH_COM, AR_SWITCH_TABLE_COM_ALL, value); } /* * Jupiter2.0 defines new switch table for BT/WLAN, * here's new field name in WB222.ref for both 2G and 5G. * Register: [GLB_CONTROL] GLB_CONTROL (@0x20044) * 15:12 R/W SWITCH_TABLE_COM_SPDT_WLAN_RX SWITCH_TABLE_COM_SPDT_WLAN_RX * 11:8 R/W SWITCH_TABLE_COM_SPDT_WLAN_TX SWITCH_TABLE_COM_SPDT_WLAN_TX * 7:4 R/W SWITCH_TABLE_COM_SPDT_WLAN_IDLE SWITCH_TABLE_COM_SPDT_WLAN_IDLE */ #define AR_SWITCH_TABLE_COM_SPDT_ALL (0x0000fff0) #define AR_SWITCH_TABLE_COM_SPDT_ALL_S (4) if (AR_SREV_JUPITER_20_OR_LATER(ah) || AR_SREV_APHRODITE(ah)) { value = ar9300_switch_com_spdt_get(ah, is_2ghz); OS_REG_RMW_FIELD(ah, AR_GLB_CONTROL, AR_SWITCH_TABLE_COM_SPDT_ALL, value); OS_REG_SET_BIT(ah, AR_GLB_CONTROL, AR_BTCOEX_CTRL_SPDT_ENABLE); //OS_REG_SET_BIT(ah, AR_GLB_CONTROL, // AR_BTCOEX_CTRL_BT_OWN_SPDT_CTRL); } #define AR_SWITCH_TABLE_COM2_ALL (0xffffff) #define AR_SWITCH_TABLE_COM2_ALL_S (0) value = ar9300_ant_ctrl_common2_get(ah, is_2ghz); #if ATH_ANT_DIV_COMB if ( AR_SREV_POSEIDON(ah) && (ahp->ah_lna_div_use_bt_ant_enable == TRUE) ) { value &= ~AR_SWITCH_TABLE_COM2_ALL; value |= ah->ah_config.ath_hal_ant_ctrl_comm2g_switch_enable; HALDEBUG(ah, HAL_DEBUG_RESET, "%s: com2=0x%08x\n", __func__, value) } #endif /* ATH_ANT_DIV_COMB */ OS_REG_RMW_FIELD(ah, AR_PHY_SWITCH_COM_2, AR_SWITCH_TABLE_COM2_ALL, value); #define AR_SWITCH_TABLE_ALL (0xfff) #define AR_SWITCH_TABLE_ALL_S (0) value = ar9300_ant_ctrl_chain_get(ah, 0, is_2ghz); OS_REG_RMW_FIELD(ah, AR_PHY_SWITCH_CHAIN_0, AR_SWITCH_TABLE_ALL, value); if (!AR_SREV_HORNET(ah) && !AR_SREV_POSEIDON(ah) && !AR_SREV_APHRODITE(ah)) { value = ar9300_ant_ctrl_chain_get(ah, 1, is_2ghz); OS_REG_RMW_FIELD(ah, AR_PHY_SWITCH_CHAIN_1, AR_SWITCH_TABLE_ALL, value); if (!AR_SREV_WASP(ah) && !AR_SREV_JUPITER(ah) && !AR_SREV_HONEYBEE(ah)) { value = ar9300_ant_ctrl_chain_get(ah, 2, is_2ghz); OS_REG_RMW_FIELD(ah, AR_PHY_SWITCH_CHAIN_2, AR_SWITCH_TABLE_ALL, value); } } if (AR_SREV_HORNET(ah) || AR_SREV_POSEIDON(ah) || AR_SREV_APHRODITE(ah)) { value = ar9300_eeprom_get(ahp, EEP_ANTDIV_control); /* main_lnaconf, alt_lnaconf, main_tb, alt_tb */ regval = OS_REG_READ(ah, AR_PHY_MC_GAIN_CTRL); regval &= (~ANT_DIV_CONTROL_ALL); /* clear bit 25~30 */ regval |= (value & 0x3f) << ANT_DIV_CONTROL_ALL_S; /* enable_lnadiv */ regval &= (~MULTICHAIN_GAIN_CTRL__ENABLE_ANT_DIV_LNADIV__MASK); regval |= ((value >> 6) & 0x1) << MULTICHAIN_GAIN_CTRL__ENABLE_ANT_DIV_LNADIV__SHIFT; #if ATH_ANT_DIV_COMB if ( AR_SREV_POSEIDON(ah) && (ahp->ah_lna_div_use_bt_ant_enable == TRUE) ) { regval |= ANT_DIV_ENABLE; } if (AR_SREV_APHRODITE(ah)) { if (ahp->ah_lna_div_use_bt_ant_enable) { regval |= (1 << MULTICHAIN_GAIN_CTRL__ENABLE_ANT_SW_RX_PROT__SHIFT); OS_REG_SET_BIT(ah, AR_PHY_RESTART, RESTART__ENABLE_ANT_FAST_DIV_M2FLAG__MASK); /* Force WLAN LNA diversity ON */ OS_REG_SET_BIT(ah, AR_BTCOEX_WL_LNADIV, AR_BTCOEX_WL_LNADIV_FORCE_ON); } else { regval &= ~(1 << MULTICHAIN_GAIN_CTRL__ENABLE_ANT_DIV_LNADIV__SHIFT); regval &= ~(1 << MULTICHAIN_GAIN_CTRL__ENABLE_ANT_SW_RX_PROT__SHIFT); OS_REG_CLR_BIT(ah, AR_PHY_MC_GAIN_CTRL, (1 << MULTICHAIN_GAIN_CTRL__ENABLE_ANT_SW_RX_PROT__SHIFT)); /* Force WLAN LNA diversity OFF */ OS_REG_CLR_BIT(ah, AR_BTCOEX_WL_LNADIV, AR_BTCOEX_WL_LNADIV_FORCE_ON); } } #endif /* ATH_ANT_DIV_COMB */ OS_REG_WRITE(ah, AR_PHY_MC_GAIN_CTRL, regval); /* enable fast_div */ regval = OS_REG_READ(ah, AR_PHY_CCK_DETECT); regval &= (~BBB_SIG_DETECT__ENABLE_ANT_FAST_DIV__MASK); regval |= ((value >> 7) & 0x1) << BBB_SIG_DETECT__ENABLE_ANT_FAST_DIV__SHIFT; #if ATH_ANT_DIV_COMB if ((AR_SREV_POSEIDON(ah) || AR_SREV_APHRODITE(ah)) && (ahp->ah_lna_div_use_bt_ant_enable == TRUE) ) { regval |= FAST_DIV_ENABLE; } #endif /* ATH_ANT_DIV_COMB */ OS_REG_WRITE(ah, AR_PHY_CCK_DETECT, regval); } #if ATH_ANT_DIV_COMB if (AR_SREV_HORNET(ah) || AR_SREV_POSEIDON_11_OR_LATER(ah)) { if (pcap->halAntDivCombSupport) { /* If support DivComb, set MAIN to LNA1, ALT to LNA2 at beginning */ regval = OS_REG_READ(ah, AR_PHY_MC_GAIN_CTRL); /* clear bit 25~30 main_lnaconf, alt_lnaconf, main_tb, alt_tb */ regval &= (~(MULTICHAIN_GAIN_CTRL__ANT_DIV_MAIN_LNACONF__MASK | MULTICHAIN_GAIN_CTRL__ANT_DIV_ALT_LNACONF__MASK | MULTICHAIN_GAIN_CTRL__ANT_DIV_ALT_GAINTB__MASK | MULTICHAIN_GAIN_CTRL__ANT_DIV_MAIN_GAINTB__MASK)); regval |= (HAL_ANT_DIV_COMB_LNA1 << MULTICHAIN_GAIN_CTRL__ANT_DIV_MAIN_LNACONF__SHIFT); regval |= (HAL_ANT_DIV_COMB_LNA2 << MULTICHAIN_GAIN_CTRL__ANT_DIV_ALT_LNACONF__SHIFT); OS_REG_WRITE(ah, AR_PHY_MC_GAIN_CTRL, regval); } } #endif /* ATH_ANT_DIV_COMB */ if (AR_SREV_POSEIDON(ah) && ( ahp->ah_diversity_control == HAL_ANT_FIXED_A || ahp->ah_diversity_control == HAL_ANT_FIXED_B)) { u_int32_t reg_val = OS_REG_READ(ah, AR_PHY_MC_GAIN_CTRL); reg_val &= ~(MULTICHAIN_GAIN_CTRL__ANT_DIV_MAIN_LNACONF__MASK | MULTICHAIN_GAIN_CTRL__ANT_DIV_ALT_LNACONF__MASK | MULTICHAIN_GAIN_CTRL__ANT_FAST_DIV_BIAS__MASK | MULTICHAIN_GAIN_CTRL__ANT_DIV_MAIN_GAINTB__MASK | MULTICHAIN_GAIN_CTRL__ANT_DIV_ALT_GAINTB__MASK ); switch (ahp->ah_diversity_control) { case HAL_ANT_FIXED_A: /* Enable first antenna only */ reg_val |= (HAL_ANT_DIV_COMB_LNA1 << MULTICHAIN_GAIN_CTRL__ANT_DIV_MAIN_LNACONF__SHIFT); reg_val |= (HAL_ANT_DIV_COMB_LNA2 << MULTICHAIN_GAIN_CTRL__ANT_DIV_ALT_LNACONF__SHIFT); /* main/alt gain table and Fast Div Bias all set to 0 */ OS_REG_WRITE(ah, AR_PHY_MC_GAIN_CTRL, reg_val); regval = OS_REG_READ(ah, AR_PHY_CCK_DETECT); regval &= (~BBB_SIG_DETECT__ENABLE_ANT_FAST_DIV__MASK); OS_REG_WRITE(ah, AR_PHY_CCK_DETECT, regval); break; case HAL_ANT_FIXED_B: /* Enable second antenna only, after checking capability */ reg_val |= (HAL_ANT_DIV_COMB_LNA2 << MULTICHAIN_GAIN_CTRL__ANT_DIV_MAIN_LNACONF__SHIFT); reg_val |= (HAL_ANT_DIV_COMB_LNA1 << MULTICHAIN_GAIN_CTRL__ANT_DIV_ALT_LNACONF__SHIFT); /* main/alt gain table and Fast Div all set to 0 */ OS_REG_WRITE(ah, AR_PHY_MC_GAIN_CTRL, reg_val); regval = OS_REG_READ(ah, AR_PHY_CCK_DETECT); regval &= (~BBB_SIG_DETECT__ENABLE_ANT_FAST_DIV__MASK); OS_REG_WRITE(ah, AR_PHY_CCK_DETECT, regval); /* For WB225, need to swith ANT2 from BT to Wifi * This will not affect HB125 LNA diversity feature. */ HALDEBUG(ah, HAL_DEBUG_RESET, "%s: com2=0x%08x\n", __func__, ah->ah_config.ath_hal_ant_ctrl_comm2g_switch_enable) OS_REG_RMW_FIELD(ah, AR_PHY_SWITCH_COM_2, AR_SWITCH_TABLE_COM2_ALL, ah->ah_config.ath_hal_ant_ctrl_comm2g_switch_enable); break; default: break; } } return 0; } static u_int16_t ar9300_attenuation_chain_get(struct ath_hal *ah, int chain, u_int16_t channel) { int32_t f[3], t[3]; u_int16_t value; ar9300_eeprom_t *eep = &AH9300(ah)->ah_eeprom; if (chain >= 0 && chain < OSPREY_MAX_CHAINS) { if (channel < 4000) { return eep->modal_header_2g.xatten1_db[chain]; } else { if (eep->base_ext2.xatten1_db_low[chain] != 0) { t[0] = eep->base_ext2.xatten1_db_low[chain]; f[0] = 5180; t[1] = eep->modal_header_5g.xatten1_db[chain]; f[1] = 5500; t[2] = eep->base_ext2.xatten1_db_high[chain]; f[2] = 5785; value = interpolate(channel, f, t, 3); return value; } else { return eep->modal_header_5g.xatten1_db[chain]; } } } return 0; } static u_int16_t ar9300_attenuation_margin_chain_get(struct ath_hal *ah, int chain, u_int16_t channel) { int32_t f[3], t[3]; u_int16_t value; ar9300_eeprom_t *eep = &AH9300(ah)->ah_eeprom; if (chain >= 0 && chain < OSPREY_MAX_CHAINS) { if (channel < 4000) { return eep->modal_header_2g.xatten1_margin[chain]; } else { if (eep->base_ext2.xatten1_margin_low[chain] != 0) { t[0] = eep->base_ext2.xatten1_margin_low[chain]; f[0] = 5180; t[1] = eep->modal_header_5g.xatten1_margin[chain]; f[1] = 5500; t[2] = eep->base_ext2.xatten1_margin_high[chain]; f[2] = 5785; value = interpolate(channel, f, t, 3); return value; } else { return eep->modal_header_5g.xatten1_margin[chain]; } } } return 0; } #if 0 HAL_BOOL ar9300_attenuation_apply(struct ath_hal *ah, u_int16_t channel) { u_int32_t value; // struct ath_hal_private *ahpriv = AH_PRIVATE(ah); /* Test value. if 0 then attenuation is unused. Don't load anything. */ value = ar9300_attenuation_chain_get(ah, 0, channel); OS_REG_RMW_FIELD(ah, AR_PHY_EXT_ATTEN_CTL_0, AR_PHY_EXT_ATTEN_CTL_XATTEN1_DB, value); value = ar9300_attenuation_margin_chain_get(ah, 0, channel); if (ar9300_rx_gain_index_get(ah) == 0 && ah->ah_config.ath_hal_ext_atten_margin_cfg) { value = 5; } OS_REG_RMW_FIELD(ah, AR_PHY_EXT_ATTEN_CTL_0, AR_PHY_EXT_ATTEN_CTL_XATTEN1_MARGIN, value); if (!AR_SREV_HORNET(ah) && !AR_SREV_POSEIDON(ah)) { value = ar9300_attenuation_chain_get(ah, 1, channel); OS_REG_RMW_FIELD(ah, AR_PHY_EXT_ATTEN_CTL_1, AR_PHY_EXT_ATTEN_CTL_XATTEN1_DB, value); value = ar9300_attenuation_margin_chain_get(ah, 1, channel); OS_REG_RMW_FIELD(ah, AR_PHY_EXT_ATTEN_CTL_1, AR_PHY_EXT_ATTEN_CTL_XATTEN1_MARGIN, value); if (!AR_SREV_WASP(ah) && !AR_SREV_JUPITER(ah)&& !AR_SREV_HONEYBEE(ah) ) { value = ar9300_attenuation_chain_get(ah, 2, channel); OS_REG_RMW_FIELD(ah, AR_PHY_EXT_ATTEN_CTL_2, AR_PHY_EXT_ATTEN_CTL_XATTEN1_DB, value); value = ar9300_attenuation_margin_chain_get(ah, 2, channel); OS_REG_RMW_FIELD(ah, AR_PHY_EXT_ATTEN_CTL_2, AR_PHY_EXT_ATTEN_CTL_XATTEN1_MARGIN, value); } } return 0; } #endif HAL_BOOL ar9300_attenuation_apply(struct ath_hal *ah, u_int16_t channel) { int i; uint32_t value; uint32_t ext_atten_reg[3] = { AR_PHY_EXT_ATTEN_CTL_0, AR_PHY_EXT_ATTEN_CTL_1, AR_PHY_EXT_ATTEN_CTL_2 }; /* * If it's an AR9462 and we're receiving on the second * chain only, set the chain 0 details from chain 1 * calibration. * * This is from ath9k. */ if (AR_SREV_JUPITER(ah) && (AH9300(ah)->ah_rx_chainmask == 0x2)) { value = ar9300_attenuation_chain_get(ah, 1, channel); OS_REG_RMW_FIELD(ah, ext_atten_reg[0], AR_PHY_EXT_ATTEN_CTL_XATTEN1_DB, value); value = ar9300_attenuation_margin_chain_get(ah, 1, channel); OS_REG_RMW_FIELD(ah, ext_atten_reg[0], AR_PHY_EXT_ATTEN_CTL_XATTEN1_MARGIN, value); } /* * Now, loop over the configured transmit chains and * load in the attenuation/margin settings as appropriate. */ for (i = 0; i < 3; i++) { if ((AH9300(ah)->ah_tx_chainmask & (1 << i)) == 0) continue; value = ar9300_attenuation_chain_get(ah, i, channel); OS_REG_RMW_FIELD(ah, ext_atten_reg[i], AR_PHY_EXT_ATTEN_CTL_XATTEN1_DB, value); if (AR_SREV_POSEIDON(ah) && (ar9300_rx_gain_index_get(ah) == 0) && ah->ah_config.ath_hal_ext_atten_margin_cfg) { value = 5; } else { value = ar9300_attenuation_margin_chain_get(ah, 0, channel); } /* * I'm not sure why it's loading in this setting into * the chain 0 margin regardless of the current chain. */ if (ah->ah_config.ath_hal_min_gainidx) OS_REG_RMW_FIELD(ah, AR_PHY_EXT_ATTEN_CTL_0, AR_PHY_EXT_ATTEN_CTL_XATTEN1_MARGIN, value); OS_REG_RMW_FIELD(ah, ext_atten_reg[i], AR_PHY_EXT_ATTEN_CTL_XATTEN1_MARGIN, value); } return (0); } static u_int16_t ar9300_quick_drop_get(struct ath_hal *ah, int chain, u_int16_t channel) { int32_t f[3], t[3]; u_int16_t value; ar9300_eeprom_t *eep = &AH9300(ah)->ah_eeprom; if (channel < 4000) { return eep->modal_header_2g.quick_drop; } else { t[0] = eep->base_ext1.quick_drop_low; f[0] = 5180; t[1] = eep->modal_header_5g.quick_drop; f[1] = 5500; t[2] = eep->base_ext1.quick_drop_high; f[2] = 5785; value = interpolate(channel, f, t, 3); return value; } } static HAL_BOOL ar9300_quick_drop_apply(struct ath_hal *ah, u_int16_t channel) { ar9300_eeprom_t *eep = &AH9300(ah)->ah_eeprom; u_int32_t value; // // Test value. if 0 then quickDrop is unused. Don't load anything. // if (eep->base_eep_header.misc_configuration & 0x10) { if (AR_SREV_OSPREY(ah) || AR_SREV_AR9580(ah) || AR_SREV_WASP(ah)) { value = ar9300_quick_drop_get(ah, 0, channel); OS_REG_RMW_FIELD(ah, AR_PHY_AGC, AR_PHY_AGC_QUICK_DROP, value); } } return 0; } static u_int16_t ar9300_tx_end_to_xpa_off_get(struct ath_hal *ah, u_int16_t channel) { ar9300_eeprom_t *eep = &AH9300(ah)->ah_eeprom; if (channel < 4000) { return eep->modal_header_2g.tx_end_to_xpa_off; } else { return eep->modal_header_5g.tx_end_to_xpa_off; } } static HAL_BOOL ar9300_tx_end_to_xpab_off_apply(struct ath_hal *ah, u_int16_t channel) { u_int32_t value; value = ar9300_tx_end_to_xpa_off_get(ah, channel); /* Apply to both xpaa and xpab */ if (AR_SREV_OSPREY(ah) || AR_SREV_AR9580(ah) || AR_SREV_WASP(ah)) { OS_REG_RMW_FIELD(ah, AR_PHY_XPA_TIMING_CTL, AR_PHY_XPA_TIMING_CTL_TX_END_XPAB_OFF, value); OS_REG_RMW_FIELD(ah, AR_PHY_XPA_TIMING_CTL, AR_PHY_XPA_TIMING_CTL_TX_END_XPAA_OFF, value); } return 0; } static int ar9300_eeprom_cal_pier_get(struct ath_hal *ah, int mode, int ipier, int ichain, int *pfrequency, int *pcorrection, int *ptemperature, int *pvoltage) { u_int8_t *p_cal_pier; OSP_CAL_DATA_PER_FREQ_OP_LOOP *p_cal_pier_struct; int is_2ghz; ar9300_eeprom_t *eep = &AH9300(ah)->ah_eeprom; if (ichain >= OSPREY_MAX_CHAINS) { HALDEBUG(ah, HAL_DEBUG_EEPROM, "%s: Invalid chain index, must be less than %d\n", __func__, OSPREY_MAX_CHAINS); return -1; } if (mode) {/* 5GHz */ if (ipier >= OSPREY_NUM_5G_CAL_PIERS){ HALDEBUG(ah, HAL_DEBUG_EEPROM, "%s: Invalid 5GHz cal pier index, must be less than %d\n", __func__, OSPREY_NUM_5G_CAL_PIERS); return -1; } p_cal_pier = &(eep->cal_freq_pier_5g[ipier]); p_cal_pier_struct = &(eep->cal_pier_data_5g[ichain][ipier]); is_2ghz = 0; } else { if (ipier >= OSPREY_NUM_2G_CAL_PIERS){ HALDEBUG(ah, HAL_DEBUG_EEPROM, "%s: Invalid 2GHz cal pier index, must be less than %d\n", __func__, OSPREY_NUM_2G_CAL_PIERS); return -1; } p_cal_pier = &(eep->cal_freq_pier_2g[ipier]); p_cal_pier_struct = &(eep->cal_pier_data_2g[ichain][ipier]); is_2ghz = 1; } *pfrequency = FBIN2FREQ(*p_cal_pier, is_2ghz); *pcorrection = p_cal_pier_struct->ref_power; *ptemperature = p_cal_pier_struct->temp_meas; *pvoltage = p_cal_pier_struct->volt_meas; return 0; } /* * Apply the recorded correction values. */ static int ar9300_calibration_apply(struct ath_hal *ah, int frequency) { struct ath_hal_9300 *ahp = AH9300(ah); int ichain, ipier, npier; int mode; int fdiff; int pfrequency, pcorrection, ptemperature, pvoltage; int bf, factor, plus; int lfrequency[AR9300_MAX_CHAINS]; int hfrequency[AR9300_MAX_CHAINS]; int lcorrection[AR9300_MAX_CHAINS]; int hcorrection[AR9300_MAX_CHAINS]; int correction[AR9300_MAX_CHAINS]; int ltemperature[AR9300_MAX_CHAINS]; int htemperature[AR9300_MAX_CHAINS]; int temperature[AR9300_MAX_CHAINS]; int lvoltage[AR9300_MAX_CHAINS]; int hvoltage[AR9300_MAX_CHAINS]; int voltage[AR9300_MAX_CHAINS]; mode = (frequency >= 4000); npier = (mode) ? OSPREY_NUM_5G_CAL_PIERS : OSPREY_NUM_2G_CAL_PIERS; for (ichain = 0; ichain < AR9300_MAX_CHAINS; ichain++) { lfrequency[ichain] = 0; hfrequency[ichain] = 100000; } /* * identify best lower and higher frequency calibration measurement */ for (ichain = 0; ichain < AR9300_MAX_CHAINS; ichain++) { for (ipier = 0; ipier < npier; ipier++) { if (ar9300_eeprom_cal_pier_get( ah, mode, ipier, ichain, &pfrequency, &pcorrection, &ptemperature, &pvoltage) == 0) { fdiff = frequency - pfrequency; /* * this measurement is higher than our desired frequency */ if (fdiff <= 0) { if (hfrequency[ichain] <= 0 || hfrequency[ichain] >= 100000 || fdiff > (frequency - hfrequency[ichain])) { /* * new best higher frequency measurement */ hfrequency[ichain] = pfrequency; hcorrection[ichain] = pcorrection; htemperature[ichain] = ptemperature; hvoltage[ichain] = pvoltage; } } if (fdiff >= 0) { if (lfrequency[ichain] <= 0 || fdiff < (frequency - lfrequency[ichain])) { /* * new best lower frequency measurement */ lfrequency[ichain] = pfrequency; lcorrection[ichain] = pcorrection; ltemperature[ichain] = ptemperature; lvoltage[ichain] = pvoltage; } } } } } /* interpolate */ for (ichain = 0; ichain < AR9300_MAX_CHAINS; ichain++) { HALDEBUG(ah, HAL_DEBUG_EEPROM, "%s: ch=%d f=%d low=%d %d h=%d %d\n", __func__, ichain, frequency, lfrequency[ichain], lcorrection[ichain], hfrequency[ichain], hcorrection[ichain]); /* * they're the same, so just pick one */ if (hfrequency[ichain] == lfrequency[ichain]) { correction[ichain] = lcorrection[ichain]; voltage[ichain] = lvoltage[ichain]; temperature[ichain] = ltemperature[ichain]; } else if (frequency - lfrequency[ichain] < 1000) { /* the low frequency is good */ if (hfrequency[ichain] - frequency < 1000) { /* * The high frequency is good too - * interpolate with round off. */ int mult, div, diff; mult = frequency - lfrequency[ichain]; div = hfrequency[ichain] - lfrequency[ichain]; diff = hcorrection[ichain] - lcorrection[ichain]; bf = 2 * diff * mult / div; plus = (bf % 2); factor = bf / 2; correction[ichain] = lcorrection[ichain] + factor + plus; diff = htemperature[ichain] - ltemperature[ichain]; bf = 2 * diff * mult / div; plus = (bf % 2); factor = bf / 2; temperature[ichain] = ltemperature[ichain] + factor + plus; diff = hvoltage[ichain] - lvoltage[ichain]; bf = 2 * diff * mult / div; plus = (bf % 2); factor = bf / 2; voltage[ichain] = lvoltage[ichain] + factor + plus; } else { /* only low is good, use it */ correction[ichain] = lcorrection[ichain]; temperature[ichain] = ltemperature[ichain]; voltage[ichain] = lvoltage[ichain]; } } else if (hfrequency[ichain] - frequency < 1000) { /* only high is good, use it */ correction[ichain] = hcorrection[ichain]; temperature[ichain] = htemperature[ichain]; voltage[ichain] = hvoltage[ichain]; } else { /* nothing is good, presume 0???? */ correction[ichain] = 0; temperature[ichain] = 0; voltage[ichain] = 0; } } /* GreenTx isn't currently supported */ /* GreenTx */ if (ah->ah_config.ath_hal_sta_update_tx_pwr_enable) { if (AR_SREV_POSEIDON(ah)) { /* Get calibrated OLPC gain delta value for GreenTx */ ahp->ah_db2[POSEIDON_STORED_REG_G2_OLPC_OFFSET] = (u_int32_t) correction[0]; } } ar9300_power_control_override( ah, frequency, correction, voltage, temperature); HALDEBUG(ah, HAL_DEBUG_EEPROM, "%s: for frequency=%d, calibration correction = %d %d %d\n", __func__, frequency, correction[0], correction[1], correction[2]); return 0; } int ar9300_power_control_override(struct ath_hal *ah, int frequency, int *correction, int *voltage, int *temperature) { int temp_slope = 0; int temp_slope_1 = 0; int temp_slope_2 = 0; ar9300_eeprom_t *eep = &AH9300(ah)->ah_eeprom; int32_t f[8], t[8],t1[3], t2[3]; int i; OS_REG_RMW(ah, AR_PHY_TPC_11_B0, (correction[0] << AR_PHY_TPC_OLPC_GAIN_DELTA_S), AR_PHY_TPC_OLPC_GAIN_DELTA); if (!AR_SREV_POSEIDON(ah)) { OS_REG_RMW(ah, AR_PHY_TPC_11_B1, (correction[1] << AR_PHY_TPC_OLPC_GAIN_DELTA_S), AR_PHY_TPC_OLPC_GAIN_DELTA); if (!AR_SREV_WASP(ah) && !AR_SREV_JUPITER(ah) && !AR_SREV_HONEYBEE(ah) ) { OS_REG_RMW(ah, AR_PHY_TPC_11_B2, (correction[2] << AR_PHY_TPC_OLPC_GAIN_DELTA_S), AR_PHY_TPC_OLPC_GAIN_DELTA); } } /* * enable open loop power control on chip */ OS_REG_RMW(ah, AR_PHY_TPC_6_B0, (3 << AR_PHY_TPC_6_ERROR_EST_MODE_S), AR_PHY_TPC_6_ERROR_EST_MODE); if (!AR_SREV_POSEIDON(ah)) { OS_REG_RMW(ah, AR_PHY_TPC_6_B1, (3 << AR_PHY_TPC_6_ERROR_EST_MODE_S), AR_PHY_TPC_6_ERROR_EST_MODE); if (!AR_SREV_WASP(ah) && !AR_SREV_JUPITER(ah) && !AR_SREV_HONEYBEE(ah) ) { OS_REG_RMW(ah, AR_PHY_TPC_6_B2, (3 << AR_PHY_TPC_6_ERROR_EST_MODE_S), AR_PHY_TPC_6_ERROR_EST_MODE); } } /* * Enable temperature compensation * Need to use register names */ if (frequency < 4000) { temp_slope = eep->modal_header_2g.temp_slope; } else { if ((eep->base_eep_header.misc_configuration & 0x20) != 0) { for(i=0;i<8;i++) { t[i]=eep->base_ext1.tempslopextension[i]; f[i]=FBIN2FREQ(eep->cal_freq_pier_5g[i], 0); } temp_slope=interpolate(frequency,f,t,8); } else { if(!AR_SREV_SCORPION(ah)) { if (eep->base_ext2.temp_slope_low != 0) { t[0] = eep->base_ext2.temp_slope_low; f[0] = 5180; t[1] = eep->modal_header_5g.temp_slope; f[1] = 5500; t[2] = eep->base_ext2.temp_slope_high; f[2] = 5785; temp_slope = interpolate(frequency, f, t, 3); } else { temp_slope = eep->modal_header_5g.temp_slope; } } else { /* * Scorpion has individual chain tempslope values */ t[0] = eep->base_ext1.tempslopextension[2]; t1[0]= eep->base_ext1.tempslopextension[3]; t2[0]= eep->base_ext1.tempslopextension[4]; f[0] = 5180; t[1] = eep->modal_header_5g.temp_slope; t1[1]= eep->base_ext1.tempslopextension[0]; t2[1]= eep->base_ext1.tempslopextension[1]; f[1] = 5500; t[2] = eep->base_ext1.tempslopextension[5]; t1[2]= eep->base_ext1.tempslopextension[6]; t2[2]= eep->base_ext1.tempslopextension[7]; f[2] = 5785; temp_slope = interpolate(frequency, f, t, 3); temp_slope_1=interpolate(frequency, f, t1,3); temp_slope_2=interpolate(frequency, f, t2,3); } } } if (!AR_SREV_SCORPION(ah) && !AR_SREV_HONEYBEE(ah)) { OS_REG_RMW_FIELD(ah, AR_PHY_TPC_19, AR_PHY_TPC_19_ALPHA_THERM, temp_slope); } else { /*Scorpion and Honeybee has tempSlope register for each chain*/ /*Check whether temp_compensation feature is enabled or not*/ if (eep->base_eep_header.feature_enable & 0x1){ if(frequency < 4000) { if (((eep->base_eep_header.txrx_mask & 0xf0) >> 4) & 0x1) { OS_REG_RMW_FIELD(ah, AR_PHY_TPC_19, AR_PHY_TPC_19_ALPHA_THERM, eep->base_ext2.temp_slope_low); } if (((eep->base_eep_header.txrx_mask & 0xf0) >> 4) & 0x2) { OS_REG_RMW_FIELD(ah, AR_SCORPION_PHY_TPC_19_B1, AR_PHY_TPC_19_ALPHA_THERM, temp_slope); } if (((eep->base_eep_header.txrx_mask & 0xf0) >> 4) & 0x4) { OS_REG_RMW_FIELD(ah, AR_SCORPION_PHY_TPC_19_B2, AR_PHY_TPC_19_ALPHA_THERM, eep->base_ext2.temp_slope_high); } } else { if (((eep->base_eep_header.txrx_mask & 0xf0) >> 4) & 0x1) { OS_REG_RMW_FIELD(ah, AR_PHY_TPC_19, AR_PHY_TPC_19_ALPHA_THERM, temp_slope); } if (((eep->base_eep_header.txrx_mask & 0xf0) >> 4) & 0x2) { OS_REG_RMW_FIELD(ah, AR_SCORPION_PHY_TPC_19_B1, AR_PHY_TPC_19_ALPHA_THERM, temp_slope_1); } if (((eep->base_eep_header.txrx_mask & 0xf0) >> 4) & 0x4) { OS_REG_RMW_FIELD(ah, AR_SCORPION_PHY_TPC_19_B2, AR_PHY_TPC_19_ALPHA_THERM, temp_slope_2); } } }else { /* If temp compensation is not enabled, set all registers to 0*/ if (((eep->base_eep_header.txrx_mask & 0xf0) >> 4) & 0x1) { OS_REG_RMW_FIELD(ah, AR_PHY_TPC_19, AR_PHY_TPC_19_ALPHA_THERM, 0); } if (((eep->base_eep_header.txrx_mask & 0xf0) >> 4) & 0x2) { OS_REG_RMW_FIELD(ah, AR_SCORPION_PHY_TPC_19_B1, AR_PHY_TPC_19_ALPHA_THERM, 0); } if (((eep->base_eep_header.txrx_mask & 0xf0) >> 4) & 0x4) { OS_REG_RMW_FIELD(ah, AR_SCORPION_PHY_TPC_19_B2, AR_PHY_TPC_19_ALPHA_THERM, 0); } } } OS_REG_RMW_FIELD(ah, AR_PHY_TPC_18, AR_PHY_TPC_18_THERM_CAL_VALUE, temperature[0]); return 0; } /************************************************************** * ar9300_eep_def_get_max_edge_power * * Find the maximum conformance test limit for the given channel and CTL info */ static inline u_int16_t ar9300_eep_def_get_max_edge_power(ar9300_eeprom_t *p_eep_data, u_int16_t freq, int idx, HAL_BOOL is_2ghz) { u_int16_t twice_max_edge_power = AR9300_MAX_RATE_POWER; u_int8_t *ctl_freqbin = is_2ghz ? &p_eep_data->ctl_freqbin_2G[idx][0] : &p_eep_data->ctl_freqbin_5G[idx][0]; u_int16_t num_edges = is_2ghz ? OSPREY_NUM_BAND_EDGES_2G : OSPREY_NUM_BAND_EDGES_5G; int i; /* Get the edge power */ for (i = 0; (i < num_edges) && (ctl_freqbin[i] != AR9300_BCHAN_UNUSED); i++) { /* * If there's an exact channel match or an inband flag set * on the lower channel use the given rd_edge_power */ if (freq == fbin2freq(ctl_freqbin[i], is_2ghz)) { if (is_2ghz) { twice_max_edge_power = p_eep_data->ctl_power_data_2g[idx].ctl_edges[i].t_power; } else { twice_max_edge_power = p_eep_data->ctl_power_data_5g[idx].ctl_edges[i].t_power; } break; } else if ((i > 0) && (freq < fbin2freq(ctl_freqbin[i], is_2ghz))) { if (is_2ghz) { if (fbin2freq(ctl_freqbin[i - 1], 1) < freq && p_eep_data->ctl_power_data_2g[idx].ctl_edges[i - 1].flag) { twice_max_edge_power = p_eep_data->ctl_power_data_2g[idx]. ctl_edges[i - 1].t_power; } } else { if (fbin2freq(ctl_freqbin[i - 1], 0) < freq && p_eep_data->ctl_power_data_5g[idx].ctl_edges[i - 1].flag) { twice_max_edge_power = p_eep_data->ctl_power_data_5g[idx]. ctl_edges[i - 1].t_power; } } /* * Leave loop - no more affecting edges possible * in this monotonic increasing list */ break; } } /* * EV89475: EEPROM might contain 0 txpower in CTL table for certain * 2.4GHz channels. We workaround it by overwriting 60 (30 dBm) here. */ if (is_2ghz && (twice_max_edge_power == 0)) { twice_max_edge_power = 60; } HALASSERT(twice_max_edge_power > 0); return twice_max_edge_power; } HAL_BOOL ar9300_eeprom_set_power_per_rate_table( struct ath_hal *ah, ar9300_eeprom_t *p_eep_data, const struct ieee80211_channel *chan, u_int8_t *p_pwr_array, u_int16_t cfg_ctl, u_int16_t antenna_reduction, u_int16_t twice_max_regulatory_power, u_int16_t power_limit, u_int8_t chainmask) { /* Local defines to distinguish between extension and control CTL's */ #define EXT_ADDITIVE (0x8000) #define CTL_11A_EXT (CTL_11A | EXT_ADDITIVE) #define CTL_11G_EXT (CTL_11G | EXT_ADDITIVE) #define CTL_11B_EXT (CTL_11B | EXT_ADDITIVE) #define REDUCE_SCALED_POWER_BY_TWO_CHAIN 6 /* 10*log10(2)*2 */ #define REDUCE_SCALED_POWER_BY_THREE_CHAIN 10 /* 10*log10(3)*2 */ #define PWRINCR_3_TO_1_CHAIN 9 /* 10*log(3)*2 */ #define PWRINCR_3_TO_2_CHAIN 3 /* floor(10*log(3/2)*2) */ #define PWRINCR_2_TO_1_CHAIN 6 /* 10*log(2)*2 */ static const u_int16_t tp_scale_reduction_table[5] = { 0, 3, 6, 9, AR9300_MAX_RATE_POWER }; int i; int16_t twice_largest_antenna; u_int16_t twice_antenna_reduction = 2*antenna_reduction ; int16_t scaled_power = 0, min_ctl_power, max_reg_allowed_power; #define SUB_NUM_CTL_MODES_AT_5G_40 2 /* excluding HT40, EXT-OFDM */ #define SUB_NUM_CTL_MODES_AT_2G_40 3 /* excluding HT40, EXT-OFDM, EXT-CCK */ u_int16_t ctl_modes_for11a[] = {CTL_11A, CTL_5GHT20, CTL_11A_EXT, CTL_5GHT40}; u_int16_t ctl_modes_for11g[] = {CTL_11B, CTL_11G, CTL_2GHT20, CTL_11B_EXT, CTL_11G_EXT, CTL_2GHT40}; u_int16_t num_ctl_modes, *p_ctl_mode, ctl_mode, freq; CHAN_CENTERS centers; int tx_chainmask; struct ath_hal_9300 *ahp = AH9300(ah); u_int8_t *ctl_index; u_int8_t ctl_num; u_int16_t twice_min_edge_power; u_int16_t twice_max_edge_power = AR9300_MAX_RATE_POWER; #ifdef AH_DEBUG HAL_CHANNEL_INTERNAL *ichan = ath_hal_checkchannel(ah, chan); #endif if (chainmask) tx_chainmask = chainmask; else tx_chainmask = ahp->ah_tx_chainmaskopt ? ahp->ah_tx_chainmaskopt :ahp->ah_tx_chainmask; ar9300_get_channel_centers(ah, chan, ¢ers); #if 1 if (IEEE80211_IS_CHAN_2GHZ(chan)) { ahp->twice_antenna_gain = p_eep_data->modal_header_2g.antenna_gain; } else { ahp->twice_antenna_gain = p_eep_data->modal_header_5g.antenna_gain; } #else if (IEEE80211_IS_CHAN_2GHZ(chan)) { ahp->twice_antenna_gain = AH_MAX(p_eep_data->modal_header_2g.antenna_gain, AH_PRIVATE(ah)->ah_antenna_gain_2g); } else { ahp->twice_antenna_gain = AH_MAX(p_eep_data->modal_header_5g.antenna_gain, AH_PRIVATE(ah)->ah_antenna_gain_5g); } #endif /* Save max allowed antenna gain to ease future lookups */ ahp->twice_antenna_reduction = twice_antenna_reduction; /* Deduct antenna gain from EIRP to get the upper limit */ twice_largest_antenna = (int16_t)AH_MIN((twice_antenna_reduction - ahp->twice_antenna_gain), 0); max_reg_allowed_power = twice_max_regulatory_power + twice_largest_antenna; /* Use ah_tp_scale - see bug 30070. */ if (AH_PRIVATE(ah)->ah_tpScale != HAL_TP_SCALE_MAX) { max_reg_allowed_power -= (tp_scale_reduction_table[(AH_PRIVATE(ah)->ah_tpScale)] * 2); } scaled_power = AH_MIN(power_limit, max_reg_allowed_power); /* * Reduce scaled Power by number of chains active to get to * per chain tx power level */ /* TODO: better value than these? */ switch (ar9300_get_ntxchains(tx_chainmask)) { case 1: ahp->upper_limit[0] = AH_MAX(0, scaled_power); break; case 2: scaled_power -= REDUCE_SCALED_POWER_BY_TWO_CHAIN; ahp->upper_limit[1] = AH_MAX(0, scaled_power); break; case 3: scaled_power -= REDUCE_SCALED_POWER_BY_THREE_CHAIN; ahp->upper_limit[2] = AH_MAX(0, scaled_power); break; default: HALASSERT(0); /* Unsupported number of chains */ } scaled_power = AH_MAX(0, scaled_power); /* Get target powers from EEPROM - our baseline for TX Power */ if (IEEE80211_IS_CHAN_2GHZ(chan)) { /* Setup for CTL modes */ /* CTL_11B, CTL_11G, CTL_2GHT20 */ num_ctl_modes = ARRAY_LENGTH(ctl_modes_for11g) - SUB_NUM_CTL_MODES_AT_2G_40; p_ctl_mode = ctl_modes_for11g; if (IEEE80211_IS_CHAN_HT40(chan)) { num_ctl_modes = ARRAY_LENGTH(ctl_modes_for11g); /* All 2G CTL's */ } } else { /* Setup for CTL modes */ /* CTL_11A, CTL_5GHT20 */ num_ctl_modes = ARRAY_LENGTH(ctl_modes_for11a) - SUB_NUM_CTL_MODES_AT_5G_40; p_ctl_mode = ctl_modes_for11a; if (IEEE80211_IS_CHAN_HT40(chan)) { num_ctl_modes = ARRAY_LENGTH(ctl_modes_for11a); /* All 5G CTL's */ } } /* * For MIMO, need to apply regulatory caps individually across dynamically * running modes: CCK, OFDM, HT20, HT40 * * The outer loop walks through each possible applicable runtime mode. * The inner loop walks through each ctl_index entry in EEPROM. * The ctl value is encoded as [7:4] == test group, [3:0] == test mode. * */ for (ctl_mode = 0; ctl_mode < num_ctl_modes; ctl_mode++) { HAL_BOOL is_ht40_ctl_mode = (p_ctl_mode[ctl_mode] == CTL_5GHT40) || (p_ctl_mode[ctl_mode] == CTL_2GHT40); if (is_ht40_ctl_mode) { freq = centers.synth_center; } else if (p_ctl_mode[ctl_mode] & EXT_ADDITIVE) { freq = centers.ext_center; } else { freq = centers.ctl_center; } HALDEBUG(ah, HAL_DEBUG_POWER_MGMT, "LOOP-Mode ctl_mode %d < %d, " "is_ht40_ctl_mode %d, EXT_ADDITIVE %d\n", ctl_mode, num_ctl_modes, is_ht40_ctl_mode, (p_ctl_mode[ctl_mode] & EXT_ADDITIVE)); /* walk through each CTL index stored in EEPROM */ if (IEEE80211_IS_CHAN_2GHZ(chan)) { ctl_index = p_eep_data->ctl_index_2g; ctl_num = OSPREY_NUM_CTLS_2G; } else { ctl_index = p_eep_data->ctl_index_5g; ctl_num = OSPREY_NUM_CTLS_5G; } for (i = 0; (i < ctl_num) && ctl_index[i]; i++) { HALDEBUG(ah, HAL_DEBUG_POWER_MGMT, " LOOP-Ctlidx %d: cfg_ctl 0x%2.2x p_ctl_mode 0x%2.2x " "ctl_index 0x%2.2x chan %d chanctl 0x%x\n", i, cfg_ctl, p_ctl_mode[ctl_mode], ctl_index[i], ichan->channel, ath_hal_getctl(ah, chan)); /* * compare test group from regulatory channel list * with test mode from p_ctl_mode list */ if ((((cfg_ctl & ~CTL_MODE_M) | (p_ctl_mode[ctl_mode] & CTL_MODE_M)) == ctl_index[i]) || (((cfg_ctl & ~CTL_MODE_M) | (p_ctl_mode[ctl_mode] & CTL_MODE_M)) == ((ctl_index[i] & CTL_MODE_M) | SD_NO_CTL))) { twice_min_edge_power = ar9300_eep_def_get_max_edge_power( p_eep_data, freq, i, IEEE80211_IS_CHAN_2GHZ(chan)); HALDEBUG(ah, HAL_DEBUG_POWER_MGMT, " MATCH-EE_IDX %d: ch %d is2 %d " "2xMinEdge %d chainmask %d chains %d\n", i, freq, IEEE80211_IS_CHAN_2GHZ(chan), twice_min_edge_power, tx_chainmask, ar9300_get_ntxchains(tx_chainmask)); if ((cfg_ctl & ~CTL_MODE_M) == SD_NO_CTL) { /* * Find the minimum of all CTL edge powers * that apply to this channel */ twice_max_edge_power = AH_MIN(twice_max_edge_power, twice_min_edge_power); } else { /* specific */ twice_max_edge_power = twice_min_edge_power; break; } } } min_ctl_power = (u_int8_t)AH_MIN(twice_max_edge_power, scaled_power); HALDEBUG(ah, HAL_DEBUG_POWER_MGMT, " SEL-Min ctl_mode %d p_ctl_mode %d " "2xMaxEdge %d sP %d min_ctl_pwr %d\n", ctl_mode, p_ctl_mode[ctl_mode], twice_max_edge_power, scaled_power, min_ctl_power); /* Apply ctl mode to correct target power set */ switch (p_ctl_mode[ctl_mode]) { case CTL_11B: for (i = ALL_TARGET_LEGACY_1L_5L; i <= ALL_TARGET_LEGACY_11S; i++) { p_pwr_array[i] = (u_int8_t)AH_MIN(p_pwr_array[i], min_ctl_power); } break; case CTL_11A: case CTL_11G: for (i = ALL_TARGET_LEGACY_6_24; i <= ALL_TARGET_LEGACY_54; i++) { p_pwr_array[i] = (u_int8_t)AH_MIN(p_pwr_array[i], min_ctl_power); #ifdef ATH_BT_COEX if ((ahp->ah_bt_coex_config_type == HAL_BT_COEX_CFG_3WIRE) || (ahp->ah_bt_coex_config_type == HAL_BT_COEX_CFG_MCI)) { if ((ahp->ah_bt_coex_flag & HAL_BT_COEX_FLAG_LOWER_TX_PWR) && (ahp->ah_bt_wlan_isolation < HAL_BT_COEX_ISOLATION_FOR_NO_COEX)) { u_int8_t reduce_pow; reduce_pow = (HAL_BT_COEX_ISOLATION_FOR_NO_COEX - ahp->ah_bt_wlan_isolation) << 1; if (reduce_pow <= p_pwr_array[i]) { p_pwr_array[i] -= reduce_pow; } } if ((ahp->ah_bt_coex_flag & HAL_BT_COEX_FLAG_LOW_ACK_PWR) && (i != ALL_TARGET_LEGACY_36) && (i != ALL_TARGET_LEGACY_48) && (i != ALL_TARGET_LEGACY_54) && (p_ctl_mode[ctl_mode] == CTL_11G)) { p_pwr_array[i] = 0; } } #endif } break; case CTL_5GHT20: case CTL_2GHT20: for (i = ALL_TARGET_HT20_0_8_16; i <= ALL_TARGET_HT20_23; i++) { p_pwr_array[i] = (u_int8_t)AH_MIN(p_pwr_array[i], min_ctl_power); #ifdef ATH_BT_COEX if (((ahp->ah_bt_coex_config_type == HAL_BT_COEX_CFG_3WIRE) || (ahp->ah_bt_coex_config_type == HAL_BT_COEX_CFG_MCI)) && (ahp->ah_bt_coex_flag & HAL_BT_COEX_FLAG_LOWER_TX_PWR) && (ahp->ah_bt_wlan_isolation < HAL_BT_COEX_ISOLATION_FOR_NO_COEX)) { u_int8_t reduce_pow = (HAL_BT_COEX_ISOLATION_FOR_NO_COEX - ahp->ah_bt_wlan_isolation) << 1; if (reduce_pow <= p_pwr_array[i]) { p_pwr_array[i] -= reduce_pow; } } #if ATH_SUPPORT_MCI else if ((ahp->ah_bt_coex_flag & HAL_BT_COEX_FLAG_MCI_MAX_TX_PWR) && (p_ctl_mode[ctl_mode] == CTL_2GHT20) && (ahp->ah_bt_coex_config_type == HAL_BT_COEX_CFG_MCI)) { u_int8_t max_pwr; max_pwr = MS(mci_concur_tx_max_pwr[2][1], ATH_MCI_CONCUR_TX_LOWEST_PWR_MASK); if (p_pwr_array[i] > max_pwr) { p_pwr_array[i] = max_pwr; } } #endif #endif } break; case CTL_11B_EXT: #ifdef NOT_YET target_power_cck_ext.t_pow2x[0] = (u_int8_t) AH_MIN(target_power_cck_ext.t_pow2x[0], min_ctl_power); #endif /* NOT_YET */ break; case CTL_11A_EXT: case CTL_11G_EXT: #ifdef NOT_YET target_power_ofdm_ext.t_pow2x[0] = (u_int8_t) AH_MIN(target_power_ofdm_ext.t_pow2x[0], min_ctl_power); #endif /* NOT_YET */ break; case CTL_5GHT40: case CTL_2GHT40: for (i = ALL_TARGET_HT40_0_8_16; i <= ALL_TARGET_HT40_23; i++) { p_pwr_array[i] = (u_int8_t) AH_MIN(p_pwr_array[i], min_ctl_power); #ifdef ATH_BT_COEX if (((ahp->ah_bt_coex_config_type == HAL_BT_COEX_CFG_3WIRE) || (ahp->ah_bt_coex_config_type == HAL_BT_COEX_CFG_MCI)) && (ahp->ah_bt_coex_flag & HAL_BT_COEX_FLAG_LOWER_TX_PWR) && (ahp->ah_bt_wlan_isolation < HAL_BT_COEX_ISOLATION_FOR_NO_COEX)) { u_int8_t reduce_pow = (HAL_BT_COEX_ISOLATION_FOR_NO_COEX - ahp->ah_bt_wlan_isolation) << 1; if (reduce_pow <= p_pwr_array[i]) { p_pwr_array[i] -= reduce_pow; } } #if ATH_SUPPORT_MCI else if ((ahp->ah_bt_coex_flag & HAL_BT_COEX_FLAG_MCI_MAX_TX_PWR) && (p_ctl_mode[ctl_mode] == CTL_2GHT40) && (ahp->ah_bt_coex_config_type == HAL_BT_COEX_CFG_MCI)) { u_int8_t max_pwr; max_pwr = MS(mci_concur_tx_max_pwr[3][1], ATH_MCI_CONCUR_TX_LOWEST_PWR_MASK); if (p_pwr_array[i] > max_pwr) { p_pwr_array[i] = max_pwr; } } #endif #endif } break; default: HALASSERT(0); break; } } /* end ctl mode checking */ return AH_TRUE; #undef EXT_ADDITIVE #undef CTL_11A_EXT #undef CTL_11G_EXT #undef CTL_11B_EXT #undef REDUCE_SCALED_POWER_BY_TWO_CHAIN #undef REDUCE_SCALED_POWER_BY_THREE_CHAIN } /************************************************************** * ar9300_eeprom_set_transmit_power * * Set the transmit power in the baseband for the given * operating channel and mode. */ HAL_STATUS ar9300_eeprom_set_transmit_power(struct ath_hal *ah, ar9300_eeprom_t *p_eep_data, const struct ieee80211_channel *chan, u_int16_t cfg_ctl, u_int16_t antenna_reduction, u_int16_t twice_max_regulatory_power, u_int16_t power_limit) { #define ABS(_x, _y) ((int)_x > (int)_y ? (int)_x - (int)_y : (int)_y - (int)_x) #define INCREASE_MAXPOW_BY_TWO_CHAIN 6 /* 10*log10(2)*2 */ #define INCREASE_MAXPOW_BY_THREE_CHAIN 10 /* 10*log10(3)*2 */ u_int8_t target_power_val_t2[ar9300_rate_size]; u_int8_t target_power_val_t2_eep[ar9300_rate_size]; int16_t twice_array_gain = 0, max_power_level = 0; struct ath_hal_9300 *ahp = AH9300(ah); int i = 0; u_int32_t tmp_paprd_rate_mask = 0, *tmp_ptr = NULL; int paprd_scale_factor = 5; HAL_CHANNEL_INTERNAL *ichan = ath_hal_checkchannel(ah, chan); u_int8_t *ptr_mcs_rate2power_table_index; u_int8_t mcs_rate2power_table_index_ht20[24] = { ALL_TARGET_HT20_0_8_16, ALL_TARGET_HT20_1_3_9_11_17_19, ALL_TARGET_HT20_1_3_9_11_17_19, ALL_TARGET_HT20_1_3_9_11_17_19, ALL_TARGET_HT20_4, ALL_TARGET_HT20_5, ALL_TARGET_HT20_6, ALL_TARGET_HT20_7, ALL_TARGET_HT20_0_8_16, ALL_TARGET_HT20_1_3_9_11_17_19, ALL_TARGET_HT20_1_3_9_11_17_19, ALL_TARGET_HT20_1_3_9_11_17_19, ALL_TARGET_HT20_12, ALL_TARGET_HT20_13, ALL_TARGET_HT20_14, ALL_TARGET_HT20_15, ALL_TARGET_HT20_0_8_16, ALL_TARGET_HT20_1_3_9_11_17_19, ALL_TARGET_HT20_1_3_9_11_17_19, ALL_TARGET_HT20_1_3_9_11_17_19, ALL_TARGET_HT20_20, ALL_TARGET_HT20_21, ALL_TARGET_HT20_22, ALL_TARGET_HT20_23 }; u_int8_t mcs_rate2power_table_index_ht40[24] = { ALL_TARGET_HT40_0_8_16, ALL_TARGET_HT40_1_3_9_11_17_19, ALL_TARGET_HT40_1_3_9_11_17_19, ALL_TARGET_HT40_1_3_9_11_17_19, ALL_TARGET_HT40_4, ALL_TARGET_HT40_5, ALL_TARGET_HT40_6, ALL_TARGET_HT40_7, ALL_TARGET_HT40_0_8_16, ALL_TARGET_HT40_1_3_9_11_17_19, ALL_TARGET_HT40_1_3_9_11_17_19, ALL_TARGET_HT40_1_3_9_11_17_19, ALL_TARGET_HT40_12, ALL_TARGET_HT40_13, ALL_TARGET_HT40_14, ALL_TARGET_HT40_15, ALL_TARGET_HT40_0_8_16, ALL_TARGET_HT40_1_3_9_11_17_19, ALL_TARGET_HT40_1_3_9_11_17_19, ALL_TARGET_HT40_1_3_9_11_17_19, ALL_TARGET_HT40_20, ALL_TARGET_HT40_21, ALL_TARGET_HT40_22, ALL_TARGET_HT40_23, }; HALDEBUG(ah, HAL_DEBUG_CALIBRATE, "%s[%d] +++chan %d,cfgctl 0x%04x " "antenna_reduction 0x%04x, twice_max_regulatory_power 0x%04x " "power_limit 0x%04x\n", __func__, __LINE__, ichan->channel, cfg_ctl, antenna_reduction, twice_max_regulatory_power, power_limit); ar9300_set_target_power_from_eeprom(ah, ichan->channel, target_power_val_t2); if (ar9300_eeprom_get(ahp, EEP_PAPRD_ENABLED)) { if (IEEE80211_IS_CHAN_2GHZ(chan)) { if (IEEE80211_IS_CHAN_HT40(chan)) { tmp_paprd_rate_mask = p_eep_data->modal_header_2g.paprd_rate_mask_ht40; tmp_ptr = &AH9300(ah)->ah_2g_paprd_rate_mask_ht40; } else { tmp_paprd_rate_mask = p_eep_data->modal_header_2g.paprd_rate_mask_ht20; tmp_ptr = &AH9300(ah)->ah_2g_paprd_rate_mask_ht20; } } else { if (IEEE80211_IS_CHAN_HT40(chan)) { tmp_paprd_rate_mask = p_eep_data->modal_header_5g.paprd_rate_mask_ht40; tmp_ptr = &AH9300(ah)->ah_5g_paprd_rate_mask_ht40; } else { tmp_paprd_rate_mask = p_eep_data->modal_header_5g.paprd_rate_mask_ht20; tmp_ptr = &AH9300(ah)->ah_5g_paprd_rate_mask_ht20; } } AH_PAPRD_GET_SCALE_FACTOR( paprd_scale_factor, p_eep_data, IEEE80211_IS_CHAN_2GHZ(chan), ichan->channel); HALDEBUG(ah, HAL_DEBUG_CALIBRATE, "%s[%d] paprd_scale_factor %d\n", __func__, __LINE__, paprd_scale_factor); /* PAPRD is not done yet, Scale down the EEP power */ if (IEEE80211_IS_CHAN_HT40(chan)) { ptr_mcs_rate2power_table_index = &mcs_rate2power_table_index_ht40[0]; } else { ptr_mcs_rate2power_table_index = &mcs_rate2power_table_index_ht20[0]; } if (! ichan->paprd_table_write_done) { for (i = 0; i < 24; i++) { /* PAPRD is done yet, so Scale down Power for PAPRD Rates*/ if (tmp_paprd_rate_mask & (1 << i)) { target_power_val_t2[ptr_mcs_rate2power_table_index[i]] -= paprd_scale_factor; HALDEBUG(ah, HAL_DEBUG_CALIBRATE, "%s[%d]: Chan %d " "Scale down target_power_val_t2[%d] = 0x%04x\n", __func__, __LINE__, ichan->channel, i, target_power_val_t2[i]); } } } else { HALDEBUG(ah, HAL_DEBUG_CALIBRATE, "%s[%d]: PAPRD Done No TGT PWR Scaling\n", __func__, __LINE__); } } /* Save the Target power for future use */ OS_MEMCPY(target_power_val_t2_eep, target_power_val_t2, sizeof(target_power_val_t2)); ar9300_eeprom_set_power_per_rate_table(ah, p_eep_data, chan, target_power_val_t2, cfg_ctl, antenna_reduction, twice_max_regulatory_power, power_limit, 0); /* Save this for quick lookup */ ahp->reg_dmn = ath_hal_getctl(ah, chan); /* * Always use CDD/direct per rate power table for register based approach. * For FCC, CDD calculations should factor in the array gain, hence * this adjust call. ETSI and MKK does not have this requirement. */ if (is_reg_dmn_fcc(ahp->reg_dmn)) { HALDEBUG(ah, HAL_DEBUG_CALIBRATE, "%s: FCC regdomain, calling reg_txpower_cdd\n", __func__); ar9300_adjust_reg_txpower_cdd(ah, target_power_val_t2); } if (ar9300_eeprom_get(ahp, EEP_PAPRD_ENABLED)) { for (i = 0; i < ar9300_rate_size; i++) { /* * EEPROM TGT PWR is not same as current TGT PWR, * so Disable PAPRD for this rate. * Some of APs might ask to reduce Target Power, * if target power drops significantly, * disable PAPRD for that rate. */ if (tmp_paprd_rate_mask & (1 << i)) { if (ABS(target_power_val_t2_eep[i], target_power_val_t2[i]) > paprd_scale_factor) { tmp_paprd_rate_mask &= ~(1 << i); HALDEBUG(ah, HAL_DEBUG_CALIBRATE, "%s: EEP TPC[%02d] 0x%08x " "Curr TPC[%02d] 0x%08x mask = 0x%08x\n", __func__, i, target_power_val_t2_eep[i], i, target_power_val_t2[i], tmp_paprd_rate_mask); } } } HALDEBUG(ah, HAL_DEBUG_CALIBRATE, "%s: Chan %d After tmp_paprd_rate_mask = 0x%08x\n", __func__, ichan->channel, tmp_paprd_rate_mask); if (tmp_ptr) { *tmp_ptr = tmp_paprd_rate_mask; } } /* Write target power array to registers */ ar9300_transmit_power_reg_write(ah, target_power_val_t2); /* Write target power for self generated frames to the TPC register */ ar9300_selfgen_tpc_reg_write(ah, chan, target_power_val_t2); /* GreenTx or Paprd */ if (ah->ah_config.ath_hal_sta_update_tx_pwr_enable || AH_PRIVATE(ah)->ah_caps.halPaprdEnabled) { if (AR_SREV_POSEIDON(ah)) { /*For HAL_RSSI_TX_POWER_NONE array*/ OS_MEMCPY(ahp->ah_default_tx_power, target_power_val_t2, sizeof(target_power_val_t2)); /* Get defautl tx related register setting for GreenTx */ /* Record OB/DB */ ahp->ah_ob_db1[POSEIDON_STORED_REG_OBDB] = OS_REG_READ(ah, AR_PHY_65NM_CH0_TXRF2); /* Record TPC settting */ ahp->ah_ob_db1[POSEIDON_STORED_REG_TPC] = OS_REG_READ(ah, AR_TPC); /* Record BB_powertx_rate9 setting */ ahp->ah_ob_db1[POSEIDON_STORED_REG_BB_PWRTX_RATE9] = OS_REG_READ(ah, AR_PHY_BB_POWERTX_RATE9); } } /* * Return tx power used to iwconfig. * Since power is rate dependent, use one of the indices from the * AR9300_Rates enum to select an entry from target_power_val_t2[] * to report. * Currently returns the power for HT40 MCS 0, HT20 MCS 0, or OFDM 6 Mbps * as CCK power is less interesting (?). */ i = ALL_TARGET_LEGACY_6_24; /* legacy */ if (IEEE80211_IS_CHAN_HT40(chan)) { i = ALL_TARGET_HT40_0_8_16; /* ht40 */ } else if (IEEE80211_IS_CHAN_HT20(chan)) { i = ALL_TARGET_HT20_0_8_16; /* ht20 */ } max_power_level = target_power_val_t2[i]; /* Adjusting the ah_max_power_level based on chains and antennaGain*/ switch (ar9300_get_ntxchains(((ahp->ah_tx_chainmaskopt > 0) ? ahp->ah_tx_chainmaskopt : ahp->ah_tx_chainmask))) { case 1: break; case 2: twice_array_gain = (ahp->twice_antenna_gain >= ahp->twice_antenna_reduction)? 0: ((int16_t)AH_MIN((ahp->twice_antenna_reduction - (ahp->twice_antenna_gain + INCREASE_MAXPOW_BY_TWO_CHAIN)), 0)); /* Adjusting maxpower with antennaGain */ max_power_level -= twice_array_gain; /* Adjusting maxpower based on chain */ max_power_level += INCREASE_MAXPOW_BY_TWO_CHAIN; break; case 3: twice_array_gain = (ahp->twice_antenna_gain >= ahp->twice_antenna_reduction)? 0: ((int16_t)AH_MIN((ahp->twice_antenna_reduction - (ahp->twice_antenna_gain + INCREASE_MAXPOW_BY_THREE_CHAIN)), 0)); /* Adjusting maxpower with antennaGain */ max_power_level -= twice_array_gain; /* Adjusting maxpower based on chain */ max_power_level += INCREASE_MAXPOW_BY_THREE_CHAIN; break; default: HALASSERT(0); /* Unsupported number of chains */ } AH_PRIVATE(ah)->ah_maxPowerLevel = (int8_t)max_power_level; ar9300_calibration_apply(ah, ichan->channel); #undef ABS /* Handle per packet TPC initializations */ if (ah->ah_config.ath_hal_desc_tpc) { /* Transmit Power per-rate per-chain are computed here. A separate * power table is maintained for different MIMO modes (i.e. TXBF ON, * STBC) to enable easy lookup during packet transmit. * The reason for maintaing each of these tables per chain is that * the transmit power used for different number of chains is different * depending on whether the power has been limited by the target power, * the regulatory domain or the CTL limits. */ u_int mode = ath_hal_get_curmode(ah, chan); u_int32_t val = 0; u_int8_t chainmasks[AR9300_MAX_CHAINS] = {OSPREY_1_CHAINMASK, OSPREY_2LOHI_CHAINMASK, OSPREY_3_CHAINMASK}; for (i = 0; i < AR9300_MAX_CHAINS; i++) { OS_MEMCPY(target_power_val_t2, target_power_val_t2_eep, sizeof(target_power_val_t2_eep)); ar9300_eeprom_set_power_per_rate_table(ah, p_eep_data, chan, target_power_val_t2, cfg_ctl, antenna_reduction, twice_max_regulatory_power, power_limit, chainmasks[i]); HALDEBUG(ah, HAL_DEBUG_POWER_MGMT, " Channel = %d Chainmask = %d, Upper Limit = [%2d.%1d dBm]\n", ichan->channel, i, ahp->upper_limit[i]/2, ahp->upper_limit[i]%2 * 5); ar9300_init_rate_txpower(ah, mode, chan, target_power_val_t2, chainmasks[i]); } /* Enable TPC */ OS_REG_WRITE(ah, AR_PHY_PWRTX_MAX, AR_PHY_PWRTX_MAX_TPC_ENABLE); /* * Disable per chain power reduction since we are already * accounting for this in our calculations */ val = OS_REG_READ(ah, AR_PHY_POWER_TX_SUB); if (AR_SREV_WASP(ah)) { OS_REG_WRITE(ah, AR_PHY_POWER_TX_SUB, val & AR_PHY_POWER_TX_SUB_2_DISABLE); } else { OS_REG_WRITE(ah, AR_PHY_POWER_TX_SUB, val & AR_PHY_POWER_TX_SUB_3_DISABLE); } } return HAL_OK; } /************************************************************** * ar9300_eeprom_set_addac * * Set the ADDAC from eeprom. */ void ar9300_eeprom_set_addac(struct ath_hal *ah, struct ieee80211_channel *chan) { HALDEBUG(AH_NULL, HAL_DEBUG_UNMASKABLE, "FIXME: ar9300_eeprom_def_set_addac called\n"); #if 0 MODAL_EEPDEF_HEADER *p_modal; struct ath_hal_9300 *ahp = AH9300(ah); ar9300_eeprom_t *eep = &ahp->ah_eeprom.def; u_int8_t biaslevel; if (AH_PRIVATE(ah)->ah_macVersion != AR_SREV_VERSION_SOWL) { return; } HALASSERT(owl_get_eepdef_ver(ahp) == AR9300_EEP_VER); /* Xpa bias levels in eeprom are valid from rev 14.7 */ if (owl_get_eepdef_rev(ahp) < AR9300_EEP_MINOR_VER_7) { return; } if (ahp->ah_emu_eeprom) { return; } p_modal = &(eep->modal_header[IEEE80211_IS_CHAN_2GHZ(chan)]); if (p_modal->xpa_bias_lvl != 0xff) { biaslevel = p_modal->xpa_bias_lvl; } else { /* Use freqeuncy specific xpa bias level */ u_int16_t reset_freq_bin, freq_bin, freq_count = 0; CHAN_CENTERS centers; ar9300_get_channel_centers(ah, chan, ¢ers); reset_freq_bin = FREQ2FBIN(centers.synth_center, IEEE80211_IS_CHAN_2GHZ(chan)); freq_bin = p_modal->xpa_bias_lvl_freq[0] & 0xff; biaslevel = (u_int8_t)(p_modal->xpa_bias_lvl_freq[0] >> 14); freq_count++; while (freq_count < 3) { if (p_modal->xpa_bias_lvl_freq[freq_count] == 0x0) { break; } freq_bin = p_modal->xpa_bias_lvl_freq[freq_count] & 0xff; if (reset_freq_bin >= freq_bin) { biaslevel = (u_int8_t)(p_modal->xpa_bias_lvl_freq[freq_count] >> 14); } else { break; } freq_count++; } } /* Apply bias level to the ADDAC values in the INI array */ if (IEEE80211_IS_CHAN_2GHZ(chan)) { INI_RA(&ahp->ah_ini_addac, 7, 1) = (INI_RA(&ahp->ah_ini_addac, 7, 1) & (~0x18)) | biaslevel << 3; } else { INI_RA(&ahp->ah_ini_addac, 6, 1) = (INI_RA(&ahp->ah_ini_addac, 6, 1) & (~0xc0)) | biaslevel << 6; } #endif } u_int ar9300_eeprom_dump_support(struct ath_hal *ah, void **pp_e) { *pp_e = &(AH9300(ah)->ah_eeprom); return sizeof(ar9300_eeprom_t); } u_int8_t ar9300_eeprom_get_num_ant_config(struct ath_hal_9300 *ahp, HAL_FREQ_BAND freq_band) { #if 0 ar9300_eeprom_t *eep = &ahp->ah_eeprom.def; MODAL_EEPDEF_HEADER *p_modal = &(eep->modal_header[HAL_FREQ_BAND_2GHZ == freq_band]); BASE_EEPDEF_HEADER *p_base = &eep->base_eep_header; u_int8_t num_ant_config; num_ant_config = 1; /* default antenna configuration */ if (p_base->version >= 0x0E0D) { if (p_modal->use_ant1) { num_ant_config += 1; } } return num_ant_config; #else return 1; #endif } HAL_STATUS ar9300_eeprom_get_ant_cfg(struct ath_hal_9300 *ahp, const struct ieee80211_channel *chan, u_int8_t index, u_int16_t *config) { #if 0 ar9300_eeprom_t *eep = &ahp->ah_eeprom.def; MODAL_EEPDEF_HEADER *p_modal = &(eep->modal_header[IEEE80211_IS_CHAN_2GHZ(chan)]); BASE_EEPDEF_HEADER *p_base = &eep->base_eep_header; switch (index) { case 0: *config = p_modal->ant_ctrl_common & 0xFFFF; return HAL_OK; case 1: if (p_base->version >= 0x0E0D) { if (p_modal->use_ant1) { *config = ((p_modal->ant_ctrl_common & 0xFFFF0000) >> 16); return HAL_OK; } } break; default: break; } #endif return HAL_EINVAL; } u_int8_t* ar9300_eeprom_get_cust_data(struct ath_hal_9300 *ahp) { return (u_int8_t *)ahp; } #ifdef UNUSED static inline HAL_STATUS ar9300_check_eeprom(struct ath_hal *ah) { #if 0 u_int32_t sum = 0, el; u_int16_t *eepdata; int i; struct ath_hal_9300 *ahp = AH9300(ah); HAL_BOOL need_swap = AH_FALSE; ar9300_eeprom_t *eep = (ar9300_eeprom_t *)&ahp->ah_eeprom.def; u_int16_t magic, magic2; int addr; u_int16_t temp; /* ** We need to check the EEPROM data regardless of if it's in flash or ** in EEPROM. */ if (!ahp->ah_priv.priv.ah_eeprom_read( ah, AR9300_EEPROM_MAGIC_OFFSET, &magic)) { HALDEBUG(ah, HAL_DEBUG_EEPROM, "%s: Reading Magic # failed\n", __func__); return AH_FALSE; } HALDEBUG(ah, HAL_DEBUG_EEPROM, "%s: Read Magic = 0x%04X\n", __func__, magic); if (!ar9300_eep_data_in_flash(ah)) { if (magic != AR9300_EEPROM_MAGIC) { magic2 = SWAP16(magic); if (magic2 == AR9300_EEPROM_MAGIC) { need_swap = AH_TRUE; eepdata = (u_int16_t *)(&ahp->ah_eeprom); for (addr = 0; addr < sizeof(ar9300_eeprom_t) / sizeof(u_int16_t); addr++) { temp = SWAP16(*eepdata); *eepdata = temp; eepdata++; HALDEBUG(ah, HAL_DEBUG_EEPROM_DUMP, "0x%04X ", *eepdata); if (((addr + 1) % 6) == 0) { HALDEBUG(ah, HAL_DEBUG_EEPROM_DUMP, "\n"); } } } else { HALDEBUG(ah, HAL_DEBUG_EEPROM, "Invalid EEPROM Magic. endianness missmatch.\n"); return HAL_EEBADSUM; } } } else { HALDEBUG(ah, HAL_DEBUG_EEPROM, "EEPROM being read from flash @0x%p\n", AH_PRIVATE(ah)->ah_st); } HALDEBUG(ah, HAL_DEBUG_EEPROM, "need_swap = %s.\n", need_swap?"True":"False"); if (need_swap) { el = SWAP16(ahp->ah_eeprom.def.base_eep_header.length); } else { el = ahp->ah_eeprom.def.base_eep_header.length; } eepdata = (u_int16_t *)(&ahp->ah_eeprom.def); for (i = 0; i < AH_MIN(el, sizeof(ar9300_eeprom_t)) / sizeof(u_int16_t); i++) { sum ^= *eepdata++; } if (need_swap) { /* * preddy: EEPROM endianness does not match. So change it * 8bit values in eeprom data structure does not need to be swapped * Only >8bits (16 & 32) values need to be swapped * If a new 16 or 32 bit field is added to the EEPROM contents, * please make sure to swap the field here */ u_int32_t integer, j; u_int16_t word; HALDEBUG(ah, HAL_DEBUG_EEPROM, "EEPROM Endianness is not native.. Changing \n"); /* convert Base Eep header */ word = SWAP16(eep->base_eep_header.length); eep->base_eep_header.length = word; word = SWAP16(eep->base_eep_header.checksum); eep->base_eep_header.checksum = word; word = SWAP16(eep->base_eep_header.version); eep->base_eep_header.version = word; word = SWAP16(eep->base_eep_header.reg_dmn[0]); eep->base_eep_header.reg_dmn[0] = word; word = SWAP16(eep->base_eep_header.reg_dmn[1]); eep->base_eep_header.reg_dmn[1] = word; word = SWAP16(eep->base_eep_header.rf_silent); eep->base_eep_header.rf_silent = word; word = SWAP16(eep->base_eep_header.blue_tooth_options); eep->base_eep_header.blue_tooth_options = word; word = SWAP16(eep->base_eep_header.device_cap); eep->base_eep_header.device_cap = word; /* convert Modal Eep header */ for (j = 0; j < ARRAY_LENGTH(eep->modal_header); j++) { MODAL_EEPDEF_HEADER *p_modal = &eep->modal_header[j]; integer = SWAP32(p_modal->ant_ctrl_common); p_modal->ant_ctrl_common = integer; for (i = 0; i < AR9300_MAX_CHAINS; i++) { integer = SWAP32(p_modal->ant_ctrl_chain[i]); p_modal->ant_ctrl_chain[i] = integer; } for (i = 0; i < AR9300_EEPROM_MODAL_SPURS; i++) { word = SWAP16(p_modal->spur_chans[i].spur_chan); p_modal->spur_chans[i].spur_chan = word; } } } /* Check CRC - Attach should fail on a bad checksum */ if (sum != 0xffff || owl_get_eepdef_ver(ahp) != AR9300_EEP_VER || owl_get_eepdef_rev(ahp) < AR9300_EEP_NO_BACK_VER) { HALDEBUG(ah, HAL_DEBUG_EEPROM, "Bad EEPROM checksum 0x%x or revision 0x%04x\n", sum, owl_get_eepdef_ver(ahp)); return HAL_EEBADSUM; } #ifdef EEPROM_DUMP ar9300_eeprom_def_dump(ah, eep); #endif #if 0 #ifdef AH_AR9300_OVRD_TGT_PWR /* * 14.4 EEPROM contains low target powers. * Hardcode until EEPROM > 14.4 */ if (owl_get_eepdef_ver(ahp) == 14 && owl_get_eepdef_rev(ahp) <= 4) { MODAL_EEPDEF_HEADER *p_modal; #ifdef EEPROM_DUMP HALDEBUG(ah, HAL_DEBUG_POWER_OVERRIDE, "Original Target Powers\n"); ar9300_eep_def_dump_tgt_power(ah, eep); #endif HALDEBUG(ah, HAL_DEBUG_POWER_OVERRIDE, "Override Target Powers. EEPROM Version is %d.%d, " "Device Type %d\n", owl_get_eepdef_ver(ahp), owl_get_eepdef_rev(ahp), eep->base_eep_header.device_type); ar9300_eep_def_override_tgt_power(ah, eep); if (eep->base_eep_header.device_type == 5) { /* for xb72 only: improve transmit EVM for interop */ p_modal = &eep->modal_header[1]; p_modal->tx_frame_to_data_start = 0x23; p_modal->tx_frame_to_xpa_on = 0x23; p_modal->tx_frame_to_pa_on = 0x23; } #ifdef EEPROM_DUMP HALDEBUG(ah, HAL_DEBUG_POWER_OVERRIDE, "Modified Target Powers\n"); ar9300_eep_def_dump_tgt_power(ah, eep); #endif } #endif /* AH_AR9300_OVRD_TGT_PWR */ #endif #endif return HAL_OK; } #endif static u_int16_t ar9300_eeprom_get_spur_chan(struct ath_hal *ah, int i, HAL_BOOL is_2ghz) { u_int16_t spur_val = AR_NO_SPUR; #if 0 struct ath_hal_9300 *ahp = AH9300(ah); ar9300_eeprom_t *eep = (ar9300_eeprom_t *)&ahp->ah_eeprom; HALASSERT(i < AR_EEPROM_MODAL_SPURS ); HALDEBUG(ah, HAL_DEBUG_ANI, "Getting spur idx %d is2Ghz. %d val %x\n", i, is_2ghz, AH_PRIVATE(ah)->ah_config.ath_hal_spur_chans[i][is_2ghz]); switch (AH_PRIVATE(ah)->ah_config.ath_hal_spur_mode) { case SPUR_DISABLE: /* returns AR_NO_SPUR */ break; case SPUR_ENABLE_IOCTL: spur_val = AH_PRIVATE(ah)->ah_config.ath_hal_spur_chans[i][is_2ghz]; HALDEBUG(ah, HAL_DEBUG_ANI, "Getting spur val from new loc. %d\n", spur_val); break; case SPUR_ENABLE_EEPROM: spur_val = eep->modal_header[is_2ghz].spur_chans[i].spur_chan; break; } #endif return spur_val; } #ifdef UNUSED static inline HAL_BOOL ar9300_fill_eeprom(struct ath_hal *ah) { return ar9300_eeprom_restore(ah); } #endif u_int16_t ar9300_eeprom_struct_size(void) { return sizeof(ar9300_eeprom_t); } int ar9300_eeprom_struct_default_many(void) { return ARRAY_LENGTH(default9300); } ar9300_eeprom_t * ar9300_eeprom_struct_default(int default_index) { if (default_index >= 0 && default_index < ARRAY_LENGTH(default9300)) { return default9300[default_index]; } else { return 0; } } ar9300_eeprom_t * ar9300_eeprom_struct_default_find_by_id(int id) { int it; for (it = 0; it < ARRAY_LENGTH(default9300); it++) { if (default9300[it] != 0 && default9300[it]->template_version == id) { return default9300[it]; } } return 0; } HAL_BOOL ar9300_calibration_data_read_flash(struct ath_hal *ah, long address, u_int8_t *buffer, int many) { if (((address) < 0) || ((address + many) > AR9300_EEPROM_SIZE - 1)) { return AH_FALSE; } return AH_FALSE; } HAL_BOOL ar9300_calibration_data_read_eeprom(struct ath_hal *ah, long address, u_int8_t *buffer, int many) { int i; u_int8_t value[2]; unsigned long eep_addr; unsigned long byte_addr; u_int16_t *svalue; if (((address) < 0) || ((address + many) > AR9300_EEPROM_SIZE)) { return AH_FALSE; } for (i = 0; i < many; i++) { eep_addr = (u_int16_t) (address + i) / 2; byte_addr = (u_int16_t) (address + i) % 2; svalue = (u_int16_t *) value; if (! ath_hal_eepromRead(ah, eep_addr, svalue)) { HALDEBUG(ah, HAL_DEBUG_EEPROM, "%s: Unable to read eeprom region \n", __func__); return AH_FALSE; } buffer[i] = (*svalue >> (8 * byte_addr)) & 0xff; } return AH_TRUE; } HAL_BOOL ar9300_calibration_data_read_otp(struct ath_hal *ah, long address, u_int8_t *buffer, int many, HAL_BOOL is_wifi) { int i; unsigned long eep_addr; unsigned long byte_addr; u_int32_t svalue; if (((address) < 0) || ((address + many) > 0x400)) { return AH_FALSE; } for (i = 0; i < many; i++) { eep_addr = (u_int16_t) (address + i) / 4; /* otp is 4 bytes long???? */ byte_addr = (u_int16_t) (address + i) % 4; if (!ar9300_otp_read(ah, eep_addr, &svalue, is_wifi)) { HALDEBUG(ah, HAL_DEBUG_EEPROM, "%s: Unable to read otp region \n", __func__); return AH_FALSE; } buffer[i] = (svalue >> (8 * byte_addr)) & 0xff; } return AH_TRUE; } #ifdef ATH_CAL_NAND_FLASH HAL_BOOL ar9300_calibration_data_read_nand(struct ath_hal *ah, long address, u_int8_t *buffer, int many) { int ret_len; int ret_val = 1; /* Calling OS based API to read NAND */ ret_val = OS_NAND_FLASH_READ(ATH_CAL_NAND_PARTITION, address, many, &ret_len, buffer); return (ret_val ? AH_FALSE: AH_TRUE); } #endif HAL_BOOL ar9300_calibration_data_read(struct ath_hal *ah, long address, u_int8_t *buffer, int many) { switch (AH9300(ah)->calibration_data_source) { case calibration_data_flash: return ar9300_calibration_data_read_flash(ah, address, buffer, many); case calibration_data_eeprom: return ar9300_calibration_data_read_eeprom(ah, address, buffer, many); case calibration_data_otp: return ar9300_calibration_data_read_otp(ah, address, buffer, many, 1); #ifdef ATH_CAL_NAND_FLASH case calibration_data_nand: return ar9300_calibration_data_read_nand(ah,address,buffer,many); #endif } return AH_FALSE; } HAL_BOOL ar9300_calibration_data_read_array(struct ath_hal *ah, int address, u_int8_t *buffer, int many) { int it; for (it = 0; it < many; it++) { (void)ar9300_calibration_data_read(ah, address - it, buffer + it, 1); } return AH_TRUE; } /* * the address where the first configuration block is written */ static const int base_address = 0x3ff; /* 1KB */ static const int base_address_512 = 0x1ff; /* 512Bytes */ /* * the address where the NAND first configuration block is written */ #ifdef ATH_CAL_NAND_FLASH static const int base_address_nand = AR9300_FLASH_CAL_START_OFFSET; #endif /* * the lower limit on configuration data */ static const int low_limit = 0x040; /* * returns size of the physical eeprom in bytes. * 1024 and 2048 are normal sizes. * 0 means there is no eeprom. */ int32_t ar9300_eeprom_size(struct ath_hal *ah) { u_int16_t data; /* * first we'll try for 4096 bytes eeprom */ if (ar9300_eeprom_read_word(ah, 2047, &data)) { if (data != 0) { return 4096; } } /* * then we'll try for 2048 bytes eeprom */ if (ar9300_eeprom_read_word(ah, 1023, &data)) { if (data != 0) { return 2048; } } /* * then we'll try for 1024 bytes eeprom */ if (ar9300_eeprom_read_word(ah, 511, &data)) { if (data != 0) { return 1024; } } return 0; } /* * returns size of the physical otp in bytes. * 1024 and 2048 are normal sizes. * 0 means there is no eeprom. */ int32_t ar9300_otp_size(struct ath_hal *ah) { if (AR_SREV_POSEIDON(ah) || AR_SREV_HORNET(ah)) { return base_address_512+1; } else { return base_address+1; } } /* * find top of memory */ int ar9300_eeprom_base_address(struct ath_hal *ah) { int size; if (AH9300(ah)->calibration_data_source == calibration_data_otp) { return ar9300_otp_size(ah)-1; } else { size = ar9300_eeprom_size(ah); if (size > 0) { return size - 1; } else { return ar9300_otp_size(ah)-1; } } } int ar9300_eeprom_volatile(struct ath_hal *ah) { if (AH9300(ah)->calibration_data_source == calibration_data_otp) { return 0; /* no eeprom, use otp */ } else { return 1; /* board has eeprom or flash */ } } /* * need to change this to look for the pcie data in the low parts of memory * cal data needs to stop a few locations above */ int ar9300_eeprom_low_limit(struct ath_hal *ah) { return low_limit; } u_int16_t ar9300_compression_checksum(u_int8_t *data, int dsize) { int it; int checksum = 0; for (it = 0; it < dsize; it++) { checksum += data[it]; checksum &= 0xffff; } return checksum; } int ar9300_compression_header_unpack(u_int8_t *best, int *code, int *reference, int *length, int *major, int *minor) { unsigned long value[4]; value[0] = best[0]; value[1] = best[1]; value[2] = best[2]; value[3] = best[3]; *code = ((value[0] >> 5) & 0x0007); *reference = (value[0] & 0x001f) | ((value[1] >> 2) & 0x0020); *length = ((value[1] << 4) & 0x07f0) | ((value[2] >> 4) & 0x000f); *major = (value[2] & 0x000f); *minor = (value[3] & 0x00ff); return 4; } static HAL_BOOL ar9300_uncompress_block(struct ath_hal *ah, u_int8_t *mptr, int mdata_size, u_int8_t *block, int size) { int it; int spot; int offset; int length; spot = 0; for (it = 0; it < size; it += (length + 2)) { offset = block[it]; offset &= 0xff; spot += offset; length = block[it + 1]; length &= 0xff; if (length > 0 && spot >= 0 && spot + length <= mdata_size) { HALDEBUG(ah, HAL_DEBUG_EEPROM, "%s: Restore at %d: spot=%d offset=%d length=%d\n", __func__, it, spot, offset, length); OS_MEMCPY(&mptr[spot], &block[it + 2], length); spot += length; } else if (length > 0) { HALDEBUG(ah, HAL_DEBUG_EEPROM, "%s: Bad restore at %d: spot=%d offset=%d length=%d\n", __func__, it, spot, offset, length); return AH_FALSE; } } return AH_TRUE; } static int ar9300_eeprom_restore_internal_address(struct ath_hal *ah, ar9300_eeprom_t *mptr, int mdata_size, int cptr, u_int8_t blank) { u_int8_t word[MOUTPUT]; ar9300_eeprom_t *dptr; /* was uint8 */ int code; int reference, length, major, minor; int osize; int it; int restored; u_int16_t checksum, mchecksum; restored = 0; for (it = 0; it < MSTATE; it++) { (void) ar9300_calibration_data_read_array( ah, cptr, word, compression_header_length); if (word[0] == blank && word[1] == blank && word[2] == blank && word[3] == blank) { break; } ar9300_compression_header_unpack( word, &code, &reference, &length, &major, &minor); HALDEBUG(ah, HAL_DEBUG_EEPROM, "%s: Found block at %x: " "code=%d ref=%d length=%d major=%d minor=%d\n", __func__, cptr, code, reference, length, major, minor); #ifdef DONTUSE if (length >= 1024) { HALDEBUG(ah, HAL_DEBUG_EEPROM, "%s: Skipping bad header\n", __func__); cptr -= compression_header_length; continue; } #endif osize = length; (void) ar9300_calibration_data_read_array( ah, cptr, word, compression_header_length + osize + compression_checksum_length); checksum = ar9300_compression_checksum( &word[compression_header_length], length); mchecksum = word[compression_header_length + osize] | (word[compression_header_length + osize + 1] << 8); HALDEBUG(ah, HAL_DEBUG_EEPROM, "%s: checksum %x %x\n", __func__, checksum, mchecksum); if (checksum == mchecksum) { switch (code) { case _compress_none: if (length != mdata_size) { HALDEBUG(ah, HAL_DEBUG_EEPROM, "%s: EEPROM structure size mismatch " "memory=%d eeprom=%d\n", __func__, mdata_size, length); return -1; } OS_MEMCPY((u_int8_t *)mptr, (u_int8_t *)(word + compression_header_length), length); HALDEBUG(ah, HAL_DEBUG_EEPROM, "%s: restored eeprom %d: uncompressed, length %d\n", __func__, it, length); restored = 1; break; #ifdef UNUSED case _compress_lzma: if (reference == reference_current) { dptr = mptr; } else { dptr = (u_int8_t *)ar9300_eeprom_struct_default_find_by_id( reference); if (dptr == 0) { HALDEBUG(ah, HAL_DEBUG_EEPROM, "%s: Can't find reference eeprom struct %d\n", __func__, reference); goto done; } } usize = -1; if (usize != mdata_size) { HALDEBUG(ah, HAL_DEBUG_EEPROM, "%s: uncompressed data is wrong size %d %d\n", __func__, usize, mdata_size); goto done; } for (ib = 0; ib < mdata_size; ib++) { mptr[ib] = dptr[ib] ^ word[ib + overhead]; } HALDEBUG(ah, HAL_DEBUG_EEPROM, "%s: restored eeprom %d: compressed, " "reference %d, length %d\n", __func__, it, reference, length); break; case _compress_pairs: if (reference == reference_current) { dptr = mptr; } else { dptr = (u_int8_t *)ar9300_eeprom_struct_default_find_by_id( reference); if (dptr == 0) { HALDEBUG(ah, HAL_DEBUG_EEPROM, "%s: Can't find the reference " "eeprom structure %d\n", __func__, reference); goto done; } } HALDEBUG(ah, HAL_DEBUG_EEPROM, "%s: restored eeprom %d: " "pairs, reference %d, length %d,\n", __func__, it, reference, length); break; #endif case _compress_block: if (reference == reference_current) { dptr = mptr; } else { dptr = ar9300_eeprom_struct_default_find_by_id(reference); if (dptr == 0) { HALDEBUG(ah, HAL_DEBUG_EEPROM, "%s: cant find reference eeprom struct %d\n", __func__, reference); break; } OS_MEMCPY(mptr, dptr, mdata_size); } HALDEBUG(ah, HAL_DEBUG_EEPROM, "%s: restore eeprom %d: block, reference %d, length %d\n", __func__, it, reference, length); (void) ar9300_uncompress_block(ah, (u_int8_t *) mptr, mdata_size, (u_int8_t *) (word + compression_header_length), length); restored = 1; break; default: HALDEBUG(ah, HAL_DEBUG_EEPROM, "%s: unknown compression code %d\n", __func__, code); break; } } else { HALDEBUG(ah, HAL_DEBUG_EEPROM, "%s: skipping block with bad checksum\n", __func__); } cptr -= compression_header_length + osize + compression_checksum_length; } if (!restored) { cptr = -1; } return cptr; } static int ar9300_eeprom_restore_from_dram(struct ath_hal *ah, ar9300_eeprom_t *mptr, int mdata_size) { struct ath_hal_9300 *ahp = AH9300(ah); #if !defined(USE_PLATFORM_FRAMEWORK) char *cal_ptr; #endif HALASSERT(mdata_size > 0); /* if cal_in_flash is AH_TRUE, the address sent by LMAC to HAL (i.e. ah->ah_st) is corresponding to Flash. so return from here if ar9300_eep_data_in_flash(ah) returns AH_TRUE */ if(ar9300_eep_data_in_flash(ah)) return -1; #if 0 /* check if LMAC sent DRAM address is valid */ if (!(uintptr_t)(AH_PRIVATE(ah)->ah_st)) { return -1; } #endif /* When calibration data is from host, Host will copy the compressed data to the predefined DRAM location saved at ah->ah_st */ #if 0 ath_hal_printf(ah, "Restoring Cal data from DRAM\n"); ahp->ah_cal_mem = OS_REMAP((uintptr_t)(AH_PRIVATE(ah)->ah_st), HOST_CALDATA_SIZE); #endif if (!ahp->ah_cal_mem) { HALDEBUG(ah, HAL_DEBUG_EEPROM,"%s: can't remap dram region\n", __func__); return -1; } #if !defined(USE_PLATFORM_FRAMEWORK) cal_ptr = &((char *)(ahp->ah_cal_mem))[AR9300_FLASH_CAL_START_OFFSET]; OS_MEMCPY(mptr, cal_ptr, mdata_size); #else OS_MEMCPY(mptr, ahp->ah_cal_mem, mdata_size); #endif if (mptr->eeprom_version == 0xff || mptr->template_version == 0xff || mptr->eeprom_version == 0 || mptr->template_version == 0) { /* The board is uncalibrated */ return -1; } if (mptr->eeprom_version != 0x2) { return -1; } return mdata_size; } static int ar9300_eeprom_restore_from_flash(struct ath_hal *ah, ar9300_eeprom_t *mptr, int mdata_size) { struct ath_hal_9300 *ahp = AH9300(ah); char *cal_ptr; HALASSERT(mdata_size > 0); if (!ahp->ah_cal_mem) { return -1; } ath_hal_printf(ah, "Restoring Cal data from Flash\n"); /* * When calibration data is saved in flash, read * uncompressed eeprom structure from flash and return */ cal_ptr = &((char *)(ahp->ah_cal_mem))[AR9300_FLASH_CAL_START_OFFSET]; OS_MEMCPY(mptr, cal_ptr, mdata_size); #if 0 ar9300_swap_eeprom((ar9300_eeprom_t *)mptr); DONE IN ar9300_restore() #endif if (mptr->eeprom_version == 0xff || mptr->template_version == 0xff || mptr->eeprom_version == 0 || mptr->template_version == 0) { /* The board is uncalibrated */ return -1; } if (mptr->eeprom_version != 0x2) { return -1; } return mdata_size; } /* * Read the configuration data from the storage. We try the order with: * EEPROM, Flash, OTP. If all of above failed, use the default template. * The data can be put in any specified memory buffer. * * Returns -1 on error. * Returns address of next memory location on success. */ int ar9300_eeprom_restore_internal(struct ath_hal *ah, ar9300_eeprom_t *mptr, int mdata_size) { int nptr; nptr = -1; if ((AH9300(ah)->calibration_data_try == calibration_data_none || AH9300(ah)->calibration_data_try == calibration_data_dram) && AH9300(ah)->try_dram && nptr < 0) { ath_hal_printf(ah, "Restoring Cal data from DRAM\n"); AH9300(ah)->calibration_data_source = calibration_data_dram; AH9300(ah)->calibration_data_source_address = 0; nptr = ar9300_eeprom_restore_from_dram(ah, mptr, mdata_size); if (nptr < 0) { AH9300(ah)->calibration_data_source = calibration_data_none; AH9300(ah)->calibration_data_source_address = 0; } } if ((AH9300(ah)->calibration_data_try == calibration_data_none || AH9300(ah)->calibration_data_try == calibration_data_eeprom) && AH9300(ah)->try_eeprom && nptr < 0) { /* * need to look at highest eeprom address as well as at * base_address=0x3ff where we used to write the data */ ath_hal_printf(ah, "Restoring Cal data from EEPROM\n"); AH9300(ah)->calibration_data_source = calibration_data_eeprom; if (AH9300(ah)->calibration_data_try_address != 0) { AH9300(ah)->calibration_data_source_address = AH9300(ah)->calibration_data_try_address; nptr = ar9300_eeprom_restore_internal_address( ah, mptr, mdata_size, AH9300(ah)->calibration_data_source_address, 0xff); } else { AH9300(ah)->calibration_data_source_address = ar9300_eeprom_base_address(ah); nptr = ar9300_eeprom_restore_internal_address( ah, mptr, mdata_size, AH9300(ah)->calibration_data_source_address, 0xff); if (nptr < 0 && AH9300(ah)->calibration_data_source_address != base_address) { AH9300(ah)->calibration_data_source_address = base_address; nptr = ar9300_eeprom_restore_internal_address( ah, mptr, mdata_size, AH9300(ah)->calibration_data_source_address, 0xff); } } if (nptr < 0) { AH9300(ah)->calibration_data_source = calibration_data_none; AH9300(ah)->calibration_data_source_address = 0; } } /* * ##### should be an ifdef test for any AP usage, * either in driver or in nart */ if ((AH9300(ah)->calibration_data_try == calibration_data_none || AH9300(ah)->calibration_data_try == calibration_data_flash) && AH9300(ah)->try_flash && nptr < 0) { ath_hal_printf(ah, "Restoring Cal data from Flash\n"); AH9300(ah)->calibration_data_source = calibration_data_flash; /* how are we supposed to set this for flash? */ AH9300(ah)->calibration_data_source_address = 0; nptr = ar9300_eeprom_restore_from_flash(ah, mptr, mdata_size); if (nptr < 0) { AH9300(ah)->calibration_data_source = calibration_data_none; AH9300(ah)->calibration_data_source_address = 0; } } if ((AH9300(ah)->calibration_data_try == calibration_data_none || AH9300(ah)->calibration_data_try == calibration_data_otp) && AH9300(ah)->try_otp && nptr < 0) { ath_hal_printf(ah, "Restoring Cal data from OTP\n"); AH9300(ah)->calibration_data_source = calibration_data_otp; if (AH9300(ah)->calibration_data_try_address != 0) { AH9300(ah)->calibration_data_source_address = AH9300(ah)->calibration_data_try_address; } else { AH9300(ah)->calibration_data_source_address = ar9300_eeprom_base_address(ah); } nptr = ar9300_eeprom_restore_internal_address( ah, mptr, mdata_size, AH9300(ah)->calibration_data_source_address, 0); if (nptr < 0) { AH9300(ah)->calibration_data_source = calibration_data_none; AH9300(ah)->calibration_data_source_address = 0; } } #ifdef ATH_CAL_NAND_FLASH if ((AH9300(ah)->calibration_data_try == calibration_data_none || AH9300(ah)->calibration_data_try == calibration_data_nand) && AH9300(ah)->try_nand && nptr < 0) { AH9300(ah)->calibration_data_source = calibration_data_nand; AH9300(ah)->calibration_data_source_address = ((unsigned int)(AH_PRIVATE(ah)->ah_st)) + base_address_nand; if(ar9300_calibration_data_read( ah, AH9300(ah)->calibration_data_source_address, (u_int8_t *)mptr, mdata_size) == AH_TRUE) { nptr = mdata_size; } /*nptr=ar9300EepromRestoreInternalAddress(ah, mptr, mdataSize, CalibrationDataSourceAddress);*/ if(nptr < 0) { AH9300(ah)->calibration_data_source = calibration_data_none; AH9300(ah)->calibration_data_source_address = 0; } } #endif if (nptr < 0) { ath_hal_printf(ah, "%s[%d] No vaid CAL, calling default template\n", __func__, __LINE__); nptr = ar9300_eeprom_restore_something(ah, mptr, mdata_size); } return nptr; } /******************************************************************************/ /*! ** \brief Eeprom Swapping Function ** ** This function will swap the contents of the "longer" EEPROM data items ** to ensure they are consistent with the endian requirements for the platform ** they are being compiled for ** ** \param eh Pointer to the EEPROM data structure ** \return N/A */ #if AH_BYTE_ORDER == AH_BIG_ENDIAN void ar9300_swap_eeprom(ar9300_eeprom_t *eep) { u_int32_t dword; u_int16_t word; int i; word = __bswap16(eep->base_eep_header.reg_dmn[0]); eep->base_eep_header.reg_dmn[0] = word; word = __bswap16(eep->base_eep_header.reg_dmn[1]); eep->base_eep_header.reg_dmn[1] = word; dword = __bswap32(eep->base_eep_header.swreg); eep->base_eep_header.swreg = dword; dword = __bswap32(eep->modal_header_2g.ant_ctrl_common); eep->modal_header_2g.ant_ctrl_common = dword; dword = __bswap32(eep->modal_header_2g.ant_ctrl_common2); eep->modal_header_2g.ant_ctrl_common2 = dword; dword = __bswap32(eep->modal_header_2g.paprd_rate_mask_ht20); eep->modal_header_2g.paprd_rate_mask_ht20 = dword; dword = __bswap32(eep->modal_header_2g.paprd_rate_mask_ht40); eep->modal_header_2g.paprd_rate_mask_ht40 = dword; dword = __bswap32(eep->modal_header_5g.ant_ctrl_common); eep->modal_header_5g.ant_ctrl_common = dword; dword = __bswap32(eep->modal_header_5g.ant_ctrl_common2); eep->modal_header_5g.ant_ctrl_common2 = dword; dword = __bswap32(eep->modal_header_5g.paprd_rate_mask_ht20); eep->modal_header_5g.paprd_rate_mask_ht20 = dword; dword = __bswap32(eep->modal_header_5g.paprd_rate_mask_ht40); eep->modal_header_5g.paprd_rate_mask_ht40 = dword; for (i = 0; i < OSPREY_MAX_CHAINS; i++) { word = __bswap16(eep->modal_header_2g.ant_ctrl_chain[i]); eep->modal_header_2g.ant_ctrl_chain[i] = word; word = __bswap16(eep->modal_header_5g.ant_ctrl_chain[i]); eep->modal_header_5g.ant_ctrl_chain[i] = word; } } void ar9300_eeprom_template_swap(void) { int it; ar9300_eeprom_t *dptr; for (it = 0; it < ARRAY_LENGTH(default9300); it++) { dptr = ar9300_eeprom_struct_default(it); if (dptr != 0) { ar9300_swap_eeprom(dptr); } } } #endif /* * Restore the configuration structure by reading the eeprom. * This function destroys any existing in-memory structure content. */ HAL_BOOL ar9300_eeprom_restore(struct ath_hal *ah) { struct ath_hal_9300 *ahp = AH9300(ah); ar9300_eeprom_t *mptr; int mdata_size; HAL_BOOL status = AH_FALSE; mptr = &ahp->ah_eeprom; mdata_size = ar9300_eeprom_struct_size(); if (mptr != 0 && mdata_size > 0) { #if AH_BYTE_ORDER == AH_BIG_ENDIAN ar9300_eeprom_template_swap(); ar9300_swap_eeprom(mptr); #endif /* * At this point, mptr points to the eeprom data structure - * in it's "default" state. If this is big endian, swap the + * in its "default" state. If this is big endian, swap the * data structures back to "little endian" form. */ if (ar9300_eeprom_restore_internal(ah, mptr, mdata_size) >= 0) { status = AH_TRUE; } #if AH_BYTE_ORDER == AH_BIG_ENDIAN /* Second Swap, back to Big Endian */ ar9300_eeprom_template_swap(); ar9300_swap_eeprom(mptr); #endif } ahp->ah_2g_paprd_rate_mask_ht40 = mptr->modal_header_2g.paprd_rate_mask_ht40; ahp->ah_2g_paprd_rate_mask_ht20 = mptr->modal_header_2g.paprd_rate_mask_ht20; ahp->ah_5g_paprd_rate_mask_ht40 = mptr->modal_header_5g.paprd_rate_mask_ht40; ahp->ah_5g_paprd_rate_mask_ht20 = mptr->modal_header_5g.paprd_rate_mask_ht20; return status; } int32_t ar9300_thermometer_get(struct ath_hal *ah) { struct ath_hal_9300 *ahp = AH9300(ah); int thermometer; thermometer = (ahp->ah_eeprom.base_eep_header.misc_configuration >> 1) & 0x3; thermometer--; return thermometer; } HAL_BOOL ar9300_thermometer_apply(struct ath_hal *ah) { int thermometer = ar9300_thermometer_get(ah); /* ch0_RXTX4 */ /*#define AR_PHY_65NM_CH0_RXTX4 AR_PHY_65NM(ch0_RXTX4)*/ #define AR_PHY_65NM_CH1_RXTX4 AR_PHY_65NM(ch1_RXTX4) #define AR_PHY_65NM_CH2_RXTX4 AR_PHY_65NM(ch2_RXTX4) /*#define AR_PHY_65NM_CH0_RXTX4_THERM_ON 0x10000000*/ /*#define AR_PHY_65NM_CH0_RXTX4_THERM_ON_S 28*/ #define AR_PHY_65NM_CH0_RXTX4_THERM_ON_OVR_S 29 #define AR_PHY_65NM_CH0_RXTX4_THERM_ON_OVR \ (0x1<ah_eeprom; tuning_caps_params = eep->base_eep_header.params_for_tuning_caps[0]; return tuning_caps_params; } /* * Read the tuning caps params from eeprom and set to correct register. * To regulation the frequency accuracy. */ HAL_BOOL ar9300_tuning_caps_apply(struct ath_hal *ah) { int tuning_caps_params; ar9300_eeprom_t *eep = &AH9300(ah)->ah_eeprom; tuning_caps_params = ar9300_tuning_caps_params_get(ah); if ((eep->base_eep_header.feature_enable & 0x40) >> 6) { tuning_caps_params &= 0x7f; if (AR_SREV_POSEIDON(ah) || AR_SREV_WASP(ah) || AR_SREV_HONEYBEE(ah)) { return true; } else if (AR_SREV_HORNET(ah)) { OS_REG_RMW_FIELD(ah, AR_HORNET_CH0_XTAL, AR_OSPREY_CHO_XTAL_CAPINDAC, tuning_caps_params); OS_REG_RMW_FIELD(ah, AR_HORNET_CH0_XTAL, AR_OSPREY_CHO_XTAL_CAPOUTDAC, tuning_caps_params); } else if (AR_SREV_SCORPION(ah)) { OS_REG_RMW_FIELD(ah, AR_SCORPION_CH0_XTAL, AR_OSPREY_CHO_XTAL_CAPINDAC, tuning_caps_params); OS_REG_RMW_FIELD(ah, AR_SCORPION_CH0_XTAL, AR_OSPREY_CHO_XTAL_CAPOUTDAC, tuning_caps_params); } else { OS_REG_RMW_FIELD(ah, AR_OSPREY_CH0_XTAL, AR_OSPREY_CHO_XTAL_CAPINDAC, tuning_caps_params); OS_REG_RMW_FIELD(ah, AR_OSPREY_CH0_XTAL, AR_OSPREY_CHO_XTAL_CAPOUTDAC, tuning_caps_params); } } return AH_TRUE; } /* * Read the tx_frame_to_xpa_on param from eeprom and apply the value to * correct register. */ HAL_BOOL ar9300_xpa_timing_control_apply(struct ath_hal *ah, HAL_BOOL is_2ghz) { u_int8_t xpa_timing_control; ar9300_eeprom_t *eep = &AH9300(ah)->ah_eeprom; if ((eep->base_eep_header.feature_enable & 0x80) >> 7) { if (AR_SREV_OSPREY(ah) || AR_SREV_AR9580(ah) || AR_SREV_WASP(ah) || AR_SREV_HONEYBEE(ah)) { if (is_2ghz) { xpa_timing_control = eep->modal_header_2g.tx_frame_to_xpa_on; OS_REG_RMW_FIELD(ah, AR_PHY_XPA_TIMING_CTL, AR_PHY_XPA_TIMING_CTL_FRAME_XPAB_ON, xpa_timing_control); } else { xpa_timing_control = eep->modal_header_5g.tx_frame_to_xpa_on; OS_REG_RMW_FIELD(ah, AR_PHY_XPA_TIMING_CTL, AR_PHY_XPA_TIMING_CTL_FRAME_XPAA_ON, xpa_timing_control); } } } return AH_TRUE; } /* * Read the xLNA_bias_strength param from eeprom and apply the value to * correct register. */ HAL_BOOL ar9300_x_lNA_bias_strength_apply(struct ath_hal *ah, HAL_BOOL is_2ghz) { u_int8_t x_lNABias; u_int32_t value = 0; ar9300_eeprom_t *eep = &AH9300(ah)->ah_eeprom; if ((eep->base_eep_header.misc_configuration & 0x40) >> 6) { if (AR_SREV_OSPREY(ah)) { if (is_2ghz) { x_lNABias = eep->modal_header_2g.xLNA_bias_strength; } else { x_lNABias = eep->modal_header_5g.xLNA_bias_strength; } value = x_lNABias & ( 0x03 ); // bit0,1 for chain0 OS_REG_RMW_FIELD(ah, AR_PHY_65NM_CH0_RXTX4, AR_PHY_65NM_RXTX4_XLNA_BIAS, value); value = (x_lNABias >> 2) & ( 0x03 ); // bit2,3 for chain1 OS_REG_RMW_FIELD(ah, AR_PHY_65NM_CH1_RXTX4, AR_PHY_65NM_RXTX4_XLNA_BIAS, value); value = (x_lNABias >> 4) & ( 0x03 ); // bit4,5 for chain2 OS_REG_RMW_FIELD(ah, AR_PHY_65NM_CH2_RXTX4, AR_PHY_65NM_RXTX4_XLNA_BIAS, value); } } return AH_TRUE; } /* * Read EEPROM header info and program the device for correct operation * given the channel value. */ HAL_BOOL ar9300_eeprom_set_board_values(struct ath_hal *ah, const struct ieee80211_channel *chan) { HAL_CHANNEL_INTERNAL *ichan = ath_hal_checkchannel(ah, chan); ar9300_xpa_bias_level_apply(ah, IEEE80211_IS_CHAN_2GHZ(chan)); ar9300_xpa_timing_control_apply(ah, IEEE80211_IS_CHAN_2GHZ(chan)); ar9300_ant_ctrl_apply(ah, IEEE80211_IS_CHAN_2GHZ(chan)); ar9300_drive_strength_apply(ah); ar9300_x_lNA_bias_strength_apply(ah, IEEE80211_IS_CHAN_2GHZ(chan)); /* wait for Poseidon internal regular turnning */ /* for Hornet we move it before initPLL to avoid an access issue */ /* Function not used when EMULATION. */ if (!AR_SREV_HORNET(ah) && !AR_SREV_WASP(ah) && !AR_SREV_HONEYBEE(ah)) { ar9300_internal_regulator_apply(ah); } ar9300_attenuation_apply(ah, ichan->channel); ar9300_quick_drop_apply(ah, ichan->channel); ar9300_thermometer_apply(ah); if(!AR_SREV_WASP(ah)) { ar9300_tuning_caps_apply(ah); } ar9300_tx_end_to_xpab_off_apply(ah, ichan->channel); return AH_TRUE; } u_int8_t * ar9300_eeprom_get_spur_chans_ptr(struct ath_hal *ah, HAL_BOOL is_2ghz) { ar9300_eeprom_t *eep = &AH9300(ah)->ah_eeprom; if (is_2ghz) { return &(eep->modal_header_2g.spur_chans[0]); } else { return &(eep->modal_header_5g.spur_chans[0]); } } static u_int8_t ar9300_eeprom_get_tx_gain_table_number_max(struct ath_hal *ah) { unsigned long tx_gain_table_max; tx_gain_table_max = OS_REG_READ_FIELD(ah, AR_PHY_TPC_7, AR_PHY_TPC_7_TX_GAIN_TABLE_MAX); return tx_gain_table_max; } u_int8_t ar9300_eeprom_tx_gain_table_index_max_apply(struct ath_hal *ah, u_int16_t channel) { unsigned int index; ar9300_eeprom_t *ahp_Eeprom; struct ath_hal_9300 *ahp = AH9300(ah); ahp_Eeprom = &ahp->ah_eeprom; if (ahp_Eeprom->base_ext1.misc_enable == 0) return AH_FALSE; if (channel < 4000) { index = ahp_Eeprom->modal_header_2g.tx_gain_cap; } else { index = ahp_Eeprom->modal_header_5g.tx_gain_cap; } OS_REG_RMW_FIELD(ah, AR_PHY_TPC_7, AR_PHY_TPC_7_TX_GAIN_TABLE_MAX, index); return AH_TRUE; } static u_int8_t ar9300_eeprom_get_pcdac_tx_gain_table_i(struct ath_hal *ah, int i, u_int8_t *pcdac) { unsigned long tx_gain; u_int8_t tx_gain_table_max; tx_gain_table_max = ar9300_eeprom_get_tx_gain_table_number_max(ah); if (i <= 0 || i > tx_gain_table_max) { *pcdac = 0; return AH_FALSE; } tx_gain = OS_REG_READ(ah, AR_PHY_TXGAIN_TAB(1) + i * 4); *pcdac = ((tx_gain >> 24) & 0xff); return AH_TRUE; } u_int8_t ar9300_eeprom_set_tx_gain_cap(struct ath_hal *ah, int *tx_gain_max) // pcdac read back from reg, read back value depends on reset 2GHz/5GHz ini // tx_gain_table, this function will be called twice after each // band's calibration. // after 2GHz cal, tx_gain_max[0] has 2GHz, calibration max txgain, // tx_gain_max[1]=-100 // after 5GHz cal, tx_gain_max[0],tx_gain_max[1] have calibration // value for both band // reset is on 5GHz, reg reading from tx_gain_table is for 5GHz, // so program can't recalculate 2g.tx_gain_cap at this point. { int i = 0, ig, im = 0; u_int8_t pcdac = 0; u_int8_t tx_gain_table_max; ar9300_eeprom_t *ahp_Eeprom; struct ath_hal_9300 *ahp = AH9300(ah); ahp_Eeprom = &ahp->ah_eeprom; if (ahp_Eeprom->base_ext1.misc_enable == 0) return AH_FALSE; tx_gain_table_max = ar9300_eeprom_get_tx_gain_table_number_max(ah); for (i = 0; i < 2; i++) { if (tx_gain_max[i]>-100) { // -100 didn't cal that band. if ( i== 0) { if (tx_gain_max[1]>-100) { continue; // both band are calibrated, skip 2GHz 2g.tx_gain_cap reset } } for (ig = 1; ig <= tx_gain_table_max; ig++) { if (ah != 0 && ah->ah_reset != 0) { ar9300_eeprom_get_pcdac_tx_gain_table_i(ah, ig, &pcdac); if (pcdac >= tx_gain_max[i]) break; } } if (ig+1 <= tx_gain_table_max) { if (pcdac == tx_gain_max[i]) im = ig; else im = ig + 1; if (i == 0) { ahp_Eeprom->modal_header_2g.tx_gain_cap = im; } else { ahp_Eeprom->modal_header_5g.tx_gain_cap = im; } } else { if (i == 0) { ahp_Eeprom->modal_header_2g.tx_gain_cap = ig; } else { ahp_Eeprom->modal_header_5g.tx_gain_cap = ig; } } } } return AH_TRUE; } Index: head/sys/contrib/vchiq/interface/vchiq_arm/vchiq_arm.c =================================================================== --- head/sys/contrib/vchiq/interface/vchiq_arm/vchiq_arm.c (revision 308456) +++ head/sys/contrib/vchiq/interface/vchiq_arm/vchiq_arm.c (revision 308457) @@ -1,2930 +1,2930 @@ /** * Copyright (c) 2014 Raspberry Pi (Trading) Ltd. All rights reserved. * Copyright (c) 2010-2012 Broadcom. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions, and the following disclaimer, * without modification. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The names of the above-listed copyright holders may not be used * to endorse or promote products derived from this software without * specific prior written permission. * * ALTERNATIVELY, this software may be distributed under the terms of the * GNU General Public License ("GPL") version 2, as published by the Free * Software Foundation. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "vchiq_core.h" #include "vchiq_ioctl.h" #include "vchiq_arm.h" #define DEVICE_NAME "vchiq" /* Override the default prefix, which would be vchiq_arm (from the filename) */ #undef MODULE_PARAM_PREFIX #define MODULE_PARAM_PREFIX DEVICE_NAME "." #define VCHIQ_MINOR 0 /* Some per-instance constants */ #define MAX_COMPLETIONS 128 #define MAX_SERVICES 64 #define MAX_ELEMENTS 8 #define MSG_QUEUE_SIZE 128 #define KEEPALIVE_VER 1 #define KEEPALIVE_VER_MIN KEEPALIVE_VER /* Run time control of log level, based on KERN_XXX level. */ int vchiq_arm_log_level = VCHIQ_LOG_DEFAULT; int vchiq_susp_log_level = VCHIQ_LOG_ERROR; #define SUSPEND_TIMER_TIMEOUT_MS 100 #define SUSPEND_RETRY_TIMER_TIMEOUT_MS 1000 #define VC_SUSPEND_NUM_OFFSET 3 /* number of values before idle which are -ve */ static const char *const suspend_state_names[] = { "VC_SUSPEND_FORCE_CANCELED", "VC_SUSPEND_REJECTED", "VC_SUSPEND_FAILED", "VC_SUSPEND_IDLE", "VC_SUSPEND_REQUESTED", "VC_SUSPEND_IN_PROGRESS", "VC_SUSPEND_SUSPENDED" }; #define VC_RESUME_NUM_OFFSET 1 /* number of values before idle which are -ve */ static const char *const resume_state_names[] = { "VC_RESUME_FAILED", "VC_RESUME_IDLE", "VC_RESUME_REQUESTED", "VC_RESUME_IN_PROGRESS", "VC_RESUME_RESUMED" }; /* The number of times we allow force suspend to timeout before actually ** _forcing_ suspend. This is to cater for SW which fails to release vchiq ** correctly - we don't want to prevent ARM suspend indefinitely in this case. */ #define FORCE_SUSPEND_FAIL_MAX 8 /* The time in ms allowed for videocore to go idle when force suspend has been * requested */ #define FORCE_SUSPEND_TIMEOUT_MS 200 static void suspend_timer_callback(unsigned long context); #ifdef notyet static int vchiq_proc_add_instance(VCHIQ_INSTANCE_T instance); static void vchiq_proc_remove_instance(VCHIQ_INSTANCE_T instance); #endif typedef struct user_service_struct { VCHIQ_SERVICE_T *service; void *userdata; VCHIQ_INSTANCE_T instance; char is_vchi; char dequeue_pending; char close_pending; int message_available_pos; int msg_insert; int msg_remove; struct semaphore insert_event; struct semaphore remove_event; struct semaphore close_event; VCHIQ_HEADER_T * msg_queue[MSG_QUEUE_SIZE]; } USER_SERVICE_T; struct bulk_waiter_node { struct bulk_waiter bulk_waiter; int pid; struct list_head list; }; struct vchiq_instance_struct { VCHIQ_STATE_T *state; VCHIQ_COMPLETION_DATA_T completions[MAX_COMPLETIONS]; int completion_insert; int completion_remove; struct semaphore insert_event; struct semaphore remove_event; struct mutex completion_mutex; int connected; int closing; int pid; int mark; int use_close_delivered; int trace; struct list_head bulk_waiter_list; struct mutex bulk_waiter_list_mutex; #ifdef notyet VCHIQ_DEBUGFS_NODE_T proc_entry; #endif }; typedef struct dump_context_struct { char __user *buf; size_t actual; size_t space; loff_t offset; } DUMP_CONTEXT_T; static struct cdev * vchiq_cdev; VCHIQ_STATE_T g_state; static DEFINE_SPINLOCK(msg_queue_spinlock); static const char *const ioctl_names[] = { "CONNECT", "SHUTDOWN", "CREATE_SERVICE", "REMOVE_SERVICE", "QUEUE_MESSAGE", "QUEUE_BULK_TRANSMIT", "QUEUE_BULK_RECEIVE", "AWAIT_COMPLETION", "DEQUEUE_MESSAGE", "GET_CLIENT_ID", "GET_CONFIG", "CLOSE_SERVICE", "USE_SERVICE", "RELEASE_SERVICE", "SET_SERVICE_OPTION", "DUMP_PHYS_MEM", "LIB_VERSION", "CLOSE_DELIVERED" }; vchiq_static_assert((sizeof(ioctl_names)/sizeof(ioctl_names[0])) == (VCHIQ_IOC_MAX + 1)); static eventhandler_tag vchiq_ehtag = NULL; static d_open_t vchiq_open; static d_close_t vchiq_close; static d_ioctl_t vchiq_ioctl; static struct cdevsw vchiq_cdevsw = { .d_version = D_VERSION, .d_ioctl = vchiq_ioctl, .d_open = vchiq_open, .d_close = vchiq_close, .d_name = DEVICE_NAME, }; #if 0 static void dump_phys_mem(void *virt_addr, uint32_t num_bytes); #endif /**************************************************************************** * * add_completion * ***************************************************************************/ static VCHIQ_STATUS_T add_completion(VCHIQ_INSTANCE_T instance, VCHIQ_REASON_T reason, VCHIQ_HEADER_T *header, USER_SERVICE_T *user_service, void *bulk_userdata) { VCHIQ_COMPLETION_DATA_T *completion; int insert; DEBUG_INITIALISE(g_state.local) insert = instance->completion_insert; while ((insert - instance->completion_remove) >= MAX_COMPLETIONS) { /* Out of space - wait for the client */ DEBUG_TRACE(SERVICE_CALLBACK_LINE); vchiq_log_trace(vchiq_arm_log_level, "add_completion - completion queue full"); DEBUG_COUNT(COMPLETION_QUEUE_FULL_COUNT); if (down_interruptible(&instance->remove_event) != 0) { vchiq_log_info(vchiq_arm_log_level, "service_callback interrupted"); return VCHIQ_RETRY; } if (instance->closing) { vchiq_log_info(vchiq_arm_log_level, "service_callback closing"); return VCHIQ_SUCCESS; } DEBUG_TRACE(SERVICE_CALLBACK_LINE); } completion = &instance->completions[insert & (MAX_COMPLETIONS - 1)]; completion->header = header; completion->reason = reason; /* N.B. service_userdata is updated while processing AWAIT_COMPLETION */ completion->service_userdata = user_service->service; completion->bulk_userdata = bulk_userdata; if (reason == VCHIQ_SERVICE_CLOSED) { /* Take an extra reference, to be held until this CLOSED notification is delivered. */ lock_service(user_service->service); if (instance->use_close_delivered) user_service->close_pending = 1; } /* A write barrier is needed here to ensure that the entire completion record is written out before the insert point. */ wmb(); if (reason == VCHIQ_MESSAGE_AVAILABLE) user_service->message_available_pos = insert; instance->completion_insert = ++insert; up(&instance->insert_event); return VCHIQ_SUCCESS; } /**************************************************************************** * * service_callback * ***************************************************************************/ static VCHIQ_STATUS_T service_callback(VCHIQ_REASON_T reason, VCHIQ_HEADER_T *header, VCHIQ_SERVICE_HANDLE_T handle, void *bulk_userdata) { /* How do we ensure the callback goes to the right client? ** The service_user data points to a USER_SERVICE_T record containing ** the original callback and the user state structure, which contains a ** circular buffer for completion records. */ USER_SERVICE_T *user_service; VCHIQ_SERVICE_T *service; VCHIQ_INSTANCE_T instance; int skip_completion = 0; DEBUG_INITIALISE(g_state.local) DEBUG_TRACE(SERVICE_CALLBACK_LINE); service = handle_to_service(handle); BUG_ON(!service); user_service = (USER_SERVICE_T *)service->base.userdata; instance = user_service->instance; if (!instance || instance->closing) return VCHIQ_SUCCESS; vchiq_log_trace(vchiq_arm_log_level, "service_callback - service %lx(%d,%p), reason %d, header %lx, " "instance %lx, bulk_userdata %lx", (unsigned long)user_service, service->localport, user_service->userdata, reason, (unsigned long)header, (unsigned long)instance, (unsigned long)bulk_userdata); if (header && user_service->is_vchi) { spin_lock(&msg_queue_spinlock); while (user_service->msg_insert == (user_service->msg_remove + MSG_QUEUE_SIZE)) { spin_unlock(&msg_queue_spinlock); DEBUG_TRACE(SERVICE_CALLBACK_LINE); DEBUG_COUNT(MSG_QUEUE_FULL_COUNT); vchiq_log_trace(vchiq_arm_log_level, "service_callback - msg queue full"); /* If there is no MESSAGE_AVAILABLE in the completion ** queue, add one */ if ((user_service->message_available_pos - instance->completion_remove) < 0) { VCHIQ_STATUS_T status; vchiq_log_info(vchiq_arm_log_level, "Inserting extra MESSAGE_AVAILABLE"); DEBUG_TRACE(SERVICE_CALLBACK_LINE); status = add_completion(instance, reason, NULL, user_service, bulk_userdata); if (status != VCHIQ_SUCCESS) { DEBUG_TRACE(SERVICE_CALLBACK_LINE); return status; } } DEBUG_TRACE(SERVICE_CALLBACK_LINE); if (down_interruptible(&user_service->remove_event) != 0) { vchiq_log_info(vchiq_arm_log_level, "service_callback interrupted"); DEBUG_TRACE(SERVICE_CALLBACK_LINE); return VCHIQ_RETRY; } else if (instance->closing) { vchiq_log_info(vchiq_arm_log_level, "service_callback closing"); DEBUG_TRACE(SERVICE_CALLBACK_LINE); return VCHIQ_ERROR; } DEBUG_TRACE(SERVICE_CALLBACK_LINE); spin_lock(&msg_queue_spinlock); } user_service->msg_queue[user_service->msg_insert & (MSG_QUEUE_SIZE - 1)] = header; user_service->msg_insert++; /* If there is a thread waiting in DEQUEUE_MESSAGE, or if ** there is a MESSAGE_AVAILABLE in the completion queue then ** bypass the completion queue. */ if (((user_service->message_available_pos - instance->completion_remove) >= 0) || user_service->dequeue_pending) { user_service->dequeue_pending = 0; skip_completion = 1; } spin_unlock(&msg_queue_spinlock); up(&user_service->insert_event); header = NULL; } if (skip_completion) { DEBUG_TRACE(SERVICE_CALLBACK_LINE); return VCHIQ_SUCCESS; } DEBUG_TRACE(SERVICE_CALLBACK_LINE); return add_completion(instance, reason, header, user_service, bulk_userdata); } /**************************************************************************** * * user_service_free * ***************************************************************************/ static void user_service_free(void *userdata) { USER_SERVICE_T *user_service = userdata; _sema_destroy(&user_service->insert_event); _sema_destroy(&user_service->remove_event); kfree(user_service); } /**************************************************************************** * * close_delivered * ***************************************************************************/ static void close_delivered(USER_SERVICE_T *user_service) { vchiq_log_info(vchiq_arm_log_level, "close_delivered(handle=%x)", user_service->service->handle); if (user_service->close_pending) { /* Allow the underlying service to be culled */ unlock_service(user_service->service); /* Wake the user-thread blocked in close_ or remove_service */ up(&user_service->close_event); user_service->close_pending = 0; } } /**************************************************************************** * * vchiq_ioctl * ***************************************************************************/ static int vchiq_ioctl(struct cdev *cdev, u_long cmd, caddr_t arg, int fflag, struct thread *td) { VCHIQ_INSTANCE_T instance; VCHIQ_STATUS_T status = VCHIQ_SUCCESS; VCHIQ_SERVICE_T *service = NULL; int ret = 0; int i, rc; DEBUG_INITIALISE(g_state.local) if ((ret = devfs_get_cdevpriv((void**)&instance))) { printf("vchiq_ioctl: devfs_get_cdevpriv failed: error %d\n", ret); return (ret); } /* XXXBSD: HACK! */ #define _IOC_NR(x) ((x) & 0xff) #define _IOC_TYPE(x) IOCGROUP(x) vchiq_log_trace(vchiq_arm_log_level, "vchiq_ioctl - instance %x, cmd %s, arg %p", (unsigned int)instance, ((_IOC_TYPE(cmd) == VCHIQ_IOC_MAGIC) && (_IOC_NR(cmd) <= VCHIQ_IOC_MAX)) ? ioctl_names[_IOC_NR(cmd)] : "", arg); switch (cmd) { case VCHIQ_IOC_SHUTDOWN: if (!instance->connected) break; /* Remove all services */ i = 0; while ((service = next_service_by_instance(instance->state, instance, &i)) != NULL) { status = vchiq_remove_service(service->handle); unlock_service(service); if (status != VCHIQ_SUCCESS) break; } service = NULL; if (status == VCHIQ_SUCCESS) { /* Wake the completion thread and ask it to exit */ instance->closing = 1; up(&instance->insert_event); } break; case VCHIQ_IOC_CONNECT: if (instance->connected) { ret = -EINVAL; break; } rc = lmutex_lock_interruptible(&instance->state->mutex); if (rc != 0) { vchiq_log_error(vchiq_arm_log_level, "vchiq: connect: could not lock mutex for " "state %d: %d", instance->state->id, rc); ret = -EINTR; break; } status = vchiq_connect_internal(instance->state, instance); lmutex_unlock(&instance->state->mutex); if (status == VCHIQ_SUCCESS) instance->connected = 1; else vchiq_log_error(vchiq_arm_log_level, "vchiq: could not connect: %d", status); break; case VCHIQ_IOC_CREATE_SERVICE: { VCHIQ_CREATE_SERVICE_T args; USER_SERVICE_T *user_service = NULL; void *userdata; int srvstate; memcpy(&args, (const void*)arg, sizeof(args)); user_service = kmalloc(sizeof(USER_SERVICE_T), GFP_KERNEL); if (!user_service) { ret = -ENOMEM; break; } if (args.is_open) { if (!instance->connected) { ret = -ENOTCONN; kfree(user_service); break; } srvstate = VCHIQ_SRVSTATE_OPENING; } else { srvstate = instance->connected ? VCHIQ_SRVSTATE_LISTENING : VCHIQ_SRVSTATE_HIDDEN; } userdata = args.params.userdata; args.params.callback = service_callback; args.params.userdata = user_service; service = vchiq_add_service_internal( instance->state, &args.params, srvstate, instance, user_service_free); if (service != NULL) { user_service->service = service; user_service->userdata = userdata; user_service->instance = instance; user_service->is_vchi = (args.is_vchi != 0); user_service->dequeue_pending = 0; user_service->close_pending = 0; user_service->message_available_pos = instance->completion_remove - 1; user_service->msg_insert = 0; user_service->msg_remove = 0; _sema_init(&user_service->insert_event, 0); _sema_init(&user_service->remove_event, 0); _sema_init(&user_service->close_event, 0); if (args.is_open) { status = vchiq_open_service_internal (service, instance->pid); if (status != VCHIQ_SUCCESS) { vchiq_remove_service(service->handle); service = NULL; ret = (status == VCHIQ_RETRY) ? -EINTR : -EIO; break; } } #ifdef VCHIQ_IOCTL_DEBUG printf("%s: [CREATE SERVICE] handle = %08x\n", __func__, service->handle); #endif memcpy((void *) &(((VCHIQ_CREATE_SERVICE_T*) arg)->handle), (const void *)&service->handle, sizeof(service->handle)); service = NULL; } else { ret = -EEXIST; kfree(user_service); } } break; case VCHIQ_IOC_CLOSE_SERVICE: { VCHIQ_SERVICE_HANDLE_T handle; memcpy(&handle, (const void*)arg, sizeof(handle)); #ifdef VCHIQ_IOCTL_DEBUG printf("%s: [CLOSE SERVICE] handle = %08x\n", __func__, handle); #endif service = find_service_for_instance(instance, handle); if (service != NULL) { USER_SERVICE_T *user_service = (USER_SERVICE_T *)service->base.userdata; /* close_pending is false on first entry, and when the wait in vchiq_close_service has been interrupted. */ if (!user_service->close_pending) { status = vchiq_close_service(service->handle); if (status != VCHIQ_SUCCESS) break; } /* close_pending is true once the underlying service has been closed until the client library calls the CLOSE_DELIVERED ioctl, signalling close_event. */ if (user_service->close_pending && down_interruptible(&user_service->close_event)) status = VCHIQ_RETRY; } else ret = -EINVAL; } break; case VCHIQ_IOC_REMOVE_SERVICE: { VCHIQ_SERVICE_HANDLE_T handle; memcpy(&handle, (const void*)arg, sizeof(handle)); #ifdef VCHIQ_IOCTL_DEBUG printf("%s: [REMOVE SERVICE] handle = %08x\n", __func__, handle); #endif service = find_service_for_instance(instance, handle); if (service != NULL) { USER_SERVICE_T *user_service = (USER_SERVICE_T *)service->base.userdata; /* close_pending is false on first entry, and when the wait in vchiq_close_service has been interrupted. */ if (!user_service->close_pending) { status = vchiq_remove_service(service->handle); if (status != VCHIQ_SUCCESS) break; } /* close_pending is true once the underlying service has been closed until the client library calls the CLOSE_DELIVERED ioctl, signalling close_event. */ if (user_service->close_pending && down_interruptible(&user_service->close_event)) status = VCHIQ_RETRY; } else ret = -EINVAL; } break; case VCHIQ_IOC_USE_SERVICE: case VCHIQ_IOC_RELEASE_SERVICE: { VCHIQ_SERVICE_HANDLE_T handle; memcpy(&handle, (const void*)arg, sizeof(handle)); #ifdef VCHIQ_IOCTL_DEBUG printf("%s: [%s SERVICE] handle = %08x\n", __func__, cmd == VCHIQ_IOC_USE_SERVICE ? "USE" : "RELEASE", handle); #endif service = find_service_for_instance(instance, handle); if (service != NULL) { status = (cmd == VCHIQ_IOC_USE_SERVICE) ? vchiq_use_service_internal(service) : vchiq_release_service_internal(service); if (status != VCHIQ_SUCCESS) { vchiq_log_error(vchiq_susp_log_level, "%s: cmd %s returned error %d for " "service %c%c%c%c:%8x", __func__, (cmd == VCHIQ_IOC_USE_SERVICE) ? "VCHIQ_IOC_USE_SERVICE" : "VCHIQ_IOC_RELEASE_SERVICE", status, VCHIQ_FOURCC_AS_4CHARS( service->base.fourcc), service->client_id); ret = -EINVAL; } } else ret = -EINVAL; } break; case VCHIQ_IOC_QUEUE_MESSAGE: { VCHIQ_QUEUE_MESSAGE_T args; memcpy(&args, (const void*)arg, sizeof(args)); #ifdef VCHIQ_IOCTL_DEBUG printf("%s: [QUEUE MESSAGE] handle = %08x\n", __func__, args.handle); #endif service = find_service_for_instance(instance, args.handle); if ((service != NULL) && (args.count <= MAX_ELEMENTS)) { /* Copy elements into kernel space */ VCHIQ_ELEMENT_T elements[MAX_ELEMENTS]; if (copy_from_user(elements, args.elements, args.count * sizeof(VCHIQ_ELEMENT_T)) == 0) status = vchiq_queue_message (args.handle, elements, args.count); else ret = -EFAULT; } else { ret = -EINVAL; } } break; case VCHIQ_IOC_QUEUE_BULK_TRANSMIT: case VCHIQ_IOC_QUEUE_BULK_RECEIVE: { VCHIQ_QUEUE_BULK_TRANSFER_T args; struct bulk_waiter_node *waiter = NULL; VCHIQ_BULK_DIR_T dir = (cmd == VCHIQ_IOC_QUEUE_BULK_TRANSMIT) ? VCHIQ_BULK_TRANSMIT : VCHIQ_BULK_RECEIVE; memcpy(&args, (const void*)arg, sizeof(args)); service = find_service_for_instance(instance, args.handle); if (!service) { ret = -EINVAL; break; } if (args.mode == VCHIQ_BULK_MODE_BLOCKING) { waiter = kzalloc(sizeof(struct bulk_waiter_node), GFP_KERNEL); if (!waiter) { ret = -ENOMEM; break; } args.userdata = &waiter->bulk_waiter; } else if (args.mode == VCHIQ_BULK_MODE_WAITING) { struct list_head *pos; lmutex_lock(&instance->bulk_waiter_list_mutex); list_for_each(pos, &instance->bulk_waiter_list) { if (list_entry(pos, struct bulk_waiter_node, list)->pid == current->p_pid) { waiter = list_entry(pos, struct bulk_waiter_node, list); list_del(pos); break; } } lmutex_unlock(&instance->bulk_waiter_list_mutex); if (!waiter) { vchiq_log_error(vchiq_arm_log_level, "no bulk_waiter found for pid %d", current->p_pid); ret = -ESRCH; break; } vchiq_log_info(vchiq_arm_log_level, "found bulk_waiter %x for pid %d", (unsigned int)waiter, current->p_pid); args.userdata = &waiter->bulk_waiter; } status = vchiq_bulk_transfer (args.handle, VCHI_MEM_HANDLE_INVALID, args.data, args.size, args.userdata, args.mode, dir); if (!waiter) break; if ((status != VCHIQ_RETRY) || fatal_signal_pending(current) || !waiter->bulk_waiter.bulk) { if (waiter->bulk_waiter.bulk) { /* Cancel the signal when the transfer ** completes. */ spin_lock(&bulk_waiter_spinlock); waiter->bulk_waiter.bulk->userdata = NULL; spin_unlock(&bulk_waiter_spinlock); } _sema_destroy(&waiter->bulk_waiter.event); kfree(waiter); } else { const VCHIQ_BULK_MODE_T mode_waiting = VCHIQ_BULK_MODE_WAITING; waiter->pid = current->p_pid; lmutex_lock(&instance->bulk_waiter_list_mutex); list_add(&waiter->list, &instance->bulk_waiter_list); lmutex_unlock(&instance->bulk_waiter_list_mutex); vchiq_log_info(vchiq_arm_log_level, "saved bulk_waiter %x for pid %d", (unsigned int)waiter, current->p_pid); memcpy((void *) &(((VCHIQ_QUEUE_BULK_TRANSFER_T *) arg)->mode), (const void *)&mode_waiting, sizeof(mode_waiting)); } } break; case VCHIQ_IOC_AWAIT_COMPLETION: { VCHIQ_AWAIT_COMPLETION_T args; int count = 0; DEBUG_TRACE(AWAIT_COMPLETION_LINE); if (!instance->connected) { ret = -ENOTCONN; break; } memcpy(&args, (const void*)arg, sizeof(args)); lmutex_lock(&instance->completion_mutex); DEBUG_TRACE(AWAIT_COMPLETION_LINE); while ((instance->completion_remove == instance->completion_insert) && !instance->closing) { DEBUG_TRACE(AWAIT_COMPLETION_LINE); lmutex_unlock(&instance->completion_mutex); rc = down_interruptible(&instance->insert_event); lmutex_lock(&instance->completion_mutex); if (rc != 0) { DEBUG_TRACE(AWAIT_COMPLETION_LINE); vchiq_log_info(vchiq_arm_log_level, "AWAIT_COMPLETION interrupted"); ret = -EINTR; break; } } DEBUG_TRACE(AWAIT_COMPLETION_LINE); if (ret == 0) { int msgbufcount = args.msgbufcount; int remove; remove = instance->completion_remove; for (count = 0; count < args.count; count++) { VCHIQ_COMPLETION_DATA_T *completion; VCHIQ_SERVICE_T *service1; USER_SERVICE_T *user_service; VCHIQ_HEADER_T *header; if (remove == instance->completion_insert) break; completion = &instance->completions[ remove & (MAX_COMPLETIONS - 1)]; /* A read memory barrier is needed to prevent ** the prefetch of a stale completion record */ rmb(); service1 = completion->service_userdata; user_service = service1->base.userdata; completion->service_userdata = user_service->userdata; header = completion->header; if (header) { void __user *msgbuf; int msglen; msglen = header->size + sizeof(VCHIQ_HEADER_T); /* This must be a VCHIQ-style service */ if (args.msgbufsize < msglen) { vchiq_log_error( vchiq_arm_log_level, "header %x: msgbufsize" " %x < msglen %x", (unsigned int)header, args.msgbufsize, msglen); WARN(1, "invalid message " "size\n"); if (count == 0) ret = -EMSGSIZE; break; } if (msgbufcount <= 0) /* Stall here for lack of a ** buffer for the message. */ break; /* Get the pointer from user space */ msgbufcount--; if (copy_from_user(&msgbuf, (const void __user *) &args.msgbufs[msgbufcount], sizeof(msgbuf)) != 0) { if (count == 0) ret = -EFAULT; break; } /* Copy the message to user space */ if (copy_to_user(msgbuf, header, msglen) != 0) { if (count == 0) ret = -EFAULT; break; } /* Now it has been copied, the message ** can be released. */ vchiq_release_message(service1->handle, header); /* The completion must point to the ** msgbuf. */ completion->header = msgbuf; } if ((completion->reason == VCHIQ_SERVICE_CLOSED) && !instance->use_close_delivered) unlock_service(service1); if (copy_to_user((void __user *)( (size_t)args.buf + count * sizeof(VCHIQ_COMPLETION_DATA_T)), completion, sizeof(VCHIQ_COMPLETION_DATA_T)) != 0) { if (ret == 0) ret = -EFAULT; break; } /* Ensure that the above copy has completed ** before advancing the remove pointer. */ mb(); instance->completion_remove = ++remove; } if (msgbufcount != args.msgbufcount) { memcpy((void __user *) &((VCHIQ_AWAIT_COMPLETION_T *)arg)-> msgbufcount, &msgbufcount, sizeof(msgbufcount)); } if (count != args.count) { memcpy((void __user *) &((VCHIQ_AWAIT_COMPLETION_T *)arg)->count, &count, sizeof(count)); } } if (count != 0) up(&instance->remove_event); if ((ret == 0) && instance->closing) ret = -ENOTCONN; /* * XXXBSD: ioctl return codes are not negative as in linux, so * we can not indicate success with positive number of passed * messages */ if (ret > 0) ret = 0; lmutex_unlock(&instance->completion_mutex); DEBUG_TRACE(AWAIT_COMPLETION_LINE); } break; case VCHIQ_IOC_DEQUEUE_MESSAGE: { VCHIQ_DEQUEUE_MESSAGE_T args; USER_SERVICE_T *user_service; VCHIQ_HEADER_T *header; DEBUG_TRACE(DEQUEUE_MESSAGE_LINE); memcpy(&args, (const void*)arg, sizeof(args)); service = find_service_for_instance(instance, args.handle); if (!service) { ret = -EINVAL; break; } user_service = (USER_SERVICE_T *)service->base.userdata; if (user_service->is_vchi == 0) { ret = -EINVAL; break; } spin_lock(&msg_queue_spinlock); if (user_service->msg_remove == user_service->msg_insert) { if (!args.blocking) { spin_unlock(&msg_queue_spinlock); DEBUG_TRACE(DEQUEUE_MESSAGE_LINE); ret = -EWOULDBLOCK; break; } user_service->dequeue_pending = 1; do { spin_unlock(&msg_queue_spinlock); DEBUG_TRACE(DEQUEUE_MESSAGE_LINE); if (down_interruptible( &user_service->insert_event) != 0) { vchiq_log_info(vchiq_arm_log_level, "DEQUEUE_MESSAGE interrupted"); ret = -EINTR; break; } spin_lock(&msg_queue_spinlock); } while (user_service->msg_remove == user_service->msg_insert); if (ret) break; } BUG_ON((int)(user_service->msg_insert - user_service->msg_remove) < 0); header = user_service->msg_queue[user_service->msg_remove & (MSG_QUEUE_SIZE - 1)]; user_service->msg_remove++; spin_unlock(&msg_queue_spinlock); up(&user_service->remove_event); if (header == NULL) ret = -ENOTCONN; else if (header->size <= args.bufsize) { /* Copy to user space if msgbuf is not NULL */ if ((args.buf == NULL) || (copy_to_user((void __user *)args.buf, header->data, header->size) == 0)) { args.bufsize = header->size; memcpy((void *)arg, &args, sizeof(args)); vchiq_release_message( service->handle, header); } else ret = -EFAULT; } else { vchiq_log_error(vchiq_arm_log_level, "header %x: bufsize %x < size %x", (unsigned int)header, args.bufsize, header->size); WARN(1, "invalid size\n"); ret = -EMSGSIZE; } DEBUG_TRACE(DEQUEUE_MESSAGE_LINE); } break; case VCHIQ_IOC_GET_CLIENT_ID: { VCHIQ_SERVICE_HANDLE_T handle; memcpy(&handle, (const void*)arg, sizeof(handle)); ret = vchiq_get_client_id(handle); } break; case VCHIQ_IOC_GET_CONFIG: { VCHIQ_GET_CONFIG_T args; VCHIQ_CONFIG_T config; memcpy(&args, (const void*)arg, sizeof(args)); if (args.config_size > sizeof(config)) { ret = -EINVAL; break; } status = vchiq_get_config(instance, args.config_size, &config); if (status == VCHIQ_SUCCESS) { if (copy_to_user((void __user *)args.pconfig, &config, args.config_size) != 0) { ret = -EFAULT; break; } } } break; case VCHIQ_IOC_SET_SERVICE_OPTION: { VCHIQ_SET_SERVICE_OPTION_T args; memcpy(&args, (const void*)arg, sizeof(args)); service = find_service_for_instance(instance, args.handle); if (!service) { ret = -EINVAL; break; } status = vchiq_set_service_option( args.handle, args.option, args.value); } break; case VCHIQ_IOC_DUMP_PHYS_MEM: { VCHIQ_DUMP_MEM_T args; memcpy(&args, (const void*)arg, sizeof(args)); printf("IMPLEMENT ME: %s:%d\n", __FILE__, __LINE__); #if 0 dump_phys_mem(args.virt_addr, args.num_bytes); #endif } break; case VCHIQ_IOC_LIB_VERSION: { unsigned int lib_version = (unsigned int)arg; if (lib_version < VCHIQ_VERSION_MIN) ret = -EINVAL; else if (lib_version >= VCHIQ_VERSION_CLOSE_DELIVERED) instance->use_close_delivered = 1; } break; case VCHIQ_IOC_CLOSE_DELIVERED: { VCHIQ_SERVICE_HANDLE_T handle; memcpy(&handle, (const void*)arg, sizeof(handle)); service = find_closed_service_for_instance(instance, handle); if (service != NULL) { USER_SERVICE_T *user_service = (USER_SERVICE_T *)service->base.userdata; close_delivered(user_service); } else ret = -EINVAL; } break; default: ret = -ENOTTY; break; } if (service) unlock_service(service); if (ret == 0) { if (status == VCHIQ_ERROR) ret = -EIO; else if (status == VCHIQ_RETRY) ret = -EINTR; } if ((status == VCHIQ_SUCCESS) && (ret < 0) && (ret != -EINTR) && (ret != -EWOULDBLOCK)) vchiq_log_info(vchiq_arm_log_level, " ioctl instance %lx, cmd %s -> status %d, %d", (unsigned long)instance, (_IOC_NR(cmd) <= VCHIQ_IOC_MAX) ? ioctl_names[_IOC_NR(cmd)] : "", status, ret); else vchiq_log_trace(vchiq_arm_log_level, " ioctl instance %lx, cmd %s -> status %d, %d", (unsigned long)instance, (_IOC_NR(cmd) <= VCHIQ_IOC_MAX) ? ioctl_names[_IOC_NR(cmd)] : "", status, ret); /* XXXBSD: report BSD-style error to userland */ if (ret < 0) ret = -ret; return ret; } static void instance_dtr(void *data) { kfree(data); } /**************************************************************************** * * vchiq_open * ***************************************************************************/ static int vchiq_open(struct cdev *dev, int flags, int fmt __unused, struct thread *td) { vchiq_log_info(vchiq_arm_log_level, "vchiq_open"); /* XXXBSD: do we really need this check? */ if (1) { VCHIQ_STATE_T *state = vchiq_get_state(); VCHIQ_INSTANCE_T instance; if (!state) { vchiq_log_error(vchiq_arm_log_level, "vchiq has no connection to VideoCore"); return -ENOTCONN; } instance = kmalloc(sizeof(*instance), GFP_KERNEL); if (!instance) return -ENOMEM; instance->state = state; /* XXXBSD: PID or thread ID? */ instance->pid = td->td_proc->p_pid; #ifdef notyet ret = vchiq_proc_add_instance(instance); if (ret != 0) { kfree(instance); return ret; } #endif _sema_init(&instance->insert_event, 0); _sema_init(&instance->remove_event, 0); lmutex_init(&instance->completion_mutex); lmutex_init(&instance->bulk_waiter_list_mutex); INIT_LIST_HEAD(&instance->bulk_waiter_list); devfs_set_cdevpriv(instance, instance_dtr); } else { vchiq_log_error(vchiq_arm_log_level, "Unknown minor device"); return -ENXIO; } return 0; } /**************************************************************************** * * vchiq_release * ***************************************************************************/ static int vchiq_close(struct cdev *dev, int flags __unused, int fmt __unused, struct thread *td) { int ret = 0; if (1) { VCHIQ_INSTANCE_T instance; VCHIQ_STATE_T *state = vchiq_get_state(); VCHIQ_SERVICE_T *service; int i; if ((ret = devfs_get_cdevpriv((void**)&instance))) { printf("devfs_get_cdevpriv failed: error %d\n", ret); return (ret); } vchiq_log_info(vchiq_arm_log_level, "vchiq_release: instance=%lx", (unsigned long)instance); if (!state) { ret = -EPERM; goto out; } /* Ensure videocore is awake to allow termination. */ vchiq_use_internal(instance->state, NULL, USE_TYPE_VCHIQ); lmutex_lock(&instance->completion_mutex); /* Wake the completion thread and ask it to exit */ instance->closing = 1; up(&instance->insert_event); lmutex_unlock(&instance->completion_mutex); /* Wake the slot handler if the completion queue is full. */ up(&instance->remove_event); /* Mark all services for termination... */ i = 0; while ((service = next_service_by_instance(state, instance, &i)) != NULL) { USER_SERVICE_T *user_service = service->base.userdata; /* Wake the slot handler if the msg queue is full. */ up(&user_service->remove_event); vchiq_terminate_service_internal(service); unlock_service(service); } /* ...and wait for them to die */ i = 0; while ((service = next_service_by_instance(state, instance, &i)) != NULL) { USER_SERVICE_T *user_service = service->base.userdata; down(&service->remove_event); BUG_ON(service->srvstate != VCHIQ_SRVSTATE_FREE); spin_lock(&msg_queue_spinlock); while (user_service->msg_remove != user_service->msg_insert) { VCHIQ_HEADER_T *header = user_service-> msg_queue[user_service->msg_remove & (MSG_QUEUE_SIZE - 1)]; user_service->msg_remove++; spin_unlock(&msg_queue_spinlock); if (header) vchiq_release_message( service->handle, header); spin_lock(&msg_queue_spinlock); } spin_unlock(&msg_queue_spinlock); unlock_service(service); } /* Release any closed services */ while (instance->completion_remove != instance->completion_insert) { VCHIQ_COMPLETION_DATA_T *completion; VCHIQ_SERVICE_T *service1; completion = &instance->completions[ instance->completion_remove & (MAX_COMPLETIONS - 1)]; service1 = completion->service_userdata; if (completion->reason == VCHIQ_SERVICE_CLOSED) { USER_SERVICE_T *user_service = service->base.userdata; /* Wake any blocked user-thread */ if (instance->use_close_delivered) up(&user_service->close_event); unlock_service(service1); } instance->completion_remove++; } /* Release the PEER service count. */ vchiq_release_internal(instance->state, NULL); { struct list_head *pos, *next; list_for_each_safe(pos, next, &instance->bulk_waiter_list) { struct bulk_waiter_node *waiter; waiter = list_entry(pos, struct bulk_waiter_node, list); list_del(pos); vchiq_log_info(vchiq_arm_log_level, "bulk_waiter - cleaned up %x " "for pid %d", (unsigned int)waiter, waiter->pid); _sema_destroy(&waiter->bulk_waiter.event); kfree(waiter); } } } else { vchiq_log_error(vchiq_arm_log_level, "Unknown minor device"); ret = -ENXIO; } out: return ret; } /**************************************************************************** * * vchiq_dump * ***************************************************************************/ void vchiq_dump(void *dump_context, const char *str, int len) { DUMP_CONTEXT_T *context = (DUMP_CONTEXT_T *)dump_context; if (context->actual < context->space) { int copy_bytes; if (context->offset > 0) { int skip_bytes = min(len, (int)context->offset); str += skip_bytes; len -= skip_bytes; context->offset -= skip_bytes; if (context->offset > 0) return; } copy_bytes = min(len, (int)(context->space - context->actual)); if (copy_bytes == 0) return; memcpy(context->buf + context->actual, str, copy_bytes); context->actual += copy_bytes; len -= copy_bytes; /* If tne terminating NUL is included in the length, then it ** marks the end of a line and should be replaced with a ** carriage return. */ if ((len == 0) && (str[copy_bytes - 1] == '\0')) { char cr = '\n'; memcpy(context->buf + context->actual - 1, &cr, 1); } } } /**************************************************************************** * * vchiq_dump_platform_instance_state * ***************************************************************************/ void vchiq_dump_platform_instances(void *dump_context) { VCHIQ_STATE_T *state = vchiq_get_state(); char buf[80]; int len; int i; /* There is no list of instances, so instead scan all services, marking those that have been dumped. */ for (i = 0; i < state->unused_service; i++) { VCHIQ_SERVICE_T *service = state->services[i]; VCHIQ_INSTANCE_T instance; if (service && (service->base.callback == service_callback)) { instance = service->instance; if (instance) instance->mark = 0; } } for (i = 0; i < state->unused_service; i++) { VCHIQ_SERVICE_T *service = state->services[i]; VCHIQ_INSTANCE_T instance; if (service && (service->base.callback == service_callback)) { instance = service->instance; if (instance && !instance->mark) { len = snprintf(buf, sizeof(buf), "Instance %x: pid %d,%s completions " "%d/%d", (unsigned int)instance, instance->pid, instance->connected ? " connected, " : "", instance->completion_insert - instance->completion_remove, MAX_COMPLETIONS); vchiq_dump(dump_context, buf, len + 1); instance->mark = 1; } } } } /**************************************************************************** * * vchiq_dump_platform_service_state * ***************************************************************************/ void vchiq_dump_platform_service_state(void *dump_context, VCHIQ_SERVICE_T *service) { USER_SERVICE_T *user_service = (USER_SERVICE_T *)service->base.userdata; char buf[80]; int len; len = snprintf(buf, sizeof(buf), " instance %x", (unsigned int)service->instance); if ((service->base.callback == service_callback) && user_service->is_vchi) { len += snprintf(buf + len, sizeof(buf) - len, ", %d/%d messages", user_service->msg_insert - user_service->msg_remove, MSG_QUEUE_SIZE); if (user_service->dequeue_pending) len += snprintf(buf + len, sizeof(buf) - len, " (dequeue pending)"); } vchiq_dump(dump_context, buf, len + 1); } #ifdef notyet /**************************************************************************** * * dump_user_mem * ***************************************************************************/ static void dump_phys_mem(void *virt_addr, uint32_t num_bytes) { int rc; uint8_t *end_virt_addr = virt_addr + num_bytes; int num_pages; int offset; int end_offset; int page_idx; int prev_idx; struct page *page; struct page **pages; uint8_t *kmapped_virt_ptr; /* Align virtAddr and endVirtAddr to 16 byte boundaries. */ virt_addr = (void *)((unsigned long)virt_addr & ~0x0fuL); end_virt_addr = (void *)(((unsigned long)end_virt_addr + 15uL) & ~0x0fuL); offset = (int)(long)virt_addr & (PAGE_SIZE - 1); end_offset = (int)(long)end_virt_addr & (PAGE_SIZE - 1); num_pages = (offset + num_bytes + PAGE_SIZE - 1) / PAGE_SIZE; pages = kmalloc(sizeof(struct page *) * num_pages, GFP_KERNEL); if (pages == NULL) { vchiq_log_error(vchiq_arm_log_level, "Unable to allocation memory for %d pages\n", num_pages); return; } down_read(¤t->mm->mmap_sem); rc = get_user_pages(current, /* task */ current->mm, /* mm */ (unsigned long)virt_addr, /* start */ num_pages, /* len */ 0, /* write */ 0, /* force */ pages, /* pages (array of page pointers) */ NULL); /* vmas */ up_read(¤t->mm->mmap_sem); prev_idx = -1; page = NULL; while (offset < end_offset) { int page_offset = offset % PAGE_SIZE; page_idx = offset / PAGE_SIZE; if (page_idx != prev_idx) { if (page != NULL) kunmap(page); page = pages[page_idx]; kmapped_virt_ptr = kmap(page); prev_idx = page_idx; } if (vchiq_arm_log_level >= VCHIQ_LOG_TRACE) vchiq_log_dump_mem("ph", (uint32_t)(unsigned long)&kmapped_virt_ptr[ page_offset], &kmapped_virt_ptr[page_offset], 16); offset += 16; } if (page != NULL) kunmap(page); for (page_idx = 0; page_idx < num_pages; page_idx++) page_cache_release(pages[page_idx]); kfree(pages); } /**************************************************************************** * * vchiq_read * ***************************************************************************/ static ssize_t vchiq_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { DUMP_CONTEXT_T context; context.buf = buf; context.actual = 0; context.space = count; context.offset = *ppos; vchiq_dump_state(&context, &g_state); *ppos += context.actual; return context.actual; } #endif VCHIQ_STATE_T * vchiq_get_state(void) { if (g_state.remote == NULL) printk(KERN_ERR "%s: g_state.remote == NULL\n", __func__); else if (g_state.remote->initialised != 1) printk(KERN_NOTICE "%s: g_state.remote->initialised != 1 (%d)\n", __func__, g_state.remote->initialised); return ((g_state.remote != NULL) && (g_state.remote->initialised == 1)) ? &g_state : NULL; } /* * Autosuspend related functionality */ int vchiq_videocore_wanted(VCHIQ_STATE_T *state) { VCHIQ_ARM_STATE_T *arm_state = vchiq_platform_get_arm_state(state); if (!arm_state) /* autosuspend not supported - always return wanted */ return 1; else if (arm_state->blocked_count) return 1; else if (!arm_state->videocore_use_count) /* usage count zero - check for override unless we're forcing */ if (arm_state->resume_blocked) return 0; else return vchiq_platform_videocore_wanted(state); else /* non-zero usage count - videocore still required */ return 1; } static VCHIQ_STATUS_T vchiq_keepalive_vchiq_callback(VCHIQ_REASON_T reason, VCHIQ_HEADER_T *header, VCHIQ_SERVICE_HANDLE_T service_user, void *bulk_user) { vchiq_log_error(vchiq_susp_log_level, "%s callback reason %d", __func__, reason); return 0; } static int vchiq_keepalive_thread_func(void *v) { VCHIQ_STATE_T *state = (VCHIQ_STATE_T *) v; VCHIQ_ARM_STATE_T *arm_state = vchiq_platform_get_arm_state(state); VCHIQ_STATUS_T status; VCHIQ_INSTANCE_T instance; VCHIQ_SERVICE_HANDLE_T ka_handle; VCHIQ_SERVICE_PARAMS_T params = { .fourcc = VCHIQ_MAKE_FOURCC('K', 'E', 'E', 'P'), .callback = vchiq_keepalive_vchiq_callback, .version = KEEPALIVE_VER, .version_min = KEEPALIVE_VER_MIN }; status = vchiq_initialise(&instance); if (status != VCHIQ_SUCCESS) { vchiq_log_error(vchiq_susp_log_level, "%s vchiq_initialise failed %d", __func__, status); goto exit; } status = vchiq_connect(instance); if (status != VCHIQ_SUCCESS) { vchiq_log_error(vchiq_susp_log_level, "%s vchiq_connect failed %d", __func__, status); goto shutdown; } status = vchiq_add_service(instance, ¶ms, &ka_handle); if (status != VCHIQ_SUCCESS) { vchiq_log_error(vchiq_susp_log_level, "%s vchiq_open_service failed %d", __func__, status); goto shutdown; } while (1) { long rc = 0, uc = 0; if (wait_for_completion_interruptible(&arm_state->ka_evt) != 0) { vchiq_log_error(vchiq_susp_log_level, "%s interrupted", __func__); flush_signals(current); continue; } /* read and clear counters. Do release_count then use_count to * prevent getting more releases than uses */ rc = atomic_xchg(&arm_state->ka_release_count, 0); uc = atomic_xchg(&arm_state->ka_use_count, 0); /* Call use/release service the requisite number of times. * Process use before release so use counts don't go negative */ while (uc--) { atomic_inc(&arm_state->ka_use_ack_count); status = vchiq_use_service(ka_handle); if (status != VCHIQ_SUCCESS) { vchiq_log_error(vchiq_susp_log_level, "%s vchiq_use_service error %d", __func__, status); } } while (rc--) { status = vchiq_release_service(ka_handle); if (status != VCHIQ_SUCCESS) { vchiq_log_error(vchiq_susp_log_level, "%s vchiq_release_service error %d", __func__, status); } } } shutdown: vchiq_shutdown(instance); exit: return 0; } VCHIQ_STATUS_T vchiq_arm_init_state(VCHIQ_STATE_T *state, VCHIQ_ARM_STATE_T *arm_state) { VCHIQ_STATUS_T status = VCHIQ_SUCCESS; if (arm_state) { rwlock_init(&arm_state->susp_res_lock); init_completion(&arm_state->ka_evt); atomic_set(&arm_state->ka_use_count, 0); atomic_set(&arm_state->ka_use_ack_count, 0); atomic_set(&arm_state->ka_release_count, 0); init_completion(&arm_state->vc_suspend_complete); init_completion(&arm_state->vc_resume_complete); /* Initialise to 'done' state. We only want to block on resume * completion while videocore is suspended. */ set_resume_state(arm_state, VC_RESUME_RESUMED); init_completion(&arm_state->resume_blocker); /* Initialise to 'done' state. We only want to block on this * completion while resume is blocked */ complete_all(&arm_state->resume_blocker); init_completion(&arm_state->blocked_blocker); /* Initialise to 'done' state. We only want to block on this * completion while things are waiting on the resume blocker */ complete_all(&arm_state->blocked_blocker); arm_state->suspend_timer_timeout = SUSPEND_TIMER_TIMEOUT_MS; arm_state->suspend_timer_running = 0; init_timer(&arm_state->suspend_timer); arm_state->suspend_timer.data = (unsigned long)(state); arm_state->suspend_timer.function = suspend_timer_callback; arm_state->first_connect = 0; } return status; } /* ** Functions to modify the state variables; ** set_suspend_state ** set_resume_state ** ** There are more state variables than we might like, so ensure they remain in ** step. Suspend and resume state are maintained separately, since most of ** these state machines can operate independently. However, there are a few ** states where state transitions in one state machine cause a reset to the ** other state machine. In addition, there are some completion events which ** need to occur on state machine reset and end-state(s), so these are also ** dealt with in these functions. ** ** In all states we set the state variable according to the input, but in some ** cases we perform additional steps outlined below; ** ** VC_SUSPEND_IDLE - Initialise the suspend completion at the same time. ** The suspend completion is completed after any suspend ** attempt. When we reset the state machine we also reset ** the completion. This reset occurs when videocore is ** resumed, and also if we initiate suspend after a suspend ** failure. ** ** VC_SUSPEND_IN_PROGRESS - This state is considered the point of no return for ** suspend - ie from this point on we must try to suspend ** before resuming can occur. We therefore also reset the ** resume state machine to VC_RESUME_IDLE in this state. ** ** VC_SUSPEND_SUSPENDED - Suspend has completed successfully. Also call ** complete_all on the suspend completion to notify ** anything waiting for suspend to happen. ** ** VC_SUSPEND_REJECTED - Videocore rejected suspend. Videocore will also ** initiate resume, so no need to alter resume state. ** We call complete_all on the suspend completion to notify ** of suspend rejection. ** ** VC_SUSPEND_FAILED - We failed to initiate videocore suspend. We notify the ** suspend completion and reset the resume state machine. ** ** VC_RESUME_IDLE - Initialise the resume completion at the same time. The -** resume completion is in it's 'done' state whenever +** resume completion is in its 'done' state whenever ** videcore is running. Therfore, the VC_RESUME_IDLE state ** implies that videocore is suspended. ** Hence, any thread which needs to wait until videocore is ** running can wait on this completion - it will only block ** if videocore is suspended. ** ** VC_RESUME_RESUMED - Resume has completed successfully. Videocore is running. ** Call complete_all on the resume completion to unblock ** any threads waiting for resume. Also reset the suspend ** state machine to it's idle state. ** ** VC_RESUME_FAILED - Currently unused - no mechanism to fail resume exists. */ void set_suspend_state(VCHIQ_ARM_STATE_T *arm_state, enum vc_suspend_status new_state) { /* set the state in all cases */ arm_state->vc_suspend_state = new_state; /* state specific additional actions */ switch (new_state) { case VC_SUSPEND_FORCE_CANCELED: complete_all(&arm_state->vc_suspend_complete); break; case VC_SUSPEND_REJECTED: complete_all(&arm_state->vc_suspend_complete); break; case VC_SUSPEND_FAILED: complete_all(&arm_state->vc_suspend_complete); arm_state->vc_resume_state = VC_RESUME_RESUMED; complete_all(&arm_state->vc_resume_complete); break; case VC_SUSPEND_IDLE: /* TODO: reinit_completion */ INIT_COMPLETION(arm_state->vc_suspend_complete); break; case VC_SUSPEND_REQUESTED: break; case VC_SUSPEND_IN_PROGRESS: set_resume_state(arm_state, VC_RESUME_IDLE); break; case VC_SUSPEND_SUSPENDED: complete_all(&arm_state->vc_suspend_complete); break; default: BUG(); break; } } void set_resume_state(VCHIQ_ARM_STATE_T *arm_state, enum vc_resume_status new_state) { /* set the state in all cases */ arm_state->vc_resume_state = new_state; /* state specific additional actions */ switch (new_state) { case VC_RESUME_FAILED: break; case VC_RESUME_IDLE: /* TODO: reinit_completion */ INIT_COMPLETION(arm_state->vc_resume_complete); break; case VC_RESUME_REQUESTED: break; case VC_RESUME_IN_PROGRESS: break; case VC_RESUME_RESUMED: complete_all(&arm_state->vc_resume_complete); set_suspend_state(arm_state, VC_SUSPEND_IDLE); break; default: BUG(); break; } } /* should be called with the write lock held */ inline void start_suspend_timer(VCHIQ_ARM_STATE_T *arm_state) { del_timer(&arm_state->suspend_timer); arm_state->suspend_timer.expires = jiffies + msecs_to_jiffies(arm_state-> suspend_timer_timeout); add_timer(&arm_state->suspend_timer); arm_state->suspend_timer_running = 1; } /* should be called with the write lock held */ static inline void stop_suspend_timer(VCHIQ_ARM_STATE_T *arm_state) { if (arm_state->suspend_timer_running) { del_timer(&arm_state->suspend_timer); arm_state->suspend_timer_running = 0; } } static inline int need_resume(VCHIQ_STATE_T *state) { VCHIQ_ARM_STATE_T *arm_state = vchiq_platform_get_arm_state(state); return (arm_state->vc_suspend_state > VC_SUSPEND_IDLE) && (arm_state->vc_resume_state < VC_RESUME_REQUESTED) && vchiq_videocore_wanted(state); } static int block_resume(VCHIQ_ARM_STATE_T *arm_state) { int status = VCHIQ_SUCCESS; const unsigned long timeout_val = msecs_to_jiffies(FORCE_SUSPEND_TIMEOUT_MS); int resume_count = 0; /* Allow any threads which were blocked by the last force suspend to * complete if they haven't already. Only give this one shot; if * blocked_count is incremented after blocked_blocker is completed * (which only happens when blocked_count hits 0) then those threads * will have to wait until next time around */ if (arm_state->blocked_count) { /* TODO: reinit_completion */ INIT_COMPLETION(arm_state->blocked_blocker); write_unlock_bh(&arm_state->susp_res_lock); vchiq_log_info(vchiq_susp_log_level, "%s wait for previously " "blocked clients", __func__); if (wait_for_completion_interruptible_timeout( &arm_state->blocked_blocker, timeout_val) <= 0) { vchiq_log_error(vchiq_susp_log_level, "%s wait for " "previously blocked clients failed" , __func__); status = VCHIQ_ERROR; write_lock_bh(&arm_state->susp_res_lock); goto out; } vchiq_log_info(vchiq_susp_log_level, "%s previously blocked " "clients resumed", __func__); write_lock_bh(&arm_state->susp_res_lock); } /* We need to wait for resume to complete if it's in process */ while (arm_state->vc_resume_state != VC_RESUME_RESUMED && arm_state->vc_resume_state > VC_RESUME_IDLE) { if (resume_count > 1) { status = VCHIQ_ERROR; vchiq_log_error(vchiq_susp_log_level, "%s waited too " "many times for resume" , __func__); goto out; } write_unlock_bh(&arm_state->susp_res_lock); vchiq_log_info(vchiq_susp_log_level, "%s wait for resume", __func__); if (wait_for_completion_interruptible_timeout( &arm_state->vc_resume_complete, timeout_val) <= 0) { vchiq_log_error(vchiq_susp_log_level, "%s wait for " "resume failed (%s)", __func__, resume_state_names[arm_state->vc_resume_state + VC_RESUME_NUM_OFFSET]); status = VCHIQ_ERROR; write_lock_bh(&arm_state->susp_res_lock); goto out; } vchiq_log_info(vchiq_susp_log_level, "%s resumed", __func__); write_lock_bh(&arm_state->susp_res_lock); resume_count++; } /* TODO: reinit_completion */ INIT_COMPLETION(arm_state->resume_blocker); arm_state->resume_blocked = 1; out: return status; } static inline void unblock_resume(VCHIQ_ARM_STATE_T *arm_state) { complete_all(&arm_state->resume_blocker); arm_state->resume_blocked = 0; } /* Initiate suspend via slot handler. Should be called with the write lock * held */ VCHIQ_STATUS_T vchiq_arm_vcsuspend(VCHIQ_STATE_T *state) { VCHIQ_STATUS_T status = VCHIQ_ERROR; VCHIQ_ARM_STATE_T *arm_state = vchiq_platform_get_arm_state(state); if (!arm_state) goto out; vchiq_log_trace(vchiq_susp_log_level, "%s", __func__); status = VCHIQ_SUCCESS; switch (arm_state->vc_suspend_state) { case VC_SUSPEND_REQUESTED: vchiq_log_info(vchiq_susp_log_level, "%s: suspend already " "requested", __func__); break; case VC_SUSPEND_IN_PROGRESS: vchiq_log_info(vchiq_susp_log_level, "%s: suspend already in " "progress", __func__); break; default: /* We don't expect to be in other states, so log but continue * anyway */ vchiq_log_error(vchiq_susp_log_level, "%s unexpected suspend state %s", __func__, suspend_state_names[arm_state->vc_suspend_state + VC_SUSPEND_NUM_OFFSET]); /* fall through */ case VC_SUSPEND_REJECTED: case VC_SUSPEND_FAILED: /* Ensure any idle state actions have been run */ set_suspend_state(arm_state, VC_SUSPEND_IDLE); /* fall through */ case VC_SUSPEND_IDLE: vchiq_log_info(vchiq_susp_log_level, "%s: suspending", __func__); set_suspend_state(arm_state, VC_SUSPEND_REQUESTED); /* kick the slot handler thread to initiate suspend */ request_poll(state, NULL, 0); break; } out: vchiq_log_trace(vchiq_susp_log_level, "%s exit %d", __func__, status); return status; } void vchiq_platform_check_suspend(VCHIQ_STATE_T *state) { VCHIQ_ARM_STATE_T *arm_state = vchiq_platform_get_arm_state(state); int susp = 0; if (!arm_state) goto out; vchiq_log_trace(vchiq_susp_log_level, "%s", __func__); write_lock_bh(&arm_state->susp_res_lock); if (arm_state->vc_suspend_state == VC_SUSPEND_REQUESTED && arm_state->vc_resume_state == VC_RESUME_RESUMED) { set_suspend_state(arm_state, VC_SUSPEND_IN_PROGRESS); susp = 1; } write_unlock_bh(&arm_state->susp_res_lock); if (susp) vchiq_platform_suspend(state); out: vchiq_log_trace(vchiq_susp_log_level, "%s exit", __func__); return; } static void output_timeout_error(VCHIQ_STATE_T *state) { VCHIQ_ARM_STATE_T *arm_state = vchiq_platform_get_arm_state(state); char service_err[50] = ""; int vc_use_count = arm_state->videocore_use_count; int active_services = state->unused_service; int i; if (!arm_state->videocore_use_count) { snprintf(service_err, 50, " Videocore usecount is 0"); goto output_msg; } for (i = 0; i < active_services; i++) { VCHIQ_SERVICE_T *service_ptr = state->services[i]; if (service_ptr && service_ptr->service_use_count && (service_ptr->srvstate != VCHIQ_SRVSTATE_FREE)) { snprintf(service_err, 50, " %c%c%c%c(%8x) service has " "use count %d%s", VCHIQ_FOURCC_AS_4CHARS( service_ptr->base.fourcc), service_ptr->client_id, service_ptr->service_use_count, service_ptr->service_use_count == vc_use_count ? "" : " (+ more)"); break; } } output_msg: vchiq_log_error(vchiq_susp_log_level, "timed out waiting for vc suspend (%d).%s", arm_state->autosuspend_override, service_err); } /* Try to get videocore into suspended state, regardless of autosuspend state. ** We don't actually force suspend, since videocore may get into a bad state ** if we force suspend at a bad time. Instead, we wait for autosuspend to ** determine a good point to suspend. If this doesn't happen within 100ms we ** report failure. ** ** Returns VCHIQ_SUCCESS if videocore suspended successfully, VCHIQ_RETRY if ** videocore failed to suspend in time or VCHIQ_ERROR if interrupted. */ VCHIQ_STATUS_T vchiq_arm_force_suspend(VCHIQ_STATE_T *state) { VCHIQ_ARM_STATE_T *arm_state = vchiq_platform_get_arm_state(state); VCHIQ_STATUS_T status = VCHIQ_ERROR; long rc = 0; int repeat = -1; if (!arm_state) goto out; vchiq_log_trace(vchiq_susp_log_level, "%s", __func__); write_lock_bh(&arm_state->susp_res_lock); status = block_resume(arm_state); if (status != VCHIQ_SUCCESS) goto unlock; if (arm_state->vc_suspend_state == VC_SUSPEND_SUSPENDED) { /* Already suspended - just block resume and exit */ vchiq_log_info(vchiq_susp_log_level, "%s already suspended", __func__); status = VCHIQ_SUCCESS; goto unlock; } else if (arm_state->vc_suspend_state <= VC_SUSPEND_IDLE) { /* initiate suspend immediately in the case that we're waiting * for the timeout */ stop_suspend_timer(arm_state); if (!vchiq_videocore_wanted(state)) { vchiq_log_info(vchiq_susp_log_level, "%s videocore " "idle, initiating suspend", __func__); status = vchiq_arm_vcsuspend(state); } else if (arm_state->autosuspend_override < FORCE_SUSPEND_FAIL_MAX) { vchiq_log_info(vchiq_susp_log_level, "%s letting " "videocore go idle", __func__); status = VCHIQ_SUCCESS; } else { vchiq_log_warning(vchiq_susp_log_level, "%s failed too " "many times - attempting suspend", __func__); status = vchiq_arm_vcsuspend(state); } } else { vchiq_log_info(vchiq_susp_log_level, "%s videocore suspend " "in progress - wait for completion", __func__); status = VCHIQ_SUCCESS; } /* Wait for suspend to happen due to system idle (not forced..) */ if (status != VCHIQ_SUCCESS) goto unblock_resume; do { write_unlock_bh(&arm_state->susp_res_lock); rc = wait_for_completion_interruptible_timeout( &arm_state->vc_suspend_complete, msecs_to_jiffies(FORCE_SUSPEND_TIMEOUT_MS)); write_lock_bh(&arm_state->susp_res_lock); if (rc < 0) { vchiq_log_warning(vchiq_susp_log_level, "%s " "interrupted waiting for suspend", __func__); status = VCHIQ_ERROR; goto unblock_resume; } else if (rc == 0) { if (arm_state->vc_suspend_state > VC_SUSPEND_IDLE) { /* Repeat timeout once if in progress */ if (repeat < 0) { repeat = 1; continue; } } arm_state->autosuspend_override++; output_timeout_error(state); status = VCHIQ_RETRY; goto unblock_resume; } } while (0 < (repeat--)); /* Check and report state in case we need to abort ARM suspend */ if (arm_state->vc_suspend_state != VC_SUSPEND_SUSPENDED) { status = VCHIQ_RETRY; vchiq_log_error(vchiq_susp_log_level, "%s videocore suspend failed (state %s)", __func__, suspend_state_names[arm_state->vc_suspend_state + VC_SUSPEND_NUM_OFFSET]); /* Reset the state only if it's still in an error state. * Something could have already initiated another suspend. */ if (arm_state->vc_suspend_state < VC_SUSPEND_IDLE) set_suspend_state(arm_state, VC_SUSPEND_IDLE); goto unblock_resume; } /* successfully suspended - unlock and exit */ goto unlock; unblock_resume: /* all error states need to unblock resume before exit */ unblock_resume(arm_state); unlock: write_unlock_bh(&arm_state->susp_res_lock); out: vchiq_log_trace(vchiq_susp_log_level, "%s exit %d", __func__, status); return status; } void vchiq_check_suspend(VCHIQ_STATE_T *state) { VCHIQ_ARM_STATE_T *arm_state = vchiq_platform_get_arm_state(state); if (!arm_state) goto out; vchiq_log_trace(vchiq_susp_log_level, "%s", __func__); write_lock_bh(&arm_state->susp_res_lock); if (arm_state->vc_suspend_state != VC_SUSPEND_SUSPENDED && arm_state->first_connect && !vchiq_videocore_wanted(state)) { vchiq_arm_vcsuspend(state); } write_unlock_bh(&arm_state->susp_res_lock); out: vchiq_log_trace(vchiq_susp_log_level, "%s exit", __func__); return; } int vchiq_arm_allow_resume(VCHIQ_STATE_T *state) { VCHIQ_ARM_STATE_T *arm_state = vchiq_platform_get_arm_state(state); int resume = 0; int ret = -1; if (!arm_state) goto out; vchiq_log_trace(vchiq_susp_log_level, "%s", __func__); write_lock_bh(&arm_state->susp_res_lock); unblock_resume(arm_state); resume = vchiq_check_resume(state); write_unlock_bh(&arm_state->susp_res_lock); if (resume) { if (wait_for_completion_interruptible( &arm_state->vc_resume_complete) < 0) { vchiq_log_error(vchiq_susp_log_level, "%s interrupted", __func__); /* failed, cannot accurately derive suspend * state, so exit early. */ goto out; } } read_lock_bh(&arm_state->susp_res_lock); if (arm_state->vc_suspend_state == VC_SUSPEND_SUSPENDED) { vchiq_log_info(vchiq_susp_log_level, "%s: Videocore remains suspended", __func__); } else { vchiq_log_info(vchiq_susp_log_level, "%s: Videocore resumed", __func__); ret = 0; } read_unlock_bh(&arm_state->susp_res_lock); out: vchiq_log_trace(vchiq_susp_log_level, "%s exit %d", __func__, ret); return ret; } /* This function should be called with the write lock held */ int vchiq_check_resume(VCHIQ_STATE_T *state) { VCHIQ_ARM_STATE_T *arm_state = vchiq_platform_get_arm_state(state); int resume = 0; if (!arm_state) goto out; vchiq_log_trace(vchiq_susp_log_level, "%s", __func__); if (need_resume(state)) { set_resume_state(arm_state, VC_RESUME_REQUESTED); request_poll(state, NULL, 0); resume = 1; } out: vchiq_log_trace(vchiq_susp_log_level, "%s exit", __func__); return resume; } #ifdef notyet void vchiq_platform_check_resume(VCHIQ_STATE_T *state) { VCHIQ_ARM_STATE_T *arm_state = vchiq_platform_get_arm_state(state); int res = 0; if (!arm_state) goto out; vchiq_log_trace(vchiq_susp_log_level, "%s", __func__); write_lock_bh(&arm_state->susp_res_lock); if (arm_state->wake_address == 0) { vchiq_log_info(vchiq_susp_log_level, "%s: already awake", __func__); goto unlock; } if (arm_state->vc_resume_state == VC_RESUME_IN_PROGRESS) { vchiq_log_info(vchiq_susp_log_level, "%s: already resuming", __func__); goto unlock; } if (arm_state->vc_resume_state == VC_RESUME_REQUESTED) { set_resume_state(arm_state, VC_RESUME_IN_PROGRESS); res = 1; } else vchiq_log_trace(vchiq_susp_log_level, "%s: not resuming (resume state %s)", __func__, resume_state_names[arm_state->vc_resume_state + VC_RESUME_NUM_OFFSET]); unlock: write_unlock_bh(&arm_state->susp_res_lock); if (res) vchiq_platform_resume(state); out: vchiq_log_trace(vchiq_susp_log_level, "%s exit", __func__); return; } #endif VCHIQ_STATUS_T vchiq_use_internal(VCHIQ_STATE_T *state, VCHIQ_SERVICE_T *service, enum USE_TYPE_E use_type) { VCHIQ_ARM_STATE_T *arm_state = vchiq_platform_get_arm_state(state); VCHIQ_STATUS_T ret = VCHIQ_SUCCESS; char entity[16]; int *entity_uc; int local_uc, local_entity_uc; if (!arm_state) goto out; vchiq_log_trace(vchiq_susp_log_level, "%s", __func__); if (use_type == USE_TYPE_VCHIQ) { snprintf(entity, sizeof(entity), "VCHIQ: "); entity_uc = &arm_state->peer_use_count; } else if (service) { snprintf(entity, sizeof(entity), "%c%c%c%c:%8x", VCHIQ_FOURCC_AS_4CHARS(service->base.fourcc), service->client_id); entity_uc = &service->service_use_count; } else { vchiq_log_error(vchiq_susp_log_level, "%s null service " "ptr", __func__); ret = VCHIQ_ERROR; goto out; } write_lock_bh(&arm_state->susp_res_lock); while (arm_state->resume_blocked) { /* If we call 'use' while force suspend is waiting for suspend, * then we're about to block the thread which the force is * waiting to complete, so we're bound to just time out. In this * case, set the suspend state such that the wait will be * canceled, so we can complete as quickly as possible. */ if (arm_state->resume_blocked && arm_state->vc_suspend_state == VC_SUSPEND_IDLE) { set_suspend_state(arm_state, VC_SUSPEND_FORCE_CANCELED); break; } /* If suspend is already in progress then we need to block */ if (!try_wait_for_completion(&arm_state->resume_blocker)) { /* Indicate that there are threads waiting on the resume * blocker. These need to be allowed to complete before * a _second_ call to force suspend can complete, * otherwise low priority threads might never actually * continue */ arm_state->blocked_count++; write_unlock_bh(&arm_state->susp_res_lock); vchiq_log_info(vchiq_susp_log_level, "%s %s resume " "blocked - waiting...", __func__, entity); if (wait_for_completion_killable( &arm_state->resume_blocker) != 0) { vchiq_log_error(vchiq_susp_log_level, "%s %s " "wait for resume blocker interrupted", __func__, entity); ret = VCHIQ_ERROR; write_lock_bh(&arm_state->susp_res_lock); arm_state->blocked_count--; write_unlock_bh(&arm_state->susp_res_lock); goto out; } vchiq_log_info(vchiq_susp_log_level, "%s %s resume " "unblocked", __func__, entity); write_lock_bh(&arm_state->susp_res_lock); if (--arm_state->blocked_count == 0) complete_all(&arm_state->blocked_blocker); } } stop_suspend_timer(arm_state); local_uc = ++arm_state->videocore_use_count; local_entity_uc = ++(*entity_uc); /* If there's a pending request which hasn't yet been serviced then * just clear it. If we're past VC_SUSPEND_REQUESTED state then * vc_resume_complete will block until we either resume or fail to * suspend */ if (arm_state->vc_suspend_state <= VC_SUSPEND_REQUESTED) set_suspend_state(arm_state, VC_SUSPEND_IDLE); if ((use_type != USE_TYPE_SERVICE_NO_RESUME) && need_resume(state)) { set_resume_state(arm_state, VC_RESUME_REQUESTED); vchiq_log_info(vchiq_susp_log_level, "%s %s count %d, state count %d", __func__, entity, local_entity_uc, local_uc); request_poll(state, NULL, 0); } else vchiq_log_trace(vchiq_susp_log_level, "%s %s count %d, state count %d", __func__, entity, *entity_uc, local_uc); write_unlock_bh(&arm_state->susp_res_lock); /* Completion is in a done state when we're not suspended, so this won't * block for the non-suspended case. */ if (!try_wait_for_completion(&arm_state->vc_resume_complete)) { vchiq_log_info(vchiq_susp_log_level, "%s %s wait for resume", __func__, entity); if (wait_for_completion_killable( &arm_state->vc_resume_complete) != 0) { vchiq_log_error(vchiq_susp_log_level, "%s %s wait for " "resume interrupted", __func__, entity); ret = VCHIQ_ERROR; goto out; } vchiq_log_info(vchiq_susp_log_level, "%s %s resumed", __func__, entity); } if (ret == VCHIQ_SUCCESS) { VCHIQ_STATUS_T status = VCHIQ_SUCCESS; long ack_cnt = atomic_xchg(&arm_state->ka_use_ack_count, 0); while (ack_cnt && (status == VCHIQ_SUCCESS)) { /* Send the use notify to videocore */ status = vchiq_send_remote_use_active(state); if (status == VCHIQ_SUCCESS) ack_cnt--; else atomic_add(ack_cnt, &arm_state->ka_use_ack_count); } } out: vchiq_log_trace(vchiq_susp_log_level, "%s exit %d", __func__, ret); return ret; } VCHIQ_STATUS_T vchiq_release_internal(VCHIQ_STATE_T *state, VCHIQ_SERVICE_T *service) { VCHIQ_ARM_STATE_T *arm_state = vchiq_platform_get_arm_state(state); VCHIQ_STATUS_T ret = VCHIQ_SUCCESS; char entity[16]; int *entity_uc; if (!arm_state) goto out; vchiq_log_trace(vchiq_susp_log_level, "%s", __func__); if (service) { snprintf(entity, sizeof(entity), "%c%c%c%c:%8x", VCHIQ_FOURCC_AS_4CHARS(service->base.fourcc), service->client_id); entity_uc = &service->service_use_count; } else { snprintf(entity, sizeof(entity), "PEER: "); entity_uc = &arm_state->peer_use_count; } write_lock_bh(&arm_state->susp_res_lock); if (!arm_state->videocore_use_count || !(*entity_uc)) { /* Don't use BUG_ON - don't allow user thread to crash kernel */ WARN_ON(!arm_state->videocore_use_count); WARN_ON(!(*entity_uc)); ret = VCHIQ_ERROR; goto unlock; } --arm_state->videocore_use_count; --(*entity_uc); if (!vchiq_videocore_wanted(state)) { if (vchiq_platform_use_suspend_timer() && !arm_state->resume_blocked) { /* Only use the timer if we're not trying to force * suspend (=> resume_blocked) */ start_suspend_timer(arm_state); } else { vchiq_log_info(vchiq_susp_log_level, "%s %s count %d, state count %d - suspending", __func__, entity, *entity_uc, arm_state->videocore_use_count); vchiq_arm_vcsuspend(state); } } else vchiq_log_trace(vchiq_susp_log_level, "%s %s count %d, state count %d", __func__, entity, *entity_uc, arm_state->videocore_use_count); unlock: write_unlock_bh(&arm_state->susp_res_lock); out: vchiq_log_trace(vchiq_susp_log_level, "%s exit %d", __func__, ret); return ret; } void vchiq_on_remote_use(VCHIQ_STATE_T *state) { VCHIQ_ARM_STATE_T *arm_state = vchiq_platform_get_arm_state(state); vchiq_log_trace(vchiq_susp_log_level, "%s", __func__); atomic_inc(&arm_state->ka_use_count); complete(&arm_state->ka_evt); } void vchiq_on_remote_release(VCHIQ_STATE_T *state) { VCHIQ_ARM_STATE_T *arm_state = vchiq_platform_get_arm_state(state); vchiq_log_trace(vchiq_susp_log_level, "%s", __func__); atomic_inc(&arm_state->ka_release_count); complete(&arm_state->ka_evt); } VCHIQ_STATUS_T vchiq_use_service_internal(VCHIQ_SERVICE_T *service) { return vchiq_use_internal(service->state, service, USE_TYPE_SERVICE); } VCHIQ_STATUS_T vchiq_release_service_internal(VCHIQ_SERVICE_T *service) { return vchiq_release_internal(service->state, service); } static void suspend_timer_callback(unsigned long context) { VCHIQ_STATE_T *state = (VCHIQ_STATE_T *)context; VCHIQ_ARM_STATE_T *arm_state = vchiq_platform_get_arm_state(state); if (!arm_state) goto out; vchiq_log_info(vchiq_susp_log_level, "%s - suspend timer expired - check suspend", __func__); vchiq_check_suspend(state); out: return; } VCHIQ_STATUS_T vchiq_use_service_no_resume(VCHIQ_SERVICE_HANDLE_T handle) { VCHIQ_STATUS_T ret = VCHIQ_ERROR; VCHIQ_SERVICE_T *service = find_service_by_handle(handle); if (service) { ret = vchiq_use_internal(service->state, service, USE_TYPE_SERVICE_NO_RESUME); unlock_service(service); } return ret; } VCHIQ_STATUS_T vchiq_use_service(VCHIQ_SERVICE_HANDLE_T handle) { VCHIQ_STATUS_T ret = VCHIQ_ERROR; VCHIQ_SERVICE_T *service = find_service_by_handle(handle); if (service) { ret = vchiq_use_internal(service->state, service, USE_TYPE_SERVICE); unlock_service(service); } return ret; } VCHIQ_STATUS_T vchiq_release_service(VCHIQ_SERVICE_HANDLE_T handle) { VCHIQ_STATUS_T ret = VCHIQ_ERROR; VCHIQ_SERVICE_T *service = find_service_by_handle(handle); if (service) { ret = vchiq_release_internal(service->state, service); unlock_service(service); } return ret; } void vchiq_dump_service_use_state(VCHIQ_STATE_T *state) { VCHIQ_ARM_STATE_T *arm_state = vchiq_platform_get_arm_state(state); int i, j = 0; /* Only dump 64 services */ static const int local_max_services = 64; /* If there's more than 64 services, only dump ones with * non-zero counts */ int only_nonzero = 0; static const char *nz = "<-- preventing suspend"; enum vc_suspend_status vc_suspend_state; enum vc_resume_status vc_resume_state; int peer_count; int vc_use_count; int active_services; struct service_data_struct { int fourcc; int clientid; int use_count; } service_data[local_max_services]; if (!arm_state) return; read_lock_bh(&arm_state->susp_res_lock); vc_suspend_state = arm_state->vc_suspend_state; vc_resume_state = arm_state->vc_resume_state; peer_count = arm_state->peer_use_count; vc_use_count = arm_state->videocore_use_count; active_services = state->unused_service; if (active_services > local_max_services) only_nonzero = 1; for (i = 0; (i < active_services) && (j < local_max_services); i++) { VCHIQ_SERVICE_T *service_ptr = state->services[i]; if (!service_ptr) continue; if (only_nonzero && !service_ptr->service_use_count) continue; if (service_ptr->srvstate != VCHIQ_SRVSTATE_FREE) { service_data[j].fourcc = service_ptr->base.fourcc; service_data[j].clientid = service_ptr->client_id; service_data[j++].use_count = service_ptr-> service_use_count; } } read_unlock_bh(&arm_state->susp_res_lock); vchiq_log_warning(vchiq_susp_log_level, "-- Videcore suspend state: %s --", suspend_state_names[vc_suspend_state + VC_SUSPEND_NUM_OFFSET]); vchiq_log_warning(vchiq_susp_log_level, "-- Videcore resume state: %s --", resume_state_names[vc_resume_state + VC_RESUME_NUM_OFFSET]); if (only_nonzero) vchiq_log_warning(vchiq_susp_log_level, "Too many active " "services (%d). Only dumping up to first %d services " "with non-zero use-count", active_services, local_max_services); for (i = 0; i < j; i++) { vchiq_log_warning(vchiq_susp_log_level, "----- %c%c%c%c:%d service count %d %s", VCHIQ_FOURCC_AS_4CHARS(service_data[i].fourcc), service_data[i].clientid, service_data[i].use_count, service_data[i].use_count ? nz : ""); } vchiq_log_warning(vchiq_susp_log_level, "----- VCHIQ use count count %d", peer_count); vchiq_log_warning(vchiq_susp_log_level, "--- Overall vchiq instance use count %d", vc_use_count); vchiq_dump_platform_use_state(state); } VCHIQ_STATUS_T vchiq_check_service(VCHIQ_SERVICE_T *service) { VCHIQ_ARM_STATE_T *arm_state; VCHIQ_STATUS_T ret = VCHIQ_ERROR; if (!service || !service->state) goto out; vchiq_log_trace(vchiq_susp_log_level, "%s", __func__); arm_state = vchiq_platform_get_arm_state(service->state); read_lock_bh(&arm_state->susp_res_lock); if (service->service_use_count) ret = VCHIQ_SUCCESS; read_unlock_bh(&arm_state->susp_res_lock); if (ret == VCHIQ_ERROR) { vchiq_log_error(vchiq_susp_log_level, "%s ERROR - %c%c%c%c:%8x service count %d, " "state count %d, videocore suspend state %s", __func__, VCHIQ_FOURCC_AS_4CHARS(service->base.fourcc), service->client_id, service->service_use_count, arm_state->videocore_use_count, suspend_state_names[arm_state->vc_suspend_state + VC_SUSPEND_NUM_OFFSET]); vchiq_dump_service_use_state(service->state); } out: return ret; } /* stub functions */ void vchiq_on_remote_use_active(VCHIQ_STATE_T *state) { (void)state; } void vchiq_platform_conn_state_changed(VCHIQ_STATE_T *state, VCHIQ_CONNSTATE_T oldstate, VCHIQ_CONNSTATE_T newstate) { VCHIQ_ARM_STATE_T *arm_state = vchiq_platform_get_arm_state(state); vchiq_log_info(vchiq_susp_log_level, "%d: %s->%s", state->id, get_conn_state_name(oldstate), get_conn_state_name(newstate)); if (state->conn_state == VCHIQ_CONNSTATE_CONNECTED) { write_lock_bh(&arm_state->susp_res_lock); if (!arm_state->first_connect) { char threadname[10]; arm_state->first_connect = 1; write_unlock_bh(&arm_state->susp_res_lock); snprintf(threadname, sizeof(threadname), "VCHIQka-%d", state->id); arm_state->ka_thread = vchiq_thread_create( &vchiq_keepalive_thread_func, (void *)state, threadname); if (arm_state->ka_thread == NULL) { vchiq_log_error(vchiq_susp_log_level, "vchiq: FATAL: couldn't create thread %s", threadname); } else { wake_up_process(arm_state->ka_thread); } } else write_unlock_bh(&arm_state->susp_res_lock); } } /**************************************************************************** * * vchiq_init - called when the module is loaded. * ***************************************************************************/ int __init vchiq_init(void); int __init vchiq_init(void) { int err; #ifdef notyet /* create proc entries */ err = vchiq_proc_init(); if (err != 0) goto failed_proc_init; #endif vchiq_cdev = make_dev(&vchiq_cdevsw, 0, UID_ROOT, GID_WHEEL, 0600, "vchiq"); if (!vchiq_cdev) { printf("Failed to create /dev/vchiq"); return (-ENXIO); } spin_lock_init(&msg_queue_spinlock); err = vchiq_platform_init(&g_state); if (err != 0) goto failed_platform_init; vchiq_log_info(vchiq_arm_log_level, "vchiq: initialised - version %d (min %d)", VCHIQ_VERSION, VCHIQ_VERSION_MIN); return 0; failed_platform_init: if (vchiq_cdev) { destroy_dev(vchiq_cdev); vchiq_cdev = NULL; } vchiq_log_warning(vchiq_arm_log_level, "could not load vchiq"); return err; } #ifdef notyet static int vchiq_instance_get_use_count(VCHIQ_INSTANCE_T instance) { VCHIQ_SERVICE_T *service; int use_count = 0, i; i = 0; while ((service = next_service_by_instance(instance->state, instance, &i)) != NULL) { use_count += service->service_use_count; unlock_service(service); } return use_count; } /* read the per-process use-count */ static int proc_read_use_count(char *page, char **start, off_t off, int count, int *eof, void *data) { VCHIQ_INSTANCE_T instance = data; int len, use_count; use_count = vchiq_instance_get_use_count(instance); len = snprintf(page+off, count, "%d\n", use_count); return len; } /* add an instance (process) to the proc entries */ static int vchiq_proc_add_instance(VCHIQ_INSTANCE_T instance) { char pidstr[32]; struct proc_dir_entry *top, *use_count; struct proc_dir_entry *clients = vchiq_clients_top(); int pid = instance->pid; snprintf(pidstr, sizeof(pidstr), "%d", pid); top = proc_mkdir(pidstr, clients); if (!top) goto fail_top; use_count = create_proc_read_entry("use_count", 0444, top, proc_read_use_count, instance); if (!use_count) goto fail_use_count; instance->proc_entry = top; return 0; fail_use_count: remove_proc_entry(top->name, clients); fail_top: return -ENOMEM; } static void vchiq_proc_remove_instance(VCHIQ_INSTANCE_T instance) { struct proc_dir_entry *clients = vchiq_clients_top(); remove_proc_entry("use_count", instance->proc_entry); remove_proc_entry(instance->proc_entry->name, clients); } #endif /**************************************************************************** * * vchiq_exit - called when the module is unloaded. * ***************************************************************************/ void vchiq_exit(void); void vchiq_exit(void) { if (vchiq_ehtag == NULL) EVENTHANDLER_DEREGISTER(dev_clone, vchiq_ehtag); vchiq_ehtag = NULL; vchiq_platform_exit(&g_state); if (vchiq_cdev) { destroy_dev(vchiq_cdev); vchiq_cdev = NULL; } } Index: head/sys/dev/aic7xxx/aic7xxx.seq =================================================================== --- head/sys/dev/aic7xxx/aic7xxx.seq (revision 308456) +++ head/sys/dev/aic7xxx/aic7xxx.seq (revision 308457) @@ -1,2403 +1,2403 @@ /*- * Adaptec 274x/284x/294x device driver firmware for Linux and FreeBSD. * * Copyright (c) 1994-2001 Justin T. Gibbs. * Copyright (c) 2000-2001 Adaptec Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions, and the following disclaimer, * without modification. * 2. Redistributions in binary form must reproduce at minimum a disclaimer * substantially similar to the "NO WARRANTY" disclaimer below * ("Disclaimer") and any redistribution must be conditioned upon * including a substantially similar Disclaimer requirement for further * binary redistribution. * 3. Neither the names of the above-listed copyright holders nor the names * of any contributors may be used to endorse or promote products derived * from this software without specific prior written permission. * * Alternatively, this software may be distributed under the terms of the * GNU General Public License ("GPL") version 2 as published by the Free * Software Foundation. * * NO WARRANTY * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGES. * * $FreeBSD$ */ VERSION = "$Id: //depot/aic7xxx/aic7xxx/aic7xxx.seq#58 $" PATCH_ARG_LIST = "struct ahc_softc *ahc" PREFIX = "ahc_" #include "aic7xxx.reg" #include "scsi_message.h" /* * A few words on the waiting SCB list: * After starting the selection hardware, we check for reconnecting targets * as well as for our selection to complete just in case the reselection wins * bus arbitration. The problem with this is that we must keep track of the * SCB that we've already pulled from the QINFIFO and started the selection * on just in case the reselection wins so that we can retry the selection at * a later time. This problem cannot be resolved by holding a single entry * in scratch ram since a reconnecting target can request sense and this will * create yet another SCB waiting for selection. The solution used here is to * use byte 27 of the SCB as a psuedo-next pointer and to thread a list * of SCBs that are awaiting selection. Since 0-0xfe are valid SCB indexes, * SCB_LIST_NULL is 0xff which is out of range. An entry is also added to * this list every time a request sense occurs or after completing a non-tagged * command for which a second SCB has been queued. The sequencer will * automatically consume the entries. */ bus_free_sel: /* * Turn off the selection hardware. We need to reset the * selection request in order to perform a new selection. */ and SCSISEQ, TEMODE|ENSELI|ENRSELI|ENAUTOATNP; and SIMODE1, ~ENBUSFREE; poll_for_work: call clear_target_state; and SXFRCTL0, ~SPIOEN; if ((ahc->features & AHC_ULTRA2) != 0) { clr SCSIBUSL; } test SCSISEQ, ENSELO jnz poll_for_selection; if ((ahc->features & AHC_TWIN) != 0) { xor SBLKCTL,SELBUSB; /* Toggle to the other bus */ test SCSISEQ, ENSELO jnz poll_for_selection; } BEGIN_CRITICAL; cmp WAITING_SCBH,SCB_LIST_NULL jne start_waiting; END_CRITICAL; poll_for_work_loop: if ((ahc->features & AHC_TWIN) != 0) { xor SBLKCTL,SELBUSB; /* Toggle to the other bus */ } test SSTAT0, SELDO|SELDI jnz selection; test_queue: /* Has the driver posted any work for us? */ BEGIN_CRITICAL; if ((ahc->features & AHC_QUEUE_REGS) != 0) { test QOFF_CTLSTA, SCB_AVAIL jz poll_for_work_loop; } else { mov A, QINPOS; cmp KERNEL_QINPOS, A je poll_for_work_loop; } mov ARG_1, NEXT_QUEUED_SCB; /* * We have at least one queued SCB now and we don't have any * SCBs in the list of SCBs awaiting selection. Allocate a * card SCB for the host's SCB and get to work on it. */ if ((ahc->flags & AHC_PAGESCBS) != 0) { mov ALLZEROS call get_free_or_disc_scb; } else { /* In the non-paging case, the SCBID == hardware SCB index */ mov SCBPTR, ARG_1; } or SEQ_FLAGS2, SCB_DMA; END_CRITICAL; dma_queued_scb: /* * DMA the SCB from host ram into the current SCB location. */ mvi DMAPARAMS, HDMAEN|DIRECTION|FIFORESET; mov ARG_1 call dma_scb; /* * Check one last time to see if this SCB was canceled * before we completed the DMA operation. If it was, * the QINFIFO next pointer will not match our saved * value. */ mov A, ARG_1; BEGIN_CRITICAL; cmp NEXT_QUEUED_SCB, A jne abort_qinscb; if ((ahc->flags & AHC_SEQUENCER_DEBUG) != 0) { cmp SCB_TAG, A je . + 2; mvi SCB_MISMATCH call set_seqint; } mov NEXT_QUEUED_SCB, SCB_NEXT; mov SCB_NEXT,WAITING_SCBH; mov WAITING_SCBH, SCBPTR; if ((ahc->features & AHC_QUEUE_REGS) != 0) { mov NONE, SNSCB_QOFF; } else { inc QINPOS; } and SEQ_FLAGS2, ~SCB_DMA; start_waiting: /* * Start the first entry on the waiting SCB list. */ mov SCBPTR, WAITING_SCBH; call start_selection; END_CRITICAL; poll_for_selection: /* * Twin channel devices cannot handle things like SELTO * interrupts on the "background" channel. So, while * selecting, keep polling the current channel until * either a selection or reselection occurs. */ test SSTAT0, SELDO|SELDI jz poll_for_selection; selection: /* * We aren't expecting a bus free, so interrupt * the kernel driver if it happens. */ mvi CLRSINT1,CLRBUSFREE; if ((ahc->features & AHC_DT) == 0) { or SIMODE1, ENBUSFREE; } /* * Guard against a bus free after (re)selection * but prior to enabling the busfree interrupt. SELDI * and SELDO will be cleared in that case. */ test SSTAT0, SELDI|SELDO jz bus_free_sel; test SSTAT0,SELDO jnz select_out; select_in: if ((ahc->flags & AHC_TARGETROLE) != 0) { if ((ahc->flags & AHC_INITIATORROLE) != 0) { test SSTAT0, TARGET jz initiator_reselect; } mvi CLRSINT0, CLRSELDI; /* * We've just been selected. Assert BSY and * setup the phase for receiving messages * from the target. */ mvi SCSISIGO, P_MESGOUT|BSYO; /* * Setup the DMA for sending the identify and * command information. */ mvi SEQ_FLAGS, CMDPHASE_PENDING; mov A, TQINPOS; if ((ahc->features & AHC_CMD_CHAN) != 0) { mvi DINDEX, CCHADDR; mvi SHARED_DATA_ADDR call set_32byte_addr; mvi CCSCBCTL, CCSCBRESET; } else { mvi DINDEX, HADDR; mvi SHARED_DATA_ADDR call set_32byte_addr; mvi DFCNTRL, FIFORESET; } /* Initiator that selected us */ and SAVED_SCSIID, SELID_MASK, SELID; /* The Target ID we were selected at */ if ((ahc->features & AHC_MULTI_TID) != 0) { and A, OID, TARGIDIN; } else if ((ahc->features & AHC_ULTRA2) != 0) { and A, OID, SCSIID_ULTRA2; } else { and A, OID, SCSIID; } or SAVED_SCSIID, A; if ((ahc->features & AHC_TWIN) != 0) { test SBLKCTL, SELBUSB jz . + 2; or SAVED_SCSIID, TWIN_CHNLB; } if ((ahc->features & AHC_CMD_CHAN) != 0) { mov CCSCBRAM, SAVED_SCSIID; } else { mov DFDAT, SAVED_SCSIID; } /* * If ATN isn't asserted, the target isn't interested * in talking to us. Go directly to bus free. * XXX SCSI-1 may require us to assume lun 0 if * ATN is false. */ test SCSISIGI, ATNI jz target_busfree; /* * Watch ATN closely now as we pull in messages from the * initiator. We follow the guidlines from section 6.5 * of the SCSI-2 spec for what messages are allowed when. */ call target_inb; /* * Our first message must be one of IDENTIFY, ABORT, or * BUS_DEVICE_RESET. */ test DINDEX, MSG_IDENTIFYFLAG jz host_target_message_loop; /* Store for host */ if ((ahc->features & AHC_CMD_CHAN) != 0) { mov CCSCBRAM, DINDEX; } else { mov DFDAT, DINDEX; } and SAVED_LUN, MSG_IDENTIFY_LUNMASK, DINDEX; /* Remember for disconnection decision */ test DINDEX, MSG_IDENTIFY_DISCFLAG jnz . + 2; /* XXX Honor per target settings too */ or SEQ_FLAGS, NO_DISCONNECT; test SCSISIGI, ATNI jz ident_messages_done; call target_inb; /* * If this is a tagged request, the tagged message must * immediately follow the identify. We test for a valid * tag message by seeing if it is >= MSG_SIMPLE_Q_TAG and * < MSG_IGN_WIDE_RESIDUE. */ add A, -MSG_SIMPLE_Q_TAG, DINDEX; jnc ident_messages_done_msg_pending; add A, -MSG_IGN_WIDE_RESIDUE, DINDEX; jc ident_messages_done_msg_pending; /* Store for host */ if ((ahc->features & AHC_CMD_CHAN) != 0) { mov CCSCBRAM, DINDEX; } else { mov DFDAT, DINDEX; } /* * If the initiator doesn't feel like providing a tag number, * we've got a failed selection and must transition to bus * free. */ test SCSISIGI, ATNI jz target_busfree; /* * Store the tag for the host. */ call target_inb; if ((ahc->features & AHC_CMD_CHAN) != 0) { mov CCSCBRAM, DINDEX; } else { mov DFDAT, DINDEX; } mov INITIATOR_TAG, DINDEX; or SEQ_FLAGS, TARGET_CMD_IS_TAGGED; ident_messages_done: /* Terminate the ident list */ if ((ahc->features & AHC_CMD_CHAN) != 0) { mvi CCSCBRAM, SCB_LIST_NULL; } else { mvi DFDAT, SCB_LIST_NULL; } or SEQ_FLAGS, TARG_CMD_PENDING; test SEQ_FLAGS2, TARGET_MSG_PENDING jnz target_mesgout_pending; test SCSISIGI, ATNI jnz target_mesgout_continue; jmp target_ITloop; ident_messages_done_msg_pending: or SEQ_FLAGS2, TARGET_MSG_PENDING; jmp ident_messages_done; /* * Pushed message loop to allow the kernel to * run it's own target mode message state engine. */ host_target_message_loop: mvi HOST_MSG_LOOP call set_seqint; cmp RETURN_1, EXIT_MSG_LOOP je target_ITloop; test SSTAT0, SPIORDY jz .; jmp host_target_message_loop; } if ((ahc->flags & AHC_INITIATORROLE) != 0) { /* * Reselection has been initiated by a target. Make a note that we've been * reselected, but haven't seen an IDENTIFY message from the target yet. */ initiator_reselect: /* XXX test for and handle ONE BIT condition */ or SXFRCTL0, SPIOEN|CLRSTCNT|CLRCHN; and SAVED_SCSIID, SELID_MASK, SELID; if ((ahc->features & AHC_ULTRA2) != 0) { and A, OID, SCSIID_ULTRA2; } else { and A, OID, SCSIID; } or SAVED_SCSIID, A; if ((ahc->features & AHC_TWIN) != 0) { test SBLKCTL, SELBUSB jz . + 2; or SAVED_SCSIID, TWIN_CHNLB; } mvi CLRSINT0, CLRSELDI; jmp ITloop; } abort_qinscb: call add_scb_to_free_list; jmp poll_for_work_loop; BEGIN_CRITICAL; start_selection: /* * If bus reset interrupts have been disabled (from a previous * reset), re-enable them now. Resets are only of interest * when we have outstanding transactions, so we can safely * defer re-enabling the interrupt until, as an initiator, * we start sending out transactions again. */ test SIMODE1, ENSCSIRST jnz . + 3; mvi CLRSINT1, CLRSCSIRSTI; or SIMODE1, ENSCSIRST; if ((ahc->features & AHC_TWIN) != 0) { and SINDEX,~SELBUSB,SBLKCTL;/* Clear channel select bit */ test SCB_SCSIID, TWIN_CHNLB jz . + 2; or SINDEX, SELBUSB; mov SBLKCTL,SINDEX; /* select channel */ } initialize_scsiid: if ((ahc->features & AHC_ULTRA2) != 0) { mov SCSIID_ULTRA2, SCB_SCSIID; } else if ((ahc->features & AHC_TWIN) != 0) { and SCSIID, TWIN_TID|OID, SCB_SCSIID; } else { mov SCSIID, SCB_SCSIID; } if ((ahc->flags & AHC_TARGETROLE) != 0) { mov SINDEX, SCSISEQ_TEMPLATE; test SCB_CONTROL, TARGET_SCB jz . + 2; or SINDEX, TEMODE; mov SCSISEQ, SINDEX ret; } else { mov SCSISEQ, SCSISEQ_TEMPLATE ret; } END_CRITICAL; /* * Initialize transfer settings with SCB provided settings. */ set_transfer_settings: if ((ahc->features & AHC_ULTRA) != 0) { test SCB_CONTROL, ULTRAENB jz . + 2; or SXFRCTL0, FAST20; } /* * Initialize SCSIRATE with the appropriate value for this target. */ if ((ahc->features & AHC_ULTRA2) != 0) { bmov SCSIRATE, SCB_SCSIRATE, 2 ret; } else { mov SCSIRATE, SCB_SCSIRATE ret; } if ((ahc->flags & AHC_TARGETROLE) != 0) { /* * We carefully toggle SPIOEN to allow us to return the * message byte we receive so it can be checked prior to * driving REQ on the bus for the next byte. */ target_inb: /* * Drive REQ on the bus by enabling SCSI PIO. */ or SXFRCTL0, SPIOEN; /* Wait for the byte */ test SSTAT0, SPIORDY jz .; /* Prevent our read from triggering another REQ */ and SXFRCTL0, ~SPIOEN; /* Save latched contents */ mov DINDEX, SCSIDATL ret; } /* * After the selection, remove this SCB from the "waiting SCB" * list. This is achieved by simply moving our "next" pointer into * WAITING_SCBH. Our next pointer will be set to null the next time this * SCB is used, so don't bother with it now. */ select_out: /* Turn off the selection hardware */ and SCSISEQ, TEMODE|ENSELI|ENRSELI|ENAUTOATNP, SCSISEQ; mov SCBPTR, WAITING_SCBH; mov WAITING_SCBH,SCB_NEXT; mov SAVED_SCSIID, SCB_SCSIID; and SAVED_LUN, LID, SCB_LUN; call set_transfer_settings; if ((ahc->flags & AHC_TARGETROLE) != 0) { test SSTAT0, TARGET jz initiator_select; or SXFRCTL0, CLRSTCNT|CLRCHN; /* * Put tag in connonical location since not * all connections have an SCB. */ mov INITIATOR_TAG, SCB_TARGET_ITAG; /* * We've just re-selected an initiator. * Assert BSY and setup the phase for * sending our identify messages. */ mvi P_MESGIN|BSYO call change_phase; mvi CLRSINT0, CLRSELDO; /* * Start out with a simple identify message. */ or SAVED_LUN, MSG_IDENTIFYFLAG call target_outb; /* * If we are the result of a tagged command, send * a simple Q tag and the tag id. */ test SCB_CONTROL, TAG_ENB jz . + 3; mvi MSG_SIMPLE_Q_TAG call target_outb; mov SCB_TARGET_ITAG call target_outb; target_synccmd: /* * Now determine what phases the host wants us * to go through. */ mov SEQ_FLAGS, SCB_TARGET_PHASES; test SCB_CONTROL, MK_MESSAGE jz target_ITloop; mvi P_MESGIN|BSYO call change_phase; jmp host_target_message_loop; target_ITloop: /* * Start honoring ATN signals now that * we properly identified ourselves. */ test SCSISIGI, ATNI jnz target_mesgout; test SEQ_FLAGS, CMDPHASE_PENDING jnz target_cmdphase; test SEQ_FLAGS, DPHASE_PENDING jnz target_dphase; test SEQ_FLAGS, SPHASE_PENDING jnz target_sphase; /* * No more work to do. Either disconnect or not depending * on the state of NO_DISCONNECT. */ test SEQ_FLAGS, NO_DISCONNECT jz target_disconnect; mvi TARG_IMMEDIATE_SCB, SCB_LIST_NULL; call complete_target_cmd; if ((ahc->flags & AHC_PAGESCBS) != 0) { mov ALLZEROS call get_free_or_disc_scb; } cmp TARG_IMMEDIATE_SCB, SCB_LIST_NULL je .; mvi DMAPARAMS, HDMAEN|DIRECTION|FIFORESET; mov TARG_IMMEDIATE_SCB call dma_scb; call set_transfer_settings; or SXFRCTL0, CLRSTCNT|CLRCHN; jmp target_synccmd; target_mesgout: mvi SCSISIGO, P_MESGOUT|BSYO; target_mesgout_continue: call target_inb; target_mesgout_pending: and SEQ_FLAGS2, ~TARGET_MSG_PENDING; /* Local Processing goes here... */ jmp host_target_message_loop; target_disconnect: mvi P_MESGIN|BSYO call change_phase; test SEQ_FLAGS, DPHASE jz . + 2; mvi MSG_SAVEDATAPOINTER call target_outb; mvi MSG_DISCONNECT call target_outb; target_busfree_wait: /* Wait for preceding I/O session to complete. */ test SCSISIGI, ACKI jnz .; target_busfree: and SIMODE1, ~ENBUSFREE; if ((ahc->features & AHC_ULTRA2) != 0) { clr SCSIBUSL; } clr SCSISIGO; mvi LASTPHASE, P_BUSFREE; call complete_target_cmd; jmp poll_for_work; target_cmdphase: /* * The target has dropped ATN (doesn't want to abort or BDR) * and we believe this selection to be valid. If the ring * buffer for new commands is full, return busy or queue full. */ if ((ahc->features & AHC_HS_MAILBOX) != 0) { and A, HOST_TQINPOS, HS_MAILBOX; } else { mov A, KERNEL_TQINPOS; } cmp TQINPOS, A jne tqinfifo_has_space; mvi P_STATUS|BSYO call change_phase; test SEQ_FLAGS, TARGET_CMD_IS_TAGGED jz . + 3; mvi STATUS_QUEUE_FULL call target_outb; jmp target_busfree_wait; mvi STATUS_BUSY call target_outb; jmp target_busfree_wait; tqinfifo_has_space: mvi P_COMMAND|BSYO call change_phase; call target_inb; mov A, DINDEX; /* Store for host */ if ((ahc->features & AHC_CMD_CHAN) != 0) { mov CCSCBRAM, A; } else { mov DFDAT, A; } /* * Determine the number of bytes to read * based on the command group code via table lookup. * We reuse the first 8 bytes of the TARG_SCSIRATE * BIOS array for this table. Count is one less than * the total for the command since we've already fetched * the first byte. */ shr A, CMD_GROUP_CODE_SHIFT; add SINDEX, CMDSIZE_TABLE, A; mov A, SINDIR; test A, 0xFF jz command_phase_done; or SXFRCTL0, SPIOEN; command_loop: test SSTAT0, SPIORDY jz .; cmp A, 1 jne . + 2; and SXFRCTL0, ~SPIOEN; /* Last Byte */ if ((ahc->features & AHC_CMD_CHAN) != 0) { mov CCSCBRAM, SCSIDATL; } else { mov DFDAT, SCSIDATL; } dec A; test A, 0xFF jnz command_loop; command_phase_done: and SEQ_FLAGS, ~CMDPHASE_PENDING; jmp target_ITloop; target_dphase: /* * Data phases on the bus are from the * perspective of the initiator. The dma * code looks at LASTPHASE to determine the * data direction of the DMA. Toggle it for * target transfers. */ xor LASTPHASE, IOI, SCB_TARGET_DATA_DIR; or SCB_TARGET_DATA_DIR, BSYO call change_phase; jmp p_data; target_sphase: mvi P_STATUS|BSYO call change_phase; mvi LASTPHASE, P_STATUS; mov SCB_SCSI_STATUS call target_outb; /* XXX Watch for ATN or parity errors??? */ mvi SCSISIGO, P_MESGIN|BSYO; /* MSG_CMDCMPLT is 0, but we can't do an immediate of 0 */ mov ALLZEROS call target_outb; jmp target_busfree_wait; complete_target_cmd: test SEQ_FLAGS, TARG_CMD_PENDING jnz . + 2; mov SCB_TAG jmp complete_post; if ((ahc->features & AHC_CMD_CHAN) != 0) { /* Set the valid byte */ mvi CCSCBADDR, 24; mov CCSCBRAM, ALLONES; mvi CCHCNT, 28; or CCSCBCTL, CCSCBEN|CCSCBRESET; test CCSCBCTL, CCSCBDONE jz .; clr CCSCBCTL; } else { /* Set the valid byte */ or DFCNTRL, FIFORESET; mvi DFWADDR, 3; /* Third 64bit word or byte 24 */ mov DFDAT, ALLONES; mvi 28 call set_hcnt; or DFCNTRL, HDMAEN|FIFOFLUSH; call dma_finish; } inc TQINPOS; mvi INTSTAT,CMDCMPLT ret; } if ((ahc->flags & AHC_INITIATORROLE) != 0) { initiator_select: or SXFRCTL0, SPIOEN|CLRSTCNT|CLRCHN; /* * As soon as we get a successful selection, the target * should go into the message out phase since we have ATN * asserted. */ mvi MSG_OUT, MSG_IDENTIFYFLAG; mvi SEQ_FLAGS, NO_CDB_SENT; mvi CLRSINT0, CLRSELDO; /* * Main loop for information transfer phases. Wait for the * target to assert REQ before checking MSG, C/D and I/O for * the bus phase. */ mesgin_phasemis: ITloop: call phase_lock; mov A, LASTPHASE; test A, ~P_DATAIN jz p_data; cmp A,P_COMMAND je p_command; cmp A,P_MESGOUT je p_mesgout; cmp A,P_STATUS je p_status; cmp A,P_MESGIN je p_mesgin; mvi BAD_PHASE call set_seqint; jmp ITloop; /* Try reading the bus again. */ await_busfree: and SIMODE1, ~ENBUSFREE; mov NONE, SCSIDATL; /* Ack the last byte */ if ((ahc->features & AHC_ULTRA2) != 0) { clr SCSIBUSL; /* Prevent bit leakage durint SELTO */ } and SXFRCTL0, ~SPIOEN; mvi SEQ_FLAGS, NOT_IDENTIFIED|NO_CDB_SENT; test SSTAT1,REQINIT|BUSFREE jz .; test SSTAT1, BUSFREE jnz poll_for_work; mvi MISSED_BUSFREE call set_seqint; } clear_target_state: /* * We assume that the kernel driver may reset us * at any time, even in the middle of a DMA, so * clear DFCNTRL too. */ clr DFCNTRL; or SXFRCTL0, CLRSTCNT|CLRCHN; /* * We don't know the target we will connect to, * so default to narrow transfers to avoid * parity problems. */ if ((ahc->features & AHC_ULTRA2) != 0) { bmov SCSIRATE, ALLZEROS, 2; } else { clr SCSIRATE; if ((ahc->features & AHC_ULTRA) != 0) { and SXFRCTL0, ~(FAST20); } } mvi LASTPHASE, P_BUSFREE; /* clear target specific flags */ mvi SEQ_FLAGS, NOT_IDENTIFIED|NO_CDB_SENT ret; sg_advance: clr A; /* add sizeof(struct scatter) */ add SCB_RESIDUAL_SGPTR[0],SG_SIZEOF; adc SCB_RESIDUAL_SGPTR[1],A; adc SCB_RESIDUAL_SGPTR[2],A; adc SCB_RESIDUAL_SGPTR[3],A ret; if ((ahc->features & AHC_CMD_CHAN) != 0) { disable_ccsgen: test CCSGCTL, CCSGEN jz return; test CCSGCTL, CCSGDONE jz .; disable_ccsgen_fetch_done: clr CCSGCTL; test CCSGCTL, CCSGEN jnz .; ret; idle_loop: /* * Do we need any more segments for this transfer? */ test SCB_RESIDUAL_DATACNT[3], SG_LAST_SEG jnz return; /* Did we just finish fetching segs? */ cmp CCSGCTL, CCSGEN|CCSGDONE je idle_sgfetch_complete; /* Are we actively fetching segments? */ test CCSGCTL, CCSGEN jnz return; /* * Do we have any prefetch left??? */ cmp CCSGADDR, SG_PREFETCH_CNT jne idle_sg_avail; /* * Need to fetch segments, but we can only do that * if the command channel is completely idle. Make * sure we don't have an SCB prefetch going on. */ test CCSCBCTL, CCSCBEN jnz return; /* * We fetch a "cacheline aligned" and sized amount of data * so we don't end up referencing a non-existant page. * Cacheline aligned is in quotes because the kernel will * set the prefetch amount to a reasonable level if the * cacheline size is unknown. */ mvi CCHCNT, SG_PREFETCH_CNT; and CCHADDR[0], SG_PREFETCH_ALIGN_MASK, SCB_RESIDUAL_SGPTR; bmov CCHADDR[1], SCB_RESIDUAL_SGPTR[1], 3; mvi CCSGCTL, CCSGEN|CCSGRESET ret; idle_sgfetch_complete: call disable_ccsgen_fetch_done; and CCSGADDR, SG_PREFETCH_ADDR_MASK, SCB_RESIDUAL_SGPTR; idle_sg_avail: if ((ahc->features & AHC_ULTRA2) != 0) { /* Does the hardware have space for another SG entry? */ test DFSTATUS, PRELOAD_AVAIL jz return; bmov HADDR, CCSGRAM, 7; bmov SCB_RESIDUAL_DATACNT[3], CCSGRAM, 1; if ((ahc->flags & AHC_39BIT_ADDRESSING) != 0) { mov SCB_RESIDUAL_DATACNT[3] call set_hhaddr; } call sg_advance; mov SINDEX, SCB_RESIDUAL_SGPTR[0]; test SCB_RESIDUAL_DATACNT[3], SG_LAST_SEG jz . + 2; or SINDEX, LAST_SEG; mov SG_CACHE_PRE, SINDEX; /* Load the segment */ or DFCNTRL, PRELOADEN; } ret; } if ((ahc->bugs & AHC_PCI_MWI_BUG) != 0 && ahc->pci_cachesize != 0) { /* * Calculate the trailing portion of this S/G segment that cannot * be transferred using memory write and invalidate PCI transactions. * XXX Can we optimize this for PCI writes only??? */ calc_mwi_residual: /* * If the ending address is on a cacheline boundary, * there is no need for an extra segment. */ mov A, HCNT[0]; add A, A, HADDR[0]; and A, CACHESIZE_MASK; test A, 0xFF jz return; /* * If the transfer is less than a cachline, * there is no need for an extra segment. */ test HCNT[1], 0xFF jnz calc_mwi_residual_final; test HCNT[2], 0xFF jnz calc_mwi_residual_final; add NONE, INVERTED_CACHESIZE_MASK, HCNT[0]; jnc return; calc_mwi_residual_final: mov MWI_RESIDUAL, A; not A; inc A; add HCNT[0], A; adc HCNT[1], -1; adc HCNT[2], -1 ret; } p_data: test SEQ_FLAGS,NOT_IDENTIFIED|NO_CDB_SENT jz p_data_allowed; mvi PROTO_VIOLATION call set_seqint; p_data_allowed: if ((ahc->features & AHC_ULTRA2) != 0) { mvi DMAPARAMS, PRELOADEN|SCSIEN|HDMAEN; } else { mvi DMAPARAMS, WIDEODD|SCSIEN|SDMAEN|HDMAEN|FIFORESET; } test LASTPHASE, IOI jnz . + 2; or DMAPARAMS, DIRECTION; if ((ahc->features & AHC_CMD_CHAN) != 0) { /* We don't have any valid S/G elements */ mvi CCSGADDR, SG_PREFETCH_CNT; } test SEQ_FLAGS, DPHASE jz data_phase_initialize; /* * If we re-enter the data phase after going through another * phase, our transfer location has almost certainly been * corrupted by the interveining, non-data, transfers. Ask * the host driver to fix us up based on the transfer residual. */ mvi PDATA_REINIT call set_seqint; jmp data_phase_loop; data_phase_initialize: /* We have seen a data phase for the first time */ or SEQ_FLAGS, DPHASE; /* * Initialize the DMA address and counter from the SCB. * Also set SCB_RESIDUAL_SGPTR, including the LAST_SEG * flag in the highest byte of the data count. We cannot * modify the saved values in the SCB until we see a save * data pointers message. */ if ((ahc->flags & AHC_39BIT_ADDRESSING) != 0) { /* The lowest address byte must be loaded last. */ mov SCB_DATACNT[3] call set_hhaddr; } if ((ahc->features & AHC_CMD_CHAN) != 0) { bmov HADDR, SCB_DATAPTR, 7; bmov SCB_RESIDUAL_DATACNT[3], SCB_DATACNT[3], 5; } else { mvi DINDEX, HADDR; mvi SCB_DATAPTR call bcopy_7; mvi DINDEX, SCB_RESIDUAL_DATACNT + 3; mvi SCB_DATACNT + 3 call bcopy_5; } if ((ahc->bugs & AHC_PCI_MWI_BUG) != 0 && ahc->pci_cachesize != 0) { call calc_mwi_residual; } and SCB_RESIDUAL_SGPTR[0], ~SG_FULL_RESID; if ((ahc->features & AHC_ULTRA2) == 0) { if ((ahc->features & AHC_CMD_CHAN) != 0) { bmov STCNT, HCNT, 3; } else { call set_stcnt_from_hcnt; } } data_phase_loop: /* Guard against overruns */ test SCB_RESIDUAL_SGPTR[0], SG_LIST_NULL jz data_phase_inbounds; /* * Turn on `Bit Bucket' mode, wait until the target takes * us to another phase, and then notify the host. */ and DMAPARAMS, DIRECTION; mov DFCNTRL, DMAPARAMS; or SXFRCTL1,BITBUCKET; if ((ahc->features & AHC_DT) == 0) { test SSTAT1,PHASEMIS jz .; } else { test SCSIPHASE, DATA_PHASE_MASK jnz .; } and SXFRCTL1, ~BITBUCKET; mvi DATA_OVERRUN call set_seqint; jmp ITloop; data_phase_inbounds: if ((ahc->features & AHC_ULTRA2) != 0) { mov SINDEX, SCB_RESIDUAL_SGPTR[0]; test SCB_RESIDUAL_DATACNT[3], SG_LAST_SEG jz . + 2; or SINDEX, LAST_SEG; mov SG_CACHE_PRE, SINDEX; mov DFCNTRL, DMAPARAMS; ultra2_dma_loop: call idle_loop; /* * The transfer is complete if either the last segment * completes or the target changes phase. */ test SG_CACHE_SHADOW, LAST_SEG_DONE jnz ultra2_dmafinish; if ((ahc->features & AHC_DT) == 0) { if ((ahc->flags & AHC_TARGETROLE) != 0) { /* * As a target, we control the phases, * so ignore PHASEMIS. */ test SSTAT0, TARGET jnz ultra2_dma_loop; } if ((ahc->flags & AHC_INITIATORROLE) != 0) { test SSTAT1,PHASEMIS jz ultra2_dma_loop; } } else { test DFCNTRL, SCSIEN jnz ultra2_dma_loop; } ultra2_dmafinish: /* * The transfer has terminated either due to a phase * change, and/or the completion of the last segment. * We have two goals here. Do as much other work * as possible while the data fifo drains on a read * and respond as quickly as possible to the standard * messages (save data pointers/disconnect and command * complete) that usually follow a data phase. */ if ((ahc->bugs & AHC_AUTOFLUSH_BUG) != 0) { /* * On chips with broken auto-flush, start * the flushing process now. We'll poke * the chip from time to time to keep the * flush process going as we complete the * data phase. */ or DFCNTRL, FIFOFLUSH; } /* * We assume that, even though data may still be * transferring to the host, that the SCSI side of * the DMA engine is now in a static state. This * allows us to update our notion of where we are * in this transfer. * * If, by chance, we stopped before being able * to fetch additional segments for this transfer, * yet the last S/G was completely exhausted, * call our idle loop until it is able to load * another segment. This will allow us to immediately * pickup on the next segment on the next data phase. * * If we happened to stop on the last segment, then * our residual information is still correct from * the idle loop and there is no need to perform * any fixups. */ ultra2_ensure_sg: test SG_CACHE_SHADOW, LAST_SEG jz ultra2_shvalid; /* Record if we've consumed all S/G entries */ test SSTAT2, SHVALID jnz residuals_correct; or SCB_RESIDUAL_SGPTR[0], SG_LIST_NULL; jmp residuals_correct; ultra2_shvalid: test SSTAT2, SHVALID jnz sgptr_fixup; call idle_loop; jmp ultra2_ensure_sg; sgptr_fixup: /* * Fixup the residual next S/G pointer. The S/G preload * feature of the chip allows us to load two elements * in addition to the currently active element. We * store the bottom byte of the next S/G pointer in * the SG_CACEPTR register so we can restore the * correct value when the DMA completes. If the next * sg ptr value has advanced to the point where higher * bytes in the address have been affected, fix them * too. */ test SG_CACHE_SHADOW, 0x80 jz sgptr_fixup_done; test SCB_RESIDUAL_SGPTR[0], 0x80 jnz sgptr_fixup_done; add SCB_RESIDUAL_SGPTR[1], -1; adc SCB_RESIDUAL_SGPTR[2], -1; adc SCB_RESIDUAL_SGPTR[3], -1; sgptr_fixup_done: and SCB_RESIDUAL_SGPTR[0], SG_ADDR_MASK, SG_CACHE_SHADOW; /* We are not the last seg */ and SCB_RESIDUAL_DATACNT[3], ~SG_LAST_SEG; residuals_correct: /* * Go ahead and shut down the DMA engine now. * In the future, we'll want to handle end of * transfer messages prior to doing this, but this * requires similar restructuring for pre-ULTRA2 * controllers. */ test DMAPARAMS, DIRECTION jnz ultra2_fifoempty; ultra2_fifoflush: if ((ahc->features & AHC_DT) == 0) { if ((ahc->bugs & AHC_AUTOFLUSH_BUG) != 0) { /* * On Rev A of the aic7890, the autoflush * feature doesn't function correctly. * Perform an explicit manual flush. During * a manual flush, the FIFOEMP bit becomes * true every time the PCI FIFO empties * regardless of the state of the SCSI FIFO. * It can take up to 4 clock cycles for the * SCSI FIFO to get data into the PCI FIFO * and for FIFOEMP to de-assert. Here we * guard against this condition by making * sure the FIFOEMP bit stays on for 5 full * clock cycles. */ or DFCNTRL, FIFOFLUSH; test DFSTATUS, FIFOEMP jz ultra2_fifoflush; test DFSTATUS, FIFOEMP jz ultra2_fifoflush; test DFSTATUS, FIFOEMP jz ultra2_fifoflush; test DFSTATUS, FIFOEMP jz ultra2_fifoflush; } test DFSTATUS, FIFOEMP jz ultra2_fifoflush; } else { /* * We enable the auto-ack feature on DT capable * controllers. This means that the controller may * have already transferred some overrun bytes into * the data FIFO and acked them on the bus. The only * way to detect this situation is to wait for * LAST_SEG_DONE to come true on a completed transfer * and then test to see if the data FIFO is non-empty. */ test SCB_RESIDUAL_SGPTR[0], SG_LIST_NULL jz ultra2_wait_fifoemp; test SG_CACHE_SHADOW, LAST_SEG_DONE jz .; /* * FIFOEMP can lag LAST_SEG_DONE. Wait a few * clocks before calling this an overrun. */ test DFSTATUS, FIFOEMP jnz ultra2_fifoempty; test DFSTATUS, FIFOEMP jnz ultra2_fifoempty; test DFSTATUS, FIFOEMP jnz ultra2_fifoempty; /* Overrun */ jmp data_phase_loop; ultra2_wait_fifoemp: test DFSTATUS, FIFOEMP jz .; } ultra2_fifoempty: /* Don't clobber an inprogress host data transfer */ test DFSTATUS, MREQPEND jnz ultra2_fifoempty; ultra2_dmahalt: and DFCNTRL, ~(SCSIEN|HDMAEN); test DFCNTRL, SCSIEN|HDMAEN jnz .; if ((ahc->flags & AHC_39BIT_ADDRESSING) != 0) { /* * Keep HHADDR cleared for future, 32bit addressed * only, DMA operations. * * Due to bayonette style S/G handling, our residual * data must be "fixed up" once the transfer is halted. * Here we fixup the HSHADDR stored in the high byte * of the residual data cnt. By postponing the fixup, * we can batch the clearing of HADDR with the fixup. * If we halted on the last segment, the residual is * already correct. If we are not on the last * segment, copy the high address directly from HSHADDR. * We don't need to worry about maintaining the * SG_LAST_SEG flag as it will always be false in the * case where an update is required. */ or DSCOMMAND1, HADDLDSEL0; test SG_CACHE_SHADOW, LAST_SEG jnz . + 2; mov SCB_RESIDUAL_DATACNT[3], SHADDR; clr HADDR; and DSCOMMAND1, ~HADDLDSEL0; } } else { /* If we are the last SG block, tell the hardware. */ if ((ahc->bugs & AHC_PCI_MWI_BUG) != 0 && ahc->pci_cachesize != 0) { test MWI_RESIDUAL, 0xFF jnz dma_mid_sg; } test SCB_RESIDUAL_DATACNT[3], SG_LAST_SEG jz dma_mid_sg; if ((ahc->flags & AHC_TARGETROLE) != 0) { test SSTAT0, TARGET jz dma_last_sg; if ((ahc->bugs & AHC_TMODE_WIDEODD_BUG) != 0) { test DMAPARAMS, DIRECTION jz dma_mid_sg; } } dma_last_sg: and DMAPARAMS, ~WIDEODD; dma_mid_sg: /* Start DMA data transfer. */ mov DFCNTRL, DMAPARAMS; dma_loop: if ((ahc->features & AHC_CMD_CHAN) != 0) { call idle_loop; } test SSTAT0,DMADONE jnz dma_dmadone; test SSTAT1,PHASEMIS jz dma_loop; /* ie. underrun */ dma_phasemis: /* * We will be "done" DMAing when the transfer count goes to * zero, or the target changes the phase (in light of this, * it makes sense that the DMA circuitry doesn't ACK when * PHASEMIS is active). If we are doing a SCSI->Host transfer, * the data FIFO should be flushed auto-magically on STCNT=0 * or a phase change, so just wait for FIFO empty status. */ dma_checkfifo: test DFCNTRL,DIRECTION jnz dma_fifoempty; dma_fifoflush: test DFSTATUS,FIFOEMP jz dma_fifoflush; dma_fifoempty: /* Don't clobber an inprogress host data transfer */ test DFSTATUS, MREQPEND jnz dma_fifoempty; /* * Now shut off the DMA and make sure that the DMA * hardware has actually stopped. Touching the DMA * counters, etc. while a DMA is active will result * in an ILLSADDR exception. */ dma_dmadone: and DFCNTRL, ~(SCSIEN|SDMAEN|HDMAEN); dma_halt: /* * Some revisions of the aic78XX have a problem where, if the * data fifo is full, but the PCI input latch is not empty, * HDMAEN cannot be cleared. The fix used here is to drain * the prefetched but unused data from the data fifo until * there is space for the input latch to drain. */ if ((ahc->bugs & AHC_PCI_2_1_RETRY_BUG) != 0) { mov NONE, DFDAT; } test DFCNTRL, (SCSIEN|SDMAEN|HDMAEN) jnz dma_halt; /* See if we have completed this last segment */ test STCNT[0], 0xff jnz data_phase_finish; test STCNT[1], 0xff jnz data_phase_finish; test STCNT[2], 0xff jnz data_phase_finish; /* * Advance the scatter-gather pointers if needed */ if ((ahc->bugs & AHC_PCI_MWI_BUG) != 0 && ahc->pci_cachesize != 0) { test MWI_RESIDUAL, 0xFF jz no_mwi_resid; /* * Reload HADDR from SHADDR and setup the * count to be the size of our residual. */ if ((ahc->features & AHC_CMD_CHAN) != 0) { bmov HADDR, SHADDR, 4; mov HCNT, MWI_RESIDUAL; bmov HCNT[1], ALLZEROS, 2; } else { mvi DINDEX, HADDR; mvi SHADDR call bcopy_4; mov MWI_RESIDUAL call set_hcnt; } clr MWI_RESIDUAL; jmp sg_load_done; no_mwi_resid: } test SCB_RESIDUAL_DATACNT[3], SG_LAST_SEG jz sg_load; or SCB_RESIDUAL_SGPTR[0], SG_LIST_NULL; jmp data_phase_finish; sg_load: /* * Load the next SG element's data address and length * into the DMA engine. If we don't have hardware * to perform a prefetch, we'll have to fetch the * segment from host memory first. */ if ((ahc->features & AHC_CMD_CHAN) != 0) { /* Wait for the idle loop to complete */ test CCSGCTL, CCSGEN jz . + 3; call idle_loop; test CCSGCTL, CCSGEN jnz . - 1; bmov HADDR, CCSGRAM, 7; /* * Workaround for flaky external SCB RAM * on certain aic7895 setups. It seems * unable to handle direct transfers from * S/G ram to certain SCB locations. */ mov SINDEX, CCSGRAM; mov SCB_RESIDUAL_DATACNT[3], SINDEX; } else { if ((ahc->flags & AHC_39BIT_ADDRESSING) != 0) { mov ALLZEROS call set_hhaddr; } mvi DINDEX, HADDR; mvi SCB_RESIDUAL_SGPTR call bcopy_4; mvi SG_SIZEOF call set_hcnt; or DFCNTRL, HDMAEN|DIRECTION|FIFORESET; call dma_finish; mvi DINDEX, HADDR; call dfdat_in_7; mov SCB_RESIDUAL_DATACNT[3], DFDAT; } if ((ahc->flags & AHC_39BIT_ADDRESSING) != 0) { mov SCB_RESIDUAL_DATACNT[3] call set_hhaddr; /* * The lowest address byte must be loaded * last as it triggers the computation of * some items in the PCI block. The ULTRA2 * chips do this on PRELOAD. */ mov HADDR, HADDR; } if ((ahc->bugs & AHC_PCI_MWI_BUG) != 0 && ahc->pci_cachesize != 0) { call calc_mwi_residual; } /* Point to the new next sg in memory */ call sg_advance; sg_load_done: if ((ahc->features & AHC_CMD_CHAN) != 0) { bmov STCNT, HCNT, 3; } else { call set_stcnt_from_hcnt; } if ((ahc->flags & AHC_TARGETROLE) != 0) { test SSTAT0, TARGET jnz data_phase_loop; } } data_phase_finish: /* * If the target has left us in data phase, loop through * the dma code again. In the case of ULTRA2 adapters, * we should only loop if there is a data overrun. For * all other adapters, we'll loop after each S/G element * is loaded as well as if there is an overrun. */ if ((ahc->flags & AHC_TARGETROLE) != 0) { test SSTAT0, TARGET jnz data_phase_done; } if ((ahc->flags & AHC_INITIATORROLE) != 0) { test SSTAT1, REQINIT jz .; if ((ahc->features & AHC_DT) == 0) { test SSTAT1,PHASEMIS jz data_phase_loop; } else { test SCSIPHASE, DATA_PHASE_MASK jnz data_phase_loop; } } data_phase_done: /* * After a DMA finishes, save the SG and STCNT residuals back into * the SCB. We use STCNT instead of HCNT, since it's a reflection * of how many bytes were transferred on the SCSI (as opposed to the * host) bus. */ if ((ahc->features & AHC_CMD_CHAN) != 0) { /* Kill off any pending prefetch */ call disable_ccsgen; } if ((ahc->features & AHC_ULTRA2) == 0) { /* * Clear the high address byte so that all other DMA * operations, which use 32bit addressing, can assume * HHADDR is 0. */ if ((ahc->flags & AHC_39BIT_ADDRESSING) != 0) { mov ALLZEROS call set_hhaddr; } } /* * Update our residual information before the information is * lost by some other type of SCSI I/O (e.g. PIO). If we have * transferred all data, no update is needed. * */ test SCB_RESIDUAL_SGPTR, SG_LIST_NULL jnz residual_update_done; if ((ahc->bugs & AHC_PCI_MWI_BUG) != 0 && ahc->pci_cachesize != 0) { if ((ahc->features & AHC_CMD_CHAN) != 0) { test MWI_RESIDUAL, 0xFF jz bmov_resid; } mov A, MWI_RESIDUAL; add SCB_RESIDUAL_DATACNT[0], A, STCNT[0]; clr A; adc SCB_RESIDUAL_DATACNT[1], A, STCNT[1]; adc SCB_RESIDUAL_DATACNT[2], A, STCNT[2]; clr MWI_RESIDUAL; if ((ahc->features & AHC_CMD_CHAN) != 0) { jmp . + 2; bmov_resid: bmov SCB_RESIDUAL_DATACNT, STCNT, 3; } } else if ((ahc->features & AHC_CMD_CHAN) != 0) { bmov SCB_RESIDUAL_DATACNT, STCNT, 3; } else { mov SCB_RESIDUAL_DATACNT[0], STCNT[0]; mov SCB_RESIDUAL_DATACNT[1], STCNT[1]; mov SCB_RESIDUAL_DATACNT[2], STCNT[2]; } residual_update_done: /* * Since we've been through a data phase, the SCB_RESID* fields * are now initialized. Clear the full residual flag. */ and SCB_SGPTR[0], ~SG_FULL_RESID; if ((ahc->features & AHC_ULTRA2) != 0) { /* Clear the channel in case we return to data phase later */ or SXFRCTL0, CLRSTCNT|CLRCHN; or SXFRCTL0, CLRSTCNT|CLRCHN; } if ((ahc->flags & AHC_TARGETROLE) != 0) { test SEQ_FLAGS, DPHASE_PENDING jz ITloop; and SEQ_FLAGS, ~DPHASE_PENDING; /* * For data-in phases, wait for any pending acks from the * initiator before changing phase. We only need to * send Ignore Wide Residue messages for data-in phases. */ test DFCNTRL, DIRECTION jz target_ITloop; test SSTAT1, REQINIT jnz .; test SCB_LUN, SCB_XFERLEN_ODD jz target_ITloop; test SCSIRATE, WIDEXFER jz target_ITloop; /* * Issue an Ignore Wide Residue Message. */ mvi P_MESGIN|BSYO call change_phase; mvi MSG_IGN_WIDE_RESIDUE call target_outb; mvi 1 call target_outb; jmp target_ITloop; } else { jmp ITloop; } if ((ahc->flags & AHC_INITIATORROLE) != 0) { /* * Command phase. Set up the DMA registers and let 'er rip. */ p_command: test SEQ_FLAGS, NOT_IDENTIFIED jz p_command_okay; mvi PROTO_VIOLATION call set_seqint; p_command_okay: if ((ahc->features & AHC_ULTRA2) != 0) { bmov HCNT[0], SCB_CDB_LEN, 1; bmov HCNT[1], ALLZEROS, 2; mvi SG_CACHE_PRE, LAST_SEG; } else if ((ahc->features & AHC_CMD_CHAN) != 0) { bmov STCNT[0], SCB_CDB_LEN, 1; bmov STCNT[1], ALLZEROS, 2; } else { mov STCNT[0], SCB_CDB_LEN; clr STCNT[1]; clr STCNT[2]; } add NONE, -13, SCB_CDB_LEN; mvi SCB_CDB_STORE jnc p_command_embedded; p_command_from_host: if ((ahc->features & AHC_ULTRA2) != 0) { bmov HADDR[0], SCB_CDB_PTR, 4; mvi DFCNTRL, (PRELOADEN|SCSIEN|HDMAEN|DIRECTION); } else { if ((ahc->features & AHC_CMD_CHAN) != 0) { bmov HADDR[0], SCB_CDB_PTR, 4; bmov HCNT, STCNT, 3; } else { mvi DINDEX, HADDR; mvi SCB_CDB_PTR call bcopy_4; mov SCB_CDB_LEN call set_hcnt; } mvi DFCNTRL, (SCSIEN|SDMAEN|HDMAEN|DIRECTION|FIFORESET); } jmp p_command_xfer; p_command_embedded: /* * The data fifo seems to require 4 byte aligned * transfers from the sequencer. Force this to * be the case by clearing HADDR[0] even though * we aren't going to touch host memory. */ clr HADDR[0]; if ((ahc->features & AHC_ULTRA2) != 0) { mvi DFCNTRL, (PRELOADEN|SCSIEN|DIRECTION); bmov DFDAT, SCB_CDB_STORE, 12; } else if ((ahc->features & AHC_CMD_CHAN) != 0) { if ((ahc->flags & AHC_SCB_BTT) != 0) { /* * On the 7895 the data FIFO will * get corrupted if you try to dump * data from external SCB memory into * the FIFO while it is enabled. So, * fill the fifo and then enable SCSI * transfers. */ mvi DFCNTRL, (DIRECTION|FIFORESET); } else { mvi DFCNTRL, (SCSIEN|SDMAEN|DIRECTION|FIFORESET); } bmov DFDAT, SCB_CDB_STORE, 12; if ((ahc->flags & AHC_SCB_BTT) != 0) { mvi DFCNTRL, (SCSIEN|SDMAEN|DIRECTION|FIFOFLUSH); } else { or DFCNTRL, FIFOFLUSH; } } else { mvi DFCNTRL, (SCSIEN|SDMAEN|DIRECTION|FIFORESET); call copy_to_fifo_6; call copy_to_fifo_6; or DFCNTRL, FIFOFLUSH; } p_command_xfer: and SEQ_FLAGS, ~NO_CDB_SENT; if ((ahc->features & AHC_DT) == 0) { test SSTAT0, SDONE jnz . + 2; test SSTAT1, PHASEMIS jz . - 1; /* - * Wait for our ACK to go-away on it's own + * Wait for our ACK to go-away on its own * instead of being killed by SCSIEN getting cleared. */ test SCSISIGI, ACKI jnz .; } else { test DFCNTRL, SCSIEN jnz .; } test SSTAT0, SDONE jnz p_command_successful; /* * Don't allow a data phase if the command * was not fully transferred. */ or SEQ_FLAGS, NO_CDB_SENT; p_command_successful: and DFCNTRL, ~(SCSIEN|SDMAEN|HDMAEN); test DFCNTRL, (SCSIEN|SDMAEN|HDMAEN) jnz .; jmp ITloop; /* * Status phase. Wait for the data byte to appear, then read it * and store it into the SCB. */ p_status: test SEQ_FLAGS, NOT_IDENTIFIED jnz mesgin_proto_violation; p_status_okay: mov SCB_SCSI_STATUS, SCSIDATL; or SCB_CONTROL, STATUS_RCVD; jmp ITloop; /* * Message out phase. If MSG_OUT is MSG_IDENTIFYFLAG, build a full * indentify message sequence and send it to the target. The host may * override this behavior by setting the MK_MESSAGE bit in the SCB * control byte. This will cause us to interrupt the host and allow * it to handle the message phase completely on its own. If the bit * associated with this target is set, we will also interrupt the host, * thereby allowing it to send a message on the next selection regardless * of the transaction being sent. * * If MSG_OUT is == HOST_MSG, also interrupt the host and take a message. * This is done to allow the host to send messages outside of an identify * sequence while protecting the seqencer from testing the MK_MESSAGE bit * on an SCB that might not be for the current nexus. (For example, a * BDR message in response to a bad reselection would leave us pointed to * an SCB that doesn't have anything to do with the current target). * * Otherwise, treat MSG_OUT as a 1 byte message to send (abort, abort tag, * bus device reset). * * When there are no messages to send, MSG_OUT should be set to MSG_NOOP, * in case the target decides to put us in this phase for some strange * reason. */ p_mesgout_retry: /* Turn on ATN for the retry */ if ((ahc->features & AHC_DT) == 0) { or SCSISIGO, ATNO, LASTPHASE; } else { mvi SCSISIGO, ATNO; } p_mesgout: mov SINDEX, MSG_OUT; cmp SINDEX, MSG_IDENTIFYFLAG jne p_mesgout_from_host; test SCB_CONTROL,MK_MESSAGE jnz host_message_loop; p_mesgout_identify: or SINDEX, MSG_IDENTIFYFLAG|DISCENB, SAVED_LUN; test SCB_CONTROL, DISCENB jnz . + 2; and SINDEX, ~DISCENB; /* * Send a tag message if TAG_ENB is set in the SCB control block. * Use SCB_TAG (the position in the kernel's SCB array) as the tag value. */ p_mesgout_tag: test SCB_CONTROL,TAG_ENB jz p_mesgout_onebyte; mov SCSIDATL, SINDEX; /* Send the identify message */ call phase_lock; cmp LASTPHASE, P_MESGOUT jne p_mesgout_done; and SCSIDATL,TAG_ENB|SCB_TAG_TYPE,SCB_CONTROL; call phase_lock; cmp LASTPHASE, P_MESGOUT jne p_mesgout_done; mov SCB_TAG jmp p_mesgout_onebyte; /* * Interrupt the driver, and allow it to handle this message * phase and any required retries. */ p_mesgout_from_host: cmp SINDEX, HOST_MSG jne p_mesgout_onebyte; jmp host_message_loop; p_mesgout_onebyte: mvi CLRSINT1, CLRATNO; mov SCSIDATL, SINDEX; /* * If the next bus phase after ATN drops is message out, it means * that the target is requesting that the last message(s) be resent. */ call phase_lock; cmp LASTPHASE, P_MESGOUT je p_mesgout_retry; p_mesgout_done: mvi CLRSINT1,CLRATNO; /* Be sure to turn ATNO off */ mov LAST_MSG, MSG_OUT; mvi MSG_OUT, MSG_NOOP; /* No message left */ jmp ITloop; /* * Message in phase. Bytes are read using Automatic PIO mode. */ p_mesgin: mvi ACCUM call inb_first; /* read the 1st message byte */ test A,MSG_IDENTIFYFLAG jnz mesgin_identify; cmp A,MSG_DISCONNECT je mesgin_disconnect; cmp A,MSG_SAVEDATAPOINTER je mesgin_sdptrs; cmp ALLZEROS,A je mesgin_complete; cmp A,MSG_RESTOREPOINTERS je mesgin_rdptrs; cmp A,MSG_IGN_WIDE_RESIDUE je mesgin_ign_wide_residue; cmp A,MSG_NOOP je mesgin_done; /* * Pushed message loop to allow the kernel to * run it's own message state engine. To avoid an * extra nop instruction after signaling the kernel, * we perform the phase_lock before checking to see * if we should exit the loop and skip the phase_lock * in the ITloop. Performing back to back phase_locks * shouldn't hurt, but why do it twice... */ host_message_loop: mvi HOST_MSG_LOOP call set_seqint; call phase_lock; cmp RETURN_1, EXIT_MSG_LOOP je ITloop + 1; jmp host_message_loop; mesgin_ign_wide_residue: if ((ahc->features & AHC_WIDE) != 0) { test SCSIRATE, WIDEXFER jz mesgin_reject; /* Pull the residue byte */ mvi ARG_1 call inb_next; cmp ARG_1, 0x01 jne mesgin_reject; test SCB_RESIDUAL_SGPTR[0], SG_LIST_NULL jz . + 2; test SCB_LUN, SCB_XFERLEN_ODD jnz mesgin_done; mvi IGN_WIDE_RES call set_seqint; jmp mesgin_done; } mesgin_proto_violation: mvi PROTO_VIOLATION call set_seqint; jmp mesgin_done; mesgin_reject: mvi MSG_MESSAGE_REJECT call mk_mesg; mesgin_done: mov NONE,SCSIDATL; /*dummy read from latch to ACK*/ jmp ITloop; /* * We received a "command complete" message. Put the SCB_TAG into the QOUTFIFO, * and trigger a completion interrupt. Before doing so, check to see if there * is a residual or the status byte is something other than STATUS_GOOD (0). * In either of these conditions, we upload the SCB back to the host so it can * process this information. In the case of a non zero status byte, we * additionally interrupt the kernel driver synchronously, allowing it to * decide if sense should be retrieved. If the kernel driver wishes to request * sense, it will fill the kernel SCB with a request sense command, requeue * it to the QINFIFO and tell us not to post to the QOUTFIFO by setting * RETURN_1 to SEND_SENSE. */ mesgin_complete: /* * If ATN is raised, we still want to give the target a message. * Perhaps there was a parity error on this last message byte. * Either way, the target should take us to message out phase * and then attempt to complete the command again. We should use a * critical section here to guard against a timeout triggering * for this command and setting ATN while we are still processing * the completion. test SCSISIGI, ATNI jnz mesgin_done; */ /* * If we are identified and have successfully sent the CDB, * any status will do. Optimize this fast path. */ test SCB_CONTROL, STATUS_RCVD jz mesgin_proto_violation; test SEQ_FLAGS, NOT_IDENTIFIED|NO_CDB_SENT jz complete_accepted; /* * If the target never sent an identify message but instead went * to mesgin to give an invalid message, let the host abort us. */ test SEQ_FLAGS, NOT_IDENTIFIED jnz mesgin_proto_violation; /* * If we recevied good status but never successfully sent the * cdb, abort the command. */ test SCB_SCSI_STATUS,0xff jnz complete_accepted; test SEQ_FLAGS, NO_CDB_SENT jnz mesgin_proto_violation; complete_accepted: /* * See if we attempted to deliver a message but the target ingnored us. */ test SCB_CONTROL, MK_MESSAGE jz . + 2; mvi MKMSG_FAILED call set_seqint; /* * Check for residuals */ test SCB_SGPTR, SG_LIST_NULL jnz check_status;/* No xfer */ test SCB_SGPTR, SG_FULL_RESID jnz upload_scb;/* Never xfered */ test SCB_RESIDUAL_SGPTR, SG_LIST_NULL jz upload_scb; check_status: test SCB_SCSI_STATUS,0xff jz complete; /* Good Status? */ upload_scb: or SCB_SGPTR, SG_RESID_VALID; mvi DMAPARAMS, FIFORESET; mov SCB_TAG call dma_scb; test SCB_SCSI_STATUS, 0xff jz complete; /* Just a residual? */ mvi BAD_STATUS call set_seqint; /* let driver know */ cmp RETURN_1, SEND_SENSE jne complete; call add_scb_to_free_list; jmp await_busfree; complete: mov SCB_TAG call complete_post; jmp await_busfree; } complete_post: /* Post the SCBID in SINDEX and issue an interrupt */ call add_scb_to_free_list; mov ARG_1, SINDEX; if ((ahc->features & AHC_QUEUE_REGS) != 0) { mov A, SDSCB_QOFF; } else { mov A, QOUTPOS; } mvi QOUTFIFO_OFFSET call post_byte_setup; mov ARG_1 call post_byte; if ((ahc->features & AHC_QUEUE_REGS) == 0) { inc QOUTPOS; } mvi INTSTAT,CMDCMPLT ret; if ((ahc->flags & AHC_INITIATORROLE) != 0) { /* * Is it a disconnect message? Set a flag in the SCB to remind us * and await the bus going free. If this is an untagged transaction * store the SCB id for it in our untagged target table for lookup on * a reselction. */ mesgin_disconnect: /* * If ATN is raised, we still want to give the target a message. * Perhaps there was a parity error on this last message byte * or we want to abort this command. Either way, the target * should take us to message out phase and then attempt to * disconnect again. * XXX - Wait for more testing. test SCSISIGI, ATNI jnz mesgin_done; */ test SEQ_FLAGS, NOT_IDENTIFIED|NO_CDB_SENT jnz mesgin_proto_violation; or SCB_CONTROL,DISCONNECTED; if ((ahc->flags & AHC_PAGESCBS) != 0) { call add_scb_to_disc_list; } test SCB_CONTROL, TAG_ENB jnz await_busfree; mov ARG_1, SCB_TAG; and SAVED_LUN, LID, SCB_LUN; mov SCB_SCSIID call set_busy_target; jmp await_busfree; /* * Save data pointers message: * Copying RAM values back to SCB, for Save Data Pointers message, but * only if we've actually been into a data phase to change them. This * protects against bogus data in scratch ram and the residual counts * since they are only initialized when we go into data_in or data_out. * Ack the message as soon as possible. For chips without S/G pipelining, * we can only ack the message after SHADDR has been saved. On these * chips, SHADDR increments with every bus transaction, even PIO. */ mesgin_sdptrs: if ((ahc->features & AHC_ULTRA2) != 0) { mov NONE,SCSIDATL; /*dummy read from latch to ACK*/ test SEQ_FLAGS, DPHASE jz ITloop; } else { test SEQ_FLAGS, DPHASE jz mesgin_done; } /* * If we are asked to save our position at the end of the * transfer, just mark us at the end rather than perform a * full save. */ test SCB_RESIDUAL_SGPTR[0], SG_LIST_NULL jz mesgin_sdptrs_full; or SCB_SGPTR, SG_LIST_NULL; if ((ahc->features & AHC_ULTRA2) != 0) { jmp ITloop; } else { jmp mesgin_done; } mesgin_sdptrs_full: /* * The SCB_SGPTR becomes the next one we'll download, * and the SCB_DATAPTR becomes the current SHADDR. * Use the residual number since STCNT is corrupted by * any message transfer. */ if ((ahc->features & AHC_CMD_CHAN) != 0) { bmov SCB_DATAPTR, SHADDR, 4; if ((ahc->features & AHC_ULTRA2) == 0) { mov NONE,SCSIDATL; /*dummy read from latch to ACK*/ } bmov SCB_DATACNT, SCB_RESIDUAL_DATACNT, 8; } else { mvi DINDEX, SCB_DATAPTR; mvi SHADDR call bcopy_4; mov NONE,SCSIDATL; /*dummy read from latch to ACK*/ mvi SCB_RESIDUAL_DATACNT call bcopy_8; } jmp ITloop; /* * Restore pointers message? Data pointers are recopied from the * SCB anytime we enter a data phase for the first time, so all * we need to do is clear the DPHASE flag and let the data phase * code do the rest. We also reset/reallocate the FIFO to make * sure we have a clean start for the next data or command phase. */ mesgin_rdptrs: and SEQ_FLAGS, ~DPHASE; /* * We'll reload them * the next time through * the dataphase. */ or SXFRCTL0, CLRSTCNT|CLRCHN; jmp mesgin_done; /* * Index into our Busy Target table. SINDEX and DINDEX are modified * upon return. SCBPTR may be modified by this action. */ set_busy_target: shr DINDEX, 4, SINDEX; if ((ahc->flags & AHC_SCB_BTT) != 0) { mov SCBPTR, SAVED_LUN; add DINDEX, SCB_64_BTT; } else { add DINDEX, BUSY_TARGETS; } mov DINDIR, ARG_1 ret; /* * Identify message? For a reconnecting target, this tells us the lun * that the reconnection is for - find the correct SCB and switch to it, * clearing the "disconnected" bit so we don't "find" it by accident later. */ mesgin_identify: /* * Determine whether a target is using tagged or non-tagged * transactions by first looking at the transaction stored in * the busy target array. If there is no untagged transaction * for this target or the transaction is for a different lun, then * this must be a tagged transaction. */ shr SINDEX, 4, SAVED_SCSIID; and SAVED_LUN, MSG_IDENTIFY_LUNMASK, A; if ((ahc->flags & AHC_SCB_BTT) != 0) { add SINDEX, SCB_64_BTT; mov SCBPTR, SAVED_LUN; if ((ahc->flags & AHC_SEQUENCER_DEBUG) != 0) { add NONE, -SCB_64_BTT, SINDEX; jc . + 2; mvi INTSTAT, OUT_OF_RANGE; nop; add NONE, -(SCB_64_BTT + 16), SINDEX; jnc . + 2; mvi INTSTAT, OUT_OF_RANGE; nop; } } else { add SINDEX, BUSY_TARGETS; if ((ahc->flags & AHC_SEQUENCER_DEBUG) != 0) { add NONE, -BUSY_TARGETS, SINDEX; jc . + 2; mvi INTSTAT, OUT_OF_RANGE; nop; add NONE, -(BUSY_TARGETS + 16), SINDEX; jnc . + 2; mvi INTSTAT, OUT_OF_RANGE; nop; } } mov ARG_1, SINDIR; cmp ARG_1, SCB_LIST_NULL je snoop_tag; if ((ahc->flags & AHC_PAGESCBS) != 0) { mov ARG_1 call findSCB; } else { mov SCBPTR, ARG_1; } if ((ahc->flags & AHC_SCB_BTT) != 0) { jmp setup_SCB_id_lun_okay; } else { /* * We only allow one untagged command per-target * at a time. So, if the lun doesn't match, look * for a tag message. */ and A, LID, SCB_LUN; cmp SAVED_LUN, A je setup_SCB_id_lun_okay; if ((ahc->flags & AHC_PAGESCBS) != 0) { /* * findSCB removes the SCB from the * disconnected list, so we must replace * it there should this SCB be for another * lun. */ call cleanup_scb; } } /* * Here we "snoop" the bus looking for a SIMPLE QUEUE TAG message. * If we get one, we use the tag returned to find the proper * SCB. With SCB paging, we must search for non-tagged * transactions since the SCB may exist in any slot. If we're not * using SCB paging, we can use the tag as the direct index to the * SCB. */ snoop_tag: if ((ahc->flags & AHC_SEQUENCER_DEBUG) != 0) { or SEQ_FLAGS, 0x80; } mov NONE,SCSIDATL; /* ACK Identify MSG */ call phase_lock; if ((ahc->flags & AHC_SEQUENCER_DEBUG) != 0) { or SEQ_FLAGS, 0x1; } cmp LASTPHASE, P_MESGIN jne not_found; if ((ahc->flags & AHC_SEQUENCER_DEBUG) != 0) { or SEQ_FLAGS, 0x2; } cmp SCSIBUSL,MSG_SIMPLE_Q_TAG jne not_found; get_tag: if ((ahc->flags & AHC_PAGESCBS) != 0) { mvi ARG_1 call inb_next; /* tag value */ mov ARG_1 call findSCB; } else { mvi ARG_1 call inb_next; /* tag value */ mov SCBPTR, ARG_1; } /* * Ensure that the SCB the tag points to is for * an SCB transaction to the reconnecting target. */ setup_SCB: if ((ahc->flags & AHC_SEQUENCER_DEBUG) != 0) { or SEQ_FLAGS, 0x4; } mov A, SCB_SCSIID; cmp SAVED_SCSIID, A jne not_found_cleanup_scb; if ((ahc->flags & AHC_SEQUENCER_DEBUG) != 0) { or SEQ_FLAGS, 0x8; } setup_SCB_id_okay: and A, LID, SCB_LUN; cmp SAVED_LUN, A jne not_found_cleanup_scb; setup_SCB_id_lun_okay: if ((ahc->flags & AHC_SEQUENCER_DEBUG) != 0) { or SEQ_FLAGS, 0x10; } test SCB_CONTROL,DISCONNECTED jz not_found_cleanup_scb; and SCB_CONTROL,~DISCONNECTED; test SCB_CONTROL, TAG_ENB jnz setup_SCB_tagged; if ((ahc->flags & AHC_SCB_BTT) != 0) { mov A, SCBPTR; } mvi ARG_1, SCB_LIST_NULL; mov SAVED_SCSIID call set_busy_target; if ((ahc->flags & AHC_SCB_BTT) != 0) { mov SCBPTR, A; } setup_SCB_tagged: clr SEQ_FLAGS; /* make note of IDENTIFY */ call set_transfer_settings; /* See if the host wants to send a message upon reconnection */ test SCB_CONTROL, MK_MESSAGE jz mesgin_done; mvi HOST_MSG call mk_mesg; jmp mesgin_done; not_found_cleanup_scb: if ((ahc->flags & AHC_PAGESCBS) != 0) { call cleanup_scb; } not_found: mvi NO_MATCH call set_seqint; jmp mesgin_done; mk_mesg: if ((ahc->features & AHC_DT) == 0) { or SCSISIGO, ATNO, LASTPHASE; } else { mvi SCSISIGO, ATNO; } mov MSG_OUT,SINDEX ret; /* * Functions to read data in Automatic PIO mode. * * According to Adaptec's documentation, an ACK is not sent on input from * the target until SCSIDATL is read from. So we wait until SCSIDATL is * latched (the usual way), then read the data byte directly off the bus * using SCSIBUSL. When we have pulled the ATN line, or we just want to * acknowledge the byte, then we do a dummy read from SCISDATL. The SCSI * spec guarantees that the target will hold the data byte on the bus until * we send our ACK. * * The assumption here is that these are called in a particular sequence, * and that REQ is already set when inb_first is called. inb_{first,next} * use the same calling convention as inb. */ inb_next_wait_perr: mvi PERR_DETECTED call set_seqint; jmp inb_next_wait; inb_next: mov NONE,SCSIDATL; /*dummy read from latch to ACK*/ inb_next_wait: /* * If there is a parity error, wait for the kernel to * see the interrupt and prepare our message response * before continuing. */ test SSTAT1, REQINIT jz inb_next_wait; test SSTAT1, SCSIPERR jnz inb_next_wait_perr; inb_next_check_phase: and LASTPHASE, PHASE_MASK, SCSISIGI; cmp LASTPHASE, P_MESGIN jne mesgin_phasemis; inb_first: mov DINDEX,SINDEX; mov DINDIR,SCSIBUSL ret; /*read byte directly from bus*/ inb_last: mov NONE,SCSIDATL ret; /*dummy read from latch to ACK*/ } if ((ahc->flags & AHC_TARGETROLE) != 0) { /* * Change to a new phase. If we are changing the state of the I/O signal, * from out to in, wait an additional data release delay before continuing. */ change_phase: /* Wait for preceding I/O session to complete. */ test SCSISIGI, ACKI jnz .; /* Change the phase */ and DINDEX, IOI, SCSISIGI; mov SCSISIGO, SINDEX; and A, IOI, SINDEX; /* * If the data direction has changed, from * out (initiator driving) to in (target driving), * we must wait at least a data release delay plus * the normal bus settle delay. [SCSI III SPI 10.11.0] */ cmp DINDEX, A je change_phase_wait; test SINDEX, IOI jz change_phase_wait; call change_phase_wait; change_phase_wait: nop; nop; nop; nop ret; /* * Send a byte to an initiator in Automatic PIO mode. */ target_outb: or SXFRCTL0, SPIOEN; test SSTAT0, SPIORDY jz .; mov SCSIDATL, SINDEX; test SSTAT0, SPIORDY jz .; and SXFRCTL0, ~SPIOEN ret; } /* * Locate a disconnected SCB by SCBID. Upon return, SCBPTR and SINDEX will * be set to the position of the SCB. If the SCB cannot be found locally, * it will be paged in from host memory. RETURN_2 stores the address of the * preceding SCB in the disconnected list which can be used to speed up * removal of the found SCB from the disconnected list. */ if ((ahc->flags & AHC_PAGESCBS) != 0) { BEGIN_CRITICAL; findSCB: mov A, SINDEX; /* Tag passed in SINDEX */ cmp DISCONNECTED_SCBH, SCB_LIST_NULL je findSCB_notFound; mov SCBPTR, DISCONNECTED_SCBH; /* Initialize SCBPTR */ mvi ARG_2, SCB_LIST_NULL; /* Head of list */ jmp findSCB_loop; findSCB_next: cmp SCB_NEXT, SCB_LIST_NULL je findSCB_notFound; mov ARG_2, SCBPTR; mov SCBPTR,SCB_NEXT; findSCB_loop: cmp SCB_TAG, A jne findSCB_next; rem_scb_from_disc_list: cmp ARG_2, SCB_LIST_NULL je rHead; mov DINDEX, SCB_NEXT; mov SINDEX, SCBPTR; mov SCBPTR, ARG_2; mov SCB_NEXT, DINDEX; mov SCBPTR, SINDEX ret; rHead: mov DISCONNECTED_SCBH,SCB_NEXT ret; END_CRITICAL; findSCB_notFound: /* * We didn't find it. Page in the SCB. */ mov ARG_1, A; /* Save tag */ mov ALLZEROS call get_free_or_disc_scb; mvi DMAPARAMS, HDMAEN|DIRECTION|FIFORESET; mov ARG_1 jmp dma_scb; } /* * Prepare the hardware to post a byte to host memory given an * index of (A + (256 * SINDEX)) and a base address of SHARED_DATA_ADDR. */ post_byte_setup: mov ARG_2, SINDEX; if ((ahc->features & AHC_CMD_CHAN) != 0) { mvi DINDEX, CCHADDR; mvi SHARED_DATA_ADDR call set_1byte_addr; mvi CCHCNT, 1; mvi CCSCBCTL, CCSCBRESET ret; } else { mvi DINDEX, HADDR; mvi SHARED_DATA_ADDR call set_1byte_addr; mvi 1 call set_hcnt; mvi DFCNTRL, FIFORESET ret; } post_byte: if ((ahc->features & AHC_CMD_CHAN) != 0) { bmov CCSCBRAM, SINDEX, 1; or CCSCBCTL, CCSCBEN|CCSCBRESET; test CCSCBCTL, CCSCBDONE jz .; clr CCSCBCTL ret; } else { mov DFDAT, SINDEX; or DFCNTRL, HDMAEN|FIFOFLUSH; jmp dma_finish; } phase_lock_perr: mvi PERR_DETECTED call set_seqint; phase_lock: /* * If there is a parity error, wait for the kernel to * see the interrupt and prepare our message response * before continuing. */ test SSTAT1, REQINIT jz phase_lock; test SSTAT1, SCSIPERR jnz phase_lock_perr; phase_lock_latch_phase: if ((ahc->features & AHC_DT) == 0) { and SCSISIGO, PHASE_MASK, SCSISIGI; } and LASTPHASE, PHASE_MASK, SCSISIGI ret; if ((ahc->features & AHC_CMD_CHAN) == 0) { set_hcnt: mov HCNT[0], SINDEX; clear_hcnt: clr HCNT[1]; clr HCNT[2] ret; set_stcnt_from_hcnt: mov STCNT[0], HCNT[0]; mov STCNT[1], HCNT[1]; mov STCNT[2], HCNT[2] ret; bcopy_8: mov DINDIR, SINDIR; bcopy_7: mov DINDIR, SINDIR; mov DINDIR, SINDIR; bcopy_5: mov DINDIR, SINDIR; bcopy_4: mov DINDIR, SINDIR; bcopy_3: mov DINDIR, SINDIR; mov DINDIR, SINDIR; mov DINDIR, SINDIR ret; } if ((ahc->flags & AHC_TARGETROLE) != 0) { /* * Setup addr assuming that A is an index into * an array of 32byte objects, SINDEX contains * the base address of that array, and DINDEX * contains the base address of the location * to store the indexed address. */ set_32byte_addr: shr ARG_2, 3, A; shl A, 5; jmp set_1byte_addr; } /* * Setup addr assuming that A is an index into * an array of 64byte objects, SINDEX contains * the base address of that array, and DINDEX * contains the base address of the location * to store the indexed address. */ set_64byte_addr: shr ARG_2, 2, A; shl A, 6; /* * Setup addr assuming that A + (ARG_2 * 256) is an * index into an array of 1byte objects, SINDEX contains * the base address of that array, and DINDEX contains * the base address of the location to store the computed * address. */ set_1byte_addr: add DINDIR, A, SINDIR; mov A, ARG_2; adc DINDIR, A, SINDIR; clr A; adc DINDIR, A, SINDIR; adc DINDIR, A, SINDIR ret; /* * Either post or fetch an SCB from host memory based on the * DIRECTION bit in DMAPARAMS. The host SCB index is in SINDEX. */ dma_scb: mov A, SINDEX; if ((ahc->features & AHC_CMD_CHAN) != 0) { mvi DINDEX, CCHADDR; mvi HSCB_ADDR call set_64byte_addr; mov CCSCBPTR, SCBPTR; test DMAPARAMS, DIRECTION jz dma_scb_tohost; if ((ahc->flags & AHC_SCB_BTT) != 0) { mvi CCHCNT, SCB_DOWNLOAD_SIZE_64; } else { mvi CCHCNT, SCB_DOWNLOAD_SIZE; } mvi CCSCBCTL, CCARREN|CCSCBEN|CCSCBDIR|CCSCBRESET; cmp CCSCBCTL, CCSCBDONE|ARRDONE|CCARREN|CCSCBEN|CCSCBDIR jne .; jmp dma_scb_finish; dma_scb_tohost: mvi CCHCNT, SCB_UPLOAD_SIZE; if ((ahc->features & AHC_ULTRA2) == 0) { mvi CCSCBCTL, CCSCBRESET; bmov CCSCBRAM, SCB_BASE, SCB_UPLOAD_SIZE; or CCSCBCTL, CCSCBEN|CCSCBRESET; test CCSCBCTL, CCSCBDONE jz .; } else if ((ahc->bugs & AHC_SCBCHAN_UPLOAD_BUG) != 0) { mvi CCSCBCTL, CCARREN|CCSCBRESET; cmp CCSCBCTL, ARRDONE|CCARREN jne .; mvi CCHCNT, SCB_UPLOAD_SIZE; mvi CCSCBCTL, CCSCBEN|CCSCBRESET; cmp CCSCBCTL, CCSCBDONE|CCSCBEN jne .; } else { mvi CCSCBCTL, CCARREN|CCSCBEN|CCSCBRESET; cmp CCSCBCTL, CCSCBDONE|ARRDONE|CCARREN|CCSCBEN jne .; } dma_scb_finish: clr CCSCBCTL; test CCSCBCTL, CCARREN|CCSCBEN jnz .; ret; } else { mvi DINDEX, HADDR; mvi HSCB_ADDR call set_64byte_addr; mvi SCB_DOWNLOAD_SIZE call set_hcnt; mov DFCNTRL, DMAPARAMS; test DMAPARAMS, DIRECTION jnz dma_scb_fromhost; /* Fill it with the SCB data */ copy_scb_tofifo: mvi SINDEX, SCB_BASE; add A, SCB_DOWNLOAD_SIZE, SINDEX; copy_scb_tofifo_loop: call copy_to_fifo_8; cmp SINDEX, A jne copy_scb_tofifo_loop; or DFCNTRL, HDMAEN|FIFOFLUSH; jmp dma_finish; dma_scb_fromhost: mvi DINDEX, SCB_BASE; if ((ahc->bugs & AHC_PCI_2_1_RETRY_BUG) != 0) { /* * The PCI module will only issue a PCI * retry if the data FIFO is empty. If the * host disconnects in the middle of a * transfer, we must empty the fifo of all * available data to force the chip to * continue the transfer. This does not * happen for SCSI transfers as the SCSI module * will drain the FIFO as data are made available. * When the hang occurs, we know that a multiple * of 8 bytes is in the FIFO because the PCI * module has an 8 byte input latch that only * dumps to the FIFO when HCNT == 0 or the * latch is full. */ clr A; /* Wait for at least 8 bytes of data to arrive. */ dma_scb_hang_fifo: test DFSTATUS, FIFOQWDEMP jnz dma_scb_hang_fifo; dma_scb_hang_wait: test DFSTATUS, MREQPEND jnz dma_scb_hang_wait; test DFSTATUS, HDONE jnz dma_scb_hang_dma_done; test DFSTATUS, HDONE jnz dma_scb_hang_dma_done; test DFSTATUS, HDONE jnz dma_scb_hang_dma_done; /* * The PCI module no longer intends to perform * a PCI transaction. Drain the fifo. */ dma_scb_hang_dma_drain_fifo: not A, HCNT; add A, SCB_DOWNLOAD_SIZE+SCB_BASE+1; and A, ~0x7; mov DINDIR,DFDAT; cmp DINDEX, A jne . - 1; cmp DINDEX, SCB_DOWNLOAD_SIZE+SCB_BASE je dma_finish_nowait; /* Restore A as the lines left to transfer. */ add A, -SCB_BASE, DINDEX; shr A, 3; jmp dma_scb_hang_fifo; dma_scb_hang_dma_done: and DFCNTRL, ~HDMAEN; test DFCNTRL, HDMAEN jnz .; add SEQADDR0, A; } else { call dma_finish; } call dfdat_in_8; call dfdat_in_8; call dfdat_in_8; dfdat_in_8: mov DINDIR,DFDAT; dfdat_in_7: mov DINDIR,DFDAT; mov DINDIR,DFDAT; mov DINDIR,DFDAT; mov DINDIR,DFDAT; mov DINDIR,DFDAT; dfdat_in_2: mov DINDIR,DFDAT; mov DINDIR,DFDAT ret; } copy_to_fifo_8: mov DFDAT,SINDIR; mov DFDAT,SINDIR; copy_to_fifo_6: mov DFDAT,SINDIR; copy_to_fifo_5: mov DFDAT,SINDIR; copy_to_fifo_4: mov DFDAT,SINDIR; mov DFDAT,SINDIR; mov DFDAT,SINDIR; mov DFDAT,SINDIR ret; /* * Wait for DMA from host memory to data FIFO to complete, then disable * DMA and wait for it to acknowledge that it's off. */ dma_finish: test DFSTATUS,HDONE jz dma_finish; dma_finish_nowait: /* Turn off DMA */ and DFCNTRL, ~HDMAEN; test DFCNTRL, HDMAEN jnz .; ret; /* * Restore an SCB that failed to match an incoming reselection * to the correct/safe state. If the SCB is for a disconnected * transaction, it must be returned to the disconnected list. * If it is not in the disconnected state, it must be free. */ cleanup_scb: if ((ahc->flags & AHC_PAGESCBS) != 0) { test SCB_CONTROL,DISCONNECTED jnz add_scb_to_disc_list; } add_scb_to_free_list: if ((ahc->flags & AHC_PAGESCBS) != 0) { BEGIN_CRITICAL; mov SCB_NEXT, FREE_SCBH; mvi SCB_TAG, SCB_LIST_NULL; mov FREE_SCBH, SCBPTR ret; END_CRITICAL; } else { mvi SCB_TAG, SCB_LIST_NULL ret; } if ((ahc->flags & AHC_39BIT_ADDRESSING) != 0) { set_hhaddr: or DSCOMMAND1, HADDLDSEL0; and HADDR, SG_HIGH_ADDR_BITS, SINDEX; and DSCOMMAND1, ~HADDLDSEL0 ret; } if ((ahc->flags & AHC_PAGESCBS) != 0) { get_free_or_disc_scb: BEGIN_CRITICAL; cmp FREE_SCBH, SCB_LIST_NULL jne dequeue_free_scb; cmp DISCONNECTED_SCBH, SCB_LIST_NULL jne dequeue_disc_scb; return_error: mvi NO_FREE_SCB call set_seqint; mvi SINDEX, SCB_LIST_NULL ret; dequeue_disc_scb: mov SCBPTR, DISCONNECTED_SCBH; mov DISCONNECTED_SCBH, SCB_NEXT; END_CRITICAL; mvi DMAPARAMS, FIFORESET; mov SCB_TAG jmp dma_scb; BEGIN_CRITICAL; dequeue_free_scb: mov SCBPTR, FREE_SCBH; mov FREE_SCBH, SCB_NEXT ret; END_CRITICAL; add_scb_to_disc_list: /* * Link this SCB into the DISCONNECTED list. This list holds the * candidates for paging out an SCB if one is needed for a new command. * Modifying the disconnected list is a critical(pause dissabled) section. */ BEGIN_CRITICAL; mov SCB_NEXT, DISCONNECTED_SCBH; mov DISCONNECTED_SCBH, SCBPTR ret; END_CRITICAL; } set_seqint: mov INTSTAT, SINDEX; nop; return: ret; Index: head/sys/dev/ath/if_ath_tdma.c =================================================================== --- head/sys/dev/ath/if_ath_tdma.c (revision 308456) +++ head/sys/dev/ath/if_ath_tdma.c (revision 308457) @@ -1,688 +1,688 @@ /*- * Copyright (c) 2002-2009 Sam Leffler, Errno Consulting * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer, * without modification. * 2. Redistributions in binary form must reproduce at minimum a disclaimer * similar to the "NO WARRANTY" disclaimer below ("Disclaimer") and any * redistribution must be conditioned upon including a substantially * similar Disclaimer requirement for further binary redistribution. * * NO WARRANTY * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF NONINFRINGEMENT, MERCHANTIBILITY * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL * THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF * THE POSSIBILITY OF SUCH DAMAGES. */ #include __FBSDID("$FreeBSD$"); /* * Driver for the Atheros Wireless LAN controller. * * This software is derived from work of Atsushi Onoe; his contribution * is greatly appreciated. */ #include "opt_inet.h" #include "opt_ath.h" /* * This is needed for register operations which are performed * by the driver - eg, calls to ath_hal_gettsf32(). * * It's also required for any AH_DEBUG checks in here, eg the * module dependencies. */ #include "opt_ah.h" #include "opt_wlan.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* for mp_ncpus */ #include #include #include #include #include #include #include #include #include #include #include #ifdef IEEE80211_SUPPORT_SUPERG #include #endif #ifdef IEEE80211_SUPPORT_TDMA #include #endif #include #ifdef INET #include #include #endif #include #include /* XXX for softled */ #include #include #include #include #include #include #include #include #include #include #include #ifdef ATH_TX99_DIAG #include #endif #ifdef ATH_DEBUG_ALQ #include #endif #ifdef IEEE80211_SUPPORT_TDMA #include static void ath_tdma_settimers(struct ath_softc *sc, u_int32_t nexttbtt, u_int32_t bintval); static void ath_tdma_bintvalsetup(struct ath_softc *sc, const struct ieee80211_tdma_state *tdma); #endif /* IEEE80211_SUPPORT_TDMA */ #ifdef IEEE80211_SUPPORT_TDMA static void ath_tdma_settimers(struct ath_softc *sc, u_int32_t nexttbtt, u_int32_t bintval) { struct ath_hal *ah = sc->sc_ah; HAL_BEACON_TIMERS bt; bt.bt_intval = bintval | HAL_BEACON_ENA; bt.bt_nexttbtt = nexttbtt; bt.bt_nextdba = (nexttbtt<<3) - sc->sc_tdmadbaprep; bt.bt_nextswba = (nexttbtt<<3) - sc->sc_tdmaswbaprep; bt.bt_nextatim = nexttbtt+1; /* Enables TBTT, DBA, SWBA timers by default */ bt.bt_flags = 0; #if 0 DPRINTF(sc, ATH_DEBUG_TDMA_TIMER, "%s: intval=%d (0x%08x) nexttbtt=%u (0x%08x), nextdba=%u (0x%08x), nextswba=%u (0x%08x),nextatim=%u (0x%08x)\n", __func__, bt.bt_intval, bt.bt_intval, bt.bt_nexttbtt, bt.bt_nexttbtt, bt.bt_nextdba, bt.bt_nextdba, bt.bt_nextswba, bt.bt_nextswba, bt.bt_nextatim, bt.bt_nextatim); #endif #ifdef ATH_DEBUG_ALQ if (if_ath_alq_checkdebug(&sc->sc_alq, ATH_ALQ_TDMA_TIMER_SET)) { struct if_ath_alq_tdma_timer_set t; t.bt_intval = htobe32(bt.bt_intval); t.bt_nexttbtt = htobe32(bt.bt_nexttbtt); t.bt_nextdba = htobe32(bt.bt_nextdba); t.bt_nextswba = htobe32(bt.bt_nextswba); t.bt_nextatim = htobe32(bt.bt_nextatim); t.bt_flags = htobe32(bt.bt_flags); t.sc_tdmadbaprep = htobe32(sc->sc_tdmadbaprep); t.sc_tdmaswbaprep = htobe32(sc->sc_tdmaswbaprep); if_ath_alq_post(&sc->sc_alq, ATH_ALQ_TDMA_TIMER_SET, sizeof(t), (char *) &t); } #endif DPRINTF(sc, ATH_DEBUG_TDMA_TIMER, "%s: nexttbtt=%u (0x%08x), nexttbtt tsf=%lld (0x%08llx)\n", __func__, bt.bt_nexttbtt, bt.bt_nexttbtt, (long long) ( ((u_int64_t) (bt.bt_nexttbtt)) << 10), (long long) ( ((u_int64_t) (bt.bt_nexttbtt)) << 10)); ath_hal_beaconsettimers(ah, &bt); } /* * Calculate the beacon interval. This is periodic in the * superframe for the bss. We assume each station is configured * identically wrt transmit rate so the guard time we calculate * above will be the same on all stations. Note we need to * factor in the xmit time because the hardware will schedule * a frame for transmit if the start of the frame is within * the burst time. When we get hardware that properly kills * frames in the PCU we can reduce/eliminate the guard time. * * Roundup to 1024 is so we have 1 TU buffer in the guard time * to deal with the granularity of the nexttbtt timer. 11n MAC's * with 1us timer granularity should allow us to reduce/eliminate * this. */ static void ath_tdma_bintvalsetup(struct ath_softc *sc, const struct ieee80211_tdma_state *tdma) { /* copy from vap state (XXX check all vaps have same value?) */ sc->sc_tdmaslotlen = tdma->tdma_slotlen; sc->sc_tdmabintval = roundup((sc->sc_tdmaslotlen+sc->sc_tdmaguard) * tdma->tdma_slotcnt, 1024); sc->sc_tdmabintval >>= 10; /* TSF -> TU */ if (sc->sc_tdmabintval & 1) sc->sc_tdmabintval++; if (tdma->tdma_slot == 0) { /* * Only slot 0 beacons; other slots respond. */ sc->sc_imask |= HAL_INT_SWBA; sc->sc_tdmaswba = 0; /* beacon immediately */ } else { /* XXX all vaps must be slot 0 or slot !0 */ sc->sc_imask &= ~HAL_INT_SWBA; } } /* * Max 802.11 overhead. This assumes no 4-address frames and * the encapsulation done by ieee80211_encap (llc). We also * include potential crypto overhead. */ #define IEEE80211_MAXOVERHEAD \ (sizeof(struct ieee80211_qosframe) \ + sizeof(struct llc) \ + IEEE80211_ADDR_LEN \ + IEEE80211_WEP_IVLEN \ + IEEE80211_WEP_KIDLEN \ + IEEE80211_WEP_CRCLEN \ + IEEE80211_WEP_MICLEN \ + IEEE80211_CRC_LEN) /* * Setup initially for tdma operation. Start the beacon * timers and enable SWBA if we are slot 0. Otherwise * we wait for slot 0 to arrive so we can sync up before * starting to transmit. */ void ath_tdma_config(struct ath_softc *sc, struct ieee80211vap *vap) { struct ath_hal *ah = sc->sc_ah; struct ieee80211com *ic = &sc->sc_ic; const struct ieee80211_txparam *tp; const struct ieee80211_tdma_state *tdma = NULL; int rix; if (vap == NULL) { vap = TAILQ_FIRST(&ic->ic_vaps); /* XXX */ if (vap == NULL) { device_printf(sc->sc_dev, "%s: no vaps?\n", __func__); return; } } /* XXX should take a locked ref to iv_bss */ tp = vap->iv_bss->ni_txparms; /* * Calculate the guard time for each slot. This is the * time to send a maximal-size frame according to the * fixed/lowest transmit rate. Note that the interface * mtu does not include the 802.11 overhead so we must * tack that on (ath_hal_computetxtime includes the - * preamble and plcp in it's calculation). + * preamble and plcp in its calculation). */ tdma = vap->iv_tdma; if (tp->ucastrate != IEEE80211_FIXED_RATE_NONE) rix = ath_tx_findrix(sc, tp->ucastrate); else rix = ath_tx_findrix(sc, tp->mcastrate); /* * If the chip supports enforcing TxOP on transmission, * we can just delete the guard window. It isn't at all required. */ if (sc->sc_hasenforcetxop) { sc->sc_tdmaguard = 0; } else { /* XXX short preamble assumed */ /* XXX non-11n rate assumed */ sc->sc_tdmaguard = ath_hal_computetxtime(ah, sc->sc_currates, vap->iv_ifp->if_mtu + IEEE80211_MAXOVERHEAD, rix, AH_TRUE, AH_TRUE); } ath_hal_intrset(ah, 0); ath_beaconq_config(sc); /* setup h/w beacon q */ if (sc->sc_setcca) ath_hal_setcca(ah, AH_FALSE); /* disable CCA */ ath_tdma_bintvalsetup(sc, tdma); /* calculate beacon interval */ ath_tdma_settimers(sc, sc->sc_tdmabintval, sc->sc_tdmabintval | HAL_BEACON_RESET_TSF); sc->sc_syncbeacon = 0; sc->sc_avgtsfdeltap = TDMA_DUMMY_MARKER; sc->sc_avgtsfdeltam = TDMA_DUMMY_MARKER; ath_hal_intrset(ah, sc->sc_imask); DPRINTF(sc, ATH_DEBUG_TDMA, "%s: slot %u len %uus cnt %u " "bsched %u guard %uus bintval %u TU dba prep %u\n", __func__, tdma->tdma_slot, tdma->tdma_slotlen, tdma->tdma_slotcnt, tdma->tdma_bintval, sc->sc_tdmaguard, sc->sc_tdmabintval, sc->sc_tdmadbaprep); #ifdef ATH_DEBUG_ALQ if (if_ath_alq_checkdebug(&sc->sc_alq, ATH_ALQ_TDMA_TIMER_CONFIG)) { struct if_ath_alq_tdma_timer_config t; t.tdma_slot = htobe32(tdma->tdma_slot); t.tdma_slotlen = htobe32(tdma->tdma_slotlen); t.tdma_slotcnt = htobe32(tdma->tdma_slotcnt); t.tdma_bintval = htobe32(tdma->tdma_bintval); t.tdma_guard = htobe32(sc->sc_tdmaguard); t.tdma_scbintval = htobe32(sc->sc_tdmabintval); t.tdma_dbaprep = htobe32(sc->sc_tdmadbaprep); if_ath_alq_post(&sc->sc_alq, ATH_ALQ_TDMA_TIMER_CONFIG, sizeof(t), (char *) &t); } #endif /* ATH_DEBUG_ALQ */ } /* * Update tdma operation. Called from the 802.11 layer * when a beacon is received from the TDMA station operating * in the slot immediately preceding us in the bss. Use * the rx timestamp for the beacon frame to update our * beacon timers so we follow their schedule. Note that * by using the rx timestamp we implicitly include the * propagation delay in our schedule. * * XXX TODO: since the changes for the AR5416 and later chips * involved changing the TSF/TU calculations, we need to make * sure that various calculations wrap consistently. * * A lot of the problems stemmed from the calculations wrapping * at 65,535 TU. Since a lot of the math is still being done in * TU, please audit it to ensure that when the TU values programmed * into the timers wrap at (2^31)-1 TSF, all the various terms * wrap consistently. */ void ath_tdma_update(struct ieee80211_node *ni, const struct ieee80211_tdma_param *tdma, int changed) { #define TSF_TO_TU(_h,_l) \ ((((u_int32_t)(_h)) << 22) | (((u_int32_t)(_l)) >> 10)) #define TU_TO_TSF(_tu) (((u_int64_t)(_tu)) << 10) struct ieee80211vap *vap = ni->ni_vap; struct ieee80211com *ic = ni->ni_ic; struct ath_softc *sc = ic->ic_softc; struct ath_hal *ah = sc->sc_ah; const HAL_RATE_TABLE *rt = sc->sc_currates; u_int64_t tsf, rstamp, nextslot, nexttbtt, nexttbtt_full; u_int32_t txtime, nextslottu; int32_t tudelta, tsfdelta; const struct ath_rx_status *rs; int rix; sc->sc_stats.ast_tdma_update++; /* * Check for and adopt configuration changes. */ if (changed != 0) { const struct ieee80211_tdma_state *ts = vap->iv_tdma; ath_tdma_bintvalsetup(sc, ts); if (changed & TDMA_UPDATE_SLOTLEN) ath_wme_update(ic); DPRINTF(sc, ATH_DEBUG_TDMA, "%s: adopt slot %u slotcnt %u slotlen %u us " "bintval %u TU\n", __func__, ts->tdma_slot, ts->tdma_slotcnt, ts->tdma_slotlen, sc->sc_tdmabintval); /* XXX right? */ ath_hal_intrset(ah, sc->sc_imask); /* NB: beacon timers programmed below */ } /* extend rx timestamp to 64 bits */ rs = sc->sc_lastrs; tsf = ath_hal_gettsf64(ah); rstamp = ath_extend_tsf(sc, rs->rs_tstamp, tsf); /* * The rx timestamp is set by the hardware on completing * reception (at the point where the rx descriptor is DMA'd * to the host). To find the start of our next slot we * must adjust this time by the time required to send * the packet just received. */ rix = rt->rateCodeToIndex[rs->rs_rate]; /* * To calculate the packet duration for legacy rates, we * only need the rix and preamble. * * For 11n non-aggregate frames, we also need the channel * width and short/long guard interval. * * For 11n aggregate frames, the required hacks are a little * more subtle. You need to figure out the frame duration * for each frame, including the delimiters. However, when * a frame isn't received successfully, we won't hear it * (unless you enable reception of CRC errored frames), so * your duration calculation is going to be off. * * However, we can assume that the beacon frames won't be * transmitted as aggregate frames, so we should be okay. * Just add a check to ensure that we aren't handed something * bad. * * For ath_hal_pkt_txtime() - for 11n rates, shortPreamble is * actually short guard interval. For legacy rates, * it's short preamble. */ txtime = ath_hal_pkt_txtime(ah, rt, rs->rs_datalen, rix, !! (rs->rs_flags & HAL_RX_2040), (rix & 0x80) ? (! (rs->rs_flags & HAL_RX_GI)) : rt->info[rix].shortPreamble, AH_TRUE); /* NB: << 9 is to cvt to TU and /2 */ nextslot = (rstamp - txtime) + (sc->sc_tdmabintval << 9); /* * For 802.11n chips: nextslottu needs to be the full TSF space, * not just 0..65535 TU. */ nextslottu = TSF_TO_TU(nextslot>>32, nextslot); /* * Retrieve the hardware NextTBTT in usecs * and calculate the difference between what the * other station thinks and what we have programmed. This * lets us figure how to adjust our timers to match. The * adjustments are done by pulling the TSF forward and possibly * rewriting the beacon timers. */ /* * The logic here assumes the nexttbtt counter is in TSF * but the prr-11n NICs are in TU. The HAL shifts them * to TSF but there's two important differences: * * + The TU->TSF values have 0's for the low 9 bits, and * + The counter wraps at TU_TO_TSF(HAL_BEACON_PERIOD + 1) for * the pre-11n NICs, but not for the 11n NICs. * * So for now, just make sure the nexttbtt value we get * matches the second issue or once nexttbtt exceeds this * value, tsfdelta ends up becoming very negative and all * of the adjustments get very messed up. */ /* * We need to track the full nexttbtt rather than having it * truncated at HAL_BEACON_PERIOD, as programming the * nexttbtt (and related) registers for the 11n chips is * actually going to take the full 32 bit space, rather than * just 0..65535 TU. */ nexttbtt_full = ath_hal_getnexttbtt(ah); nexttbtt = nexttbtt_full % (TU_TO_TSF(HAL_BEACON_PERIOD + 1)); tsfdelta = (int32_t)((nextslot % TU_TO_TSF(HAL_BEACON_PERIOD + 1)) - nexttbtt); DPRINTF(sc, ATH_DEBUG_TDMA_TIMER, "rs->rstamp %llu rstamp %llu tsf %llu txtime %d, nextslot %llu, " "nextslottu %d, nextslottume %d\n", (unsigned long long) rs->rs_tstamp, (unsigned long long) rstamp, (unsigned long long) tsf, txtime, (unsigned long long) nextslot, nextslottu, TSF_TO_TU(nextslot >> 32, nextslot)); DPRINTF(sc, ATH_DEBUG_TDMA, " beacon tstamp: %llu (0x%016llx)\n", (unsigned long long) le64toh(ni->ni_tstamp.tsf), (unsigned long long) le64toh(ni->ni_tstamp.tsf)); DPRINTF(sc, ATH_DEBUG_TDMA_TIMER, "nexttbtt %llu (0x%08llx) tsfdelta %d avg +%d/-%d\n", (unsigned long long) nexttbtt, (long long) nexttbtt, tsfdelta, TDMA_AVG(sc->sc_avgtsfdeltap), TDMA_AVG(sc->sc_avgtsfdeltam)); if (tsfdelta < 0) { TDMA_SAMPLE(sc->sc_avgtsfdeltap, 0); TDMA_SAMPLE(sc->sc_avgtsfdeltam, -tsfdelta); tsfdelta = -tsfdelta % 1024; nextslottu++; } else if (tsfdelta > 0) { TDMA_SAMPLE(sc->sc_avgtsfdeltap, tsfdelta); TDMA_SAMPLE(sc->sc_avgtsfdeltam, 0); tsfdelta = 1024 - (tsfdelta % 1024); nextslottu++; } else { TDMA_SAMPLE(sc->sc_avgtsfdeltap, 0); TDMA_SAMPLE(sc->sc_avgtsfdeltam, 0); } tudelta = nextslottu - TSF_TO_TU(nexttbtt_full >> 32, nexttbtt_full); #ifdef ATH_DEBUG_ALQ if (if_ath_alq_checkdebug(&sc->sc_alq, ATH_ALQ_TDMA_BEACON_STATE)) { struct if_ath_alq_tdma_beacon_state t; t.rx_tsf = htobe64(rstamp); t.beacon_tsf = htobe64(le64toh(ni->ni_tstamp.tsf)); t.tsf64 = htobe64(tsf); t.nextslot_tsf = htobe64(nextslot); t.nextslot_tu = htobe32(nextslottu); t.txtime = htobe32(txtime); if_ath_alq_post(&sc->sc_alq, ATH_ALQ_TDMA_BEACON_STATE, sizeof(t), (char *) &t); } if (if_ath_alq_checkdebug(&sc->sc_alq, ATH_ALQ_TDMA_SLOT_CALC)) { struct if_ath_alq_tdma_slot_calc t; t.nexttbtt = htobe64(nexttbtt_full); t.next_slot = htobe64(nextslot); t.tsfdelta = htobe32(tsfdelta); t.avg_plus = htobe32(TDMA_AVG(sc->sc_avgtsfdeltap)); t.avg_minus = htobe32(TDMA_AVG(sc->sc_avgtsfdeltam)); if_ath_alq_post(&sc->sc_alq, ATH_ALQ_TDMA_SLOT_CALC, sizeof(t), (char *) &t); } #endif /* * Copy sender's timetstamp into tdma ie so they can * calculate roundtrip time. We submit a beacon frame * below after any timer adjustment. The frame goes out * at the next TBTT so the sender can calculate the * roundtrip by inspecting the tdma ie in our beacon frame. * * NB: This tstamp is subtlely preserved when * IEEE80211_BEACON_TDMA is marked (e.g. when the * slot position changes) because ieee80211_add_tdma * skips over the data. */ memcpy(vap->iv_bcn_off.bo_tdma + __offsetof(struct ieee80211_tdma_param, tdma_tstamp), &ni->ni_tstamp.data, 8); #if 0 DPRINTF(sc, ATH_DEBUG_TDMA_TIMER, "tsf %llu nextslot %llu (%d, %d) nextslottu %u nexttbtt %llu (%d)\n", (unsigned long long) tsf, (unsigned long long) nextslot, (int)(nextslot - tsf), tsfdelta, nextslottu, nexttbtt, tudelta); #endif /* * Adjust the beacon timers only when pulling them forward * or when going back by less than the beacon interval. * Negative jumps larger than the beacon interval seem to * cause the timers to stop and generally cause instability. * This basically filters out jumps due to missed beacons. */ if (tudelta != 0 && (tudelta > 0 || -tudelta < sc->sc_tdmabintval)) { DPRINTF(sc, ATH_DEBUG_TDMA_TIMER, "%s: calling ath_tdma_settimers; nextslottu=%d, bintval=%d\n", __func__, nextslottu, sc->sc_tdmabintval); ath_tdma_settimers(sc, nextslottu, sc->sc_tdmabintval); sc->sc_stats.ast_tdma_timers++; } if (tsfdelta > 0) { uint64_t tsf; /* XXX should just teach ath_hal_adjusttsf() to do this */ tsf = ath_hal_gettsf64(ah); ath_hal_settsf64(ah, tsf + tsfdelta); DPRINTF(sc, ATH_DEBUG_TDMA_TIMER, "%s: calling ath_hal_adjusttsf: TSF=%llu, tsfdelta=%d\n", __func__, (unsigned long long) tsf, tsfdelta); #ifdef ATH_DEBUG_ALQ if (if_ath_alq_checkdebug(&sc->sc_alq, ATH_ALQ_TDMA_TSF_ADJUST)) { struct if_ath_alq_tdma_tsf_adjust t; t.tsfdelta = htobe32(tsfdelta); t.tsf64_old = htobe64(tsf); t.tsf64_new = htobe64(tsf + tsfdelta); if_ath_alq_post(&sc->sc_alq, ATH_ALQ_TDMA_TSF_ADJUST, sizeof(t), (char *) &t); } #endif /* ATH_DEBUG_ALQ */ sc->sc_stats.ast_tdma_tsf++; } ath_tdma_beacon_send(sc, vap); /* prepare response */ #undef TU_TO_TSF #undef TSF_TO_TU } /* * Transmit a beacon frame at SWBA. Dynamic updates * to the frame contents are done as needed. */ void ath_tdma_beacon_send(struct ath_softc *sc, struct ieee80211vap *vap) { struct ath_hal *ah = sc->sc_ah; struct ath_buf *bf; int otherant; /* * Check if the previous beacon has gone out. If * not don't try to post another, skip this period * and wait for the next. Missed beacons indicate * a problem and should not occur. If we miss too * many consecutive beacons reset the device. */ if (ath_hal_numtxpending(ah, sc->sc_bhalq) != 0) { sc->sc_bmisscount++; DPRINTF(sc, ATH_DEBUG_BEACON, "%s: missed %u consecutive beacons\n", __func__, sc->sc_bmisscount); if (sc->sc_bmisscount >= ath_bstuck_threshold) taskqueue_enqueue(sc->sc_tq, &sc->sc_bstucktask); return; } if (sc->sc_bmisscount != 0) { DPRINTF(sc, ATH_DEBUG_BEACON, "%s: resume beacon xmit after %u misses\n", __func__, sc->sc_bmisscount); sc->sc_bmisscount = 0; } /* * Check recent per-antenna transmit statistics and flip * the default antenna if noticeably more frames went out * on the non-default antenna. * XXX assumes 2 anntenae */ if (!sc->sc_diversity) { otherant = sc->sc_defant & 1 ? 2 : 1; if (sc->sc_ant_tx[otherant] > sc->sc_ant_tx[sc->sc_defant] + 2) ath_setdefantenna(sc, otherant); sc->sc_ant_tx[1] = sc->sc_ant_tx[2] = 0; } bf = ath_beacon_generate(sc, vap); /* XXX We don't do cabq traffic, but just for completeness .. */ ATH_TXQ_LOCK(sc->sc_cabq); ath_beacon_cabq_start(sc); ATH_TXQ_UNLOCK(sc->sc_cabq); if (bf != NULL) { /* * Stop any current dma and put the new frame on the queue. * This should never fail since we check above that no frames * are still pending on the queue. */ if ((! sc->sc_isedma) && (! ath_hal_stoptxdma(ah, sc->sc_bhalq))) { DPRINTF(sc, ATH_DEBUG_ANY, "%s: beacon queue %u did not stop?\n", __func__, sc->sc_bhalq); /* NB: the HAL still stops DMA, so proceed */ } ath_hal_puttxbuf(ah, sc->sc_bhalq, bf->bf_daddr); ath_hal_txstart(ah, sc->sc_bhalq); sc->sc_stats.ast_be_xmit++; /* XXX per-vap? */ /* * Record local TSF for our last send for use * in arbitrating slot collisions. */ /* XXX should take a locked ref to iv_bss */ vap->iv_bss->ni_tstamp.tsf = ath_hal_gettsf64(ah); } } #endif /* IEEE80211_SUPPORT_TDMA */ Index: head/sys/dev/isci/scil/sati_design.h =================================================================== --- head/sys/dev/isci/scil/sati_design.h (revision 308456) +++ head/sys/dev/isci/scil/sati_design.h (revision 308457) @@ -1,169 +1,169 @@ /*- * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. * * GPL LICENSE SUMMARY * * Copyright(c) 2008 - 2011 Intel Corporation. All rights reserved. * * This program is free software; you can redistribute it and/or modify * it under the terms of version 2 of the GNU General Public License as * published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. * The full GNU General Public License is included in this distribution * in the file called LICENSE.GPL. * * BSD LICENSE * * Copyright(c) 2008 - 2011 Intel Corporation. All rights reserved. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in * the documentation and/or other materials provided with the * distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _SATI_DESIGN_H_ #define _SATI_DESIGN_H_ /** @page sati_design_page SATI High Level Design Authors: - Nathan Marushak @section scif_sas_scope_and_audience Scope and Audience This document provides design information relating to the SCSI to ATA Translation Implementation (SATI). Driver developers are the primary audience for this document. The reader is expected to have an understanding of SCSI (Simple Computer Storage Interface), ATA (Advanced Technology Attachment), and SAT (SCSI-to-ATA Translation). Please refer to www.t10.org for specifications relating to SCSI and SAT. Please refer to www.t13.org for specifications relating to ATA. @section overview Overview SATI provides environment agnostic functionality for translating SCSI commands, data, and responses into ATA commands, data, and responses. As a result, in some instances the user must fill out callbacks to set data. This ensures that user isn't forced to have to copy the data an additional time due to memory access restrictions. SATI complies with the t10 SAT specification where possible. In cases where there are variances the design and implementation will make note. Additionally, for parameters, pages, functionality, or commands for which SATI is unable to translate, SATI will return sense data indicating INVALID FIELD IN CDB. SATI has two primary entry points from which the user can enter: - sati_translate_command() - sati_translate_response() (this method performs data translation). Additionally, SATI provides a means through which the user can query to determine the t10 specification revision with which SATI is compliant. For more information please refer to: - sati_get_sat_compliance_version() - sati_get_sat_compliance_version_revision() @section sati_definitions Definitions - scsi_io: The SCSI IO is considered to be the user's SCSI IO request object (e.g. the windows driver IO request object and SRB). It is passed back to the user via callback methods to retrieve required SCSI information (e.g. CDB, response IU address, etc.). The SCSI IO is just a cookie and can represent any value the caller desires, but the user must be able to utilize this value when it is passed back through callback methods during translation. - ata_io: The ATA IO is considered to be the user's ATA IO request object. If you are utilizing the SCI Framework, then the SCI Framework is the ATA IO. The ATA IO is just a cookie and can represent any value the caller desires, but the user must be able to utilize this value when it is passed back through callback methods during translation. @section sati_use_cases Use Cases The SCSI Primary Command (SPC) set is comprised of commands that are valid for all device types defined in SCSI. Some of these commands have sub-commands or parameter data defined in another specification (e.g. SBC, SAT). These separate sub-commands or parameter data are captured in the SPC use case diagram for simplicity. @note - For simplicify the association between the actor and the use cases has not been drawn, but is assumed. - The use cases in green indicate the use case has been implemented in source. @image html Use_Case_Diagram__SATI__SATI_-_SPC.jpg "SCSI Primary Command Translation Use Cases" The SCSI Block Command (SBC) set is comprised of commands that are valid for block devices (e.g. disks). @image html Use_Case_Diagram__SATI__SATI_-_SBC.jpg "SCSI Block Command Translation Use Cases" -The SCSI-to-ATA Translation (SAT) specification defines a few of it's own +The SCSI-to-ATA Translation (SAT) specification defines a few of its own commands, parameter data, and log pages. This use case diagram, however, only captures the SAT specific commands being translated. @image html Use_Case_Diagram__SATI__SATI_-_SAT_Specific.jpg "SCSI-to-ATA Translation Specific Use Cases" @section sati_class_hierarchy Class Hierarchy @image html Class_Diagram__SATI__Class_Diagram.jpg "SATI Class Diagram" @section sati_sequences Sequence Diagrams @note These sequence diagrams are currently a little out of date. An update is required. This sequence diagram simply depicts the high-level translation sequence to be followed for command translations. @image html Sequence_Diagram__General_Cmd_Translation_Sequence__General_Cmd_Translation_Sequence.jpg "General Command Translation Sequence" This sequence diagram simply depicts the high-level translation sequence to be followed for response translations. @image html Sequence_Diagram__General_Rsp_Translation_Sequence__General_Rsp_Translation_Sequence.jpg "General Response Translation Sequence" This sequence diagram simply depicts the high-level translation sequence to be followed for data translations. Some SCSI commands such as READ CAPACITY, INQUIRY, etc. have payload data associated with them. As a result, it is necessary for the ATA payload data to be translated to meet the expected SCSI output. @image html Sequence_Diagram__General_Data_Translation_Sequence__General_Data_Translation_Sequence.jpg "General Data Translation Sequence" */ #endif // _SATI_DESIGN_H_ Index: head/sys/dev/isci/scil/sci_base_controller.h =================================================================== --- head/sys/dev/isci/scil/sci_base_controller.h (revision 308456) +++ head/sys/dev/isci/scil/sci_base_controller.h (revision 308457) @@ -1,347 +1,347 @@ /*- * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. * * GPL LICENSE SUMMARY * * Copyright(c) 2008 - 2011 Intel Corporation. All rights reserved. * * This program is free software; you can redistribute it and/or modify * it under the terms of version 2 of the GNU General Public License as * published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. * The full GNU General Public License is included in this distribution * in the file called LICENSE.GPL. * * BSD LICENSE * * Copyright(c) 2008 - 2011 Intel Corporation. All rights reserved. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in * the documentation and/or other materials provided with the * distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _SCI_BASE_CONTROLLER_H_ #define _SCI_BASE_CONTROLLER_H_ /** * @file * * @brief This file contains all of the structures, constants, and methods * common to all controller object definitions. */ #ifdef __cplusplus extern "C" { #endif // __cplusplus #include #include #include #include #include #include #include #include /** * @enum SCI_BASE_CONTROLLER_STATES * * @brief This enumeration depicts all the states for the common controller * state machine. */ typedef enum _SCI_BASE_CONTROLLER_STATES { /** * Simply the initial state for the base controller state machine. */ SCI_BASE_CONTROLLER_STATE_INITIAL = 0, /** * This state indicates that the controller is reset. The memory for - * the controller is in it's initial state, but the controller requires + * the controller is in its initial state, but the controller requires * initialization. * This state is entered from the INITIAL state. * This state is entered from the RESETTING state. */ SCI_BASE_CONTROLLER_STATE_RESET, /** * This state is typically an action state that indicates the controller * is in the process of initialization. In this state no new IO operations * are permitted. * This state is entered from the RESET state. */ SCI_BASE_CONTROLLER_STATE_INITIALIZING, /** * This state indicates that the controller has been successfully * initialized. In this state no new IO operations are permitted. * This state is entered from the INITIALIZING state. */ SCI_BASE_CONTROLLER_STATE_INITIALIZED, /** * This state indicates the controller is in the process of becoming * ready (i.e. starting). In this state no new IO operations are permitted. * This state is entered from the INITIALIZED state. */ SCI_BASE_CONTROLLER_STATE_STARTING, /** * This state indicates the controller is now ready. Thus, the user * is able to perform IO operations on the controller. * This state is entered from the STARTING state. */ SCI_BASE_CONTROLLER_STATE_READY, /** * This state is typically an action state that indicates the controller * is in the process of resetting. Thus, the user is unable to perform * IO operations on the controller. A reset is considered destructive in * most cases. * This state is entered from the READY state. * This state is entered from the FAILED state. * This state is entered from the STOPPED state. */ SCI_BASE_CONTROLLER_STATE_RESETTING, /** * This state indicates that the controller is in the process of stopping. * In this state no new IO operations are permitted, but existing IO * operations are allowed to complete. * This state is entered from the READY state. */ SCI_BASE_CONTROLLER_STATE_STOPPING, /** * This state indicates that the controller has successfully been stopped. * In this state no new IO operations are permitted. * This state is entered from the STOPPING state. */ SCI_BASE_CONTROLLER_STATE_STOPPED, /** * This state indicates that the controller could not successfully be * initialized. In this state no new IO operations are permitted. * This state is entered from the INITIALIZING state. * This state is entered from the STARTING state. * This state is entered from the STOPPING state. * This state is entered from the RESETTING state. */ SCI_BASE_CONTROLLER_STATE_FAILED, SCI_BASE_CONTROLLER_MAX_STATES } SCI_BASE_CONTROLLER_STATES; /** * @struct SCI_BASE_CONTROLLER * * @brief The base controller object abstracts the fields common to all * SCI controller objects. */ typedef struct SCI_BASE_CONTROLLER { /** * The field specifies that the parent object for the base controller * is the base object itself. */ SCI_BASE_OBJECT_T parent; /** * This field points to the memory descriptor list associated with this * controller. The MDL indicates the memory requirements necessary for * this controller object. */ SCI_BASE_MEMORY_DESCRIPTOR_LIST_T mdl; /** * This field records the fact that the controller has encountered a fatal memory * error and controller must stay in failed state. */ U8 error; /** * This field contains the information for the base controller state * machine. */ SCI_BASE_STATE_MACHINE_T state_machine; #ifdef SCI_LOGGING SCI_BASE_STATE_MACHINE_LOGGER_T state_machine_logger; #endif // SCI_LOGGING } SCI_BASE_CONTROLLER_T; // Forward declarations struct SCI_BASE_REMOTE_DEVICE; struct SCI_BASE_REQUEST; typedef SCI_STATUS (*SCI_BASE_CONTROLLER_HANDLER_T)( SCI_BASE_CONTROLLER_T * ); typedef SCI_STATUS (*SCI_BASE_CONTROLLER_TIMED_HANDLER_T)( SCI_BASE_CONTROLLER_T *, U32 ); typedef SCI_STATUS (*SCI_BASE_CONTROLLER_REQUEST_HANDLER_T)( SCI_BASE_CONTROLLER_T *, struct SCI_BASE_REMOTE_DEVICE *, struct SCI_BASE_REQUEST * ); typedef SCI_STATUS (*SCI_BASE_CONTROLLER_START_REQUEST_HANDLER_T)( SCI_BASE_CONTROLLER_T *, struct SCI_BASE_REMOTE_DEVICE *, struct SCI_BASE_REQUEST *, U16 ); /** * @struct SCI_BASE_CONTROLLER_STATE_HANDLER * * @brief This structure contains all of the state handler methods common to * base controller state machines. Handler methods provide the ability * to change the behavior for user requests or transitions depending * on the state the machine is in. */ typedef struct SCI_BASE_CONTROLLER_STATE_HANDLER { /** * The start_handler specifies the method invoked when a user attempts to * start a controller. */ SCI_BASE_CONTROLLER_TIMED_HANDLER_T start_handler; /** * The stop_handler specifies the method invoked when a user attempts to * stop a controller. */ SCI_BASE_CONTROLLER_TIMED_HANDLER_T stop_handler; /** * The reset_handler specifies the method invoked when a user attempts to * reset a controller. */ SCI_BASE_CONTROLLER_HANDLER_T reset_handler; /** * The initialize_handler specifies the method invoked when a user * attempts to initialize a controller. */ SCI_BASE_CONTROLLER_HANDLER_T initialize_handler; /** * The start_io_handler specifies the method invoked when a user * attempts to start an IO request for a controller. */ SCI_BASE_CONTROLLER_START_REQUEST_HANDLER_T start_io_handler; /** * The start_internal_request_handler specifies the method invoked when a user * attempts to start an internal request for a controller. */ SCI_BASE_CONTROLLER_START_REQUEST_HANDLER_T start_high_priority_io_handler; /** * The complete_io_handler specifies the method invoked when a user * attempts to complete an IO request for a controller. */ SCI_BASE_CONTROLLER_REQUEST_HANDLER_T complete_io_handler; /** * The complete_high_priority_io_handler specifies the method invoked when a user * attempts to complete a high priority IO request for a controller. */ SCI_BASE_CONTROLLER_REQUEST_HANDLER_T complete_high_priority_io_handler; /** * The continue_io_handler specifies the method invoked when a user * attempts to continue an IO request for a controller. */ SCI_BASE_CONTROLLER_REQUEST_HANDLER_T continue_io_handler; /** * The start_task_handler specifies the method invoked when a user * attempts to start a task management request for a controller. */ SCI_BASE_CONTROLLER_START_REQUEST_HANDLER_T start_task_handler; /** * The complete_task_handler specifies the method invoked when a user * attempts to complete a task management request for a controller. */ SCI_BASE_CONTROLLER_REQUEST_HANDLER_T complete_task_handler; } SCI_BASE_CONTROLLER_STATE_HANDLER_T; /** * @brief Construct the base controller * * @param[in] this_controller This parameter specifies the base controller * to be constructed. * @param[in] logger This parameter specifies the logger associated with * this base controller object. * @param[in] state_table This parameter specifies the table of state * definitions to be utilized for the controller state machine. * @param[in] mde_array This parameter specifies the array of memory * descriptor entries to be managed by this list. * @param[in] mde_array_length This parameter specifies the size of the * array of entries. * @param[in] next_mdl This parameter specifies a subsequent MDL object * to be managed by this MDL object. * @param[in] oem_parameters This parameter specifies the original * equipment manufacturer parameters to be utilized by this * controller object. * * @return none */ void sci_base_controller_construct( SCI_BASE_CONTROLLER_T * this_controller, SCI_BASE_LOGGER_T * logger, SCI_BASE_STATE_T * state_table, SCI_PHYSICAL_MEMORY_DESCRIPTOR_T * mdes, U32 mde_count, SCI_MEMORY_DESCRIPTOR_LIST_HANDLE_T next_mdl ); #ifdef __cplusplus } #endif // __cplusplus #endif // _SCI_BASE_CONTROLLER_H_ Index: head/sys/dev/sfxge/common/efx_mcdi.c =================================================================== --- head/sys/dev/sfxge/common/efx_mcdi.c (revision 308456) +++ head/sys/dev/sfxge/common/efx_mcdi.c (revision 308457) @@ -1,2288 +1,2288 @@ /*- * Copyright (c) 2008-2016 Solarflare Communications Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * The views and conclusions contained in the software and documentation are * those of the authors and should not be interpreted as representing official * policies, either expressed or implied, of the FreeBSD Project. */ #include __FBSDID("$FreeBSD$"); #include "efx.h" #include "efx_impl.h" #if EFSYS_OPT_MCDI /* * There are three versions of the MCDI interface: * - MCDIv0: Siena BootROM. Transport uses MCDIv1 headers. * - MCDIv1: Siena firmware and Huntington BootROM. * - MCDIv2: EF10 firmware (Huntington/Medford) and Medford BootROM. * Transport uses MCDIv2 headers. * * MCDIv2 Header NOT_EPOCH flag * ---------------------------- * A new epoch begins at initial startup or after an MC reboot, and defines when * the MC should reject stale MCDI requests. * * The first MCDI request sent by the host should contain NOT_EPOCH=0, and all * subsequent requests (until the next MC reboot) should contain NOT_EPOCH=1. * * After rebooting the MC will fail all requests with NOT_EPOCH=1 by writing a * response with ERROR=1 and DATALEN=0 until a request is seen with NOT_EPOCH=0. */ #if EFSYS_OPT_SIENA static const efx_mcdi_ops_t __efx_mcdi_siena_ops = { siena_mcdi_init, /* emco_init */ siena_mcdi_send_request, /* emco_send_request */ siena_mcdi_poll_reboot, /* emco_poll_reboot */ siena_mcdi_poll_response, /* emco_poll_response */ siena_mcdi_read_response, /* emco_read_response */ siena_mcdi_fini, /* emco_fini */ siena_mcdi_feature_supported, /* emco_feature_supported */ }; #endif /* EFSYS_OPT_SIENA */ #if EFSYS_OPT_HUNTINGTON || EFSYS_OPT_MEDFORD static const efx_mcdi_ops_t __efx_mcdi_ef10_ops = { ef10_mcdi_init, /* emco_init */ ef10_mcdi_send_request, /* emco_send_request */ ef10_mcdi_poll_reboot, /* emco_poll_reboot */ ef10_mcdi_poll_response, /* emco_poll_response */ ef10_mcdi_read_response, /* emco_read_response */ ef10_mcdi_fini, /* emco_fini */ ef10_mcdi_feature_supported, /* emco_feature_supported */ }; #endif /* EFSYS_OPT_HUNTINGTON || EFSYS_OPT_MEDFORD */ __checkReturn efx_rc_t efx_mcdi_init( __in efx_nic_t *enp, __in const efx_mcdi_transport_t *emtp) { const efx_mcdi_ops_t *emcop; efx_rc_t rc; EFSYS_ASSERT3U(enp->en_magic, ==, EFX_NIC_MAGIC); EFSYS_ASSERT3U(enp->en_mod_flags, ==, 0); switch (enp->en_family) { #if EFSYS_OPT_SIENA case EFX_FAMILY_SIENA: emcop = &__efx_mcdi_siena_ops; break; #endif /* EFSYS_OPT_SIENA */ #if EFSYS_OPT_HUNTINGTON case EFX_FAMILY_HUNTINGTON: emcop = &__efx_mcdi_ef10_ops; break; #endif /* EFSYS_OPT_HUNTINGTON */ #if EFSYS_OPT_MEDFORD case EFX_FAMILY_MEDFORD: emcop = &__efx_mcdi_ef10_ops; break; #endif /* EFSYS_OPT_MEDFORD */ default: EFSYS_ASSERT(0); rc = ENOTSUP; goto fail1; } if (enp->en_features & EFX_FEATURE_MCDI_DMA) { /* MCDI requires a DMA buffer in host memory */ if ((emtp == NULL) || (emtp->emt_dma_mem) == NULL) { rc = EINVAL; goto fail2; } } enp->en_mcdi.em_emtp = emtp; if (emcop != NULL && emcop->emco_init != NULL) { if ((rc = emcop->emco_init(enp, emtp)) != 0) goto fail3; } enp->en_mcdi.em_emcop = emcop; enp->en_mod_flags |= EFX_MOD_MCDI; return (0); fail3: EFSYS_PROBE(fail3); fail2: EFSYS_PROBE(fail2); fail1: EFSYS_PROBE1(fail1, efx_rc_t, rc); enp->en_mcdi.em_emcop = NULL; enp->en_mcdi.em_emtp = NULL; enp->en_mod_flags &= ~EFX_MOD_MCDI; return (rc); } void efx_mcdi_fini( __in efx_nic_t *enp) { efx_mcdi_iface_t *emip = &(enp->en_mcdi.em_emip); const efx_mcdi_ops_t *emcop = enp->en_mcdi.em_emcop; EFSYS_ASSERT3U(enp->en_magic, ==, EFX_NIC_MAGIC); EFSYS_ASSERT3U(enp->en_mod_flags, ==, EFX_MOD_MCDI); if (emcop != NULL && emcop->emco_fini != NULL) emcop->emco_fini(enp); emip->emi_port = 0; emip->emi_aborted = 0; enp->en_mcdi.em_emcop = NULL; enp->en_mod_flags &= ~EFX_MOD_MCDI; } void efx_mcdi_new_epoch( __in efx_nic_t *enp) { efx_mcdi_iface_t *emip = &(enp->en_mcdi.em_emip); int state; /* Start a new epoch (allow fresh MCDI requests to succeed) */ EFSYS_LOCK(enp->en_eslp, state); emip->emi_new_epoch = B_TRUE; EFSYS_UNLOCK(enp->en_eslp, state); } static void efx_mcdi_send_request( __in efx_nic_t *enp, __in void *hdrp, __in size_t hdr_len, __in void *sdup, __in size_t sdu_len) { const efx_mcdi_ops_t *emcop = enp->en_mcdi.em_emcop; emcop->emco_send_request(enp, hdrp, hdr_len, sdup, sdu_len); } static efx_rc_t efx_mcdi_poll_reboot( __in efx_nic_t *enp) { const efx_mcdi_ops_t *emcop = enp->en_mcdi.em_emcop; efx_rc_t rc; rc = emcop->emco_poll_reboot(enp); return (rc); } static boolean_t efx_mcdi_poll_response( __in efx_nic_t *enp) { const efx_mcdi_ops_t *emcop = enp->en_mcdi.em_emcop; boolean_t available; available = emcop->emco_poll_response(enp); return (available); } static void efx_mcdi_read_response( __in efx_nic_t *enp, __out void *bufferp, __in size_t offset, __in size_t length) { const efx_mcdi_ops_t *emcop = enp->en_mcdi.em_emcop; emcop->emco_read_response(enp, bufferp, offset, length); } void efx_mcdi_request_start( __in efx_nic_t *enp, __in efx_mcdi_req_t *emrp, __in boolean_t ev_cpl) { #if EFSYS_OPT_MCDI_LOGGING const efx_mcdi_transport_t *emtp = enp->en_mcdi.em_emtp; #endif efx_mcdi_iface_t *emip = &(enp->en_mcdi.em_emip); efx_dword_t hdr[2]; size_t hdr_len; unsigned int max_version; unsigned int seq; unsigned int xflags; boolean_t new_epoch; int state; EFSYS_ASSERT3U(enp->en_magic, ==, EFX_NIC_MAGIC); EFSYS_ASSERT3U(enp->en_mod_flags, &, EFX_MOD_MCDI); EFSYS_ASSERT3U(enp->en_features, &, EFX_FEATURE_MCDI); /* * efx_mcdi_request_start() is naturally serialised against both * efx_mcdi_request_poll() and efx_mcdi_ev_cpl()/efx_mcdi_ev_death(), * by virtue of there only being one outstanding MCDI request. * Unfortunately, upper layers may also call efx_mcdi_request_abort() * at any time, to timeout a pending mcdi request, That request may * then subsequently complete, meaning efx_mcdi_ev_cpl() or * efx_mcdi_ev_death() may end up running in parallel with * efx_mcdi_request_start(). This race is handled by ensuring that * %emi_pending_req, %emi_ev_cpl and %emi_seq are protected by the * en_eslp lock. */ EFSYS_LOCK(enp->en_eslp, state); EFSYS_ASSERT(emip->emi_pending_req == NULL); emip->emi_pending_req = emrp; emip->emi_ev_cpl = ev_cpl; emip->emi_poll_cnt = 0; seq = emip->emi_seq++ & EFX_MASK32(MCDI_HEADER_SEQ); new_epoch = emip->emi_new_epoch; max_version = emip->emi_max_version; EFSYS_UNLOCK(enp->en_eslp, state); xflags = 0; if (ev_cpl) xflags |= MCDI_HEADER_XFLAGS_EVREQ; /* * Huntington firmware supports MCDIv2, but the Huntington BootROM only * supports MCDIv1. Use MCDIv1 headers for MCDIv1 commands where * possible to support this. */ if ((max_version >= 2) && ((emrp->emr_cmd > MC_CMD_CMD_SPACE_ESCAPE_7) || (emrp->emr_in_length > MCDI_CTL_SDU_LEN_MAX_V1))) { /* Construct MCDI v2 header */ hdr_len = sizeof (hdr); EFX_POPULATE_DWORD_8(hdr[0], MCDI_HEADER_CODE, MC_CMD_V2_EXTN, MCDI_HEADER_RESYNC, 1, MCDI_HEADER_DATALEN, 0, MCDI_HEADER_SEQ, seq, MCDI_HEADER_NOT_EPOCH, new_epoch ? 0 : 1, MCDI_HEADER_ERROR, 0, MCDI_HEADER_RESPONSE, 0, MCDI_HEADER_XFLAGS, xflags); EFX_POPULATE_DWORD_2(hdr[1], MC_CMD_V2_EXTN_IN_EXTENDED_CMD, emrp->emr_cmd, MC_CMD_V2_EXTN_IN_ACTUAL_LEN, emrp->emr_in_length); } else { /* Construct MCDI v1 header */ hdr_len = sizeof (hdr[0]); EFX_POPULATE_DWORD_8(hdr[0], MCDI_HEADER_CODE, emrp->emr_cmd, MCDI_HEADER_RESYNC, 1, MCDI_HEADER_DATALEN, emrp->emr_in_length, MCDI_HEADER_SEQ, seq, MCDI_HEADER_NOT_EPOCH, new_epoch ? 0 : 1, MCDI_HEADER_ERROR, 0, MCDI_HEADER_RESPONSE, 0, MCDI_HEADER_XFLAGS, xflags); } #if EFSYS_OPT_MCDI_LOGGING if (emtp->emt_logger != NULL) { emtp->emt_logger(emtp->emt_context, EFX_LOG_MCDI_REQUEST, &hdr, hdr_len, emrp->emr_in_buf, emrp->emr_in_length); } #endif /* EFSYS_OPT_MCDI_LOGGING */ efx_mcdi_send_request(enp, &hdr[0], hdr_len, emrp->emr_in_buf, emrp->emr_in_length); } static void efx_mcdi_read_response_header( __in efx_nic_t *enp, __inout efx_mcdi_req_t *emrp) { #if EFSYS_OPT_MCDI_LOGGING const efx_mcdi_transport_t *emtp = enp->en_mcdi.em_emtp; #endif /* EFSYS_OPT_MCDI_LOGGING */ efx_mcdi_iface_t *emip = &(enp->en_mcdi.em_emip); efx_dword_t hdr[2]; unsigned int hdr_len; unsigned int data_len; unsigned int seq; unsigned int cmd; unsigned int error; efx_rc_t rc; EFSYS_ASSERT(emrp != NULL); efx_mcdi_read_response(enp, &hdr[0], 0, sizeof (hdr[0])); hdr_len = sizeof (hdr[0]); cmd = EFX_DWORD_FIELD(hdr[0], MCDI_HEADER_CODE); seq = EFX_DWORD_FIELD(hdr[0], MCDI_HEADER_SEQ); error = EFX_DWORD_FIELD(hdr[0], MCDI_HEADER_ERROR); if (cmd != MC_CMD_V2_EXTN) { data_len = EFX_DWORD_FIELD(hdr[0], MCDI_HEADER_DATALEN); } else { efx_mcdi_read_response(enp, &hdr[1], hdr_len, sizeof (hdr[1])); hdr_len += sizeof (hdr[1]); cmd = EFX_DWORD_FIELD(hdr[1], MC_CMD_V2_EXTN_IN_EXTENDED_CMD); data_len = EFX_DWORD_FIELD(hdr[1], MC_CMD_V2_EXTN_IN_ACTUAL_LEN); } if (error && (data_len == 0)) { /* The MC has rebooted since the request was sent. */ EFSYS_SPIN(EFX_MCDI_STATUS_SLEEP_US); efx_mcdi_poll_reboot(enp); rc = EIO; goto fail1; } if ((cmd != emrp->emr_cmd) || (seq != ((emip->emi_seq - 1) & EFX_MASK32(MCDI_HEADER_SEQ)))) { /* Response is for a different request */ rc = EIO; goto fail2; } if (error) { efx_dword_t err[2]; unsigned int err_len = MIN(data_len, sizeof (err)); int err_code = MC_CMD_ERR_EPROTO; int err_arg = 0; /* Read error code (and arg num for MCDI v2 commands) */ efx_mcdi_read_response(enp, &err, hdr_len, err_len); if (err_len >= (MC_CMD_ERR_CODE_OFST + sizeof (efx_dword_t))) err_code = EFX_DWORD_FIELD(err[0], EFX_DWORD_0); #ifdef WITH_MCDI_V2 if (err_len >= (MC_CMD_ERR_ARG_OFST + sizeof (efx_dword_t))) err_arg = EFX_DWORD_FIELD(err[1], EFX_DWORD_0); #endif emrp->emr_err_code = err_code; emrp->emr_err_arg = err_arg; #if EFSYS_OPT_MCDI_PROXY_AUTH if ((err_code == MC_CMD_ERR_PROXY_PENDING) && (err_len == sizeof (err))) { /* * The MCDI request would normally fail with EPERM, but * firmware has forwarded it to an authorization agent * attached to a privileged PF. * * Save the authorization request handle. The client * must wait for a PROXY_RESPONSE event, or timeout. */ emrp->emr_proxy_handle = err_arg; } #endif /* EFSYS_OPT_MCDI_PROXY_AUTH */ #if EFSYS_OPT_MCDI_LOGGING if (emtp->emt_logger != NULL) { emtp->emt_logger(emtp->emt_context, EFX_LOG_MCDI_RESPONSE, &hdr, hdr_len, &err, err_len); } #endif /* EFSYS_OPT_MCDI_LOGGING */ if (!emrp->emr_quiet) { EFSYS_PROBE3(mcdi_err_arg, int, emrp->emr_cmd, int, err_code, int, err_arg); } rc = efx_mcdi_request_errcode(err_code); goto fail3; } emrp->emr_rc = 0; emrp->emr_out_length_used = data_len; #if EFSYS_OPT_MCDI_PROXY_AUTH emrp->emr_proxy_handle = 0; #endif /* EFSYS_OPT_MCDI_PROXY_AUTH */ return; fail3: fail2: fail1: emrp->emr_rc = rc; emrp->emr_out_length_used = 0; } static void efx_mcdi_finish_response( __in efx_nic_t *enp, __in efx_mcdi_req_t *emrp) { #if EFSYS_OPT_MCDI_LOGGING const efx_mcdi_transport_t *emtp = enp->en_mcdi.em_emtp; #endif /* EFSYS_OPT_MCDI_LOGGING */ efx_dword_t hdr[2]; unsigned int hdr_len; size_t bytes; if (emrp->emr_out_buf == NULL) return; /* Read the command header to detect MCDI response format */ hdr_len = sizeof (hdr[0]); efx_mcdi_read_response(enp, &hdr[0], 0, hdr_len); if (EFX_DWORD_FIELD(hdr[0], MCDI_HEADER_CODE) == MC_CMD_V2_EXTN) { /* * Read the actual payload length. The length given in the event * is only correct for responses with the V1 format. */ efx_mcdi_read_response(enp, &hdr[1], hdr_len, sizeof (hdr[1])); hdr_len += sizeof (hdr[1]); emrp->emr_out_length_used = EFX_DWORD_FIELD(hdr[1], MC_CMD_V2_EXTN_IN_ACTUAL_LEN); } /* Copy payload out into caller supplied buffer */ bytes = MIN(emrp->emr_out_length_used, emrp->emr_out_length); efx_mcdi_read_response(enp, emrp->emr_out_buf, hdr_len, bytes); #if EFSYS_OPT_MCDI_LOGGING if (emtp->emt_logger != NULL) { emtp->emt_logger(emtp->emt_context, EFX_LOG_MCDI_RESPONSE, &hdr, hdr_len, emrp->emr_out_buf, bytes); } #endif /* EFSYS_OPT_MCDI_LOGGING */ } __checkReturn boolean_t efx_mcdi_request_poll( __in efx_nic_t *enp) { efx_mcdi_iface_t *emip = &(enp->en_mcdi.em_emip); efx_mcdi_req_t *emrp; int state; efx_rc_t rc; EFSYS_ASSERT3U(enp->en_magic, ==, EFX_NIC_MAGIC); EFSYS_ASSERT3U(enp->en_mod_flags, &, EFX_MOD_MCDI); EFSYS_ASSERT3U(enp->en_features, &, EFX_FEATURE_MCDI); /* Serialise against post-watchdog efx_mcdi_ev* */ EFSYS_LOCK(enp->en_eslp, state); EFSYS_ASSERT(emip->emi_pending_req != NULL); EFSYS_ASSERT(!emip->emi_ev_cpl); emrp = emip->emi_pending_req; /* Check for reboot atomically w.r.t efx_mcdi_request_start */ if (emip->emi_poll_cnt++ == 0) { if ((rc = efx_mcdi_poll_reboot(enp)) != 0) { emip->emi_pending_req = NULL; EFSYS_UNLOCK(enp->en_eslp, state); /* Reboot/Assertion */ if (rc == EIO || rc == EINTR) efx_mcdi_raise_exception(enp, emrp, rc); goto fail1; } } /* Check if a response is available */ if (efx_mcdi_poll_response(enp) == B_FALSE) { EFSYS_UNLOCK(enp->en_eslp, state); return (B_FALSE); } /* Read the response header */ efx_mcdi_read_response_header(enp, emrp); /* Request complete */ emip->emi_pending_req = NULL; /* Ensure stale MCDI requests fail after an MC reboot. */ emip->emi_new_epoch = B_FALSE; EFSYS_UNLOCK(enp->en_eslp, state); if ((rc = emrp->emr_rc) != 0) goto fail2; efx_mcdi_finish_response(enp, emrp); return (B_TRUE); fail2: if (!emrp->emr_quiet) EFSYS_PROBE(fail2); fail1: if (!emrp->emr_quiet) EFSYS_PROBE1(fail1, efx_rc_t, rc); return (B_TRUE); } __checkReturn boolean_t efx_mcdi_request_abort( __in efx_nic_t *enp) { efx_mcdi_iface_t *emip = &(enp->en_mcdi.em_emip); efx_mcdi_req_t *emrp; boolean_t aborted; int state; EFSYS_ASSERT3U(enp->en_magic, ==, EFX_NIC_MAGIC); EFSYS_ASSERT3U(enp->en_mod_flags, &, EFX_MOD_MCDI); EFSYS_ASSERT3U(enp->en_features, &, EFX_FEATURE_MCDI); /* * efx_mcdi_ev_* may have already completed this event, and be * spinning/blocked on the upper layer lock. So it *is* legitimate * to for emi_pending_req to be NULL. If there is a pending event * completed request, then provide a "credit" to allow * efx_mcdi_ev_cpl() to accept a single spurious completion. */ EFSYS_LOCK(enp->en_eslp, state); emrp = emip->emi_pending_req; aborted = (emrp != NULL); if (aborted) { emip->emi_pending_req = NULL; /* Error the request */ emrp->emr_out_length_used = 0; emrp->emr_rc = ETIMEDOUT; /* Provide a credit for seqno/emr_pending_req mismatches */ if (emip->emi_ev_cpl) ++emip->emi_aborted; /* * The upper layer has called us, so we don't * need to complete the request. */ } EFSYS_UNLOCK(enp->en_eslp, state); return (aborted); } __checkReturn efx_rc_t efx_mcdi_request_errcode( __in unsigned int err) { switch (err) { /* MCDI v1 */ case MC_CMD_ERR_EPERM: return (EACCES); case MC_CMD_ERR_ENOENT: return (ENOENT); case MC_CMD_ERR_EINTR: return (EINTR); case MC_CMD_ERR_EACCES: return (EACCES); case MC_CMD_ERR_EBUSY: return (EBUSY); case MC_CMD_ERR_EINVAL: return (EINVAL); case MC_CMD_ERR_EDEADLK: return (EDEADLK); case MC_CMD_ERR_ENOSYS: return (ENOTSUP); case MC_CMD_ERR_ETIME: return (ETIMEDOUT); case MC_CMD_ERR_ENOTSUP: return (ENOTSUP); case MC_CMD_ERR_EALREADY: return (EALREADY); /* MCDI v2 */ case MC_CMD_ERR_EEXIST: return (EEXIST); #ifdef MC_CMD_ERR_EAGAIN case MC_CMD_ERR_EAGAIN: return (EAGAIN); #endif #ifdef MC_CMD_ERR_ENOSPC case MC_CMD_ERR_ENOSPC: return (ENOSPC); #endif case MC_CMD_ERR_ALLOC_FAIL: return (ENOMEM); case MC_CMD_ERR_NO_VADAPTOR: return (ENOENT); case MC_CMD_ERR_NO_EVB_PORT: return (ENOENT); case MC_CMD_ERR_NO_VSWITCH: return (ENODEV); case MC_CMD_ERR_VLAN_LIMIT: return (EINVAL); case MC_CMD_ERR_BAD_PCI_FUNC: return (ENODEV); case MC_CMD_ERR_BAD_VLAN_MODE: return (EINVAL); case MC_CMD_ERR_BAD_VSWITCH_TYPE: return (EINVAL); case MC_CMD_ERR_BAD_VPORT_TYPE: return (EINVAL); case MC_CMD_ERR_MAC_EXIST: return (EEXIST); case MC_CMD_ERR_PROXY_PENDING: return (EAGAIN); default: EFSYS_PROBE1(mc_pcol_error, int, err); return (EIO); } } void efx_mcdi_raise_exception( __in efx_nic_t *enp, __in_opt efx_mcdi_req_t *emrp, __in int rc) { const efx_mcdi_transport_t *emtp = enp->en_mcdi.em_emtp; efx_mcdi_exception_t exception; /* Reboot or Assertion failure only */ EFSYS_ASSERT(rc == EIO || rc == EINTR); /* * If MC_CMD_REBOOT causes a reboot (dependent on parameters), * then the EIO is not worthy of an exception. */ if (emrp != NULL && emrp->emr_cmd == MC_CMD_REBOOT && rc == EIO) return; exception = (rc == EIO) ? EFX_MCDI_EXCEPTION_MC_REBOOT : EFX_MCDI_EXCEPTION_MC_BADASSERT; emtp->emt_exception(emtp->emt_context, exception); } void efx_mcdi_execute( __in efx_nic_t *enp, __inout efx_mcdi_req_t *emrp) { const efx_mcdi_transport_t *emtp = enp->en_mcdi.em_emtp; EFSYS_ASSERT3U(enp->en_mod_flags, &, EFX_MOD_MCDI); EFSYS_ASSERT3U(enp->en_features, &, EFX_FEATURE_MCDI); emrp->emr_quiet = B_FALSE; emtp->emt_execute(emtp->emt_context, emrp); } void efx_mcdi_execute_quiet( __in efx_nic_t *enp, __inout efx_mcdi_req_t *emrp) { const efx_mcdi_transport_t *emtp = enp->en_mcdi.em_emtp; EFSYS_ASSERT3U(enp->en_mod_flags, &, EFX_MOD_MCDI); EFSYS_ASSERT3U(enp->en_features, &, EFX_FEATURE_MCDI); emrp->emr_quiet = B_TRUE; emtp->emt_execute(emtp->emt_context, emrp); } void efx_mcdi_ev_cpl( __in efx_nic_t *enp, __in unsigned int seq, __in unsigned int outlen, __in int errcode) { efx_mcdi_iface_t *emip = &(enp->en_mcdi.em_emip); const efx_mcdi_transport_t *emtp = enp->en_mcdi.em_emtp; efx_mcdi_req_t *emrp; int state; EFSYS_ASSERT3U(enp->en_mod_flags, &, EFX_MOD_MCDI); EFSYS_ASSERT3U(enp->en_features, &, EFX_FEATURE_MCDI); /* * Serialise against efx_mcdi_request_poll()/efx_mcdi_request_start() * when we're completing an aborted request. */ EFSYS_LOCK(enp->en_eslp, state); if (emip->emi_pending_req == NULL || !emip->emi_ev_cpl || (seq != ((emip->emi_seq - 1) & EFX_MASK32(MCDI_HEADER_SEQ)))) { EFSYS_ASSERT(emip->emi_aborted > 0); if (emip->emi_aborted > 0) --emip->emi_aborted; EFSYS_UNLOCK(enp->en_eslp, state); return; } emrp = emip->emi_pending_req; emip->emi_pending_req = NULL; EFSYS_UNLOCK(enp->en_eslp, state); if (emip->emi_max_version >= 2) { /* MCDIv2 response details do not fit into an event. */ efx_mcdi_read_response_header(enp, emrp); } else { if (errcode != 0) { if (!emrp->emr_quiet) { EFSYS_PROBE2(mcdi_err, int, emrp->emr_cmd, int, errcode); } emrp->emr_out_length_used = 0; emrp->emr_rc = efx_mcdi_request_errcode(errcode); } else { emrp->emr_out_length_used = outlen; emrp->emr_rc = 0; } } if (errcode == 0) { efx_mcdi_finish_response(enp, emrp); } emtp->emt_ev_cpl(emtp->emt_context); } #if EFSYS_OPT_MCDI_PROXY_AUTH __checkReturn efx_rc_t efx_mcdi_get_proxy_handle( __in efx_nic_t *enp, __in efx_mcdi_req_t *emrp, __out uint32_t *handlep) { efx_rc_t rc; /* * Return proxy handle from MCDI request that returned with error * MC_MCD_ERR_PROXY_PENDING. This handle is used to wait for a matching * PROXY_RESPONSE event. */ if ((emrp == NULL) || (handlep == NULL)) { rc = EINVAL; goto fail1; } if ((emrp->emr_rc != 0) && (emrp->emr_err_code == MC_CMD_ERR_PROXY_PENDING)) { *handlep = emrp->emr_proxy_handle; rc = 0; } else { *handlep = 0; rc = ENOENT; } return (rc); fail1: EFSYS_PROBE1(fail1, efx_rc_t, rc); return (rc); } void efx_mcdi_ev_proxy_response( __in efx_nic_t *enp, __in unsigned int handle, __in unsigned int status) { const efx_mcdi_transport_t *emtp = enp->en_mcdi.em_emtp; efx_rc_t rc; /* * Handle results of an authorization request for a privileged MCDI * command. If authorization was granted then we must re-issue the * original MCDI request. If authorization failed or timed out, * then the original MCDI request should be completed with the * result code from this event. */ rc = (status == 0) ? 0 : efx_mcdi_request_errcode(status); emtp->emt_ev_proxy_response(emtp->emt_context, handle, rc); } #endif /* EFSYS_OPT_MCDI_PROXY_AUTH */ void efx_mcdi_ev_death( __in efx_nic_t *enp, __in int rc) { efx_mcdi_iface_t *emip = &(enp->en_mcdi.em_emip); const efx_mcdi_transport_t *emtp = enp->en_mcdi.em_emtp; efx_mcdi_req_t *emrp = NULL; boolean_t ev_cpl; int state; /* * The MCDI request (if there is one) has been terminated, either * by a BADASSERT or REBOOT event. * * If there is an outstanding event-completed MCDI operation, then we * will never receive the completion event (because both MCDI * completions and BADASSERT events are sent to the same evq). So * complete this MCDI op. * * This function might run in parallel with efx_mcdi_request_poll() * for poll completed mcdi requests, and also with * efx_mcdi_request_start() for post-watchdog completions. */ EFSYS_LOCK(enp->en_eslp, state); emrp = emip->emi_pending_req; ev_cpl = emip->emi_ev_cpl; if (emrp != NULL && emip->emi_ev_cpl) { emip->emi_pending_req = NULL; emrp->emr_out_length_used = 0; emrp->emr_rc = rc; ++emip->emi_aborted; } /* * Since we're running in parallel with a request, consume the * status word before dropping the lock. */ if (rc == EIO || rc == EINTR) { EFSYS_SPIN(EFX_MCDI_STATUS_SLEEP_US); (void) efx_mcdi_poll_reboot(enp); emip->emi_new_epoch = B_TRUE; } EFSYS_UNLOCK(enp->en_eslp, state); efx_mcdi_raise_exception(enp, emrp, rc); if (emrp != NULL && ev_cpl) emtp->emt_ev_cpl(emtp->emt_context); } __checkReturn efx_rc_t efx_mcdi_version( __in efx_nic_t *enp, __out_ecount_opt(4) uint16_t versionp[4], __out_opt uint32_t *buildp, __out_opt efx_mcdi_boot_t *statusp) { efx_mcdi_req_t req; uint8_t payload[MAX(MAX(MC_CMD_GET_VERSION_IN_LEN, MC_CMD_GET_VERSION_OUT_LEN), MAX(MC_CMD_GET_BOOT_STATUS_IN_LEN, MC_CMD_GET_BOOT_STATUS_OUT_LEN))]; efx_word_t *ver_words; uint16_t version[4]; uint32_t build; efx_mcdi_boot_t status; efx_rc_t rc; EFSYS_ASSERT3U(enp->en_features, &, EFX_FEATURE_MCDI); (void) memset(payload, 0, sizeof (payload)); req.emr_cmd = MC_CMD_GET_VERSION; req.emr_in_buf = payload; req.emr_in_length = MC_CMD_GET_VERSION_IN_LEN; req.emr_out_buf = payload; req.emr_out_length = MC_CMD_GET_VERSION_OUT_LEN; efx_mcdi_execute(enp, &req); if (req.emr_rc != 0) { rc = req.emr_rc; goto fail1; } /* bootrom support */ if (req.emr_out_length_used == MC_CMD_GET_VERSION_V0_OUT_LEN) { version[0] = version[1] = version[2] = version[3] = 0; build = MCDI_OUT_DWORD(req, GET_VERSION_OUT_FIRMWARE); goto version; } if (req.emr_out_length_used < MC_CMD_GET_VERSION_OUT_LEN) { rc = EMSGSIZE; goto fail2; } ver_words = MCDI_OUT2(req, efx_word_t, GET_VERSION_OUT_VERSION); version[0] = EFX_WORD_FIELD(ver_words[0], EFX_WORD_0); version[1] = EFX_WORD_FIELD(ver_words[1], EFX_WORD_0); version[2] = EFX_WORD_FIELD(ver_words[2], EFX_WORD_0); version[3] = EFX_WORD_FIELD(ver_words[3], EFX_WORD_0); build = MCDI_OUT_DWORD(req, GET_VERSION_OUT_FIRMWARE); version: /* The bootrom doesn't understand BOOT_STATUS */ if (MC_FW_VERSION_IS_BOOTLOADER(build)) { status = EFX_MCDI_BOOT_ROM; goto out; } (void) memset(payload, 0, sizeof (payload)); req.emr_cmd = MC_CMD_GET_BOOT_STATUS; req.emr_in_buf = payload; req.emr_in_length = MC_CMD_GET_BOOT_STATUS_IN_LEN; req.emr_out_buf = payload; req.emr_out_length = MC_CMD_GET_BOOT_STATUS_OUT_LEN; efx_mcdi_execute_quiet(enp, &req); if (req.emr_rc == EACCES) { /* Unprivileged functions cannot access BOOT_STATUS */ status = EFX_MCDI_BOOT_PRIMARY; version[0] = version[1] = version[2] = version[3] = 0; build = 0; goto out; } if (req.emr_rc != 0) { rc = req.emr_rc; goto fail3; } if (req.emr_out_length_used < MC_CMD_GET_BOOT_STATUS_OUT_LEN) { rc = EMSGSIZE; goto fail4; } if (MCDI_OUT_DWORD_FIELD(req, GET_BOOT_STATUS_OUT_FLAGS, GET_BOOT_STATUS_OUT_FLAGS_PRIMARY)) status = EFX_MCDI_BOOT_PRIMARY; else status = EFX_MCDI_BOOT_SECONDARY; out: if (versionp != NULL) memcpy(versionp, version, sizeof (version)); if (buildp != NULL) *buildp = build; if (statusp != NULL) *statusp = status; return (0); fail4: EFSYS_PROBE(fail4); fail3: EFSYS_PROBE(fail3); fail2: EFSYS_PROBE(fail2); fail1: EFSYS_PROBE1(fail1, efx_rc_t, rc); return (rc); } static __checkReturn efx_rc_t efx_mcdi_do_reboot( __in efx_nic_t *enp, __in boolean_t after_assertion) { uint8_t payload[MAX(MC_CMD_REBOOT_IN_LEN, MC_CMD_REBOOT_OUT_LEN)]; efx_mcdi_req_t req; efx_rc_t rc; /* * We could require the caller to have caused en_mod_flags=0 to * call this function. This doesn't help the other port though, * who's about to get the MC ripped out from underneath them. * Since they have to cope with the subsequent fallout of MCDI * failures, we should as well. */ EFSYS_ASSERT3U(enp->en_magic, ==, EFX_NIC_MAGIC); (void) memset(payload, 0, sizeof (payload)); req.emr_cmd = MC_CMD_REBOOT; req.emr_in_buf = payload; req.emr_in_length = MC_CMD_REBOOT_IN_LEN; req.emr_out_buf = payload; req.emr_out_length = MC_CMD_REBOOT_OUT_LEN; MCDI_IN_SET_DWORD(req, REBOOT_IN_FLAGS, (after_assertion ? MC_CMD_REBOOT_FLAGS_AFTER_ASSERTION : 0)); efx_mcdi_execute_quiet(enp, &req); if (req.emr_rc == EACCES) { /* Unprivileged functions cannot reboot the MC. */ goto out; } /* A successful reboot request returns EIO. */ if (req.emr_rc != 0 && req.emr_rc != EIO) { rc = req.emr_rc; goto fail1; } out: return (0); fail1: EFSYS_PROBE1(fail1, efx_rc_t, rc); return (rc); } __checkReturn efx_rc_t efx_mcdi_reboot( __in efx_nic_t *enp) { return (efx_mcdi_do_reboot(enp, B_FALSE)); } __checkReturn efx_rc_t efx_mcdi_exit_assertion_handler( __in efx_nic_t *enp) { return (efx_mcdi_do_reboot(enp, B_TRUE)); } __checkReturn efx_rc_t efx_mcdi_read_assertion( __in efx_nic_t *enp) { efx_mcdi_req_t req; uint8_t payload[MAX(MC_CMD_GET_ASSERTS_IN_LEN, MC_CMD_GET_ASSERTS_OUT_LEN)]; const char *reason; unsigned int flags; unsigned int index; unsigned int ofst; int retry; efx_rc_t rc; /* * Before we attempt to chat to the MC, we should verify that the MC - * isn't in it's assertion handler, either due to a previous reboot, + * isn't in its assertion handler, either due to a previous reboot, * or because we're reinitializing due to an eec_exception(). * * Use GET_ASSERTS to read any assertion state that may be present. * Retry this command twice. Once because a boot-time assertion failure * might cause the 1st MCDI request to fail. And once again because * we might race with efx_mcdi_exit_assertion_handler() running on * partner port(s) on the same NIC. */ retry = 2; do { (void) memset(payload, 0, sizeof (payload)); req.emr_cmd = MC_CMD_GET_ASSERTS; req.emr_in_buf = payload; req.emr_in_length = MC_CMD_GET_ASSERTS_IN_LEN; req.emr_out_buf = payload; req.emr_out_length = MC_CMD_GET_ASSERTS_OUT_LEN; MCDI_IN_SET_DWORD(req, GET_ASSERTS_IN_CLEAR, 1); efx_mcdi_execute_quiet(enp, &req); } while ((req.emr_rc == EINTR || req.emr_rc == EIO) && retry-- > 0); if (req.emr_rc != 0) { if (req.emr_rc == EACCES) { /* Unprivileged functions cannot clear assertions. */ goto out; } rc = req.emr_rc; goto fail1; } if (req.emr_out_length_used < MC_CMD_GET_ASSERTS_OUT_LEN) { rc = EMSGSIZE; goto fail2; } /* Print out any assertion state recorded */ flags = MCDI_OUT_DWORD(req, GET_ASSERTS_OUT_GLOBAL_FLAGS); if (flags == MC_CMD_GET_ASSERTS_FLAGS_NO_FAILS) return (0); reason = (flags == MC_CMD_GET_ASSERTS_FLAGS_SYS_FAIL) ? "system-level assertion" : (flags == MC_CMD_GET_ASSERTS_FLAGS_THR_FAIL) ? "thread-level assertion" : (flags == MC_CMD_GET_ASSERTS_FLAGS_WDOG_FIRED) ? "watchdog reset" : (flags == MC_CMD_GET_ASSERTS_FLAGS_ADDR_TRAP) ? "illegal address trap" : "unknown assertion"; EFSYS_PROBE3(mcpu_assertion, const char *, reason, unsigned int, MCDI_OUT_DWORD(req, GET_ASSERTS_OUT_SAVED_PC_OFFS), unsigned int, MCDI_OUT_DWORD(req, GET_ASSERTS_OUT_THREAD_OFFS)); /* Print out the registers (r1 ... r31) */ ofst = MC_CMD_GET_ASSERTS_OUT_GP_REGS_OFFS_OFST; for (index = 1; index < 1 + MC_CMD_GET_ASSERTS_OUT_GP_REGS_OFFS_NUM; index++) { EFSYS_PROBE2(mcpu_register, unsigned int, index, unsigned int, EFX_DWORD_FIELD(*MCDI_OUT(req, efx_dword_t, ofst), EFX_DWORD_0)); ofst += sizeof (efx_dword_t); } EFSYS_ASSERT(ofst <= MC_CMD_GET_ASSERTS_OUT_LEN); out: return (0); fail2: EFSYS_PROBE(fail2); fail1: EFSYS_PROBE1(fail1, efx_rc_t, rc); return (rc); } /* * Internal routines for for specific MCDI requests. */ __checkReturn efx_rc_t efx_mcdi_drv_attach( __in efx_nic_t *enp, __in boolean_t attach) { efx_nic_cfg_t *encp = &(enp->en_nic_cfg); efx_mcdi_req_t req; uint8_t payload[MAX(MC_CMD_DRV_ATTACH_IN_LEN, MC_CMD_DRV_ATTACH_EXT_OUT_LEN)]; uint32_t flags; efx_rc_t rc; (void) memset(payload, 0, sizeof (payload)); req.emr_cmd = MC_CMD_DRV_ATTACH; req.emr_in_buf = payload; req.emr_in_length = MC_CMD_DRV_ATTACH_IN_LEN; req.emr_out_buf = payload; req.emr_out_length = MC_CMD_DRV_ATTACH_EXT_OUT_LEN; /* * Use DONT_CARE for the datapath firmware type to ensure that the * driver can attach to an unprivileged function. The datapath firmware * type to use is controlled by the 'sfboot' utility. */ MCDI_IN_SET_DWORD(req, DRV_ATTACH_IN_NEW_STATE, attach ? 1 : 0); MCDI_IN_SET_DWORD(req, DRV_ATTACH_IN_UPDATE, 1); MCDI_IN_SET_DWORD(req, DRV_ATTACH_IN_FIRMWARE_ID, MC_CMD_FW_DONT_CARE); efx_mcdi_execute(enp, &req); if (req.emr_rc != 0) { rc = req.emr_rc; goto fail1; } if (req.emr_out_length_used < MC_CMD_DRV_ATTACH_OUT_LEN) { rc = EMSGSIZE; goto fail2; } if (attach == B_FALSE) { flags = 0; } else if (enp->en_family == EFX_FAMILY_SIENA) { efx_mcdi_iface_t *emip = &(enp->en_mcdi.em_emip); /* Create synthetic privileges for Siena functions */ flags = EFX_NIC_FUNC_LINKCTRL | EFX_NIC_FUNC_TRUSTED; if (emip->emi_port == 1) flags |= EFX_NIC_FUNC_PRIMARY; } else { EFX_STATIC_ASSERT(EFX_NIC_FUNC_PRIMARY == (1u << MC_CMD_DRV_ATTACH_EXT_OUT_FLAG_PRIMARY)); EFX_STATIC_ASSERT(EFX_NIC_FUNC_LINKCTRL == (1u << MC_CMD_DRV_ATTACH_EXT_OUT_FLAG_LINKCTRL)); EFX_STATIC_ASSERT(EFX_NIC_FUNC_TRUSTED == (1u << MC_CMD_DRV_ATTACH_EXT_OUT_FLAG_TRUSTED)); /* Save function privilege flags (EF10 and later) */ if (req.emr_out_length_used < MC_CMD_DRV_ATTACH_EXT_OUT_LEN) { rc = EMSGSIZE; goto fail3; } flags = MCDI_OUT_DWORD(req, DRV_ATTACH_EXT_OUT_FUNC_FLAGS); } encp->enc_func_flags = flags; return (0); fail3: EFSYS_PROBE(fail3); fail2: EFSYS_PROBE(fail2); fail1: EFSYS_PROBE1(fail1, efx_rc_t, rc); return (rc); } __checkReturn efx_rc_t efx_mcdi_get_board_cfg( __in efx_nic_t *enp, __out_opt uint32_t *board_typep, __out_opt efx_dword_t *capabilitiesp, __out_ecount_opt(6) uint8_t mac_addrp[6]) { efx_mcdi_iface_t *emip = &(enp->en_mcdi.em_emip); efx_mcdi_req_t req; uint8_t payload[MAX(MC_CMD_GET_BOARD_CFG_IN_LEN, MC_CMD_GET_BOARD_CFG_OUT_LENMIN)]; efx_rc_t rc; (void) memset(payload, 0, sizeof (payload)); req.emr_cmd = MC_CMD_GET_BOARD_CFG; req.emr_in_buf = payload; req.emr_in_length = MC_CMD_GET_BOARD_CFG_IN_LEN; req.emr_out_buf = payload; req.emr_out_length = MC_CMD_GET_BOARD_CFG_OUT_LENMIN; efx_mcdi_execute(enp, &req); if (req.emr_rc != 0) { rc = req.emr_rc; goto fail1; } if (req.emr_out_length_used < MC_CMD_GET_BOARD_CFG_OUT_LENMIN) { rc = EMSGSIZE; goto fail2; } if (mac_addrp != NULL) { uint8_t *addrp; if (emip->emi_port == 1) { addrp = MCDI_OUT2(req, uint8_t, GET_BOARD_CFG_OUT_MAC_ADDR_BASE_PORT0); } else if (emip->emi_port == 2) { addrp = MCDI_OUT2(req, uint8_t, GET_BOARD_CFG_OUT_MAC_ADDR_BASE_PORT1); } else { rc = EINVAL; goto fail3; } EFX_MAC_ADDR_COPY(mac_addrp, addrp); } if (capabilitiesp != NULL) { if (emip->emi_port == 1) { *capabilitiesp = *MCDI_OUT2(req, efx_dword_t, GET_BOARD_CFG_OUT_CAPABILITIES_PORT0); } else if (emip->emi_port == 2) { *capabilitiesp = *MCDI_OUT2(req, efx_dword_t, GET_BOARD_CFG_OUT_CAPABILITIES_PORT1); } else { rc = EINVAL; goto fail4; } } if (board_typep != NULL) { *board_typep = MCDI_OUT_DWORD(req, GET_BOARD_CFG_OUT_BOARD_TYPE); } return (0); fail4: EFSYS_PROBE(fail4); fail3: EFSYS_PROBE(fail3); fail2: EFSYS_PROBE(fail2); fail1: EFSYS_PROBE1(fail1, efx_rc_t, rc); return (rc); } __checkReturn efx_rc_t efx_mcdi_get_resource_limits( __in efx_nic_t *enp, __out_opt uint32_t *nevqp, __out_opt uint32_t *nrxqp, __out_opt uint32_t *ntxqp) { efx_mcdi_req_t req; uint8_t payload[MAX(MC_CMD_GET_RESOURCE_LIMITS_IN_LEN, MC_CMD_GET_RESOURCE_LIMITS_OUT_LEN)]; efx_rc_t rc; (void) memset(payload, 0, sizeof (payload)); req.emr_cmd = MC_CMD_GET_RESOURCE_LIMITS; req.emr_in_buf = payload; req.emr_in_length = MC_CMD_GET_RESOURCE_LIMITS_IN_LEN; req.emr_out_buf = payload; req.emr_out_length = MC_CMD_GET_RESOURCE_LIMITS_OUT_LEN; efx_mcdi_execute(enp, &req); if (req.emr_rc != 0) { rc = req.emr_rc; goto fail1; } if (req.emr_out_length_used < MC_CMD_GET_RESOURCE_LIMITS_OUT_LEN) { rc = EMSGSIZE; goto fail2; } if (nevqp != NULL) *nevqp = MCDI_OUT_DWORD(req, GET_RESOURCE_LIMITS_OUT_EVQ); if (nrxqp != NULL) *nrxqp = MCDI_OUT_DWORD(req, GET_RESOURCE_LIMITS_OUT_RXQ); if (ntxqp != NULL) *ntxqp = MCDI_OUT_DWORD(req, GET_RESOURCE_LIMITS_OUT_TXQ); return (0); fail2: EFSYS_PROBE(fail2); fail1: EFSYS_PROBE1(fail1, efx_rc_t, rc); return (rc); } __checkReturn efx_rc_t efx_mcdi_get_phy_cfg( __in efx_nic_t *enp) { efx_port_t *epp = &(enp->en_port); efx_nic_cfg_t *encp = &(enp->en_nic_cfg); efx_mcdi_req_t req; uint8_t payload[MAX(MC_CMD_GET_PHY_CFG_IN_LEN, MC_CMD_GET_PHY_CFG_OUT_LEN)]; efx_rc_t rc; (void) memset(payload, 0, sizeof (payload)); req.emr_cmd = MC_CMD_GET_PHY_CFG; req.emr_in_buf = payload; req.emr_in_length = MC_CMD_GET_PHY_CFG_IN_LEN; req.emr_out_buf = payload; req.emr_out_length = MC_CMD_GET_PHY_CFG_OUT_LEN; efx_mcdi_execute(enp, &req); if (req.emr_rc != 0) { rc = req.emr_rc; goto fail1; } if (req.emr_out_length_used < MC_CMD_GET_PHY_CFG_OUT_LEN) { rc = EMSGSIZE; goto fail2; } encp->enc_phy_type = MCDI_OUT_DWORD(req, GET_PHY_CFG_OUT_TYPE); #if EFSYS_OPT_NAMES (void) strncpy(encp->enc_phy_name, MCDI_OUT2(req, char, GET_PHY_CFG_OUT_NAME), MIN(sizeof (encp->enc_phy_name) - 1, MC_CMD_GET_PHY_CFG_OUT_NAME_LEN)); #endif /* EFSYS_OPT_NAMES */ (void) memset(encp->enc_phy_revision, 0, sizeof (encp->enc_phy_revision)); memcpy(encp->enc_phy_revision, MCDI_OUT2(req, char, GET_PHY_CFG_OUT_REVISION), MIN(sizeof (encp->enc_phy_revision) - 1, MC_CMD_GET_PHY_CFG_OUT_REVISION_LEN)); #if EFSYS_OPT_PHY_LED_CONTROL encp->enc_led_mask = ((1 << EFX_PHY_LED_DEFAULT) | (1 << EFX_PHY_LED_OFF) | (1 << EFX_PHY_LED_ON)); #endif /* EFSYS_OPT_PHY_LED_CONTROL */ /* Get the media type of the fixed port, if recognised. */ EFX_STATIC_ASSERT(MC_CMD_MEDIA_XAUI == EFX_PHY_MEDIA_XAUI); EFX_STATIC_ASSERT(MC_CMD_MEDIA_CX4 == EFX_PHY_MEDIA_CX4); EFX_STATIC_ASSERT(MC_CMD_MEDIA_KX4 == EFX_PHY_MEDIA_KX4); EFX_STATIC_ASSERT(MC_CMD_MEDIA_XFP == EFX_PHY_MEDIA_XFP); EFX_STATIC_ASSERT(MC_CMD_MEDIA_SFP_PLUS == EFX_PHY_MEDIA_SFP_PLUS); EFX_STATIC_ASSERT(MC_CMD_MEDIA_BASE_T == EFX_PHY_MEDIA_BASE_T); EFX_STATIC_ASSERT(MC_CMD_MEDIA_QSFP_PLUS == EFX_PHY_MEDIA_QSFP_PLUS); epp->ep_fixed_port_type = MCDI_OUT_DWORD(req, GET_PHY_CFG_OUT_MEDIA_TYPE); if (epp->ep_fixed_port_type >= EFX_PHY_MEDIA_NTYPES) epp->ep_fixed_port_type = EFX_PHY_MEDIA_INVALID; epp->ep_phy_cap_mask = MCDI_OUT_DWORD(req, GET_PHY_CFG_OUT_SUPPORTED_CAP); #if EFSYS_OPT_PHY_FLAGS encp->enc_phy_flags_mask = MCDI_OUT_DWORD(req, GET_PHY_CFG_OUT_FLAGS); #endif /* EFSYS_OPT_PHY_FLAGS */ encp->enc_port = (uint8_t)MCDI_OUT_DWORD(req, GET_PHY_CFG_OUT_PRT); /* Populate internal state */ encp->enc_mcdi_mdio_channel = (uint8_t)MCDI_OUT_DWORD(req, GET_PHY_CFG_OUT_CHANNEL); #if EFSYS_OPT_PHY_STATS encp->enc_mcdi_phy_stat_mask = MCDI_OUT_DWORD(req, GET_PHY_CFG_OUT_STATS_MASK); #endif /* EFSYS_OPT_PHY_STATS */ #if EFSYS_OPT_BIST encp->enc_bist_mask = 0; if (MCDI_OUT_DWORD_FIELD(req, GET_PHY_CFG_OUT_FLAGS, GET_PHY_CFG_OUT_BIST_CABLE_SHORT)) encp->enc_bist_mask |= (1 << EFX_BIST_TYPE_PHY_CABLE_SHORT); if (MCDI_OUT_DWORD_FIELD(req, GET_PHY_CFG_OUT_FLAGS, GET_PHY_CFG_OUT_BIST_CABLE_LONG)) encp->enc_bist_mask |= (1 << EFX_BIST_TYPE_PHY_CABLE_LONG); if (MCDI_OUT_DWORD_FIELD(req, GET_PHY_CFG_OUT_FLAGS, GET_PHY_CFG_OUT_BIST)) encp->enc_bist_mask |= (1 << EFX_BIST_TYPE_PHY_NORMAL); #endif /* EFSYS_OPT_BIST */ return (0); fail2: EFSYS_PROBE(fail2); fail1: EFSYS_PROBE1(fail1, efx_rc_t, rc); return (rc); } __checkReturn efx_rc_t efx_mcdi_firmware_update_supported( __in efx_nic_t *enp, __out boolean_t *supportedp) { const efx_mcdi_ops_t *emcop = enp->en_mcdi.em_emcop; efx_rc_t rc; if (emcop != NULL) { if ((rc = emcop->emco_feature_supported(enp, EFX_MCDI_FEATURE_FW_UPDATE, supportedp)) != 0) goto fail1; } else { /* Earlier devices always supported updates */ *supportedp = B_TRUE; } return (0); fail1: EFSYS_PROBE1(fail1, efx_rc_t, rc); return (rc); } __checkReturn efx_rc_t efx_mcdi_macaddr_change_supported( __in efx_nic_t *enp, __out boolean_t *supportedp) { const efx_mcdi_ops_t *emcop = enp->en_mcdi.em_emcop; efx_rc_t rc; if (emcop != NULL) { if ((rc = emcop->emco_feature_supported(enp, EFX_MCDI_FEATURE_MACADDR_CHANGE, supportedp)) != 0) goto fail1; } else { /* Earlier devices always supported MAC changes */ *supportedp = B_TRUE; } return (0); fail1: EFSYS_PROBE1(fail1, efx_rc_t, rc); return (rc); } __checkReturn efx_rc_t efx_mcdi_link_control_supported( __in efx_nic_t *enp, __out boolean_t *supportedp) { const efx_mcdi_ops_t *emcop = enp->en_mcdi.em_emcop; efx_rc_t rc; if (emcop != NULL) { if ((rc = emcop->emco_feature_supported(enp, EFX_MCDI_FEATURE_LINK_CONTROL, supportedp)) != 0) goto fail1; } else { /* Earlier devices always supported link control */ *supportedp = B_TRUE; } return (0); fail1: EFSYS_PROBE1(fail1, efx_rc_t, rc); return (rc); } __checkReturn efx_rc_t efx_mcdi_mac_spoofing_supported( __in efx_nic_t *enp, __out boolean_t *supportedp) { const efx_mcdi_ops_t *emcop = enp->en_mcdi.em_emcop; efx_rc_t rc; if (emcop != NULL) { if ((rc = emcop->emco_feature_supported(enp, EFX_MCDI_FEATURE_MAC_SPOOFING, supportedp)) != 0) goto fail1; } else { /* Earlier devices always supported MAC spoofing */ *supportedp = B_TRUE; } return (0); fail1: EFSYS_PROBE1(fail1, efx_rc_t, rc); return (rc); } #if EFSYS_OPT_BIST #if EFSYS_OPT_HUNTINGTON || EFSYS_OPT_MEDFORD /* * Enter bist offline mode. This is a fw mode which puts the NIC into a state * where memory BIST tests can be run and not much else can interfere or happen. * A reboot is required to exit this mode. */ __checkReturn efx_rc_t efx_mcdi_bist_enable_offline( __in efx_nic_t *enp) { efx_mcdi_req_t req; efx_rc_t rc; EFX_STATIC_ASSERT(MC_CMD_ENABLE_OFFLINE_BIST_IN_LEN == 0); EFX_STATIC_ASSERT(MC_CMD_ENABLE_OFFLINE_BIST_OUT_LEN == 0); req.emr_cmd = MC_CMD_ENABLE_OFFLINE_BIST; req.emr_in_buf = NULL; req.emr_in_length = 0; req.emr_out_buf = NULL; req.emr_out_length = 0; efx_mcdi_execute(enp, &req); if (req.emr_rc != 0) { rc = req.emr_rc; goto fail1; } return (0); fail1: EFSYS_PROBE1(fail1, efx_rc_t, rc); return (rc); } #endif /* EFSYS_OPT_HUNTINGTON || EFSYS_OPT_MEDFORD */ __checkReturn efx_rc_t efx_mcdi_bist_start( __in efx_nic_t *enp, __in efx_bist_type_t type) { efx_mcdi_req_t req; uint8_t payload[MAX(MC_CMD_START_BIST_IN_LEN, MC_CMD_START_BIST_OUT_LEN)]; efx_rc_t rc; (void) memset(payload, 0, sizeof (payload)); req.emr_cmd = MC_CMD_START_BIST; req.emr_in_buf = payload; req.emr_in_length = MC_CMD_START_BIST_IN_LEN; req.emr_out_buf = payload; req.emr_out_length = MC_CMD_START_BIST_OUT_LEN; switch (type) { case EFX_BIST_TYPE_PHY_NORMAL: MCDI_IN_SET_DWORD(req, START_BIST_IN_TYPE, MC_CMD_PHY_BIST); break; case EFX_BIST_TYPE_PHY_CABLE_SHORT: MCDI_IN_SET_DWORD(req, START_BIST_IN_TYPE, MC_CMD_PHY_BIST_CABLE_SHORT); break; case EFX_BIST_TYPE_PHY_CABLE_LONG: MCDI_IN_SET_DWORD(req, START_BIST_IN_TYPE, MC_CMD_PHY_BIST_CABLE_LONG); break; case EFX_BIST_TYPE_MC_MEM: MCDI_IN_SET_DWORD(req, START_BIST_IN_TYPE, MC_CMD_MC_MEM_BIST); break; case EFX_BIST_TYPE_SAT_MEM: MCDI_IN_SET_DWORD(req, START_BIST_IN_TYPE, MC_CMD_PORT_MEM_BIST); break; case EFX_BIST_TYPE_REG: MCDI_IN_SET_DWORD(req, START_BIST_IN_TYPE, MC_CMD_REG_BIST); break; default: EFSYS_ASSERT(0); } efx_mcdi_execute(enp, &req); if (req.emr_rc != 0) { rc = req.emr_rc; goto fail1; } return (0); fail1: EFSYS_PROBE1(fail1, efx_rc_t, rc); return (rc); } #endif /* EFSYS_OPT_BIST */ /* Enable logging of some events (e.g. link state changes) */ __checkReturn efx_rc_t efx_mcdi_log_ctrl( __in efx_nic_t *enp) { efx_mcdi_req_t req; uint8_t payload[MAX(MC_CMD_LOG_CTRL_IN_LEN, MC_CMD_LOG_CTRL_OUT_LEN)]; efx_rc_t rc; (void) memset(payload, 0, sizeof (payload)); req.emr_cmd = MC_CMD_LOG_CTRL; req.emr_in_buf = payload; req.emr_in_length = MC_CMD_LOG_CTRL_IN_LEN; req.emr_out_buf = payload; req.emr_out_length = MC_CMD_LOG_CTRL_OUT_LEN; MCDI_IN_SET_DWORD(req, LOG_CTRL_IN_LOG_DEST, MC_CMD_LOG_CTRL_IN_LOG_DEST_EVQ); MCDI_IN_SET_DWORD(req, LOG_CTRL_IN_LOG_DEST_EVQ, 0); efx_mcdi_execute(enp, &req); if (req.emr_rc != 0) { rc = req.emr_rc; goto fail1; } return (0); fail1: EFSYS_PROBE1(fail1, efx_rc_t, rc); return (rc); } #if EFSYS_OPT_MAC_STATS typedef enum efx_stats_action_e { EFX_STATS_CLEAR, EFX_STATS_UPLOAD, EFX_STATS_ENABLE_NOEVENTS, EFX_STATS_ENABLE_EVENTS, EFX_STATS_DISABLE, } efx_stats_action_t; static __checkReturn efx_rc_t efx_mcdi_mac_stats( __in efx_nic_t *enp, __in_opt efsys_mem_t *esmp, __in efx_stats_action_t action) { efx_mcdi_req_t req; uint8_t payload[MAX(MC_CMD_MAC_STATS_IN_LEN, MC_CMD_MAC_STATS_OUT_DMA_LEN)]; int clear = (action == EFX_STATS_CLEAR); int upload = (action == EFX_STATS_UPLOAD); int enable = (action == EFX_STATS_ENABLE_NOEVENTS); int events = (action == EFX_STATS_ENABLE_EVENTS); int disable = (action == EFX_STATS_DISABLE); efx_rc_t rc; (void) memset(payload, 0, sizeof (payload)); req.emr_cmd = MC_CMD_MAC_STATS; req.emr_in_buf = payload; req.emr_in_length = MC_CMD_MAC_STATS_IN_LEN; req.emr_out_buf = payload; req.emr_out_length = MC_CMD_MAC_STATS_OUT_DMA_LEN; MCDI_IN_POPULATE_DWORD_6(req, MAC_STATS_IN_CMD, MAC_STATS_IN_DMA, upload, MAC_STATS_IN_CLEAR, clear, MAC_STATS_IN_PERIODIC_CHANGE, enable | events | disable, MAC_STATS_IN_PERIODIC_ENABLE, enable | events, MAC_STATS_IN_PERIODIC_NOEVENT, !events, MAC_STATS_IN_PERIOD_MS, (enable | events) ? 1000: 0); if (esmp != NULL) { int bytes = MC_CMD_MAC_NSTATS * sizeof (uint64_t); EFX_STATIC_ASSERT(MC_CMD_MAC_NSTATS * sizeof (uint64_t) <= EFX_MAC_STATS_SIZE); MCDI_IN_SET_DWORD(req, MAC_STATS_IN_DMA_ADDR_LO, EFSYS_MEM_ADDR(esmp) & 0xffffffff); MCDI_IN_SET_DWORD(req, MAC_STATS_IN_DMA_ADDR_HI, EFSYS_MEM_ADDR(esmp) >> 32); MCDI_IN_SET_DWORD(req, MAC_STATS_IN_DMA_LEN, bytes); } else { EFSYS_ASSERT(!upload && !enable && !events); } /* * NOTE: Do not use EVB_PORT_ID_ASSIGNED when disabling periodic stats, * as this may fail (and leave periodic DMA enabled) if the * vadapter has already been deleted. */ MCDI_IN_SET_DWORD(req, MAC_STATS_IN_PORT_ID, (disable ? EVB_PORT_ID_NULL : enp->en_vport_id)); efx_mcdi_execute(enp, &req); if (req.emr_rc != 0) { /* EF10: Expect ENOENT if no DMA queues are initialised */ if ((req.emr_rc != ENOENT) || (enp->en_rx_qcount + enp->en_tx_qcount != 0)) { rc = req.emr_rc; goto fail1; } } return (0); fail1: EFSYS_PROBE1(fail1, efx_rc_t, rc); return (rc); } __checkReturn efx_rc_t efx_mcdi_mac_stats_clear( __in efx_nic_t *enp) { efx_rc_t rc; if ((rc = efx_mcdi_mac_stats(enp, NULL, EFX_STATS_CLEAR)) != 0) goto fail1; return (0); fail1: EFSYS_PROBE1(fail1, efx_rc_t, rc); return (rc); } __checkReturn efx_rc_t efx_mcdi_mac_stats_upload( __in efx_nic_t *enp, __in efsys_mem_t *esmp) { efx_rc_t rc; /* * The MC DMAs aggregate statistics for our convenience, so we can * avoid having to pull the statistics buffer into the cache to * maintain cumulative statistics. */ if ((rc = efx_mcdi_mac_stats(enp, esmp, EFX_STATS_UPLOAD)) != 0) goto fail1; return (0); fail1: EFSYS_PROBE1(fail1, efx_rc_t, rc); return (rc); } __checkReturn efx_rc_t efx_mcdi_mac_stats_periodic( __in efx_nic_t *enp, __in efsys_mem_t *esmp, __in uint16_t period, __in boolean_t events) { efx_rc_t rc; /* * The MC DMAs aggregate statistics for our convenience, so we can * avoid having to pull the statistics buffer into the cache to * maintain cumulative statistics. * Huntington uses a fixed 1sec period, so use that on Siena too. */ if (period == 0) rc = efx_mcdi_mac_stats(enp, NULL, EFX_STATS_DISABLE); else if (events) rc = efx_mcdi_mac_stats(enp, esmp, EFX_STATS_ENABLE_EVENTS); else rc = efx_mcdi_mac_stats(enp, esmp, EFX_STATS_ENABLE_NOEVENTS); if (rc != 0) goto fail1; return (0); fail1: EFSYS_PROBE1(fail1, efx_rc_t, rc); return (rc); } #endif /* EFSYS_OPT_MAC_STATS */ #if EFSYS_OPT_HUNTINGTON || EFSYS_OPT_MEDFORD /* * This function returns the pf and vf number of a function. If it is a pf the * vf number is 0xffff. The vf number is the index of the vf on that * function. So if you have 3 vfs on pf 0 the 3 vfs will return (pf=0,vf=0), * (pf=0,vf=1), (pf=0,vf=2) aand the pf will return (pf=0, vf=0xffff). */ __checkReturn efx_rc_t efx_mcdi_get_function_info( __in efx_nic_t *enp, __out uint32_t *pfp, __out_opt uint32_t *vfp) { efx_mcdi_req_t req; uint8_t payload[MAX(MC_CMD_GET_FUNCTION_INFO_IN_LEN, MC_CMD_GET_FUNCTION_INFO_OUT_LEN)]; efx_rc_t rc; (void) memset(payload, 0, sizeof (payload)); req.emr_cmd = MC_CMD_GET_FUNCTION_INFO; req.emr_in_buf = payload; req.emr_in_length = MC_CMD_GET_FUNCTION_INFO_IN_LEN; req.emr_out_buf = payload; req.emr_out_length = MC_CMD_GET_FUNCTION_INFO_OUT_LEN; efx_mcdi_execute(enp, &req); if (req.emr_rc != 0) { rc = req.emr_rc; goto fail1; } if (req.emr_out_length_used < MC_CMD_GET_FUNCTION_INFO_OUT_LEN) { rc = EMSGSIZE; goto fail2; } *pfp = MCDI_OUT_DWORD(req, GET_FUNCTION_INFO_OUT_PF); if (vfp != NULL) *vfp = MCDI_OUT_DWORD(req, GET_FUNCTION_INFO_OUT_VF); return (0); fail2: EFSYS_PROBE(fail2); fail1: EFSYS_PROBE1(fail1, efx_rc_t, rc); return (rc); } __checkReturn efx_rc_t efx_mcdi_privilege_mask( __in efx_nic_t *enp, __in uint32_t pf, __in uint32_t vf, __out uint32_t *maskp) { efx_mcdi_req_t req; uint8_t payload[MAX(MC_CMD_PRIVILEGE_MASK_IN_LEN, MC_CMD_PRIVILEGE_MASK_OUT_LEN)]; efx_rc_t rc; (void) memset(payload, 0, sizeof (payload)); req.emr_cmd = MC_CMD_PRIVILEGE_MASK; req.emr_in_buf = payload; req.emr_in_length = MC_CMD_PRIVILEGE_MASK_IN_LEN; req.emr_out_buf = payload; req.emr_out_length = MC_CMD_PRIVILEGE_MASK_OUT_LEN; MCDI_IN_POPULATE_DWORD_2(req, PRIVILEGE_MASK_IN_FUNCTION, PRIVILEGE_MASK_IN_FUNCTION_PF, pf, PRIVILEGE_MASK_IN_FUNCTION_VF, vf); efx_mcdi_execute(enp, &req); if (req.emr_rc != 0) { rc = req.emr_rc; goto fail1; } if (req.emr_out_length_used < MC_CMD_PRIVILEGE_MASK_OUT_LEN) { rc = EMSGSIZE; goto fail2; } *maskp = MCDI_OUT_DWORD(req, PRIVILEGE_MASK_OUT_OLD_MASK); return (0); fail2: EFSYS_PROBE(fail2); fail1: EFSYS_PROBE1(fail1, efx_rc_t, rc); return (rc); } #endif /* EFSYS_OPT_HUNTINGTON || EFSYS_OPT_MEDFORD */ __checkReturn efx_rc_t efx_mcdi_set_workaround( __in efx_nic_t *enp, __in uint32_t type, __in boolean_t enabled, __out_opt uint32_t *flagsp) { efx_mcdi_req_t req; uint8_t payload[MAX(MC_CMD_WORKAROUND_IN_LEN, MC_CMD_WORKAROUND_EXT_OUT_LEN)]; efx_rc_t rc; (void) memset(payload, 0, sizeof (payload)); req.emr_cmd = MC_CMD_WORKAROUND; req.emr_in_buf = payload; req.emr_in_length = MC_CMD_WORKAROUND_IN_LEN; req.emr_out_buf = payload; req.emr_out_length = MC_CMD_WORKAROUND_OUT_LEN; MCDI_IN_SET_DWORD(req, WORKAROUND_IN_TYPE, type); MCDI_IN_SET_DWORD(req, WORKAROUND_IN_ENABLED, enabled ? 1 : 0); efx_mcdi_execute_quiet(enp, &req); if (req.emr_rc != 0) { rc = req.emr_rc; goto fail1; } if (flagsp != NULL) { if (req.emr_out_length_used >= MC_CMD_WORKAROUND_EXT_OUT_LEN) *flagsp = MCDI_OUT_DWORD(req, WORKAROUND_EXT_OUT_FLAGS); else *flagsp = 0; } return (0); fail1: EFSYS_PROBE1(fail1, efx_rc_t, rc); return (rc); } __checkReturn efx_rc_t efx_mcdi_get_workarounds( __in efx_nic_t *enp, __out_opt uint32_t *implementedp, __out_opt uint32_t *enabledp) { efx_mcdi_req_t req; uint8_t payload[MC_CMD_GET_WORKAROUNDS_OUT_LEN]; efx_rc_t rc; (void) memset(payload, 0, sizeof (payload)); req.emr_cmd = MC_CMD_GET_WORKAROUNDS; req.emr_in_buf = NULL; req.emr_in_length = 0; req.emr_out_buf = payload; req.emr_out_length = MC_CMD_GET_WORKAROUNDS_OUT_LEN; efx_mcdi_execute(enp, &req); if (req.emr_rc != 0) { rc = req.emr_rc; goto fail1; } if (implementedp != NULL) { *implementedp = MCDI_OUT_DWORD(req, GET_WORKAROUNDS_OUT_IMPLEMENTED); } if (enabledp != NULL) { *enabledp = MCDI_OUT_DWORD(req, GET_WORKAROUNDS_OUT_ENABLED); } return (0); fail1: EFSYS_PROBE1(fail1, efx_rc_t, rc); return (rc); } /* * Size of media information page in accordance with SFF-8472 and SFF-8436. * It is used in MCDI interface as well. */ #define EFX_PHY_MEDIA_INFO_PAGE_SIZE 0x80 static __checkReturn efx_rc_t efx_mcdi_get_phy_media_info( __in efx_nic_t *enp, __in uint32_t mcdi_page, __in uint8_t offset, __in uint8_t len, __out_bcount(len) uint8_t *data) { efx_mcdi_req_t req; uint8_t payload[MAX(MC_CMD_GET_PHY_MEDIA_INFO_IN_LEN, MC_CMD_GET_PHY_MEDIA_INFO_OUT_LEN( EFX_PHY_MEDIA_INFO_PAGE_SIZE))]; efx_rc_t rc; EFSYS_ASSERT((uint32_t)offset + len <= EFX_PHY_MEDIA_INFO_PAGE_SIZE); (void) memset(payload, 0, sizeof (payload)); req.emr_cmd = MC_CMD_GET_PHY_MEDIA_INFO; req.emr_in_buf = payload; req.emr_in_length = MC_CMD_GET_PHY_MEDIA_INFO_IN_LEN; req.emr_out_buf = payload; req.emr_out_length = MC_CMD_GET_PHY_MEDIA_INFO_OUT_LEN(EFX_PHY_MEDIA_INFO_PAGE_SIZE); MCDI_IN_SET_DWORD(req, GET_PHY_MEDIA_INFO_IN_PAGE, mcdi_page); efx_mcdi_execute(enp, &req); if (req.emr_rc != 0) { rc = req.emr_rc; goto fail1; } if (req.emr_out_length_used != MC_CMD_GET_PHY_MEDIA_INFO_OUT_LEN(EFX_PHY_MEDIA_INFO_PAGE_SIZE)) { rc = EMSGSIZE; goto fail2; } if (MCDI_OUT_DWORD(req, GET_PHY_MEDIA_INFO_OUT_DATALEN) != EFX_PHY_MEDIA_INFO_PAGE_SIZE) { rc = EIO; goto fail3; } memcpy(data, MCDI_OUT2(req, uint8_t, GET_PHY_MEDIA_INFO_OUT_DATA) + offset, len); return (0); fail3: EFSYS_PROBE(fail3); fail2: EFSYS_PROBE(fail2); fail1: EFSYS_PROBE1(fail1, efx_rc_t, rc); return (rc); } /* * 2-wire device address of the base information in accordance with SFF-8472 * Diagnostic Monitoring Interface for Optical Transceivers section * 4 Memory Organization. */ #define EFX_PHY_MEDIA_INFO_DEV_ADDR_SFP_BASE 0xA0 /* * 2-wire device address of the digital diagnostics monitoring interface * in accordance with SFF-8472 Diagnostic Monitoring Interface for Optical * Transceivers section 4 Memory Organization. */ #define EFX_PHY_MEDIA_INFO_DEV_ADDR_SFP_DDM 0xA2 /* * Hard wired 2-wire device address for QSFP+ in accordance with SFF-8436 * QSFP+ 10 Gbs 4X PLUGGABLE TRANSCEIVER section 7.4 Device Addressing and * Operation. */ #define EFX_PHY_MEDIA_INFO_DEV_ADDR_QSFP 0xA0 __checkReturn efx_rc_t efx_mcdi_phy_module_get_info( __in efx_nic_t *enp, __in uint8_t dev_addr, __in uint8_t offset, __in uint8_t len, __out_bcount(len) uint8_t *data) { efx_port_t *epp = &(enp->en_port); efx_rc_t rc; uint32_t mcdi_lower_page; uint32_t mcdi_upper_page; EFSYS_ASSERT3U(enp->en_mod_flags, &, EFX_MOD_PROBE); /* * Map device address to MC_CMD_GET_PHY_MEDIA_INFO pages. * Offset plus length interface allows to access page 0 only. * I.e. non-zero upper pages are not accessible. * See SFF-8472 section 4 Memory Organization and SFF-8436 section 7.6 * QSFP+ Memory Map for details on how information is structured * and accessible. */ switch (epp->ep_fixed_port_type) { case EFX_PHY_MEDIA_SFP_PLUS: /* * In accordance with SFF-8472 Diagnostic Monitoring * Interface for Optical Transceivers section 4 Memory * Organization two 2-wire addresses are defined. */ switch (dev_addr) { /* Base information */ case EFX_PHY_MEDIA_INFO_DEV_ADDR_SFP_BASE: /* * MCDI page 0 should be used to access lower * page 0 (0x00 - 0x7f) at the device address 0xA0. */ mcdi_lower_page = 0; /* * MCDI page 1 should be used to access upper * page 0 (0x80 - 0xff) at the device address 0xA0. */ mcdi_upper_page = 1; break; /* Diagnostics */ case EFX_PHY_MEDIA_INFO_DEV_ADDR_SFP_DDM: /* * MCDI page 2 should be used to access lower * page 0 (0x00 - 0x7f) at the device address 0xA2. */ mcdi_lower_page = 2; /* * MCDI page 3 should be used to access upper * page 0 (0x80 - 0xff) at the device address 0xA2. */ mcdi_upper_page = 3; break; default: rc = ENOTSUP; goto fail1; } break; case EFX_PHY_MEDIA_QSFP_PLUS: switch (dev_addr) { case EFX_PHY_MEDIA_INFO_DEV_ADDR_QSFP: /* * MCDI page -1 should be used to access lower page 0 * (0x00 - 0x7f). */ mcdi_lower_page = (uint32_t)-1; /* * MCDI page 0 should be used to access upper page 0 * (0x80h - 0xff). */ mcdi_upper_page = 0; break; default: rc = ENOTSUP; goto fail1; } break; default: rc = ENOTSUP; goto fail1; } if (offset < EFX_PHY_MEDIA_INFO_PAGE_SIZE) { uint8_t read_len = MIN(len, EFX_PHY_MEDIA_INFO_PAGE_SIZE - offset); rc = efx_mcdi_get_phy_media_info(enp, mcdi_lower_page, offset, read_len, data); if (rc != 0) goto fail2; data += read_len; len -= read_len; offset = 0; } else { offset -= EFX_PHY_MEDIA_INFO_PAGE_SIZE; } if (len > 0) { EFSYS_ASSERT3U(len, <=, EFX_PHY_MEDIA_INFO_PAGE_SIZE); EFSYS_ASSERT3U(offset, <, EFX_PHY_MEDIA_INFO_PAGE_SIZE); rc = efx_mcdi_get_phy_media_info(enp, mcdi_upper_page, offset, len, data); if (rc != 0) goto fail3; } return (0); fail3: EFSYS_PROBE(fail3); fail2: EFSYS_PROBE(fail2); fail1: EFSYS_PROBE1(fail1, efx_rc_t, rc); return (rc); } #endif /* EFSYS_OPT_MCDI */ Index: head/sys/dev/sound/pci/hda/hdaa.c =================================================================== --- head/sys/dev/sound/pci/hda/hdaa.c (revision 308456) +++ head/sys/dev/sound/pci/hda/hdaa.c (revision 308457) @@ -1,7149 +1,7149 @@ /*- * Copyright (c) 2006 Stephane E. Potvin * Copyright (c) 2006 Ariff Abdullah * Copyright (c) 2008-2012 Alexander Motin * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * Intel High Definition Audio (Audio function) driver for FreeBSD. */ #ifdef HAVE_KERNEL_OPTION_HEADERS #include "opt_snd.h" #endif #include #include #include #include #include #include #include "mixer_if.h" SND_DECLARE_FILE("$FreeBSD$"); #define hdaa_lock(devinfo) snd_mtxlock((devinfo)->lock) #define hdaa_unlock(devinfo) snd_mtxunlock((devinfo)->lock) #define hdaa_lockassert(devinfo) snd_mtxassert((devinfo)->lock) #define hdaa_lockowned(devinfo) mtx_owned((devinfo)->lock) static const struct { const char *key; uint32_t value; } hdaa_quirks_tab[] = { { "softpcmvol", HDAA_QUIRK_SOFTPCMVOL }, { "fixedrate", HDAA_QUIRK_FIXEDRATE }, { "forcestereo", HDAA_QUIRK_FORCESTEREO }, { "eapdinv", HDAA_QUIRK_EAPDINV }, { "senseinv", HDAA_QUIRK_SENSEINV }, { "ivref50", HDAA_QUIRK_IVREF50 }, { "ivref80", HDAA_QUIRK_IVREF80 }, { "ivref100", HDAA_QUIRK_IVREF100 }, { "ovref50", HDAA_QUIRK_OVREF50 }, { "ovref80", HDAA_QUIRK_OVREF80 }, { "ovref100", HDAA_QUIRK_OVREF100 }, { "ivref", HDAA_QUIRK_IVREF }, { "ovref", HDAA_QUIRK_OVREF }, { "vref", HDAA_QUIRK_VREF }, }; #define HDA_PARSE_MAXDEPTH 10 MALLOC_DEFINE(M_HDAA, "hdaa", "HDA Audio"); static const char *HDA_COLORS[16] = {"Unknown", "Black", "Grey", "Blue", "Green", "Red", "Orange", "Yellow", "Purple", "Pink", "Res.A", "Res.B", "Res.C", "Res.D", "White", "Other"}; static const char *HDA_DEVS[16] = {"Line-out", "Speaker", "Headphones", "CD", "SPDIF-out", "Digital-out", "Modem-line", "Modem-handset", "Line-in", "AUX", "Mic", "Telephony", "SPDIF-in", "Digital-in", "Res.E", "Other"}; static const char *HDA_CONNS[4] = {"Jack", "None", "Fixed", "Both"}; static const char *HDA_CONNECTORS[16] = { "Unknown", "1/8", "1/4", "ATAPI", "RCA", "Optical", "Digital", "Analog", "DIN", "XLR", "RJ-11", "Combo", "0xc", "0xd", "0xe", "Other" }; static const char *HDA_LOCS[64] = { "0x00", "Rear", "Front", "Left", "Right", "Top", "Bottom", "Rear-panel", "Drive-bay", "0x09", "0x0a", "0x0b", "0x0c", "0x0d", "0x0e", "0x0f", "Internal", "0x11", "0x12", "0x13", "0x14", "0x15", "0x16", "Riser", "0x18", "Onboard", "0x1a", "0x1b", "0x1c", "0x1d", "0x1e", "0x1f", "External", "Ext-Rear", "Ext-Front", "Ext-Left", "Ext-Right", "Ext-Top", "Ext-Bottom", "0x07", "0x28", "0x29", "0x2a", "0x2b", "0x2c", "0x2d", "0x2e", "0x2f", "Other", "0x31", "0x32", "0x33", "0x34", "0x35", "Other-Bott", "Lid-In", "Lid-Out", "0x39", "0x3a", "0x3b", "0x3c", "0x3d", "0x3e", "0x3f" }; static const char *HDA_GPIO_ACTIONS[8] = { "keep", "set", "clear", "disable", "input", "0x05", "0x06", "0x07"}; static const char *HDA_HDMI_CODING_TYPES[18] = { "undefined", "LPCM", "AC-3", "MPEG1", "MP3", "MPEG2", "AAC-LC", "DTS", "ATRAC", "DSD", "E-AC-3", "DTS-HD", "MLP", "DST", "WMAPro", "HE-AAC", "HE-AACv2", "MPEG-Surround" }; /* Default */ static uint32_t hdaa_fmt[] = { SND_FORMAT(AFMT_S16_LE, 2, 0), 0 }; static struct pcmchan_caps hdaa_caps = {48000, 48000, hdaa_fmt, 0}; static const struct { uint32_t rate; int valid; uint16_t base; uint16_t mul; uint16_t div; } hda_rate_tab[] = { { 8000, 1, 0x0000, 0x0000, 0x0500 }, /* (48000 * 1) / 6 */ { 9600, 0, 0x0000, 0x0000, 0x0400 }, /* (48000 * 1) / 5 */ { 12000, 0, 0x0000, 0x0000, 0x0300 }, /* (48000 * 1) / 4 */ { 16000, 1, 0x0000, 0x0000, 0x0200 }, /* (48000 * 1) / 3 */ { 18000, 0, 0x0000, 0x1000, 0x0700 }, /* (48000 * 3) / 8 */ { 19200, 0, 0x0000, 0x0800, 0x0400 }, /* (48000 * 2) / 5 */ { 24000, 0, 0x0000, 0x0000, 0x0100 }, /* (48000 * 1) / 2 */ { 28800, 0, 0x0000, 0x1000, 0x0400 }, /* (48000 * 3) / 5 */ { 32000, 1, 0x0000, 0x0800, 0x0200 }, /* (48000 * 2) / 3 */ { 36000, 0, 0x0000, 0x1000, 0x0300 }, /* (48000 * 3) / 4 */ { 38400, 0, 0x0000, 0x1800, 0x0400 }, /* (48000 * 4) / 5 */ { 48000, 1, 0x0000, 0x0000, 0x0000 }, /* (48000 * 1) / 1 */ { 64000, 0, 0x0000, 0x1800, 0x0200 }, /* (48000 * 4) / 3 */ { 72000, 0, 0x0000, 0x1000, 0x0100 }, /* (48000 * 3) / 2 */ { 96000, 1, 0x0000, 0x0800, 0x0000 }, /* (48000 * 2) / 1 */ { 144000, 0, 0x0000, 0x1000, 0x0000 }, /* (48000 * 3) / 1 */ { 192000, 1, 0x0000, 0x1800, 0x0000 }, /* (48000 * 4) / 1 */ { 8820, 0, 0x4000, 0x0000, 0x0400 }, /* (44100 * 1) / 5 */ { 11025, 1, 0x4000, 0x0000, 0x0300 }, /* (44100 * 1) / 4 */ { 12600, 0, 0x4000, 0x0800, 0x0600 }, /* (44100 * 2) / 7 */ { 14700, 0, 0x4000, 0x0000, 0x0200 }, /* (44100 * 1) / 3 */ { 17640, 0, 0x4000, 0x0800, 0x0400 }, /* (44100 * 2) / 5 */ { 18900, 0, 0x4000, 0x1000, 0x0600 }, /* (44100 * 3) / 7 */ { 22050, 1, 0x4000, 0x0000, 0x0100 }, /* (44100 * 1) / 2 */ { 25200, 0, 0x4000, 0x1800, 0x0600 }, /* (44100 * 4) / 7 */ { 26460, 0, 0x4000, 0x1000, 0x0400 }, /* (44100 * 3) / 5 */ { 29400, 0, 0x4000, 0x0800, 0x0200 }, /* (44100 * 2) / 3 */ { 33075, 0, 0x4000, 0x1000, 0x0300 }, /* (44100 * 3) / 4 */ { 35280, 0, 0x4000, 0x1800, 0x0400 }, /* (44100 * 4) / 5 */ { 44100, 1, 0x4000, 0x0000, 0x0000 }, /* (44100 * 1) / 1 */ { 58800, 0, 0x4000, 0x1800, 0x0200 }, /* (44100 * 4) / 3 */ { 66150, 0, 0x4000, 0x1000, 0x0100 }, /* (44100 * 3) / 2 */ { 88200, 1, 0x4000, 0x0800, 0x0000 }, /* (44100 * 2) / 1 */ { 132300, 0, 0x4000, 0x1000, 0x0000 }, /* (44100 * 3) / 1 */ { 176400, 1, 0x4000, 0x1800, 0x0000 }, /* (44100 * 4) / 1 */ }; #define HDA_RATE_TAB_LEN (sizeof(hda_rate_tab) / sizeof(hda_rate_tab[0])) const static char *ossnames[] = SOUND_DEVICE_NAMES; /**************************************************************************** * Function prototypes ****************************************************************************/ static int hdaa_pcmchannel_setup(struct hdaa_chan *); static void hdaa_widget_connection_select(struct hdaa_widget *, uint8_t); static void hdaa_audio_ctl_amp_set(struct hdaa_audio_ctl *, uint32_t, int, int); static struct hdaa_audio_ctl *hdaa_audio_ctl_amp_get(struct hdaa_devinfo *, nid_t, int, int, int); static void hdaa_audio_ctl_amp_set_internal(struct hdaa_devinfo *, nid_t, int, int, int, int, int, int); static void hdaa_dump_pin_config(struct hdaa_widget *w, uint32_t conf); static char * hdaa_audio_ctl_ossmixer_mask2allname(uint32_t mask, char *buf, size_t len) { int i, first = 1; bzero(buf, len); for (i = 0; i < SOUND_MIXER_NRDEVICES; i++) { if (mask & (1 << i)) { if (first == 0) strlcat(buf, ", ", len); strlcat(buf, ossnames[i], len); first = 0; } } return (buf); } static struct hdaa_audio_ctl * hdaa_audio_ctl_each(struct hdaa_devinfo *devinfo, int *index) { if (devinfo == NULL || index == NULL || devinfo->ctl == NULL || devinfo->ctlcnt < 1 || *index < 0 || *index >= devinfo->ctlcnt) return (NULL); return (&devinfo->ctl[(*index)++]); } static struct hdaa_audio_ctl * hdaa_audio_ctl_amp_get(struct hdaa_devinfo *devinfo, nid_t nid, int dir, int index, int cnt) { struct hdaa_audio_ctl *ctl; int i, found = 0; if (devinfo == NULL || devinfo->ctl == NULL) return (NULL); i = 0; while ((ctl = hdaa_audio_ctl_each(devinfo, &i)) != NULL) { if (ctl->enable == 0) continue; if (ctl->widget->nid != nid) continue; if (dir && ctl->ndir != dir) continue; if (index >= 0 && ctl->ndir == HDAA_CTL_IN && ctl->dir == ctl->ndir && ctl->index != index) continue; found++; if (found == cnt || cnt <= 0) return (ctl); } return (NULL); } static const struct matrix { struct pcmchan_matrix m; int analog; } matrixes[] = { { SND_CHN_MATRIX_MAP_1_0, 1 }, { SND_CHN_MATRIX_MAP_2_0, 1 }, { SND_CHN_MATRIX_MAP_2_1, 0 }, { SND_CHN_MATRIX_MAP_3_0, 0 }, { SND_CHN_MATRIX_MAP_3_1, 0 }, { SND_CHN_MATRIX_MAP_4_0, 1 }, { SND_CHN_MATRIX_MAP_4_1, 0 }, { SND_CHN_MATRIX_MAP_5_0, 0 }, { SND_CHN_MATRIX_MAP_5_1, 1 }, { SND_CHN_MATRIX_MAP_6_0, 0 }, { SND_CHN_MATRIX_MAP_6_1, 0 }, { SND_CHN_MATRIX_MAP_7_0, 0 }, { SND_CHN_MATRIX_MAP_7_1, 1 }, }; static const char *channel_names[] = SND_CHN_T_NAMES; /* * Connected channels change handler. */ static void hdaa_channels_handler(struct hdaa_audio_as *as) { struct hdaa_pcm_devinfo *pdevinfo = as->pdevinfo; struct hdaa_devinfo *devinfo = pdevinfo->devinfo; struct hdaa_chan *ch = &devinfo->chans[as->chans[0]]; struct hdaa_widget *w; uint8_t *eld; int i, total, sub, assume, channels; uint16_t cpins, upins, tpins; cpins = upins = 0; eld = NULL; for (i = 0; i < 16; i++) { if (as->pins[i] <= 0) continue; w = hdaa_widget_get(devinfo, as->pins[i]); if (w == NULL) continue; if (w->wclass.pin.connected == 1) cpins |= (1 << i); else if (w->wclass.pin.connected != 0) upins |= (1 << i); if (w->eld != NULL && w->eld_len >= 8) eld = w->eld; } tpins = cpins | upins; if (as->hpredir >= 0) tpins &= 0x7fff; if (tpins == 0) tpins = as->pinset; total = sub = assume = channels = 0; if (eld) { /* Map CEA speakers to sound(4) channels. */ if (eld[7] & 0x01) /* Front Left/Right */ channels |= SND_CHN_T_MASK_FL | SND_CHN_T_MASK_FR; if (eld[7] & 0x02) /* Low Frequency Effect */ channels |= SND_CHN_T_MASK_LF; if (eld[7] & 0x04) /* Front Center */ channels |= SND_CHN_T_MASK_FC; if (eld[7] & 0x08) { /* Rear Left/Right */ /* If we have both RLR and RLRC, report RLR as side. */ if (eld[7] & 0x40) /* Rear Left/Right Center */ channels |= SND_CHN_T_MASK_SL | SND_CHN_T_MASK_SR; else channels |= SND_CHN_T_MASK_BL | SND_CHN_T_MASK_BR; } if (eld[7] & 0x10) /* Rear center */ channels |= SND_CHN_T_MASK_BC; if (eld[7] & 0x20) /* Front Left/Right Center */ channels |= SND_CHN_T_MASK_FLC | SND_CHN_T_MASK_FRC; if (eld[7] & 0x40) /* Rear Left/Right Center */ channels |= SND_CHN_T_MASK_BL | SND_CHN_T_MASK_BR; } else if (as->pinset != 0 && (tpins & 0xffe0) == 0) { /* Map UAA speakers to sound(4) channels. */ if (tpins & 0x0001) channels |= SND_CHN_T_MASK_FL | SND_CHN_T_MASK_FR; if (tpins & 0x0002) channels |= SND_CHN_T_MASK_FC | SND_CHN_T_MASK_LF; if (tpins & 0x0004) channels |= SND_CHN_T_MASK_BL | SND_CHN_T_MASK_BR; if (tpins & 0x0008) channels |= SND_CHN_T_MASK_FLC | SND_CHN_T_MASK_FRC; if (tpins & 0x0010) { /* If there is no back pin, report side as back. */ if ((as->pinset & 0x0004) == 0) channels |= SND_CHN_T_MASK_BL | SND_CHN_T_MASK_BR; else channels |= SND_CHN_T_MASK_SL | SND_CHN_T_MASK_SR; } } else if (as->mixed) { /* Mixed assoc can be only stereo or theoretically mono. */ if (ch->channels == 1) channels |= SND_CHN_T_MASK_FC; else channels |= SND_CHN_T_MASK_FL | SND_CHN_T_MASK_FR; } if (channels) { /* We have some usable channels info. */ HDA_BOOTVERBOSE( device_printf(pdevinfo->dev, "%s channel set is: ", as->dir == HDAA_CTL_OUT ? "Playback" : "Recording"); for (i = 0; i < SND_CHN_T_MAX; i++) if (channels & (1 << i)) printf("%s, ", channel_names[i]); printf("\n"); ); /* Look for maximal fitting matrix. */ for (i = 0; i < sizeof(matrixes) / sizeof(struct matrix); i++) { if (as->pinset != 0 && matrixes[i].analog == 0) continue; if ((matrixes[i].m.mask & ~channels) == 0) { total = matrixes[i].m.channels; sub = matrixes[i].m.ext; } } } if (total == 0) { assume = 1; total = ch->channels; sub = (total == 6 || total == 8) ? 1 : 0; } HDA_BOOTVERBOSE( device_printf(pdevinfo->dev, "%s channel matrix is: %s%d.%d (%s)\n", as->dir == HDAA_CTL_OUT ? "Playback" : "Recording", assume ? "unknown, assuming " : "", total - sub, sub, cpins != 0 ? "connected" : (upins != 0 ? "unknown" : "disconnected")); ); } /* * Headphones redirection change handler. */ static void hdaa_hpredir_handler(struct hdaa_widget *w) { struct hdaa_devinfo *devinfo = w->devinfo; struct hdaa_audio_as *as = &devinfo->as[w->bindas]; struct hdaa_widget *w1; struct hdaa_audio_ctl *ctl; uint32_t val; int j, connected = w->wclass.pin.connected; HDA_BOOTVERBOSE( device_printf((as->pdevinfo && as->pdevinfo->dev) ? as->pdevinfo->dev : devinfo->dev, "Redirect output to: %s\n", connected ? "headphones": "main"); ); /* (Un)Mute headphone pin. */ ctl = hdaa_audio_ctl_amp_get(devinfo, w->nid, HDAA_CTL_IN, -1, 1); if (ctl != NULL && ctl->mute) { /* If pin has muter - use it. */ val = connected ? 0 : 1; if (val != ctl->forcemute) { ctl->forcemute = val; hdaa_audio_ctl_amp_set(ctl, HDAA_AMP_MUTE_DEFAULT, HDAA_AMP_VOL_DEFAULT, HDAA_AMP_VOL_DEFAULT); } } else { /* If there is no muter - disable pin output. */ if (connected) val = w->wclass.pin.ctrl | HDA_CMD_SET_PIN_WIDGET_CTRL_OUT_ENABLE; else val = w->wclass.pin.ctrl & ~HDA_CMD_SET_PIN_WIDGET_CTRL_OUT_ENABLE; if (val != w->wclass.pin.ctrl) { w->wclass.pin.ctrl = val; hda_command(devinfo->dev, HDA_CMD_SET_PIN_WIDGET_CTRL(0, w->nid, w->wclass.pin.ctrl)); } } /* (Un)Mute other pins. */ for (j = 0; j < 15; j++) { if (as->pins[j] <= 0) continue; ctl = hdaa_audio_ctl_amp_get(devinfo, as->pins[j], HDAA_CTL_IN, -1, 1); if (ctl != NULL && ctl->mute) { /* If pin has muter - use it. */ val = connected ? 1 : 0; if (val == ctl->forcemute) continue; ctl->forcemute = val; hdaa_audio_ctl_amp_set(ctl, HDAA_AMP_MUTE_DEFAULT, HDAA_AMP_VOL_DEFAULT, HDAA_AMP_VOL_DEFAULT); continue; } /* If there is no muter - disable pin output. */ w1 = hdaa_widget_get(devinfo, as->pins[j]); if (w1 != NULL) { if (connected) val = w1->wclass.pin.ctrl & ~HDA_CMD_SET_PIN_WIDGET_CTRL_OUT_ENABLE; else val = w1->wclass.pin.ctrl | HDA_CMD_SET_PIN_WIDGET_CTRL_OUT_ENABLE; if (val != w1->wclass.pin.ctrl) { w1->wclass.pin.ctrl = val; hda_command(devinfo->dev, HDA_CMD_SET_PIN_WIDGET_CTRL(0, w1->nid, w1->wclass.pin.ctrl)); } } } } /* * Recording source change handler. */ static void hdaa_autorecsrc_handler(struct hdaa_audio_as *as, struct hdaa_widget *w) { struct hdaa_pcm_devinfo *pdevinfo = as->pdevinfo; struct hdaa_devinfo *devinfo; struct hdaa_widget *w1; int i, mask, fullmask, prio, bestprio; char buf[128]; if (!as->mixed || pdevinfo == NULL || pdevinfo->mixer == NULL) return; /* Don't touch anything if we asked not to. */ if (pdevinfo->autorecsrc == 0 || (pdevinfo->autorecsrc == 1 && w != NULL)) return; /* Don't touch anything if "mix" or "speaker" selected. */ if (pdevinfo->recsrc & (SOUND_MASK_IMIX | SOUND_MASK_SPEAKER)) return; /* Don't touch anything if several selected. */ if (ffs(pdevinfo->recsrc) != fls(pdevinfo->recsrc)) return; devinfo = pdevinfo->devinfo; mask = fullmask = 0; bestprio = 0; for (i = 0; i < 16; i++) { if (as->pins[i] <= 0) continue; w1 = hdaa_widget_get(devinfo, as->pins[i]); if (w1 == NULL || w1->enable == 0) continue; if (w1->wclass.pin.connected == 0) continue; prio = (w1->wclass.pin.connected == 1) ? 2 : 1; if (prio < bestprio) continue; if (prio > bestprio) { mask = 0; bestprio = prio; } mask |= (1 << w1->ossdev); fullmask |= (1 << w1->ossdev); } if (mask == 0) return; /* Prefer newly connected input. */ if (w != NULL && (mask & (1 << w->ossdev))) mask = (1 << w->ossdev); /* Prefer previously selected input */ if (mask & pdevinfo->recsrc) mask &= pdevinfo->recsrc; /* Prefer mic. */ if (mask & SOUND_MASK_MIC) mask = SOUND_MASK_MIC; /* Prefer monitor (2nd mic). */ if (mask & SOUND_MASK_MONITOR) mask = SOUND_MASK_MONITOR; /* Just take first one. */ mask = (1 << (ffs(mask) - 1)); HDA_BOOTVERBOSE( hdaa_audio_ctl_ossmixer_mask2allname(mask, buf, sizeof(buf)); device_printf(pdevinfo->dev, "Automatically set rec source to: %s\n", buf); ); hdaa_unlock(devinfo); mix_setrecsrc(pdevinfo->mixer, mask); hdaa_lock(devinfo); } /* * Jack presence detection event handler. */ static void hdaa_presence_handler(struct hdaa_widget *w) { struct hdaa_devinfo *devinfo = w->devinfo; struct hdaa_audio_as *as; uint32_t res; int connected, old; if (w->enable == 0 || w->type != HDA_PARAM_AUDIO_WIDGET_CAP_TYPE_PIN_COMPLEX) return; if (HDA_PARAM_PIN_CAP_PRESENCE_DETECT_CAP(w->wclass.pin.cap) == 0 || (HDA_CONFIG_DEFAULTCONF_MISC(w->wclass.pin.config) & 1) != 0) return; res = hda_command(devinfo->dev, HDA_CMD_GET_PIN_SENSE(0, w->nid)); connected = (res & HDA_CMD_GET_PIN_SENSE_PRESENCE_DETECT) != 0; if (devinfo->quirks & HDAA_QUIRK_SENSEINV) connected = !connected; old = w->wclass.pin.connected; if (connected == old) return; w->wclass.pin.connected = connected; HDA_BOOTVERBOSE( if (connected || old != 2) { device_printf(devinfo->dev, "Pin sense: nid=%d sense=0x%08x (%sconnected)\n", w->nid, res, !connected ? "dis" : ""); } ); as = &devinfo->as[w->bindas]; if (as->hpredir >= 0 && as->pins[15] == w->nid) hdaa_hpredir_handler(w); if (as->dir == HDAA_CTL_IN && old != 2) hdaa_autorecsrc_handler(as, w); if (old != 2) hdaa_channels_handler(as); } /* * Callback for poll based presence detection. */ static void hdaa_jack_poll_callback(void *arg) { struct hdaa_devinfo *devinfo = arg; struct hdaa_widget *w; int i; hdaa_lock(devinfo); if (devinfo->poll_ival == 0) { hdaa_unlock(devinfo); return; } for (i = 0; i < devinfo->ascnt; i++) { if (devinfo->as[i].hpredir < 0) continue; w = hdaa_widget_get(devinfo, devinfo->as[i].pins[15]); if (w == NULL || w->enable == 0 || w->type != HDA_PARAM_AUDIO_WIDGET_CAP_TYPE_PIN_COMPLEX) continue; hdaa_presence_handler(w); } callout_reset(&devinfo->poll_jack, devinfo->poll_ival, hdaa_jack_poll_callback, devinfo); hdaa_unlock(devinfo); } static void hdaa_eld_dump(struct hdaa_widget *w) { struct hdaa_devinfo *devinfo = w->devinfo; device_t dev = devinfo->dev; uint8_t *sad; int len, mnl, i, sadc, fmt; if (w->eld == NULL || w->eld_len < 4) return; device_printf(dev, "ELD nid=%d: ELD_Ver=%u Baseline_ELD_Len=%u\n", w->nid, w->eld[0] >> 3, w->eld[2]); if ((w->eld[0] >> 3) != 0x02) return; len = min(w->eld_len, (u_int)w->eld[2] * 4); mnl = w->eld[4] & 0x1f; device_printf(dev, "ELD nid=%d: CEA_EDID_Ver=%u MNL=%u\n", w->nid, w->eld[4] >> 5, mnl); sadc = w->eld[5] >> 4; device_printf(dev, "ELD nid=%d: SAD_Count=%u Conn_Type=%u S_AI=%u HDCP=%u\n", w->nid, sadc, (w->eld[5] >> 2) & 0x3, (w->eld[5] >> 1) & 0x1, w->eld[5] & 0x1); device_printf(dev, "ELD nid=%d: Aud_Synch_Delay=%ums\n", w->nid, w->eld[6] * 2); device_printf(dev, "ELD nid=%d: Channels=0x%b\n", w->nid, w->eld[7], "\020\07RLRC\06FLRC\05RC\04RLR\03FC\02LFE\01FLR"); device_printf(dev, "ELD nid=%d: Port_ID=0x%02x%02x%02x%02x%02x%02x%02x%02x\n", w->nid, w->eld[8], w->eld[9], w->eld[10], w->eld[11], w->eld[12], w->eld[13], w->eld[14], w->eld[15]); device_printf(dev, "ELD nid=%d: Manufacturer_Name=0x%02x%02x\n", w->nid, w->eld[16], w->eld[17]); device_printf(dev, "ELD nid=%d: Product_Code=0x%02x%02x\n", w->nid, w->eld[18], w->eld[19]); device_printf(dev, "ELD nid=%d: Monitor_Name_String='%.*s'\n", w->nid, mnl, &w->eld[20]); for (i = 0; i < sadc; i++) { sad = &w->eld[20 + mnl + i * 3]; fmt = (sad[0] >> 3) & 0x0f; if (fmt == HDA_HDMI_CODING_TYPE_REF_CTX) { fmt = (sad[2] >> 3) & 0x1f; if (fmt < 1 || fmt > 3) fmt = 0; else fmt += 14; } device_printf(dev, "ELD nid=%d: %s %dch freqs=0x%b", w->nid, HDA_HDMI_CODING_TYPES[fmt], (sad[0] & 0x07) + 1, sad[1], "\020\007192\006176\00596\00488\00348\00244\00132"); switch (fmt) { case HDA_HDMI_CODING_TYPE_LPCM: printf(" sizes=0x%b", sad[2] & 0x07, "\020\00324\00220\00116"); break; case HDA_HDMI_CODING_TYPE_AC3: case HDA_HDMI_CODING_TYPE_MPEG1: case HDA_HDMI_CODING_TYPE_MP3: case HDA_HDMI_CODING_TYPE_MPEG2: case HDA_HDMI_CODING_TYPE_AACLC: case HDA_HDMI_CODING_TYPE_DTS: case HDA_HDMI_CODING_TYPE_ATRAC: printf(" max_bitrate=%d", sad[2] * 8000); break; case HDA_HDMI_CODING_TYPE_WMAPRO: printf(" profile=%d", sad[2] & 0x07); break; } printf("\n"); } } static void hdaa_eld_handler(struct hdaa_widget *w) { struct hdaa_devinfo *devinfo = w->devinfo; uint32_t res; int i; if (w->enable == 0 || w->type != HDA_PARAM_AUDIO_WIDGET_CAP_TYPE_PIN_COMPLEX) return; if (HDA_PARAM_PIN_CAP_PRESENCE_DETECT_CAP(w->wclass.pin.cap) == 0 || (HDA_CONFIG_DEFAULTCONF_MISC(w->wclass.pin.config) & 1) != 0) return; res = hda_command(devinfo->dev, HDA_CMD_GET_PIN_SENSE(0, w->nid)); if ((w->eld != 0) == ((res & HDA_CMD_GET_PIN_SENSE_ELD_VALID) != 0)) return; if (w->eld != NULL) { w->eld_len = 0; free(w->eld, M_HDAA); w->eld = NULL; } HDA_BOOTVERBOSE( device_printf(devinfo->dev, "Pin sense: nid=%d sense=0x%08x " "(%sconnected, ELD %svalid)\n", w->nid, res, (res & HDA_CMD_GET_PIN_SENSE_PRESENCE_DETECT) ? "" : "dis", (res & HDA_CMD_GET_PIN_SENSE_ELD_VALID) ? "" : "in"); ); if ((res & HDA_CMD_GET_PIN_SENSE_ELD_VALID) == 0) return; res = hda_command(devinfo->dev, HDA_CMD_GET_HDMI_DIP_SIZE(0, w->nid, 0x08)); if (res == HDA_INVALID) return; w->eld_len = res & 0xff; if (w->eld_len != 0) w->eld = malloc(w->eld_len, M_HDAA, M_ZERO | M_NOWAIT); if (w->eld == NULL) { w->eld_len = 0; return; } for (i = 0; i < w->eld_len; i++) { res = hda_command(devinfo->dev, HDA_CMD_GET_HDMI_ELDD(0, w->nid, i)); if (res & 0x80000000) w->eld[i] = res & 0xff; } HDA_BOOTVERBOSE( hdaa_eld_dump(w); ); hdaa_channels_handler(&devinfo->as[w->bindas]); } /* * Pin sense initializer. */ static void hdaa_sense_init(struct hdaa_devinfo *devinfo) { struct hdaa_audio_as *as; struct hdaa_widget *w; int i, poll = 0; for (i = devinfo->startnode; i < devinfo->endnode; i++) { w = hdaa_widget_get(devinfo, i); if (w == NULL || w->enable == 0 || w->type != HDA_PARAM_AUDIO_WIDGET_CAP_TYPE_PIN_COMPLEX) continue; if (HDA_PARAM_AUDIO_WIDGET_CAP_UNSOL_CAP(w->param.widget_cap)) { if (w->unsol < 0) w->unsol = HDAC_UNSOL_ALLOC( device_get_parent(devinfo->dev), devinfo->dev, w->nid); hda_command(devinfo->dev, HDA_CMD_SET_UNSOLICITED_RESPONSE(0, w->nid, HDA_CMD_SET_UNSOLICITED_RESPONSE_ENABLE | w->unsol)); } as = &devinfo->as[w->bindas]; if (as->hpredir >= 0 && as->pins[15] == w->nid) { if (HDA_PARAM_PIN_CAP_PRESENCE_DETECT_CAP(w->wclass.pin.cap) == 0 || (HDA_CONFIG_DEFAULTCONF_MISC(w->wclass.pin.config) & 1) != 0) { device_printf(devinfo->dev, "No presence detection support at nid %d\n", w->nid); } else { if (w->unsol < 0) poll = 1; HDA_BOOTVERBOSE( device_printf(devinfo->dev, "Headphones redirection for " "association %d nid=%d using %s.\n", w->bindas, w->nid, (w->unsol < 0) ? "polling" : "unsolicited responses"); ); } } hdaa_presence_handler(w); if (!HDA_PARAM_PIN_CAP_DP(w->wclass.pin.cap) && !HDA_PARAM_PIN_CAP_HDMI(w->wclass.pin.cap)) continue; hdaa_eld_handler(w); } if (poll) { callout_reset(&devinfo->poll_jack, 1, hdaa_jack_poll_callback, devinfo); } } static void hdaa_sense_deinit(struct hdaa_devinfo *devinfo) { struct hdaa_widget *w; int i; callout_stop(&devinfo->poll_jack); for (i = devinfo->startnode; i < devinfo->endnode; i++) { w = hdaa_widget_get(devinfo, i); if (w == NULL || w->enable == 0 || w->type != HDA_PARAM_AUDIO_WIDGET_CAP_TYPE_PIN_COMPLEX) continue; if (w->unsol < 0) continue; hda_command(devinfo->dev, HDA_CMD_SET_UNSOLICITED_RESPONSE(0, w->nid, 0)); HDAC_UNSOL_FREE( device_get_parent(devinfo->dev), devinfo->dev, w->unsol); w->unsol = -1; } } uint32_t hdaa_widget_pin_patch(uint32_t config, const char *str) { char buf[256]; char *key, *value, *rest, *bad; int ival, i; strlcpy(buf, str, sizeof(buf)); rest = buf; while ((key = strsep(&rest, "=")) != NULL) { value = strsep(&rest, " \t"); if (value == NULL) break; ival = strtol(value, &bad, 10); if (strcmp(key, "seq") == 0) { config &= ~HDA_CONFIG_DEFAULTCONF_SEQUENCE_MASK; config |= ((ival << HDA_CONFIG_DEFAULTCONF_SEQUENCE_SHIFT) & HDA_CONFIG_DEFAULTCONF_SEQUENCE_MASK); } else if (strcmp(key, "as") == 0) { config &= ~HDA_CONFIG_DEFAULTCONF_ASSOCIATION_MASK; config |= ((ival << HDA_CONFIG_DEFAULTCONF_ASSOCIATION_SHIFT) & HDA_CONFIG_DEFAULTCONF_ASSOCIATION_MASK); } else if (strcmp(key, "misc") == 0) { config &= ~HDA_CONFIG_DEFAULTCONF_MISC_MASK; config |= ((ival << HDA_CONFIG_DEFAULTCONF_MISC_SHIFT) & HDA_CONFIG_DEFAULTCONF_MISC_MASK); } else if (strcmp(key, "color") == 0) { config &= ~HDA_CONFIG_DEFAULTCONF_COLOR_MASK; if (bad[0] == 0) { config |= ((ival << HDA_CONFIG_DEFAULTCONF_COLOR_SHIFT) & HDA_CONFIG_DEFAULTCONF_COLOR_MASK); } for (i = 0; i < 16; i++) { if (strcasecmp(HDA_COLORS[i], value) == 0) { config |= (i << HDA_CONFIG_DEFAULTCONF_COLOR_SHIFT); break; } } } else if (strcmp(key, "ctype") == 0) { config &= ~HDA_CONFIG_DEFAULTCONF_CONNECTION_TYPE_MASK; if (bad[0] == 0) { config |= ((ival << HDA_CONFIG_DEFAULTCONF_CONNECTION_TYPE_SHIFT) & HDA_CONFIG_DEFAULTCONF_CONNECTION_TYPE_MASK); } for (i = 0; i < 16; i++) { if (strcasecmp(HDA_CONNECTORS[i], value) == 0) { config |= (i << HDA_CONFIG_DEFAULTCONF_CONNECTION_TYPE_SHIFT); break; } } } else if (strcmp(key, "device") == 0) { config &= ~HDA_CONFIG_DEFAULTCONF_DEVICE_MASK; if (bad[0] == 0) { config |= ((ival << HDA_CONFIG_DEFAULTCONF_DEVICE_SHIFT) & HDA_CONFIG_DEFAULTCONF_DEVICE_MASK); continue; } for (i = 0; i < 16; i++) { if (strcasecmp(HDA_DEVS[i], value) == 0) { config |= (i << HDA_CONFIG_DEFAULTCONF_DEVICE_SHIFT); break; } } } else if (strcmp(key, "loc") == 0) { config &= ~HDA_CONFIG_DEFAULTCONF_LOCATION_MASK; if (bad[0] == 0) { config |= ((ival << HDA_CONFIG_DEFAULTCONF_LOCATION_SHIFT) & HDA_CONFIG_DEFAULTCONF_LOCATION_MASK); continue; } for (i = 0; i < 64; i++) { if (strcasecmp(HDA_LOCS[i], value) == 0) { config |= (i << HDA_CONFIG_DEFAULTCONF_LOCATION_SHIFT); break; } } } else if (strcmp(key, "conn") == 0) { config &= ~HDA_CONFIG_DEFAULTCONF_CONNECTIVITY_MASK; if (bad[0] == 0) { config |= ((ival << HDA_CONFIG_DEFAULTCONF_CONNECTIVITY_SHIFT) & HDA_CONFIG_DEFAULTCONF_CONNECTIVITY_MASK); continue; } for (i = 0; i < 4; i++) { if (strcasecmp(HDA_CONNS[i], value) == 0) { config |= (i << HDA_CONFIG_DEFAULTCONF_CONNECTIVITY_SHIFT); break; } } } } return (config); } uint32_t hdaa_gpio_patch(uint32_t gpio, const char *str) { char buf[256]; char *key, *value, *rest; int ikey, i; strlcpy(buf, str, sizeof(buf)); rest = buf; while ((key = strsep(&rest, "=")) != NULL) { value = strsep(&rest, " \t"); if (value == NULL) break; ikey = strtol(key, NULL, 10); if (ikey < 0 || ikey > 7) continue; for (i = 0; i < 7; i++) { if (strcasecmp(HDA_GPIO_ACTIONS[i], value) == 0) { gpio &= ~HDAA_GPIO_MASK(ikey); gpio |= i << HDAA_GPIO_SHIFT(ikey); break; } } } return (gpio); } static void hdaa_local_patch_pin(struct hdaa_widget *w) { device_t dev = w->devinfo->dev; const char *res = NULL; uint32_t config, orig; char buf[32]; config = orig = w->wclass.pin.config; snprintf(buf, sizeof(buf), "cad%u.nid%u.config", hda_get_codec_id(dev), w->nid); if (resource_string_value(device_get_name( device_get_parent(device_get_parent(dev))), device_get_unit(device_get_parent(device_get_parent(dev))), buf, &res) == 0) { if (strncmp(res, "0x", 2) == 0) { config = strtol(res + 2, NULL, 16); } else { config = hdaa_widget_pin_patch(config, res); } } snprintf(buf, sizeof(buf), "nid%u.config", w->nid); if (resource_string_value(device_get_name(dev), device_get_unit(dev), buf, &res) == 0) { if (strncmp(res, "0x", 2) == 0) { config = strtol(res + 2, NULL, 16); } else { config = hdaa_widget_pin_patch(config, res); } } HDA_BOOTVERBOSE( if (config != orig) device_printf(w->devinfo->dev, "Patching pin config nid=%u 0x%08x -> 0x%08x\n", w->nid, orig, config); ); w->wclass.pin.newconf = w->wclass.pin.config = config; } static void hdaa_dump_audio_formats_sb(struct sbuf *sb, uint32_t fcap, uint32_t pcmcap) { uint32_t cap; cap = fcap; if (cap != 0) { sbuf_printf(sb, " Stream cap: 0x%08x", cap); if (HDA_PARAM_SUPP_STREAM_FORMATS_AC3(cap)) sbuf_printf(sb, " AC3"); if (HDA_PARAM_SUPP_STREAM_FORMATS_FLOAT32(cap)) sbuf_printf(sb, " FLOAT32"); if (HDA_PARAM_SUPP_STREAM_FORMATS_PCM(cap)) sbuf_printf(sb, " PCM"); sbuf_printf(sb, "\n"); } cap = pcmcap; if (cap != 0) { sbuf_printf(sb, " PCM cap: 0x%08x", cap); if (HDA_PARAM_SUPP_PCM_SIZE_RATE_8BIT(cap)) sbuf_printf(sb, " 8"); if (HDA_PARAM_SUPP_PCM_SIZE_RATE_16BIT(cap)) sbuf_printf(sb, " 16"); if (HDA_PARAM_SUPP_PCM_SIZE_RATE_20BIT(cap)) sbuf_printf(sb, " 20"); if (HDA_PARAM_SUPP_PCM_SIZE_RATE_24BIT(cap)) sbuf_printf(sb, " 24"); if (HDA_PARAM_SUPP_PCM_SIZE_RATE_32BIT(cap)) sbuf_printf(sb, " 32"); sbuf_printf(sb, " bits,"); if (HDA_PARAM_SUPP_PCM_SIZE_RATE_8KHZ(cap)) sbuf_printf(sb, " 8"); if (HDA_PARAM_SUPP_PCM_SIZE_RATE_11KHZ(cap)) sbuf_printf(sb, " 11"); if (HDA_PARAM_SUPP_PCM_SIZE_RATE_16KHZ(cap)) sbuf_printf(sb, " 16"); if (HDA_PARAM_SUPP_PCM_SIZE_RATE_22KHZ(cap)) sbuf_printf(sb, " 22"); if (HDA_PARAM_SUPP_PCM_SIZE_RATE_32KHZ(cap)) sbuf_printf(sb, " 32"); if (HDA_PARAM_SUPP_PCM_SIZE_RATE_44KHZ(cap)) sbuf_printf(sb, " 44"); sbuf_printf(sb, " 48"); if (HDA_PARAM_SUPP_PCM_SIZE_RATE_88KHZ(cap)) sbuf_printf(sb, " 88"); if (HDA_PARAM_SUPP_PCM_SIZE_RATE_96KHZ(cap)) sbuf_printf(sb, " 96"); if (HDA_PARAM_SUPP_PCM_SIZE_RATE_176KHZ(cap)) sbuf_printf(sb, " 176"); if (HDA_PARAM_SUPP_PCM_SIZE_RATE_192KHZ(cap)) sbuf_printf(sb, " 192"); sbuf_printf(sb, " KHz\n"); } } static void hdaa_dump_pin_sb(struct sbuf *sb, struct hdaa_widget *w) { uint32_t pincap, conf; pincap = w->wclass.pin.cap; sbuf_printf(sb, " Pin cap: 0x%08x", pincap); if (HDA_PARAM_PIN_CAP_IMP_SENSE_CAP(pincap)) sbuf_printf(sb, " ISC"); if (HDA_PARAM_PIN_CAP_TRIGGER_REQD(pincap)) sbuf_printf(sb, " TRQD"); if (HDA_PARAM_PIN_CAP_PRESENCE_DETECT_CAP(pincap)) sbuf_printf(sb, " PDC"); if (HDA_PARAM_PIN_CAP_HEADPHONE_CAP(pincap)) sbuf_printf(sb, " HP"); if (HDA_PARAM_PIN_CAP_OUTPUT_CAP(pincap)) sbuf_printf(sb, " OUT"); if (HDA_PARAM_PIN_CAP_INPUT_CAP(pincap)) sbuf_printf(sb, " IN"); if (HDA_PARAM_PIN_CAP_BALANCED_IO_PINS(pincap)) sbuf_printf(sb, " BAL"); if (HDA_PARAM_PIN_CAP_HDMI(pincap)) sbuf_printf(sb, " HDMI"); if (HDA_PARAM_PIN_CAP_VREF_CTRL(pincap)) { sbuf_printf(sb, " VREF["); if (HDA_PARAM_PIN_CAP_VREF_CTRL_50(pincap)) sbuf_printf(sb, " 50"); if (HDA_PARAM_PIN_CAP_VREF_CTRL_80(pincap)) sbuf_printf(sb, " 80"); if (HDA_PARAM_PIN_CAP_VREF_CTRL_100(pincap)) sbuf_printf(sb, " 100"); if (HDA_PARAM_PIN_CAP_VREF_CTRL_GROUND(pincap)) sbuf_printf(sb, " GROUND"); if (HDA_PARAM_PIN_CAP_VREF_CTRL_HIZ(pincap)) sbuf_printf(sb, " HIZ"); sbuf_printf(sb, " ]"); } if (HDA_PARAM_PIN_CAP_EAPD_CAP(pincap)) sbuf_printf(sb, " EAPD"); if (HDA_PARAM_PIN_CAP_DP(pincap)) sbuf_printf(sb, " DP"); if (HDA_PARAM_PIN_CAP_HBR(pincap)) sbuf_printf(sb, " HBR"); sbuf_printf(sb, "\n"); conf = w->wclass.pin.config; sbuf_printf(sb, " Pin config: 0x%08x", conf); sbuf_printf(sb, " as=%d seq=%d " "device=%s conn=%s ctype=%s loc=%s color=%s misc=%d\n", HDA_CONFIG_DEFAULTCONF_ASSOCIATION(conf), HDA_CONFIG_DEFAULTCONF_SEQUENCE(conf), HDA_DEVS[HDA_CONFIG_DEFAULTCONF_DEVICE(conf)], HDA_CONNS[HDA_CONFIG_DEFAULTCONF_CONNECTIVITY(conf)], HDA_CONNECTORS[HDA_CONFIG_DEFAULTCONF_CONNECTION_TYPE(conf)], HDA_LOCS[HDA_CONFIG_DEFAULTCONF_LOCATION(conf)], HDA_COLORS[HDA_CONFIG_DEFAULTCONF_COLOR(conf)], HDA_CONFIG_DEFAULTCONF_MISC(conf)); sbuf_printf(sb, " Pin control: 0x%08x", w->wclass.pin.ctrl); if (w->wclass.pin.ctrl & HDA_CMD_SET_PIN_WIDGET_CTRL_HPHN_ENABLE) sbuf_printf(sb, " HP"); if (w->wclass.pin.ctrl & HDA_CMD_SET_PIN_WIDGET_CTRL_IN_ENABLE) sbuf_printf(sb, " IN"); if (w->wclass.pin.ctrl & HDA_CMD_SET_PIN_WIDGET_CTRL_OUT_ENABLE) sbuf_printf(sb, " OUT"); if (HDA_PARAM_AUDIO_WIDGET_CAP_DIGITAL(w->param.widget_cap)) { if ((w->wclass.pin.ctrl & HDA_CMD_SET_PIN_WIDGET_CTRL_VREF_ENABLE_MASK) == 0x03) sbuf_printf(sb, " HBR"); else if ((w->wclass.pin.ctrl & HDA_CMD_SET_PIN_WIDGET_CTRL_VREF_ENABLE_MASK) != 0) sbuf_printf(sb, " EPTs"); } else { if ((w->wclass.pin.ctrl & HDA_CMD_SET_PIN_WIDGET_CTRL_VREF_ENABLE_MASK) != 0) sbuf_printf(sb, " VREFs"); } sbuf_printf(sb, "\n"); } static void hdaa_dump_amp_sb(struct sbuf *sb, uint32_t cap, const char *banner) { int offset, size, step; offset = HDA_PARAM_OUTPUT_AMP_CAP_OFFSET(cap); size = HDA_PARAM_OUTPUT_AMP_CAP_STEPSIZE(cap); step = HDA_PARAM_OUTPUT_AMP_CAP_NUMSTEPS(cap); sbuf_printf(sb, " %s amp: 0x%08x " "mute=%d step=%d size=%d offset=%d (%+d/%+ddB)\n", banner, cap, HDA_PARAM_OUTPUT_AMP_CAP_MUTE_CAP(cap), step, size, offset, ((0 - offset) * (size + 1)) / 4, ((step - offset) * (size + 1)) / 4); } static int hdaa_sysctl_caps(SYSCTL_HANDLER_ARGS) { struct hdaa_devinfo *devinfo; struct hdaa_widget *w, *cw; struct sbuf sb; char buf[64]; int error, j; w = (struct hdaa_widget *)oidp->oid_arg1; devinfo = w->devinfo; sbuf_new_for_sysctl(&sb, NULL, 256, req); sbuf_printf(&sb, "%s%s\n", w->name, (w->enable == 0) ? " [DISABLED]" : ""); sbuf_printf(&sb, " Widget cap: 0x%08x", w->param.widget_cap); if (w->param.widget_cap & 0x0ee1) { if (HDA_PARAM_AUDIO_WIDGET_CAP_LR_SWAP(w->param.widget_cap)) sbuf_printf(&sb, " LRSWAP"); if (HDA_PARAM_AUDIO_WIDGET_CAP_POWER_CTRL(w->param.widget_cap)) sbuf_printf(&sb, " PWR"); if (HDA_PARAM_AUDIO_WIDGET_CAP_DIGITAL(w->param.widget_cap)) sbuf_printf(&sb, " DIGITAL"); if (HDA_PARAM_AUDIO_WIDGET_CAP_UNSOL_CAP(w->param.widget_cap)) sbuf_printf(&sb, " UNSOL"); if (HDA_PARAM_AUDIO_WIDGET_CAP_PROC_WIDGET(w->param.widget_cap)) sbuf_printf(&sb, " PROC"); if (HDA_PARAM_AUDIO_WIDGET_CAP_STRIPE(w->param.widget_cap)) sbuf_printf(&sb, " STRIPE(x%d)", 1 << (fls(w->wclass.conv.stripecap) - 1)); j = HDA_PARAM_AUDIO_WIDGET_CAP_CC(w->param.widget_cap); if (j == 1) sbuf_printf(&sb, " STEREO"); else if (j > 1) sbuf_printf(&sb, " %dCH", j + 1); } sbuf_printf(&sb, "\n"); if (w->bindas != -1) { sbuf_printf(&sb, " Association: %d (0x%04x)\n", w->bindas, w->bindseqmask); } if (w->ossmask != 0 || w->ossdev >= 0) { sbuf_printf(&sb, " OSS: %s", hdaa_audio_ctl_ossmixer_mask2allname(w->ossmask, buf, sizeof(buf))); if (w->ossdev >= 0) sbuf_printf(&sb, " (%s)", ossnames[w->ossdev]); sbuf_printf(&sb, "\n"); } if (w->type == HDA_PARAM_AUDIO_WIDGET_CAP_TYPE_AUDIO_OUTPUT || w->type == HDA_PARAM_AUDIO_WIDGET_CAP_TYPE_AUDIO_INPUT) { hdaa_dump_audio_formats_sb(&sb, w->param.supp_stream_formats, w->param.supp_pcm_size_rate); } else if (w->type == HDA_PARAM_AUDIO_WIDGET_CAP_TYPE_PIN_COMPLEX || w->waspin) hdaa_dump_pin_sb(&sb, w); if (w->param.eapdbtl != HDA_INVALID) { sbuf_printf(&sb, " EAPD: 0x%08x%s%s%s\n", w->param.eapdbtl, (w->param.eapdbtl & HDA_CMD_SET_EAPD_BTL_ENABLE_LR_SWAP) ? " LRSWAP" : "", (w->param.eapdbtl & HDA_CMD_SET_EAPD_BTL_ENABLE_EAPD) ? " EAPD" : "", (w->param.eapdbtl & HDA_CMD_SET_EAPD_BTL_ENABLE_BTL) ? " BTL" : ""); } if (HDA_PARAM_AUDIO_WIDGET_CAP_OUT_AMP(w->param.widget_cap) && w->param.outamp_cap != 0) hdaa_dump_amp_sb(&sb, w->param.outamp_cap, "Output"); if (HDA_PARAM_AUDIO_WIDGET_CAP_IN_AMP(w->param.widget_cap) && w->param.inamp_cap != 0) hdaa_dump_amp_sb(&sb, w->param.inamp_cap, " Input"); if (w->nconns > 0) sbuf_printf(&sb, " Connections: %d\n", w->nconns); for (j = 0; j < w->nconns; j++) { cw = hdaa_widget_get(devinfo, w->conns[j]); sbuf_printf(&sb, " + %s<- nid=%d [%s]", (w->connsenable[j] == 0)?"[DISABLED] ":"", w->conns[j], (cw == NULL) ? "GHOST!" : cw->name); if (cw == NULL) sbuf_printf(&sb, " [UNKNOWN]"); else if (cw->enable == 0) sbuf_printf(&sb, " [DISABLED]"); if (w->nconns > 1 && w->selconn == j && w->type != HDA_PARAM_AUDIO_WIDGET_CAP_TYPE_AUDIO_MIXER) sbuf_printf(&sb, " (selected)"); sbuf_printf(&sb, "\n"); } error = sbuf_finish(&sb); sbuf_delete(&sb); return (error); } static int hdaa_sysctl_config(SYSCTL_HANDLER_ARGS) { char buf[256]; int error; uint32_t conf; conf = *(uint32_t *)oidp->oid_arg1; snprintf(buf, sizeof(buf), "0x%08x as=%d seq=%d " "device=%s conn=%s ctype=%s loc=%s color=%s misc=%d", conf, HDA_CONFIG_DEFAULTCONF_ASSOCIATION(conf), HDA_CONFIG_DEFAULTCONF_SEQUENCE(conf), HDA_DEVS[HDA_CONFIG_DEFAULTCONF_DEVICE(conf)], HDA_CONNS[HDA_CONFIG_DEFAULTCONF_CONNECTIVITY(conf)], HDA_CONNECTORS[HDA_CONFIG_DEFAULTCONF_CONNECTION_TYPE(conf)], HDA_LOCS[HDA_CONFIG_DEFAULTCONF_LOCATION(conf)], HDA_COLORS[HDA_CONFIG_DEFAULTCONF_COLOR(conf)], HDA_CONFIG_DEFAULTCONF_MISC(conf)); error = sysctl_handle_string(oidp, buf, sizeof(buf), req); if (error != 0 || req->newptr == NULL) return (error); if (strncmp(buf, "0x", 2) == 0) conf = strtol(buf + 2, NULL, 16); else conf = hdaa_widget_pin_patch(conf, buf); *(uint32_t *)oidp->oid_arg1 = conf; return (0); } static void hdaa_config_fetch(const char *str, uint32_t *on, uint32_t *off) { int i = 0, j, k, len, inv; for (;;) { while (str[i] != '\0' && (str[i] == ',' || isspace(str[i]) != 0)) i++; if (str[i] == '\0') return; j = i; while (str[j] != '\0' && !(str[j] == ',' || isspace(str[j]) != 0)) j++; len = j - i; if (len > 2 && strncmp(str + i, "no", 2) == 0) inv = 2; else inv = 0; for (k = 0; len > inv && k < nitems(hdaa_quirks_tab); k++) { if (strncmp(str + i + inv, hdaa_quirks_tab[k].key, len - inv) != 0) continue; if (len - inv != strlen(hdaa_quirks_tab[k].key)) continue; if (inv == 0) { *on |= hdaa_quirks_tab[k].value; *off &= ~hdaa_quirks_tab[k].value; } else { *off |= hdaa_quirks_tab[k].value; *on &= ~hdaa_quirks_tab[k].value; } break; } i = j; } } static int hdaa_sysctl_quirks(SYSCTL_HANDLER_ARGS) { char buf[256]; int error, n = 0, i; uint32_t quirks, quirks_off; quirks = *(uint32_t *)oidp->oid_arg1; buf[0] = 0; for (i = 0; i < nitems(hdaa_quirks_tab); i++) { if ((quirks & hdaa_quirks_tab[i].value) != 0) n += snprintf(buf + n, sizeof(buf) - n, "%s%s", n != 0 ? "," : "", hdaa_quirks_tab[i].key); } error = sysctl_handle_string(oidp, buf, sizeof(buf), req); if (error != 0 || req->newptr == NULL) return (error); if (strncmp(buf, "0x", 2) == 0) quirks = strtol(buf + 2, NULL, 16); else { quirks = 0; hdaa_config_fetch(buf, &quirks, &quirks_off); } *(uint32_t *)oidp->oid_arg1 = quirks; return (0); } static void hdaa_local_patch(struct hdaa_devinfo *devinfo) { struct hdaa_widget *w; const char *res = NULL; uint32_t quirks_on = 0, quirks_off = 0, x; int i; for (i = devinfo->startnode; i < devinfo->endnode; i++) { w = hdaa_widget_get(devinfo, i); if (w == NULL) continue; if (w->type == HDA_PARAM_AUDIO_WIDGET_CAP_TYPE_PIN_COMPLEX) hdaa_local_patch_pin(w); } if (resource_string_value(device_get_name(devinfo->dev), device_get_unit(devinfo->dev), "config", &res) == 0) { if (res != NULL && strlen(res) > 0) hdaa_config_fetch(res, &quirks_on, &quirks_off); devinfo->quirks |= quirks_on; devinfo->quirks &= ~quirks_off; } if (devinfo->newquirks == -1) devinfo->newquirks = devinfo->quirks; else devinfo->quirks = devinfo->newquirks; HDA_BOOTHVERBOSE( device_printf(devinfo->dev, "Config options: 0x%08x\n", devinfo->quirks); ); if (resource_string_value(device_get_name(devinfo->dev), device_get_unit(devinfo->dev), "gpio_config", &res) == 0) { if (strncmp(res, "0x", 2) == 0) { devinfo->gpio = strtol(res + 2, NULL, 16); } else { devinfo->gpio = hdaa_gpio_patch(devinfo->gpio, res); } } if (devinfo->newgpio == -1) devinfo->newgpio = devinfo->gpio; else devinfo->gpio = devinfo->newgpio; if (devinfo->newgpo == -1) devinfo->newgpo = devinfo->gpo; else devinfo->gpo = devinfo->newgpo; HDA_BOOTHVERBOSE( device_printf(devinfo->dev, "GPIO config options:"); for (i = 0; i < 7; i++) { x = (devinfo->gpio & HDAA_GPIO_MASK(i)) >> HDAA_GPIO_SHIFT(i); if (x != 0) printf(" %d=%s", i, HDA_GPIO_ACTIONS[x]); } printf("\n"); ); } static void hdaa_widget_connection_parse(struct hdaa_widget *w) { uint32_t res; int i, j, max, ents, entnum; nid_t nid = w->nid; nid_t cnid, addcnid, prevcnid; w->nconns = 0; res = hda_command(w->devinfo->dev, HDA_CMD_GET_PARAMETER(0, nid, HDA_PARAM_CONN_LIST_LENGTH)); ents = HDA_PARAM_CONN_LIST_LENGTH_LIST_LENGTH(res); if (ents < 1) return; entnum = HDA_PARAM_CONN_LIST_LENGTH_LONG_FORM(res) ? 2 : 4; max = (sizeof(w->conns) / sizeof(w->conns[0])) - 1; prevcnid = 0; #define CONN_RMASK(e) (1 << ((32 / (e)) - 1)) #define CONN_NMASK(e) (CONN_RMASK(e) - 1) #define CONN_RESVAL(r, e, n) ((r) >> ((32 / (e)) * (n))) #define CONN_RANGE(r, e, n) (CONN_RESVAL(r, e, n) & CONN_RMASK(e)) #define CONN_CNID(r, e, n) (CONN_RESVAL(r, e, n) & CONN_NMASK(e)) for (i = 0; i < ents; i += entnum) { res = hda_command(w->devinfo->dev, HDA_CMD_GET_CONN_LIST_ENTRY(0, nid, i)); for (j = 0; j < entnum; j++) { cnid = CONN_CNID(res, entnum, j); if (cnid == 0) { if (w->nconns < ents) device_printf(w->devinfo->dev, "WARNING: nid=%d has zero cnid " "entnum=%d j=%d index=%d " "entries=%d found=%d res=0x%08x\n", nid, entnum, j, i, ents, w->nconns, res); else goto getconns_out; } if (cnid < w->devinfo->startnode || cnid >= w->devinfo->endnode) { HDA_BOOTVERBOSE( device_printf(w->devinfo->dev, "WARNING: nid=%d has cnid outside " "of the AFG range j=%d " "entnum=%d index=%d res=0x%08x\n", nid, j, entnum, i, res); ); } if (CONN_RANGE(res, entnum, j) == 0) addcnid = cnid; else if (prevcnid == 0 || prevcnid >= cnid) { device_printf(w->devinfo->dev, "WARNING: Invalid child range " "nid=%d index=%d j=%d entnum=%d " "prevcnid=%d cnid=%d res=0x%08x\n", nid, i, j, entnum, prevcnid, cnid, res); addcnid = cnid; } else addcnid = prevcnid + 1; while (addcnid <= cnid) { if (w->nconns > max) { device_printf(w->devinfo->dev, "Adding %d (nid=%d): " "Max connection reached! max=%d\n", addcnid, nid, max + 1); goto getconns_out; } w->connsenable[w->nconns] = 1; w->conns[w->nconns++] = addcnid++; } prevcnid = cnid; } } getconns_out: return; } static void hdaa_widget_parse(struct hdaa_widget *w) { device_t dev = w->devinfo->dev; uint32_t wcap, cap; nid_t nid = w->nid; char buf[64]; w->param.widget_cap = wcap = hda_command(dev, HDA_CMD_GET_PARAMETER(0, nid, HDA_PARAM_AUDIO_WIDGET_CAP)); w->type = HDA_PARAM_AUDIO_WIDGET_CAP_TYPE(wcap); hdaa_widget_connection_parse(w); if (HDA_PARAM_AUDIO_WIDGET_CAP_OUT_AMP(wcap)) { if (HDA_PARAM_AUDIO_WIDGET_CAP_AMP_OVR(wcap)) w->param.outamp_cap = hda_command(dev, HDA_CMD_GET_PARAMETER(0, nid, HDA_PARAM_OUTPUT_AMP_CAP)); else w->param.outamp_cap = w->devinfo->outamp_cap; } else w->param.outamp_cap = 0; if (HDA_PARAM_AUDIO_WIDGET_CAP_IN_AMP(wcap)) { if (HDA_PARAM_AUDIO_WIDGET_CAP_AMP_OVR(wcap)) w->param.inamp_cap = hda_command(dev, HDA_CMD_GET_PARAMETER(0, nid, HDA_PARAM_INPUT_AMP_CAP)); else w->param.inamp_cap = w->devinfo->inamp_cap; } else w->param.inamp_cap = 0; if (w->type == HDA_PARAM_AUDIO_WIDGET_CAP_TYPE_AUDIO_OUTPUT || w->type == HDA_PARAM_AUDIO_WIDGET_CAP_TYPE_AUDIO_INPUT) { if (HDA_PARAM_AUDIO_WIDGET_CAP_FORMAT_OVR(wcap)) { cap = hda_command(dev, HDA_CMD_GET_PARAMETER(0, nid, HDA_PARAM_SUPP_STREAM_FORMATS)); w->param.supp_stream_formats = (cap != 0) ? cap : w->devinfo->supp_stream_formats; cap = hda_command(dev, HDA_CMD_GET_PARAMETER(0, nid, HDA_PARAM_SUPP_PCM_SIZE_RATE)); w->param.supp_pcm_size_rate = (cap != 0) ? cap : w->devinfo->supp_pcm_size_rate; } else { w->param.supp_stream_formats = w->devinfo->supp_stream_formats; w->param.supp_pcm_size_rate = w->devinfo->supp_pcm_size_rate; } if (HDA_PARAM_AUDIO_WIDGET_CAP_STRIPE(w->param.widget_cap)) { w->wclass.conv.stripecap = hda_command(dev, HDA_CMD_GET_STRIPE_CONTROL(0, w->nid)) >> 20; } else w->wclass.conv.stripecap = 1; } else { w->param.supp_stream_formats = 0; w->param.supp_pcm_size_rate = 0; } if (w->type == HDA_PARAM_AUDIO_WIDGET_CAP_TYPE_PIN_COMPLEX) { w->wclass.pin.original = w->wclass.pin.newconf = w->wclass.pin.config = hda_command(dev, HDA_CMD_GET_CONFIGURATION_DEFAULT(0, w->nid)); w->wclass.pin.cap = hda_command(dev, HDA_CMD_GET_PARAMETER(0, w->nid, HDA_PARAM_PIN_CAP)); w->wclass.pin.ctrl = hda_command(dev, HDA_CMD_GET_PIN_WIDGET_CTRL(0, nid)); w->wclass.pin.connected = 2; if (HDA_PARAM_PIN_CAP_EAPD_CAP(w->wclass.pin.cap)) { w->param.eapdbtl = hda_command(dev, HDA_CMD_GET_EAPD_BTL_ENABLE(0, nid)); w->param.eapdbtl &= 0x7; w->param.eapdbtl |= HDA_CMD_SET_EAPD_BTL_ENABLE_EAPD; } else w->param.eapdbtl = HDA_INVALID; } w->unsol = -1; hdaa_unlock(w->devinfo); snprintf(buf, sizeof(buf), "nid%d", w->nid); SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev), SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), OID_AUTO, buf, CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, w, 0, hdaa_sysctl_caps, "A", "Node capabilities"); if (w->type == HDA_PARAM_AUDIO_WIDGET_CAP_TYPE_PIN_COMPLEX) { snprintf(buf, sizeof(buf), "nid%d_config", w->nid); SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev), SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), OID_AUTO, buf, CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_MPSAFE, &w->wclass.pin.newconf, 0, hdaa_sysctl_config, "A", "Current pin configuration"); snprintf(buf, sizeof(buf), "nid%d_original", w->nid); SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev), SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), OID_AUTO, buf, CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, &w->wclass.pin.original, 0, hdaa_sysctl_config, "A", "Original pin configuration"); } hdaa_lock(w->devinfo); } static void hdaa_widget_postprocess(struct hdaa_widget *w) { const char *typestr; w->type = HDA_PARAM_AUDIO_WIDGET_CAP_TYPE(w->param.widget_cap); switch (w->type) { case HDA_PARAM_AUDIO_WIDGET_CAP_TYPE_AUDIO_OUTPUT: typestr = "audio output"; break; case HDA_PARAM_AUDIO_WIDGET_CAP_TYPE_AUDIO_INPUT: typestr = "audio input"; break; case HDA_PARAM_AUDIO_WIDGET_CAP_TYPE_AUDIO_MIXER: typestr = "audio mixer"; break; case HDA_PARAM_AUDIO_WIDGET_CAP_TYPE_AUDIO_SELECTOR: typestr = "audio selector"; break; case HDA_PARAM_AUDIO_WIDGET_CAP_TYPE_PIN_COMPLEX: typestr = "pin"; break; case HDA_PARAM_AUDIO_WIDGET_CAP_TYPE_POWER_WIDGET: typestr = "power widget"; break; case HDA_PARAM_AUDIO_WIDGET_CAP_TYPE_VOLUME_WIDGET: typestr = "volume widget"; break; case HDA_PARAM_AUDIO_WIDGET_CAP_TYPE_BEEP_WIDGET: typestr = "beep widget"; break; case HDA_PARAM_AUDIO_WIDGET_CAP_TYPE_VENDOR_WIDGET: typestr = "vendor widget"; break; default: typestr = "unknown type"; break; } strlcpy(w->name, typestr, sizeof(w->name)); if (w->type == HDA_PARAM_AUDIO_WIDGET_CAP_TYPE_PIN_COMPLEX) { uint32_t config; const char *devstr; int conn, color; config = w->wclass.pin.config; devstr = HDA_DEVS[(config & HDA_CONFIG_DEFAULTCONF_DEVICE_MASK) >> HDA_CONFIG_DEFAULTCONF_DEVICE_SHIFT]; conn = (config & HDA_CONFIG_DEFAULTCONF_CONNECTIVITY_MASK) >> HDA_CONFIG_DEFAULTCONF_CONNECTIVITY_SHIFT; color = (config & HDA_CONFIG_DEFAULTCONF_COLOR_MASK) >> HDA_CONFIG_DEFAULTCONF_COLOR_SHIFT; strlcat(w->name, ": ", sizeof(w->name)); strlcat(w->name, devstr, sizeof(w->name)); strlcat(w->name, " (", sizeof(w->name)); if (conn == 0 && color != 0 && color != 15) { strlcat(w->name, HDA_COLORS[color], sizeof(w->name)); strlcat(w->name, " ", sizeof(w->name)); } strlcat(w->name, HDA_CONNS[conn], sizeof(w->name)); strlcat(w->name, ")", sizeof(w->name)); } } struct hdaa_widget * hdaa_widget_get(struct hdaa_devinfo *devinfo, nid_t nid) { if (devinfo == NULL || devinfo->widget == NULL || nid < devinfo->startnode || nid >= devinfo->endnode) return (NULL); return (&devinfo->widget[nid - devinfo->startnode]); } static void hdaa_audio_ctl_amp_set_internal(struct hdaa_devinfo *devinfo, nid_t nid, int index, int lmute, int rmute, int left, int right, int dir) { uint16_t v = 0; HDA_BOOTHVERBOSE( device_printf(devinfo->dev, "Setting amplifier nid=%d index=%d %s mute=%d/%d vol=%d/%d\n", nid,index,dir ? "in" : "out",lmute,rmute,left,right); ); if (left != right || lmute != rmute) { v = (1 << (15 - dir)) | (1 << 13) | (index << 8) | (lmute << 7) | left; hda_command(devinfo->dev, HDA_CMD_SET_AMP_GAIN_MUTE(0, nid, v)); v = (1 << (15 - dir)) | (1 << 12) | (index << 8) | (rmute << 7) | right; } else v = (1 << (15 - dir)) | (3 << 12) | (index << 8) | (lmute << 7) | left; hda_command(devinfo->dev, HDA_CMD_SET_AMP_GAIN_MUTE(0, nid, v)); } static void hdaa_audio_ctl_amp_set(struct hdaa_audio_ctl *ctl, uint32_t mute, int left, int right) { nid_t nid; int lmute, rmute; nid = ctl->widget->nid; /* Save new values if valid. */ if (mute != HDAA_AMP_MUTE_DEFAULT) ctl->muted = mute; if (left != HDAA_AMP_VOL_DEFAULT) ctl->left = left; if (right != HDAA_AMP_VOL_DEFAULT) ctl->right = right; /* Prepare effective values */ if (ctl->forcemute) { lmute = 1; rmute = 1; left = 0; right = 0; } else { lmute = HDAA_AMP_LEFT_MUTED(ctl->muted); rmute = HDAA_AMP_RIGHT_MUTED(ctl->muted); left = ctl->left; right = ctl->right; } /* Apply effective values */ if (ctl->dir & HDAA_CTL_OUT) hdaa_audio_ctl_amp_set_internal(ctl->widget->devinfo, nid, ctl->index, lmute, rmute, left, right, 0); if (ctl->dir & HDAA_CTL_IN) hdaa_audio_ctl_amp_set_internal(ctl->widget->devinfo, nid, ctl->index, lmute, rmute, left, right, 1); } static void hdaa_widget_connection_select(struct hdaa_widget *w, uint8_t index) { if (w == NULL || w->nconns < 1 || index > (w->nconns - 1)) return; HDA_BOOTHVERBOSE( device_printf(w->devinfo->dev, "Setting selector nid=%d index=%d\n", w->nid, index); ); hda_command(w->devinfo->dev, HDA_CMD_SET_CONNECTION_SELECT_CONTROL(0, w->nid, index)); w->selconn = index; } /**************************************************************************** * Device Methods ****************************************************************************/ static void * hdaa_channel_init(kobj_t obj, void *data, struct snd_dbuf *b, struct pcm_channel *c, int dir) { struct hdaa_chan *ch = data; struct hdaa_pcm_devinfo *pdevinfo = ch->pdevinfo; struct hdaa_devinfo *devinfo = pdevinfo->devinfo; hdaa_lock(devinfo); if (devinfo->quirks & HDAA_QUIRK_FIXEDRATE) { ch->caps.minspeed = ch->caps.maxspeed = 48000; ch->pcmrates[0] = 48000; ch->pcmrates[1] = 0; } ch->dir = dir; ch->b = b; ch->c = c; ch->blksz = pdevinfo->chan_size / pdevinfo->chan_blkcnt; ch->blkcnt = pdevinfo->chan_blkcnt; hdaa_unlock(devinfo); if (sndbuf_alloc(ch->b, bus_get_dma_tag(devinfo->dev), hda_get_dma_nocache(devinfo->dev) ? BUS_DMA_NOCACHE : 0, pdevinfo->chan_size) != 0) return (NULL); return (ch); } static int hdaa_channel_setformat(kobj_t obj, void *data, uint32_t format) { struct hdaa_chan *ch = data; int i; for (i = 0; ch->caps.fmtlist[i] != 0; i++) { if (format == ch->caps.fmtlist[i]) { ch->fmt = format; return (0); } } return (EINVAL); } static uint32_t hdaa_channel_setspeed(kobj_t obj, void *data, uint32_t speed) { struct hdaa_chan *ch = data; uint32_t spd = 0, threshold; int i; /* First look for equal or multiple frequency. */ for (i = 0; ch->pcmrates[i] != 0; i++) { spd = ch->pcmrates[i]; if (speed != 0 && spd / speed * speed == spd) { ch->spd = spd; return (spd); } } /* If no match, just find nearest. */ for (i = 0; ch->pcmrates[i] != 0; i++) { spd = ch->pcmrates[i]; threshold = spd + ((ch->pcmrates[i + 1] != 0) ? ((ch->pcmrates[i + 1] - spd) >> 1) : 0); if (speed < threshold) break; } ch->spd = spd; return (spd); } static uint16_t hdaa_stream_format(struct hdaa_chan *ch) { int i; uint16_t fmt; fmt = 0; if (ch->fmt & AFMT_S16_LE) fmt |= ch->bit16 << 4; else if (ch->fmt & AFMT_S32_LE) fmt |= ch->bit32 << 4; else fmt |= 1 << 4; for (i = 0; i < HDA_RATE_TAB_LEN; i++) { if (hda_rate_tab[i].valid && ch->spd == hda_rate_tab[i].rate) { fmt |= hda_rate_tab[i].base; fmt |= hda_rate_tab[i].mul; fmt |= hda_rate_tab[i].div; break; } } fmt |= (AFMT_CHANNEL(ch->fmt) - 1); return (fmt); } static int hdaa_allowed_stripes(uint16_t fmt) { static const int bits[8] = { 8, 16, 20, 24, 32, 32, 32, 32 }; int size; size = bits[(fmt >> 4) & 0x03]; size *= (fmt & 0x0f) + 1; size *= ((fmt >> 11) & 0x07) + 1; return (0xffffffffU >> (32 - fls(size / 8))); } static void hdaa_audio_setup(struct hdaa_chan *ch) { struct hdaa_audio_as *as = &ch->devinfo->as[ch->as]; struct hdaa_widget *w, *wp; int i, j, k, chn, cchn, totalchn, totalextchn, c; uint16_t fmt, dfmt; /* Mapping channel pairs to codec pins/converters. */ const static uint16_t convmap[2][5] = /* 1.0 2.0 4.0 5.1 7.1 */ {{ 0x0010, 0x0001, 0x0201, 0x0231, 0x4231 }, /* no dup. */ { 0x0010, 0x0001, 0x2201, 0x2231, 0x4231 }}; /* side dup. */ /* Mapping formats to HDMI channel allocations. */ const static uint8_t hdmica[2][8] = /* 1 2 3 4 5 6 7 8 */ {{ 0x02, 0x00, 0x04, 0x08, 0x0a, 0x0e, 0x12, 0x12 }, /* x.0 */ { 0x01, 0x03, 0x01, 0x03, 0x09, 0x0b, 0x0f, 0x13 }}; /* x.1 */ /* Mapping formats to HDMI channels order. */ const static uint32_t hdmich[2][8] = /* 1 / 5 2 / 6 3 / 7 4 / 8 */ {{ 0xFFFF0F00, 0xFFFFFF10, 0xFFF2FF10, 0xFF32FF10, 0xFF324F10, 0xF5324F10, 0x54326F10, 0x54326F10 }, /* x.0 */ { 0xFFFFF000, 0xFFFF0100, 0xFFFFF210, 0xFFFF2310, 0xFF32F410, 0xFF324510, 0xF6324510, 0x76325410 }}; /* x.1 */ int convmapid = -1; nid_t nid; uint8_t csum; totalchn = AFMT_CHANNEL(ch->fmt); totalextchn = AFMT_EXTCHANNEL(ch->fmt); HDA_BOOTHVERBOSE( device_printf(ch->pdevinfo->dev, "PCMDIR_%s: Stream setup fmt=%08x (%d.%d) speed=%d\n", (ch->dir == PCMDIR_PLAY) ? "PLAY" : "REC", ch->fmt, totalchn - totalextchn, totalextchn, ch->spd); ); fmt = hdaa_stream_format(ch); /* Set channels to I/O converters mapping for known speaker setups. */ if ((as->pinset == 0x0007 || as->pinset == 0x0013) || /* Standard 5.1 */ (as->pinset == 0x0017)) /* Standard 7.1 */ convmapid = (ch->dir == PCMDIR_PLAY); dfmt = HDA_CMD_SET_DIGITAL_CONV_FMT1_DIGEN; if (ch->fmt & AFMT_AC3) dfmt |= HDA_CMD_SET_DIGITAL_CONV_FMT1_NAUDIO; chn = 0; for (i = 0; ch->io[i] != -1; i++) { w = hdaa_widget_get(ch->devinfo, ch->io[i]); if (w == NULL) continue; /* If HP redirection is enabled, but failed to use same DAC, make last DAC to duplicate first one. */ if (as->fakeredir && i == (as->pincnt - 1)) { c = (ch->sid << 4); } else { /* Map channels to I/O converters, if set. */ if (convmapid >= 0) chn = (((convmap[convmapid][totalchn / 2] >> i * 4) & 0xf) - 1) * 2; if (chn < 0 || chn >= totalchn) { c = 0; } else { c = (ch->sid << 4) | chn; } } hda_command(ch->devinfo->dev, HDA_CMD_SET_CONV_FMT(0, ch->io[i], fmt)); if (HDA_PARAM_AUDIO_WIDGET_CAP_DIGITAL(w->param.widget_cap)) { hda_command(ch->devinfo->dev, HDA_CMD_SET_DIGITAL_CONV_FMT1(0, ch->io[i], dfmt)); } hda_command(ch->devinfo->dev, HDA_CMD_SET_CONV_STREAM_CHAN(0, ch->io[i], c)); if (HDA_PARAM_AUDIO_WIDGET_CAP_STRIPE(w->param.widget_cap)) { hda_command(ch->devinfo->dev, HDA_CMD_SET_STRIPE_CONTROL(0, w->nid, ch->stripectl)); } cchn = HDA_PARAM_AUDIO_WIDGET_CAP_CC(w->param.widget_cap); if (cchn > 1 && chn < totalchn) { cchn = min(cchn, totalchn - chn - 1); hda_command(ch->devinfo->dev, HDA_CMD_SET_CONV_CHAN_COUNT(0, ch->io[i], cchn)); } HDA_BOOTHVERBOSE( device_printf(ch->pdevinfo->dev, "PCMDIR_%s: Stream setup nid=%d: " "fmt=0x%04x, dfmt=0x%04x, chan=0x%04x, " "chan_count=0x%02x, stripe=%d\n", (ch->dir == PCMDIR_PLAY) ? "PLAY" : "REC", ch->io[i], fmt, dfmt, c, cchn, ch->stripectl); ); for (j = 0; j < 16; j++) { if (as->dacs[ch->asindex][j] != ch->io[i]) continue; nid = as->pins[j]; wp = hdaa_widget_get(ch->devinfo, nid); if (wp == NULL) continue; if (!HDA_PARAM_PIN_CAP_DP(wp->wclass.pin.cap) && !HDA_PARAM_PIN_CAP_HDMI(wp->wclass.pin.cap)) continue; /* Set channel mapping. */ for (k = 0; k < 8; k++) { hda_command(ch->devinfo->dev, HDA_CMD_SET_HDMI_CHAN_SLOT(0, nid, (((hdmich[totalextchn == 0 ? 0 : 1][totalchn - 1] >> (k * 4)) & 0xf) << 4) | k)); } /* * Enable High Bit Rate (HBR) Encoded Packet Type * (EPT), if supported and needed (8ch data). */ if (HDA_PARAM_PIN_CAP_HDMI(wp->wclass.pin.cap) && HDA_PARAM_PIN_CAP_HBR(wp->wclass.pin.cap)) { wp->wclass.pin.ctrl &= ~HDA_CMD_SET_PIN_WIDGET_CTRL_VREF_ENABLE_MASK; if ((ch->fmt & AFMT_AC3) && (cchn == 7)) wp->wclass.pin.ctrl |= 0x03; hda_command(ch->devinfo->dev, HDA_CMD_SET_PIN_WIDGET_CTRL(0, nid, wp->wclass.pin.ctrl)); } /* Stop audio infoframe transmission. */ hda_command(ch->devinfo->dev, HDA_CMD_SET_HDMI_DIP_INDEX(0, nid, 0x00)); hda_command(ch->devinfo->dev, HDA_CMD_SET_HDMI_DIP_XMIT(0, nid, 0x00)); /* Clear audio infoframe buffer. */ hda_command(ch->devinfo->dev, HDA_CMD_SET_HDMI_DIP_INDEX(0, nid, 0x00)); for (k = 0; k < 32; k++) hda_command(ch->devinfo->dev, HDA_CMD_SET_HDMI_DIP_DATA(0, nid, 0x00)); /* Write HDMI/DisplayPort audio infoframe. */ hda_command(ch->devinfo->dev, HDA_CMD_SET_HDMI_DIP_INDEX(0, nid, 0x00)); if (w->eld != NULL && w->eld_len >= 6 && ((w->eld[5] >> 2) & 0x3) == 1) { /* DisplayPort */ hda_command(ch->devinfo->dev, HDA_CMD_SET_HDMI_DIP_DATA(0, nid, 0x84)); hda_command(ch->devinfo->dev, HDA_CMD_SET_HDMI_DIP_DATA(0, nid, 0x1b)); hda_command(ch->devinfo->dev, HDA_CMD_SET_HDMI_DIP_DATA(0, nid, 0x44)); } else { /* HDMI */ hda_command(ch->devinfo->dev, HDA_CMD_SET_HDMI_DIP_DATA(0, nid, 0x84)); hda_command(ch->devinfo->dev, HDA_CMD_SET_HDMI_DIP_DATA(0, nid, 0x01)); hda_command(ch->devinfo->dev, HDA_CMD_SET_HDMI_DIP_DATA(0, nid, 0x0a)); csum = 0; csum -= 0x84 + 0x01 + 0x0a + (totalchn - 1) + hdmica[totalextchn == 0 ? 0 : 1][totalchn - 1]; hda_command(ch->devinfo->dev, HDA_CMD_SET_HDMI_DIP_DATA(0, nid, csum)); } hda_command(ch->devinfo->dev, HDA_CMD_SET_HDMI_DIP_DATA(0, nid, totalchn - 1)); hda_command(ch->devinfo->dev, HDA_CMD_SET_HDMI_DIP_DATA(0, nid, 0x00)); hda_command(ch->devinfo->dev, HDA_CMD_SET_HDMI_DIP_DATA(0, nid, 0x00)); hda_command(ch->devinfo->dev, HDA_CMD_SET_HDMI_DIP_DATA(0, nid, hdmica[totalextchn == 0 ? 0 : 1][totalchn - 1])); /* Start audio infoframe transmission. */ hda_command(ch->devinfo->dev, HDA_CMD_SET_HDMI_DIP_INDEX(0, nid, 0x00)); hda_command(ch->devinfo->dev, HDA_CMD_SET_HDMI_DIP_XMIT(0, nid, 0xc0)); } chn += cchn + 1; } } /* * Greatest Common Divisor. */ static unsigned gcd(unsigned a, unsigned b) { u_int c; while (b != 0) { c = a; a = b; b = (c % b); } return (a); } /* * Least Common Multiple. */ static unsigned lcm(unsigned a, unsigned b) { return ((a * b) / gcd(a, b)); } static int hdaa_channel_setfragments(kobj_t obj, void *data, uint32_t blksz, uint32_t blkcnt) { struct hdaa_chan *ch = data; blksz -= blksz % lcm(HDA_DMA_ALIGNMENT, sndbuf_getalign(ch->b)); if (blksz > (sndbuf_getmaxsize(ch->b) / HDA_BDL_MIN)) blksz = sndbuf_getmaxsize(ch->b) / HDA_BDL_MIN; if (blksz < HDA_BLK_MIN) blksz = HDA_BLK_MIN; if (blkcnt > HDA_BDL_MAX) blkcnt = HDA_BDL_MAX; if (blkcnt < HDA_BDL_MIN) blkcnt = HDA_BDL_MIN; while ((blksz * blkcnt) > sndbuf_getmaxsize(ch->b)) { if ((blkcnt >> 1) >= HDA_BDL_MIN) blkcnt >>= 1; else if ((blksz >> 1) >= HDA_BLK_MIN) blksz >>= 1; else break; } if ((sndbuf_getblksz(ch->b) != blksz || sndbuf_getblkcnt(ch->b) != blkcnt) && sndbuf_resize(ch->b, blkcnt, blksz) != 0) device_printf(ch->devinfo->dev, "%s: failed blksz=%u blkcnt=%u\n", __func__, blksz, blkcnt); ch->blksz = sndbuf_getblksz(ch->b); ch->blkcnt = sndbuf_getblkcnt(ch->b); return (0); } static uint32_t hdaa_channel_setblocksize(kobj_t obj, void *data, uint32_t blksz) { struct hdaa_chan *ch = data; hdaa_channel_setfragments(obj, data, blksz, ch->pdevinfo->chan_blkcnt); return (ch->blksz); } static void hdaa_channel_stop(struct hdaa_chan *ch) { struct hdaa_devinfo *devinfo = ch->devinfo; struct hdaa_widget *w; int i; if ((ch->flags & HDAA_CHN_RUNNING) == 0) return; ch->flags &= ~HDAA_CHN_RUNNING; HDAC_STREAM_STOP(device_get_parent(devinfo->dev), devinfo->dev, ch->dir == PCMDIR_PLAY ? 1 : 0, ch->sid); for (i = 0; ch->io[i] != -1; i++) { w = hdaa_widget_get(ch->devinfo, ch->io[i]); if (w == NULL) continue; if (HDA_PARAM_AUDIO_WIDGET_CAP_DIGITAL(w->param.widget_cap)) { hda_command(devinfo->dev, HDA_CMD_SET_DIGITAL_CONV_FMT1(0, ch->io[i], 0)); } hda_command(devinfo->dev, HDA_CMD_SET_CONV_STREAM_CHAN(0, ch->io[i], 0)); } HDAC_STREAM_FREE(device_get_parent(devinfo->dev), devinfo->dev, ch->dir == PCMDIR_PLAY ? 1 : 0, ch->sid); } static int hdaa_channel_start(struct hdaa_chan *ch) { struct hdaa_devinfo *devinfo = ch->devinfo; uint32_t fmt; fmt = hdaa_stream_format(ch); ch->stripectl = fls(ch->stripecap & hdaa_allowed_stripes(fmt)) - 1; ch->sid = HDAC_STREAM_ALLOC(device_get_parent(devinfo->dev), devinfo->dev, ch->dir == PCMDIR_PLAY ? 1 : 0, fmt, ch->stripectl, &ch->dmapos); if (ch->sid <= 0) return (EBUSY); hdaa_audio_setup(ch); HDAC_STREAM_RESET(device_get_parent(devinfo->dev), devinfo->dev, ch->dir == PCMDIR_PLAY ? 1 : 0, ch->sid); HDAC_STREAM_START(device_get_parent(devinfo->dev), devinfo->dev, ch->dir == PCMDIR_PLAY ? 1 : 0, ch->sid, sndbuf_getbufaddr(ch->b), ch->blksz, ch->blkcnt); ch->flags |= HDAA_CHN_RUNNING; return (0); } static int hdaa_channel_trigger(kobj_t obj, void *data, int go) { struct hdaa_chan *ch = data; int error = 0; if (!PCMTRIG_COMMON(go)) return (0); hdaa_lock(ch->devinfo); switch (go) { case PCMTRIG_START: error = hdaa_channel_start(ch); break; case PCMTRIG_STOP: case PCMTRIG_ABORT: hdaa_channel_stop(ch); break; default: break; } hdaa_unlock(ch->devinfo); return (error); } static uint32_t hdaa_channel_getptr(kobj_t obj, void *data) { struct hdaa_chan *ch = data; struct hdaa_devinfo *devinfo = ch->devinfo; uint32_t ptr; hdaa_lock(devinfo); if (ch->dmapos != NULL) { ptr = *(ch->dmapos); } else { ptr = HDAC_STREAM_GETPTR( device_get_parent(devinfo->dev), devinfo->dev, ch->dir == PCMDIR_PLAY ? 1 : 0, ch->sid); } hdaa_unlock(devinfo); /* * Round to available space and force 128 bytes aligment. */ ptr %= ch->blksz * ch->blkcnt; ptr &= HDA_BLK_ALIGN; return (ptr); } static struct pcmchan_caps * hdaa_channel_getcaps(kobj_t obj, void *data) { return (&((struct hdaa_chan *)data)->caps); } static kobj_method_t hdaa_channel_methods[] = { KOBJMETHOD(channel_init, hdaa_channel_init), KOBJMETHOD(channel_setformat, hdaa_channel_setformat), KOBJMETHOD(channel_setspeed, hdaa_channel_setspeed), KOBJMETHOD(channel_setblocksize, hdaa_channel_setblocksize), KOBJMETHOD(channel_setfragments, hdaa_channel_setfragments), KOBJMETHOD(channel_trigger, hdaa_channel_trigger), KOBJMETHOD(channel_getptr, hdaa_channel_getptr), KOBJMETHOD(channel_getcaps, hdaa_channel_getcaps), KOBJMETHOD_END }; CHANNEL_DECLARE(hdaa_channel); static int hdaa_audio_ctl_ossmixer_init(struct snd_mixer *m) { struct hdaa_pcm_devinfo *pdevinfo = mix_getdevinfo(m); struct hdaa_devinfo *devinfo = pdevinfo->devinfo; struct hdaa_widget *w, *cw; uint32_t mask, recmask; int i, j; hdaa_lock(devinfo); pdevinfo->mixer = m; /* Make sure that in case of soft volume it won't stay muted. */ for (i = 0; i < SOUND_MIXER_NRDEVICES; i++) { pdevinfo->left[i] = 100; pdevinfo->right[i] = 100; } /* Declare volume controls assigned to this association. */ mask = pdevinfo->ossmask; if (pdevinfo->playas >= 0) { /* Declate EAPD as ogain control. */ for (i = devinfo->startnode; i < devinfo->endnode; i++) { w = hdaa_widget_get(devinfo, i); if (w == NULL || w->enable == 0) continue; if (w->type != HDA_PARAM_AUDIO_WIDGET_CAP_TYPE_PIN_COMPLEX || w->param.eapdbtl == HDA_INVALID || w->bindas != pdevinfo->playas) continue; mask |= SOUND_MASK_OGAIN; break; } /* Declare soft PCM volume if needed. */ if ((mask & SOUND_MASK_PCM) == 0 || (devinfo->quirks & HDAA_QUIRK_SOFTPCMVOL) || pdevinfo->minamp[SOUND_MIXER_PCM] == pdevinfo->maxamp[SOUND_MIXER_PCM]) { mask |= SOUND_MASK_PCM; pcm_setflags(pdevinfo->dev, pcm_getflags(pdevinfo->dev) | SD_F_SOFTPCMVOL); HDA_BOOTHVERBOSE( device_printf(pdevinfo->dev, "Forcing Soft PCM volume\n"); ); } /* Declare master volume if needed. */ if ((mask & SOUND_MASK_VOLUME) == 0) { mask |= SOUND_MASK_VOLUME; mix_setparentchild(m, SOUND_MIXER_VOLUME, SOUND_MASK_PCM); mix_setrealdev(m, SOUND_MIXER_VOLUME, SOUND_MIXER_NONE); HDA_BOOTHVERBOSE( device_printf(pdevinfo->dev, "Forcing master volume with PCM\n"); ); } } /* Declare record sources available to this association. */ recmask = 0; if (pdevinfo->recas >= 0) { for (i = 0; i < 16; i++) { if (devinfo->as[pdevinfo->recas].dacs[0][i] < 0) continue; w = hdaa_widget_get(devinfo, devinfo->as[pdevinfo->recas].dacs[0][i]); if (w == NULL || w->enable == 0) continue; for (j = 0; j < w->nconns; j++) { if (w->connsenable[j] == 0) continue; cw = hdaa_widget_get(devinfo, w->conns[j]); if (cw == NULL || cw->enable == 0) continue; if (cw->bindas != pdevinfo->recas && cw->bindas != -2) continue; recmask |= cw->ossmask; } } } recmask &= (1 << SOUND_MIXER_NRDEVICES) - 1; mask &= (1 << SOUND_MIXER_NRDEVICES) - 1; pdevinfo->ossmask = mask; mix_setrecdevs(m, recmask); mix_setdevs(m, mask); hdaa_unlock(devinfo); return (0); } /* * Update amplification per pdevinfo per ossdev, calculate summary coefficient * and write it to codec, update *left and *right to reflect remaining error. */ static void hdaa_audio_ctl_dev_set(struct hdaa_audio_ctl *ctl, int ossdev, int mute, int *left, int *right) { int i, zleft, zright, sleft, sright, smute, lval, rval; ctl->devleft[ossdev] = *left; ctl->devright[ossdev] = *right; ctl->devmute[ossdev] = mute; smute = sleft = sright = zleft = zright = 0; for (i = 0; i < SOUND_MIXER_NRDEVICES; i++) { sleft += ctl->devleft[i]; sright += ctl->devright[i]; smute |= ctl->devmute[i]; if (i == ossdev) continue; zleft += ctl->devleft[i]; zright += ctl->devright[i]; } lval = QDB2VAL(ctl, sleft); rval = QDB2VAL(ctl, sright); hdaa_audio_ctl_amp_set(ctl, smute, lval, rval); *left -= VAL2QDB(ctl, lval) - VAL2QDB(ctl, QDB2VAL(ctl, zleft)); *right -= VAL2QDB(ctl, rval) - VAL2QDB(ctl, QDB2VAL(ctl, zright)); } /* * Trace signal from source, setting volumes on the way. */ static void hdaa_audio_ctl_source_volume(struct hdaa_pcm_devinfo *pdevinfo, int ossdev, nid_t nid, int index, int mute, int left, int right, int depth) { struct hdaa_devinfo *devinfo = pdevinfo->devinfo; struct hdaa_widget *w, *wc; struct hdaa_audio_ctl *ctl; int i, j, conns = 0; if (depth > HDA_PARSE_MAXDEPTH) return; w = hdaa_widget_get(devinfo, nid); if (w == NULL || w->enable == 0) return; /* Count number of active inputs. */ if (depth > 0) { for (j = 0; j < w->nconns; j++) { if (!w->connsenable[j]) continue; conns++; } } /* If this is not a first step - use input mixer. Pins have common input ctl so care must be taken. */ if (depth > 0 && (conns == 1 || w->type != HDA_PARAM_AUDIO_WIDGET_CAP_TYPE_PIN_COMPLEX)) { ctl = hdaa_audio_ctl_amp_get(devinfo, w->nid, HDAA_CTL_IN, index, 1); if (ctl) hdaa_audio_ctl_dev_set(ctl, ossdev, mute, &left, &right); } /* If widget has own ossdev - not traverse it. - It will be traversed on it's own. */ + It will be traversed on its own. */ if (w->ossdev >= 0 && depth > 0) return; /* We must not traverse pin */ if ((w->type == HDA_PARAM_AUDIO_WIDGET_CAP_TYPE_AUDIO_INPUT || w->type == HDA_PARAM_AUDIO_WIDGET_CAP_TYPE_PIN_COMPLEX) && depth > 0) return; /* * If signals mixed, we can't assign controls farther. * Ignore this on depth zero. Caller must knows why. */ if (conns > 1 && (w->type == HDA_PARAM_AUDIO_WIDGET_CAP_TYPE_AUDIO_MIXER || w->selconn != index)) return; ctl = hdaa_audio_ctl_amp_get(devinfo, w->nid, HDAA_CTL_OUT, -1, 1); if (ctl) hdaa_audio_ctl_dev_set(ctl, ossdev, mute, &left, &right); for (i = devinfo->startnode; i < devinfo->endnode; i++) { wc = hdaa_widget_get(devinfo, i); if (wc == NULL || wc->enable == 0) continue; for (j = 0; j < wc->nconns; j++) { if (wc->connsenable[j] && wc->conns[j] == nid) { hdaa_audio_ctl_source_volume(pdevinfo, ossdev, wc->nid, j, mute, left, right, depth + 1); } } } return; } /* * Trace signal from destination, setting volumes on the way. */ static void hdaa_audio_ctl_dest_volume(struct hdaa_pcm_devinfo *pdevinfo, int ossdev, nid_t nid, int index, int mute, int left, int right, int depth) { struct hdaa_devinfo *devinfo = pdevinfo->devinfo; struct hdaa_audio_as *as = devinfo->as; struct hdaa_widget *w, *wc; struct hdaa_audio_ctl *ctl; int i, j, consumers, cleft, cright; if (depth > HDA_PARSE_MAXDEPTH) return; w = hdaa_widget_get(devinfo, nid); if (w == NULL || w->enable == 0) return; if (depth > 0) { /* If this node produce output for several consumers, we can't touch it. */ consumers = 0; for (i = devinfo->startnode; i < devinfo->endnode; i++) { wc = hdaa_widget_get(devinfo, i); if (wc == NULL || wc->enable == 0) continue; for (j = 0; j < wc->nconns; j++) { if (wc->connsenable[j] && wc->conns[j] == nid) consumers++; } } /* The only exception is if real HP redirection is configured and this is a duplication point. XXX: Actually exception is not completely correct. XXX: Duplication point check is not perfect. */ if ((consumers == 2 && (w->bindas < 0 || as[w->bindas].hpredir < 0 || as[w->bindas].fakeredir || (w->bindseqmask & (1 << 15)) == 0)) || consumers > 2) return; /* Else use it's output mixer. */ ctl = hdaa_audio_ctl_amp_get(devinfo, w->nid, HDAA_CTL_OUT, -1, 1); if (ctl) hdaa_audio_ctl_dev_set(ctl, ossdev, mute, &left, &right); } /* We must not traverse pin */ if (w->type == HDA_PARAM_AUDIO_WIDGET_CAP_TYPE_PIN_COMPLEX && depth > 0) return; for (i = 0; i < w->nconns; i++) { if (w->connsenable[i] == 0) continue; if (index >= 0 && i != index) continue; cleft = left; cright = right; ctl = hdaa_audio_ctl_amp_get(devinfo, w->nid, HDAA_CTL_IN, i, 1); if (ctl) hdaa_audio_ctl_dev_set(ctl, ossdev, mute, &cleft, &cright); hdaa_audio_ctl_dest_volume(pdevinfo, ossdev, w->conns[i], -1, mute, cleft, cright, depth + 1); } } /* * Set volumes for the specified pdevinfo and ossdev. */ static void hdaa_audio_ctl_dev_volume(struct hdaa_pcm_devinfo *pdevinfo, unsigned dev) { struct hdaa_devinfo *devinfo = pdevinfo->devinfo; struct hdaa_widget *w, *cw; uint32_t mute; int lvol, rvol; int i, j; mute = 0; if (pdevinfo->left[dev] == 0) { mute |= HDAA_AMP_MUTE_LEFT; lvol = -4000; } else lvol = ((pdevinfo->maxamp[dev] - pdevinfo->minamp[dev]) * pdevinfo->left[dev] + 50) / 100 + pdevinfo->minamp[dev]; if (pdevinfo->right[dev] == 0) { mute |= HDAA_AMP_MUTE_RIGHT; rvol = -4000; } else rvol = ((pdevinfo->maxamp[dev] - pdevinfo->minamp[dev]) * pdevinfo->right[dev] + 50) / 100 + pdevinfo->minamp[dev]; for (i = devinfo->startnode; i < devinfo->endnode; i++) { w = hdaa_widget_get(devinfo, i); if (w == NULL || w->enable == 0) continue; if (w->bindas < 0) { if (pdevinfo->index != 0) continue; } else { if (w->bindas != pdevinfo->playas && w->bindas != pdevinfo->recas) continue; } if (dev == SOUND_MIXER_RECLEV && w->type == HDA_PARAM_AUDIO_WIDGET_CAP_TYPE_AUDIO_INPUT) { hdaa_audio_ctl_dest_volume(pdevinfo, dev, w->nid, -1, mute, lvol, rvol, 0); continue; } if (dev == SOUND_MIXER_VOLUME && w->type == HDA_PARAM_AUDIO_WIDGET_CAP_TYPE_PIN_COMPLEX && devinfo->as[w->bindas].dir == HDAA_CTL_OUT) { hdaa_audio_ctl_dest_volume(pdevinfo, dev, w->nid, -1, mute, lvol, rvol, 0); continue; } if (dev == SOUND_MIXER_IGAIN && w->pflags & HDAA_ADC_MONITOR) { for (j = 0; j < w->nconns; j++) { if (!w->connsenable[j]) continue; cw = hdaa_widget_get(devinfo, w->conns[j]); if (cw == NULL || cw->enable == 0) continue; if (cw->bindas == -1) continue; if (cw->bindas >= 0 && devinfo->as[cw->bindas].dir != HDAA_CTL_IN) continue; hdaa_audio_ctl_dest_volume(pdevinfo, dev, w->nid, j, mute, lvol, rvol, 0); } continue; } if (w->ossdev != dev) continue; hdaa_audio_ctl_source_volume(pdevinfo, dev, w->nid, -1, mute, lvol, rvol, 0); if (dev == SOUND_MIXER_IMIX && (w->pflags & HDAA_IMIX_AS_DST)) hdaa_audio_ctl_dest_volume(pdevinfo, dev, w->nid, -1, mute, lvol, rvol, 0); } } /* * OSS Mixer set method. */ static int hdaa_audio_ctl_ossmixer_set(struct snd_mixer *m, unsigned dev, unsigned left, unsigned right) { struct hdaa_pcm_devinfo *pdevinfo = mix_getdevinfo(m); struct hdaa_devinfo *devinfo = pdevinfo->devinfo; struct hdaa_widget *w; int i; hdaa_lock(devinfo); /* Save new values. */ pdevinfo->left[dev] = left; pdevinfo->right[dev] = right; /* 'ogain' is the special case implemented with EAPD. */ if (dev == SOUND_MIXER_OGAIN) { uint32_t orig; w = NULL; for (i = devinfo->startnode; i < devinfo->endnode; i++) { w = hdaa_widget_get(devinfo, i); if (w == NULL || w->enable == 0) continue; if (w->type != HDA_PARAM_AUDIO_WIDGET_CAP_TYPE_PIN_COMPLEX || w->param.eapdbtl == HDA_INVALID) continue; break; } if (i >= devinfo->endnode) { hdaa_unlock(devinfo); return (-1); } orig = w->param.eapdbtl; if (left == 0) w->param.eapdbtl &= ~HDA_CMD_SET_EAPD_BTL_ENABLE_EAPD; else w->param.eapdbtl |= HDA_CMD_SET_EAPD_BTL_ENABLE_EAPD; if (orig != w->param.eapdbtl) { uint32_t val; val = w->param.eapdbtl; if (devinfo->quirks & HDAA_QUIRK_EAPDINV) val ^= HDA_CMD_SET_EAPD_BTL_ENABLE_EAPD; hda_command(devinfo->dev, HDA_CMD_SET_EAPD_BTL_ENABLE(0, w->nid, val)); } hdaa_unlock(devinfo); return (left | (left << 8)); } /* Recalculate all controls related to this OSS device. */ hdaa_audio_ctl_dev_volume(pdevinfo, dev); hdaa_unlock(devinfo); return (left | (right << 8)); } /* * Set mixer settings to our own default values: * +20dB for mics, -10dB for analog vol, mute for igain, 0dB for others. */ static void hdaa_audio_ctl_set_defaults(struct hdaa_pcm_devinfo *pdevinfo) { int amp, vol, dev; for (dev = 0; dev < SOUND_MIXER_NRDEVICES; dev++) { if ((pdevinfo->ossmask & (1 << dev)) == 0) continue; /* If the value was overriden, leave it as is. */ if (resource_int_value(device_get_name(pdevinfo->dev), device_get_unit(pdevinfo->dev), ossnames[dev], &vol) == 0) continue; vol = -1; if (dev == SOUND_MIXER_OGAIN) vol = 100; else if (dev == SOUND_MIXER_IGAIN) vol = 0; else if (dev == SOUND_MIXER_MIC || dev == SOUND_MIXER_MONITOR) amp = 20 * 4; /* +20dB */ else if (dev == SOUND_MIXER_VOLUME && !pdevinfo->digital) amp = -10 * 4; /* -10dB */ else amp = 0; if (vol < 0 && (pdevinfo->maxamp[dev] - pdevinfo->minamp[dev]) <= 0) { vol = 100; } else if (vol < 0) { vol = ((amp - pdevinfo->minamp[dev]) * 100 + (pdevinfo->maxamp[dev] - pdevinfo->minamp[dev]) / 2) / (pdevinfo->maxamp[dev] - pdevinfo->minamp[dev]); vol = imin(imax(vol, 1), 100); } mix_set(pdevinfo->mixer, dev, vol, vol); } } /* * Recursively commutate specified record source. */ static uint32_t hdaa_audio_ctl_recsel_comm(struct hdaa_pcm_devinfo *pdevinfo, uint32_t src, nid_t nid, int depth) { struct hdaa_devinfo *devinfo = pdevinfo->devinfo; struct hdaa_widget *w, *cw; struct hdaa_audio_ctl *ctl; char buf[64]; int i, muted; uint32_t res = 0; if (depth > HDA_PARSE_MAXDEPTH) return (0); w = hdaa_widget_get(devinfo, nid); if (w == NULL || w->enable == 0) return (0); for (i = 0; i < w->nconns; i++) { if (w->connsenable[i] == 0) continue; cw = hdaa_widget_get(devinfo, w->conns[i]); if (cw == NULL || cw->enable == 0 || cw->bindas == -1) continue; /* Call recursively to trace signal to it's source if needed. */ if ((src & cw->ossmask) != 0) { if (cw->ossdev < 0) { res |= hdaa_audio_ctl_recsel_comm(pdevinfo, src, w->conns[i], depth + 1); } else { res |= cw->ossmask; } } /* We have two special cases: mixers and others (selectors). */ if (w->type == HDA_PARAM_AUDIO_WIDGET_CAP_TYPE_AUDIO_MIXER) { ctl = hdaa_audio_ctl_amp_get(devinfo, w->nid, HDAA_CTL_IN, i, 1); if (ctl == NULL) continue; /* If we have input control on this node mute them * according to requested sources. */ muted = (src & cw->ossmask) ? 0 : 1; if (muted != ctl->forcemute) { ctl->forcemute = muted; hdaa_audio_ctl_amp_set(ctl, HDAA_AMP_MUTE_DEFAULT, HDAA_AMP_VOL_DEFAULT, HDAA_AMP_VOL_DEFAULT); } HDA_BOOTHVERBOSE( device_printf(pdevinfo->dev, "Recsel (%s): nid %d source %d %s\n", hdaa_audio_ctl_ossmixer_mask2allname( src, buf, sizeof(buf)), nid, i, muted?"mute":"unmute"); ); } else { if (w->nconns == 1) break; if ((src & cw->ossmask) == 0) continue; /* If we found requested source - select it and exit. */ hdaa_widget_connection_select(w, i); HDA_BOOTHVERBOSE( device_printf(pdevinfo->dev, "Recsel (%s): nid %d source %d select\n", hdaa_audio_ctl_ossmixer_mask2allname( src, buf, sizeof(buf)), nid, i); ); break; } } return (res); } static uint32_t hdaa_audio_ctl_ossmixer_setrecsrc(struct snd_mixer *m, uint32_t src) { struct hdaa_pcm_devinfo *pdevinfo = mix_getdevinfo(m); struct hdaa_devinfo *devinfo = pdevinfo->devinfo; struct hdaa_widget *w; struct hdaa_audio_as *as; struct hdaa_audio_ctl *ctl; struct hdaa_chan *ch; int i, j; uint32_t ret = 0xffffffff; hdaa_lock(devinfo); if (pdevinfo->recas < 0) { hdaa_unlock(devinfo); return (0); } as = &devinfo->as[pdevinfo->recas]; /* For non-mixed associations we always recording everything. */ if (!as->mixed) { hdaa_unlock(devinfo); return (mix_getrecdevs(m)); } /* Commutate requested recsrc for each ADC. */ for (j = 0; j < as->num_chans; j++) { ch = &devinfo->chans[as->chans[j]]; for (i = 0; ch->io[i] >= 0; i++) { w = hdaa_widget_get(devinfo, ch->io[i]); if (w == NULL || w->enable == 0) continue; ret &= hdaa_audio_ctl_recsel_comm(pdevinfo, src, ch->io[i], 0); } } if (ret == 0xffffffff) ret = 0; /* * Some controls could be shared. Reset volumes for controls * related to previously chosen devices, as they may no longer * affect the signal. */ i = 0; while ((ctl = hdaa_audio_ctl_each(devinfo, &i)) != NULL) { if (ctl->enable == 0 || !(ctl->ossmask & pdevinfo->recsrc)) continue; if (!((pdevinfo->playas >= 0 && ctl->widget->bindas == pdevinfo->playas) || (pdevinfo->recas >= 0 && ctl->widget->bindas == pdevinfo->recas) || (pdevinfo->index == 0 && ctl->widget->bindas == -2))) continue; for (j = 0; j < SOUND_MIXER_NRDEVICES; j++) { if (pdevinfo->recsrc & (1 << j)) { ctl->devleft[j] = 0; ctl->devright[j] = 0; ctl->devmute[j] = 0; } } } /* * Some controls could be shared. Set volumes for controls * related to devices selected both previously and now. */ for (j = 0; j < SOUND_MIXER_NRDEVICES; j++) { if ((ret | pdevinfo->recsrc) & (1 << j)) hdaa_audio_ctl_dev_volume(pdevinfo, j); } pdevinfo->recsrc = ret; hdaa_unlock(devinfo); return (ret); } static kobj_method_t hdaa_audio_ctl_ossmixer_methods[] = { KOBJMETHOD(mixer_init, hdaa_audio_ctl_ossmixer_init), KOBJMETHOD(mixer_set, hdaa_audio_ctl_ossmixer_set), KOBJMETHOD(mixer_setrecsrc, hdaa_audio_ctl_ossmixer_setrecsrc), KOBJMETHOD_END }; MIXER_DECLARE(hdaa_audio_ctl_ossmixer); static void hdaa_dump_gpi(struct hdaa_devinfo *devinfo) { device_t dev = devinfo->dev; int i; uint32_t data, wake, unsol, sticky; if (HDA_PARAM_GPIO_COUNT_NUM_GPI(devinfo->gpio_cap) > 0) { data = hda_command(dev, HDA_CMD_GET_GPI_DATA(0, devinfo->nid)); wake = hda_command(dev, HDA_CMD_GET_GPI_WAKE_ENABLE_MASK(0, devinfo->nid)); unsol = hda_command(dev, HDA_CMD_GET_GPI_UNSOLICITED_ENABLE_MASK(0, devinfo->nid)); sticky = hda_command(dev, HDA_CMD_GET_GPI_STICKY_MASK(0, devinfo->nid)); for (i = 0; i < HDA_PARAM_GPIO_COUNT_NUM_GPI(devinfo->gpio_cap); i++) { device_printf(dev, " GPI%d:%s%s%s state=%d", i, (sticky & (1 << i)) ? " sticky" : "", (unsol & (1 << i)) ? " unsol" : "", (wake & (1 << i)) ? " wake" : "", (data >> i) & 1); } } } static void hdaa_dump_gpio(struct hdaa_devinfo *devinfo) { device_t dev = devinfo->dev; int i; uint32_t data, dir, enable, wake, unsol, sticky; if (HDA_PARAM_GPIO_COUNT_NUM_GPIO(devinfo->gpio_cap) > 0) { data = hda_command(dev, HDA_CMD_GET_GPIO_DATA(0, devinfo->nid)); enable = hda_command(dev, HDA_CMD_GET_GPIO_ENABLE_MASK(0, devinfo->nid)); dir = hda_command(dev, HDA_CMD_GET_GPIO_DIRECTION(0, devinfo->nid)); wake = hda_command(dev, HDA_CMD_GET_GPIO_WAKE_ENABLE_MASK(0, devinfo->nid)); unsol = hda_command(dev, HDA_CMD_GET_GPIO_UNSOLICITED_ENABLE_MASK(0, devinfo->nid)); sticky = hda_command(dev, HDA_CMD_GET_GPIO_STICKY_MASK(0, devinfo->nid)); for (i = 0; i < HDA_PARAM_GPIO_COUNT_NUM_GPIO(devinfo->gpio_cap); i++) { device_printf(dev, " GPIO%d: ", i); if ((enable & (1 << i)) == 0) { printf("disabled\n"); continue; } if ((dir & (1 << i)) == 0) { printf("input%s%s%s", (sticky & (1 << i)) ? " sticky" : "", (unsol & (1 << i)) ? " unsol" : "", (wake & (1 << i)) ? " wake" : ""); } else printf("output"); printf(" state=%d\n", (data >> i) & 1); } } } static void hdaa_dump_gpo(struct hdaa_devinfo *devinfo) { device_t dev = devinfo->dev; int i; uint32_t data; if (HDA_PARAM_GPIO_COUNT_NUM_GPO(devinfo->gpio_cap) > 0) { data = hda_command(dev, HDA_CMD_GET_GPO_DATA(0, devinfo->nid)); for (i = 0; i < HDA_PARAM_GPIO_COUNT_NUM_GPO(devinfo->gpio_cap); i++) { device_printf(dev, " GPO%d: state=%d", i, (data >> i) & 1); } } } static void hdaa_audio_parse(struct hdaa_devinfo *devinfo) { struct hdaa_widget *w; uint32_t res; int i; nid_t nid; nid = devinfo->nid; res = hda_command(devinfo->dev, HDA_CMD_GET_PARAMETER(0, nid, HDA_PARAM_GPIO_COUNT)); devinfo->gpio_cap = res; HDA_BOOTVERBOSE( device_printf(devinfo->dev, "NumGPIO=%d NumGPO=%d " "NumGPI=%d GPIWake=%d GPIUnsol=%d\n", HDA_PARAM_GPIO_COUNT_NUM_GPIO(devinfo->gpio_cap), HDA_PARAM_GPIO_COUNT_NUM_GPO(devinfo->gpio_cap), HDA_PARAM_GPIO_COUNT_NUM_GPI(devinfo->gpio_cap), HDA_PARAM_GPIO_COUNT_GPI_WAKE(devinfo->gpio_cap), HDA_PARAM_GPIO_COUNT_GPI_UNSOL(devinfo->gpio_cap)); hdaa_dump_gpi(devinfo); hdaa_dump_gpio(devinfo); hdaa_dump_gpo(devinfo); ); res = hda_command(devinfo->dev, HDA_CMD_GET_PARAMETER(0, nid, HDA_PARAM_SUPP_STREAM_FORMATS)); devinfo->supp_stream_formats = res; res = hda_command(devinfo->dev, HDA_CMD_GET_PARAMETER(0, nid, HDA_PARAM_SUPP_PCM_SIZE_RATE)); devinfo->supp_pcm_size_rate = res; res = hda_command(devinfo->dev, HDA_CMD_GET_PARAMETER(0, nid, HDA_PARAM_OUTPUT_AMP_CAP)); devinfo->outamp_cap = res; res = hda_command(devinfo->dev, HDA_CMD_GET_PARAMETER(0, nid, HDA_PARAM_INPUT_AMP_CAP)); devinfo->inamp_cap = res; for (i = devinfo->startnode; i < devinfo->endnode; i++) { w = hdaa_widget_get(devinfo, i); if (w == NULL) device_printf(devinfo->dev, "Ghost widget! nid=%d!\n", i); else { w->devinfo = devinfo; w->nid = i; w->enable = 1; w->selconn = -1; w->pflags = 0; w->ossdev = -1; w->bindas = -1; w->param.eapdbtl = HDA_INVALID; hdaa_widget_parse(w); } } } static void hdaa_audio_postprocess(struct hdaa_devinfo *devinfo) { struct hdaa_widget *w; int i; for (i = devinfo->startnode; i < devinfo->endnode; i++) { w = hdaa_widget_get(devinfo, i); if (w == NULL) continue; hdaa_widget_postprocess(w); } } static void hdaa_audio_ctl_parse(struct hdaa_devinfo *devinfo) { struct hdaa_audio_ctl *ctls; struct hdaa_widget *w, *cw; int i, j, cnt, max, ocap, icap; int mute, offset, step, size; /* XXX This is redundant */ max = 0; for (i = devinfo->startnode; i < devinfo->endnode; i++) { w = hdaa_widget_get(devinfo, i); if (w == NULL || w->enable == 0) continue; if (w->param.outamp_cap != 0) max++; if (w->param.inamp_cap != 0) { switch (w->type) { case HDA_PARAM_AUDIO_WIDGET_CAP_TYPE_AUDIO_SELECTOR: case HDA_PARAM_AUDIO_WIDGET_CAP_TYPE_AUDIO_MIXER: for (j = 0; j < w->nconns; j++) { cw = hdaa_widget_get(devinfo, w->conns[j]); if (cw == NULL || cw->enable == 0) continue; max++; } break; default: max++; break; } } } devinfo->ctlcnt = max; if (max < 1) return; ctls = (struct hdaa_audio_ctl *)malloc( sizeof(*ctls) * max, M_HDAA, M_ZERO | M_NOWAIT); if (ctls == NULL) { /* Blekh! */ device_printf(devinfo->dev, "unable to allocate ctls!\n"); devinfo->ctlcnt = 0; return; } cnt = 0; for (i = devinfo->startnode; cnt < max && i < devinfo->endnode; i++) { if (cnt >= max) { device_printf(devinfo->dev, "%s: Ctl overflow!\n", __func__); break; } w = hdaa_widget_get(devinfo, i); if (w == NULL || w->enable == 0) continue; ocap = w->param.outamp_cap; icap = w->param.inamp_cap; if (ocap != 0) { mute = HDA_PARAM_OUTPUT_AMP_CAP_MUTE_CAP(ocap); step = HDA_PARAM_OUTPUT_AMP_CAP_NUMSTEPS(ocap); size = HDA_PARAM_OUTPUT_AMP_CAP_STEPSIZE(ocap); offset = HDA_PARAM_OUTPUT_AMP_CAP_OFFSET(ocap); /*if (offset > step) { HDA_BOOTVERBOSE( device_printf(devinfo->dev, "BUGGY outamp: nid=%d " "[offset=%d > step=%d]\n", w->nid, offset, step); ); offset = step; }*/ ctls[cnt].enable = 1; ctls[cnt].widget = w; ctls[cnt].mute = mute; ctls[cnt].step = step; ctls[cnt].size = size; ctls[cnt].offset = offset; ctls[cnt].left = offset; ctls[cnt].right = offset; if (w->type == HDA_PARAM_AUDIO_WIDGET_CAP_TYPE_PIN_COMPLEX || w->waspin) ctls[cnt].ndir = HDAA_CTL_IN; else ctls[cnt].ndir = HDAA_CTL_OUT; ctls[cnt++].dir = HDAA_CTL_OUT; } if (icap != 0) { mute = HDA_PARAM_OUTPUT_AMP_CAP_MUTE_CAP(icap); step = HDA_PARAM_OUTPUT_AMP_CAP_NUMSTEPS(icap); size = HDA_PARAM_OUTPUT_AMP_CAP_STEPSIZE(icap); offset = HDA_PARAM_OUTPUT_AMP_CAP_OFFSET(icap); /*if (offset > step) { HDA_BOOTVERBOSE( device_printf(devinfo->dev, "BUGGY inamp: nid=%d " "[offset=%d > step=%d]\n", w->nid, offset, step); ); offset = step; }*/ switch (w->type) { case HDA_PARAM_AUDIO_WIDGET_CAP_TYPE_AUDIO_SELECTOR: case HDA_PARAM_AUDIO_WIDGET_CAP_TYPE_AUDIO_MIXER: for (j = 0; j < w->nconns; j++) { if (cnt >= max) { device_printf(devinfo->dev, "%s: Ctl overflow!\n", __func__); break; } cw = hdaa_widget_get(devinfo, w->conns[j]); if (cw == NULL || cw->enable == 0) continue; ctls[cnt].enable = 1; ctls[cnt].widget = w; ctls[cnt].childwidget = cw; ctls[cnt].index = j; ctls[cnt].mute = mute; ctls[cnt].step = step; ctls[cnt].size = size; ctls[cnt].offset = offset; ctls[cnt].left = offset; ctls[cnt].right = offset; ctls[cnt].ndir = HDAA_CTL_IN; ctls[cnt++].dir = HDAA_CTL_IN; } break; default: if (cnt >= max) { device_printf(devinfo->dev, "%s: Ctl overflow!\n", __func__); break; } ctls[cnt].enable = 1; ctls[cnt].widget = w; ctls[cnt].mute = mute; ctls[cnt].step = step; ctls[cnt].size = size; ctls[cnt].offset = offset; ctls[cnt].left = offset; ctls[cnt].right = offset; if (w->type == HDA_PARAM_AUDIO_WIDGET_CAP_TYPE_PIN_COMPLEX) ctls[cnt].ndir = HDAA_CTL_OUT; else ctls[cnt].ndir = HDAA_CTL_IN; ctls[cnt++].dir = HDAA_CTL_IN; break; } } } devinfo->ctl = ctls; } static void hdaa_audio_as_parse(struct hdaa_devinfo *devinfo) { struct hdaa_audio_as *as; struct hdaa_widget *w; int i, j, cnt, max, type, dir, assoc, seq, first, hpredir; /* Count present associations */ max = 0; for (j = 1; j < 16; j++) { for (i = devinfo->startnode; i < devinfo->endnode; i++) { w = hdaa_widget_get(devinfo, i); if (w == NULL || w->enable == 0) continue; if (w->type != HDA_PARAM_AUDIO_WIDGET_CAP_TYPE_PIN_COMPLEX) continue; if (HDA_CONFIG_DEFAULTCONF_ASSOCIATION(w->wclass.pin.config) != j) continue; max++; if (j != 15) /* There could be many 1-pin assocs #15 */ break; } } devinfo->ascnt = max; if (max < 1) return; as = (struct hdaa_audio_as *)malloc( sizeof(*as) * max, M_HDAA, M_ZERO | M_NOWAIT); if (as == NULL) { /* Blekh! */ device_printf(devinfo->dev, "unable to allocate assocs!\n"); devinfo->ascnt = 0; return; } for (i = 0; i < max; i++) { as[i].hpredir = -1; as[i].digital = 0; as[i].num_chans = 1; as[i].location = -1; } /* Scan associations skipping as=0. */ cnt = 0; for (j = 1; j < 16 && cnt < max; j++) { first = 16; hpredir = 0; for (i = devinfo->startnode; i < devinfo->endnode; i++) { w = hdaa_widget_get(devinfo, i); if (w == NULL || w->enable == 0) continue; if (w->type != HDA_PARAM_AUDIO_WIDGET_CAP_TYPE_PIN_COMPLEX) continue; assoc = HDA_CONFIG_DEFAULTCONF_ASSOCIATION(w->wclass.pin.config); seq = HDA_CONFIG_DEFAULTCONF_SEQUENCE(w->wclass.pin.config); if (assoc != j) { continue; } KASSERT(cnt < max, ("%s: Associations owerflow (%d of %d)", __func__, cnt, max)); type = w->wclass.pin.config & HDA_CONFIG_DEFAULTCONF_DEVICE_MASK; /* Get pin direction. */ if (type == HDA_CONFIG_DEFAULTCONF_DEVICE_LINE_OUT || type == HDA_CONFIG_DEFAULTCONF_DEVICE_SPEAKER || type == HDA_CONFIG_DEFAULTCONF_DEVICE_HP_OUT || type == HDA_CONFIG_DEFAULTCONF_DEVICE_SPDIF_OUT || type == HDA_CONFIG_DEFAULTCONF_DEVICE_DIGITAL_OTHER_OUT) dir = HDAA_CTL_OUT; else dir = HDAA_CTL_IN; /* If this is a first pin - create new association. */ if (as[cnt].pincnt == 0) { as[cnt].enable = 1; as[cnt].index = j; as[cnt].dir = dir; } if (seq < first) first = seq; /* Check association correctness. */ if (as[cnt].pins[seq] != 0) { device_printf(devinfo->dev, "%s: Duplicate pin %d (%d) " "in association %d! Disabling association.\n", __func__, seq, w->nid, j); as[cnt].enable = 0; } if (dir != as[cnt].dir) { device_printf(devinfo->dev, "%s: Pin %d has wrong " "direction for association %d! Disabling " "association.\n", __func__, w->nid, j); as[cnt].enable = 0; } if (HDA_PARAM_AUDIO_WIDGET_CAP_DIGITAL(w->param.widget_cap)) { as[cnt].digital |= 0x1; if (HDA_PARAM_PIN_CAP_HDMI(w->wclass.pin.cap)) as[cnt].digital |= 0x2; if (HDA_PARAM_PIN_CAP_DP(w->wclass.pin.cap)) as[cnt].digital |= 0x4; } if (as[cnt].location == -1) { as[cnt].location = HDA_CONFIG_DEFAULTCONF_LOCATION(w->wclass.pin.config); } else if (as[cnt].location != HDA_CONFIG_DEFAULTCONF_LOCATION(w->wclass.pin.config)) { as[cnt].location = -2; } /* Headphones with seq=15 may mean redirection. */ if (type == HDA_CONFIG_DEFAULTCONF_DEVICE_HP_OUT && seq == 15) hpredir = 1; as[cnt].pins[seq] = w->nid; as[cnt].pincnt++; /* Association 15 is a multiple unassociated pins. */ if (j == 15) cnt++; } if (j != 15 && as[cnt].pincnt > 0) { if (hpredir && as[cnt].pincnt > 1) as[cnt].hpredir = first; cnt++; } } for (i = 0; i < max; i++) { if (as[i].dir == HDAA_CTL_IN && (as[i].pincnt == 1 || as[i].pins[14] > 0 || as[i].pins[15] > 0)) as[i].mixed = 1; } HDA_BOOTVERBOSE( device_printf(devinfo->dev, "%d associations found:\n", max); for (i = 0; i < max; i++) { device_printf(devinfo->dev, "Association %d (%d) %s%s:\n", i, as[i].index, (as[i].dir == HDAA_CTL_IN)?"in":"out", as[i].enable?"":" (disabled)"); for (j = 0; j < 16; j++) { if (as[i].pins[j] == 0) continue; device_printf(devinfo->dev, " Pin nid=%d seq=%d\n", as[i].pins[j], j); } } ); devinfo->as = as; } /* * Trace path from DAC to pin. */ static nid_t hdaa_audio_trace_dac(struct hdaa_devinfo *devinfo, int as, int seq, nid_t nid, int dupseq, int min, int only, int depth) { struct hdaa_widget *w; int i, im = -1; nid_t m = 0, ret; if (depth > HDA_PARSE_MAXDEPTH) return (0); w = hdaa_widget_get(devinfo, nid); if (w == NULL || w->enable == 0) return (0); HDA_BOOTHVERBOSE( if (!only) { device_printf(devinfo->dev, " %*stracing via nid %d\n", depth + 1, "", w->nid); } ); /* Use only unused widgets */ if (w->bindas >= 0 && w->bindas != as) { HDA_BOOTHVERBOSE( if (!only) { device_printf(devinfo->dev, " %*snid %d busy by association %d\n", depth + 1, "", w->nid, w->bindas); } ); return (0); } if (dupseq < 0) { if (w->bindseqmask != 0) { HDA_BOOTHVERBOSE( if (!only) { device_printf(devinfo->dev, " %*snid %d busy by seqmask %x\n", depth + 1, "", w->nid, w->bindseqmask); } ); return (0); } } else { /* If this is headphones - allow duplicate first pin. */ if (w->bindseqmask != 0 && (w->bindseqmask & (1 << dupseq)) == 0) { HDA_BOOTHVERBOSE( device_printf(devinfo->dev, " %*snid %d busy by seqmask %x\n", depth + 1, "", w->nid, w->bindseqmask); ); return (0); } } switch (w->type) { case HDA_PARAM_AUDIO_WIDGET_CAP_TYPE_AUDIO_INPUT: /* Do not traverse input. AD1988 has digital monitor for which we are not ready. */ break; case HDA_PARAM_AUDIO_WIDGET_CAP_TYPE_AUDIO_OUTPUT: /* If we are tracing HP take only dac of first pin. */ if ((only == 0 || only == w->nid) && (w->nid >= min) && (dupseq < 0 || w->nid == devinfo->as[as].dacs[0][dupseq])) m = w->nid; break; case HDA_PARAM_AUDIO_WIDGET_CAP_TYPE_PIN_COMPLEX: if (depth > 0) break; /* Fall */ default: /* Find reachable DACs with smallest nid respecting constraints. */ for (i = 0; i < w->nconns; i++) { if (w->connsenable[i] == 0) continue; if (w->selconn != -1 && w->selconn != i) continue; if ((ret = hdaa_audio_trace_dac(devinfo, as, seq, w->conns[i], dupseq, min, only, depth + 1)) != 0) { if (m == 0 || ret < m) { m = ret; im = i; } if (only || dupseq >= 0) break; } } if (im >= 0 && only && ((w->nconns > 1 && w->type != HDA_PARAM_AUDIO_WIDGET_CAP_TYPE_AUDIO_MIXER) || w->type == HDA_PARAM_AUDIO_WIDGET_CAP_TYPE_AUDIO_SELECTOR)) w->selconn = im; break; } if (m && only) { w->bindas = as; w->bindseqmask |= (1 << seq); } HDA_BOOTHVERBOSE( if (!only) { device_printf(devinfo->dev, " %*snid %d returned %d\n", depth + 1, "", w->nid, m); } ); return (m); } /* * Trace path from widget to ADC. */ static nid_t hdaa_audio_trace_adc(struct hdaa_devinfo *devinfo, int as, int seq, nid_t nid, int mixed, int min, int only, int depth, int *length, int onlylength) { struct hdaa_widget *w, *wc; int i, j, im, lm = HDA_PARSE_MAXDEPTH; nid_t m = 0, ret; if (depth > HDA_PARSE_MAXDEPTH) return (0); w = hdaa_widget_get(devinfo, nid); if (w == NULL || w->enable == 0) return (0); HDA_BOOTHVERBOSE( device_printf(devinfo->dev, " %*stracing via nid %d\n", depth + 1, "", w->nid); ); /* Use only unused widgets */ if (w->bindas >= 0 && w->bindas != as) { HDA_BOOTHVERBOSE( device_printf(devinfo->dev, " %*snid %d busy by association %d\n", depth + 1, "", w->nid, w->bindas); ); return (0); } if (!mixed && w->bindseqmask != 0) { HDA_BOOTHVERBOSE( device_printf(devinfo->dev, " %*snid %d busy by seqmask %x\n", depth + 1, "", w->nid, w->bindseqmask); ); return (0); } switch (w->type) { case HDA_PARAM_AUDIO_WIDGET_CAP_TYPE_AUDIO_INPUT: if ((only == 0 || only == w->nid) && (w->nid >= min) && (onlylength == 0 || onlylength == depth)) { m = w->nid; *length = depth; } break; case HDA_PARAM_AUDIO_WIDGET_CAP_TYPE_PIN_COMPLEX: if (depth > 0) break; /* Fall */ default: /* Try to find reachable ADCs with specified nid. */ for (j = devinfo->startnode; j < devinfo->endnode; j++) { wc = hdaa_widget_get(devinfo, j); if (wc == NULL || wc->enable == 0) continue; im = -1; for (i = 0; i < wc->nconns; i++) { if (wc->connsenable[i] == 0) continue; if (wc->conns[i] != nid) continue; if ((ret = hdaa_audio_trace_adc(devinfo, as, seq, j, mixed, min, only, depth + 1, length, onlylength)) != 0) { if (m == 0 || ret < m || (ret == m && *length < lm)) { m = ret; im = i; lm = *length; } else *length = lm; if (only) break; } } if (im >= 0 && only && ((wc->nconns > 1 && wc->type != HDA_PARAM_AUDIO_WIDGET_CAP_TYPE_AUDIO_MIXER) || wc->type == HDA_PARAM_AUDIO_WIDGET_CAP_TYPE_AUDIO_SELECTOR)) wc->selconn = im; } break; } if (m && only) { w->bindas = as; w->bindseqmask |= (1 << seq); } HDA_BOOTHVERBOSE( device_printf(devinfo->dev, " %*snid %d returned %d\n", depth + 1, "", w->nid, m); ); return (m); } /* * Erase trace path of the specified association. */ static void hdaa_audio_undo_trace(struct hdaa_devinfo *devinfo, int as, int seq) { struct hdaa_widget *w; int i; for (i = devinfo->startnode; i < devinfo->endnode; i++) { w = hdaa_widget_get(devinfo, i); if (w == NULL || w->enable == 0) continue; if (w->bindas == as) { if (seq >= 0) { w->bindseqmask &= ~(1 << seq); if (w->bindseqmask == 0) { w->bindas = -1; w->selconn = -1; } } else { w->bindas = -1; w->bindseqmask = 0; w->selconn = -1; } } } } /* * Trace association path from DAC to output */ static int hdaa_audio_trace_as_out(struct hdaa_devinfo *devinfo, int as, int seq) { struct hdaa_audio_as *ases = devinfo->as; int i, hpredir; nid_t min, res; /* Find next pin */ for (i = seq; i < 16 && ases[as].pins[i] == 0; i++) ; /* Check if there is no any left. If so - we succeeded. */ if (i == 16) return (1); hpredir = (i == 15 && ases[as].fakeredir == 0)?ases[as].hpredir:-1; min = 0; do { HDA_BOOTHVERBOSE( device_printf(devinfo->dev, " Tracing pin %d with min nid %d", ases[as].pins[i], min); if (hpredir >= 0) printf(" and hpredir %d", hpredir); printf("\n"); ); /* Trace this pin taking min nid into account. */ res = hdaa_audio_trace_dac(devinfo, as, i, ases[as].pins[i], hpredir, min, 0, 0); if (res == 0) { /* If we failed - return to previous and redo it. */ HDA_BOOTVERBOSE( device_printf(devinfo->dev, " Unable to trace pin %d seq %d with min " "nid %d", ases[as].pins[i], i, min); if (hpredir >= 0) printf(" and hpredir %d", hpredir); printf("\n"); ); return (0); } HDA_BOOTVERBOSE( device_printf(devinfo->dev, " Pin %d traced to DAC %d", ases[as].pins[i], res); if (hpredir >= 0) printf(" and hpredir %d", hpredir); if (ases[as].fakeredir) printf(" with fake redirection"); printf("\n"); ); /* Trace again to mark the path */ hdaa_audio_trace_dac(devinfo, as, i, ases[as].pins[i], hpredir, min, res, 0); ases[as].dacs[0][i] = res; /* We succeeded, so call next. */ if (hdaa_audio_trace_as_out(devinfo, as, i + 1)) return (1); /* If next failed, we should retry with next min */ hdaa_audio_undo_trace(devinfo, as, i); ases[as].dacs[0][i] = 0; min = res + 1; } while (1); } /* * Check equivalency of two DACs. */ static int hdaa_audio_dacs_equal(struct hdaa_widget *w1, struct hdaa_widget *w2) { struct hdaa_devinfo *devinfo = w1->devinfo; struct hdaa_widget *w3; int i, j, c1, c2; if (memcmp(&w1->param, &w2->param, sizeof(w1->param))) return (0); for (i = devinfo->startnode; i < devinfo->endnode; i++) { w3 = hdaa_widget_get(devinfo, i); if (w3 == NULL || w3->enable == 0) continue; if (w3->bindas != w1->bindas) continue; if (w3->nconns == 0) continue; c1 = c2 = -1; for (j = 0; j < w3->nconns; j++) { if (w3->connsenable[j] == 0) continue; if (w3->conns[j] == w1->nid) c1 = j; if (w3->conns[j] == w2->nid) c2 = j; } if (c1 < 0) continue; if (c2 < 0) return (0); if (w3->type != HDA_PARAM_AUDIO_WIDGET_CAP_TYPE_AUDIO_MIXER) return (0); } return (1); } /* * Check equivalency of two ADCs. */ static int hdaa_audio_adcs_equal(struct hdaa_widget *w1, struct hdaa_widget *w2) { struct hdaa_devinfo *devinfo = w1->devinfo; struct hdaa_widget *w3, *w4; int i; if (memcmp(&w1->param, &w2->param, sizeof(w1->param))) return (0); if (w1->nconns != 1 || w2->nconns != 1) return (0); if (w1->conns[0] == w2->conns[0]) return (1); w3 = hdaa_widget_get(devinfo, w1->conns[0]); if (w3 == NULL || w3->enable == 0) return (0); w4 = hdaa_widget_get(devinfo, w2->conns[0]); if (w4 == NULL || w4->enable == 0) return (0); if (w3->bindas == w4->bindas && w3->bindseqmask == w4->bindseqmask) return (1); if (w4->bindas >= 0) return (0); if (w3->type != w4->type) return (0); if (memcmp(&w3->param, &w4->param, sizeof(w3->param))) return (0); if (w3->nconns != w4->nconns) return (0); for (i = 0; i < w3->nconns; i++) { if (w3->conns[i] != w4->conns[i]) return (0); } return (1); } /* * Look for equivalent DAC/ADC to implement second channel. */ static void hdaa_audio_adddac(struct hdaa_devinfo *devinfo, int asid) { struct hdaa_audio_as *as = &devinfo->as[asid]; struct hdaa_widget *w1, *w2; int i, pos; nid_t nid1, nid2; HDA_BOOTVERBOSE( device_printf(devinfo->dev, "Looking for additional %sC " "for association %d (%d)\n", (as->dir == HDAA_CTL_OUT) ? "DA" : "AD", asid, as->index); ); /* Find the exisitng DAC position and return if found more the one. */ pos = -1; for (i = 0; i < 16; i++) { if (as->dacs[0][i] <= 0) continue; if (pos >= 0 && as->dacs[0][i] != as->dacs[0][pos]) return; pos = i; } nid1 = as->dacs[0][pos]; w1 = hdaa_widget_get(devinfo, nid1); w2 = NULL; for (nid2 = devinfo->startnode; nid2 < devinfo->endnode; nid2++) { w2 = hdaa_widget_get(devinfo, nid2); if (w2 == NULL || w2->enable == 0) continue; if (w2->bindas >= 0) continue; if (w1->type == HDA_PARAM_AUDIO_WIDGET_CAP_TYPE_AUDIO_OUTPUT) { if (w2->type != HDA_PARAM_AUDIO_WIDGET_CAP_TYPE_AUDIO_OUTPUT) continue; if (hdaa_audio_dacs_equal(w1, w2)) break; } else { if (w2->type != HDA_PARAM_AUDIO_WIDGET_CAP_TYPE_AUDIO_INPUT) continue; if (hdaa_audio_adcs_equal(w1, w2)) break; } } if (nid2 >= devinfo->endnode) return; w2->bindas = w1->bindas; w2->bindseqmask = w1->bindseqmask; if (w1->type == HDA_PARAM_AUDIO_WIDGET_CAP_TYPE_AUDIO_INPUT) { HDA_BOOTVERBOSE( device_printf(devinfo->dev, " ADC %d considered equal to ADC %d\n", nid2, nid1); ); w1 = hdaa_widget_get(devinfo, w1->conns[0]); w2 = hdaa_widget_get(devinfo, w2->conns[0]); w2->bindas = w1->bindas; w2->bindseqmask = w1->bindseqmask; } else { HDA_BOOTVERBOSE( device_printf(devinfo->dev, " DAC %d considered equal to DAC %d\n", nid2, nid1); ); } for (i = 0; i < 16; i++) { if (as->dacs[0][i] <= 0) continue; as->dacs[as->num_chans][i] = nid2; } as->num_chans++; } /* * Trace association path from input to ADC */ static int hdaa_audio_trace_as_in(struct hdaa_devinfo *devinfo, int as) { struct hdaa_audio_as *ases = devinfo->as; struct hdaa_widget *w; int i, j, k, length; for (j = devinfo->startnode; j < devinfo->endnode; j++) { w = hdaa_widget_get(devinfo, j); if (w == NULL || w->enable == 0) continue; if (w->type != HDA_PARAM_AUDIO_WIDGET_CAP_TYPE_AUDIO_INPUT) continue; if (w->bindas >= 0 && w->bindas != as) continue; /* Find next pin */ for (i = 0; i < 16; i++) { if (ases[as].pins[i] == 0) continue; HDA_BOOTHVERBOSE( device_printf(devinfo->dev, " Tracing pin %d to ADC %d\n", ases[as].pins[i], j); ); /* Trace this pin taking goal into account. */ if (hdaa_audio_trace_adc(devinfo, as, i, ases[as].pins[i], 1, 0, j, 0, &length, 0) == 0) { /* If we failed - return to previous and redo it. */ HDA_BOOTVERBOSE( device_printf(devinfo->dev, " Unable to trace pin %d to ADC %d, undo traces\n", ases[as].pins[i], j); ); hdaa_audio_undo_trace(devinfo, as, -1); for (k = 0; k < 16; k++) ases[as].dacs[0][k] = 0; break; } HDA_BOOTVERBOSE( device_printf(devinfo->dev, " Pin %d traced to ADC %d\n", ases[as].pins[i], j); ); ases[as].dacs[0][i] = j; } if (i == 16) return (1); } return (0); } /* * Trace association path from input to multiple ADCs */ static int hdaa_audio_trace_as_in_mch(struct hdaa_devinfo *devinfo, int as, int seq) { struct hdaa_audio_as *ases = devinfo->as; int i, length; nid_t min, res; /* Find next pin */ for (i = seq; i < 16 && ases[as].pins[i] == 0; i++) ; /* Check if there is no any left. If so - we succeeded. */ if (i == 16) return (1); min = 0; do { HDA_BOOTHVERBOSE( device_printf(devinfo->dev, " Tracing pin %d with min nid %d", ases[as].pins[i], min); printf("\n"); ); /* Trace this pin taking min nid into account. */ res = hdaa_audio_trace_adc(devinfo, as, i, ases[as].pins[i], 0, min, 0, 0, &length, 0); if (res == 0) { /* If we failed - return to previous and redo it. */ HDA_BOOTVERBOSE( device_printf(devinfo->dev, " Unable to trace pin %d seq %d with min " "nid %d", ases[as].pins[i], i, min); printf("\n"); ); return (0); } HDA_BOOTVERBOSE( device_printf(devinfo->dev, " Pin %d traced to ADC %d\n", ases[as].pins[i], res); ); /* Trace again to mark the path */ hdaa_audio_trace_adc(devinfo, as, i, ases[as].pins[i], 0, min, res, 0, &length, length); ases[as].dacs[0][i] = res; /* We succeeded, so call next. */ if (hdaa_audio_trace_as_in_mch(devinfo, as, i + 1)) return (1); /* If next failed, we should retry with next min */ hdaa_audio_undo_trace(devinfo, as, i); ases[as].dacs[0][i] = 0; min = res + 1; } while (1); } /* * Trace input monitor path from mixer to output association. */ static int hdaa_audio_trace_to_out(struct hdaa_devinfo *devinfo, nid_t nid, int depth) { struct hdaa_audio_as *ases = devinfo->as; struct hdaa_widget *w, *wc; int i, j; nid_t res = 0; if (depth > HDA_PARSE_MAXDEPTH) return (0); w = hdaa_widget_get(devinfo, nid); if (w == NULL || w->enable == 0) return (0); HDA_BOOTHVERBOSE( device_printf(devinfo->dev, " %*stracing via nid %d\n", depth + 1, "", w->nid); ); /* Use only unused widgets */ if (depth > 0 && w->bindas != -1) { if (w->bindas < 0 || ases[w->bindas].dir == HDAA_CTL_OUT) { HDA_BOOTHVERBOSE( device_printf(devinfo->dev, " %*snid %d found output association %d\n", depth + 1, "", w->nid, w->bindas); ); if (w->bindas >= 0) w->pflags |= HDAA_ADC_MONITOR; return (1); } else { HDA_BOOTHVERBOSE( device_printf(devinfo->dev, " %*snid %d busy by input association %d\n", depth + 1, "", w->nid, w->bindas); ); return (0); } } switch (w->type) { case HDA_PARAM_AUDIO_WIDGET_CAP_TYPE_AUDIO_INPUT: /* Do not traverse input. AD1988 has digital monitor for which we are not ready. */ break; case HDA_PARAM_AUDIO_WIDGET_CAP_TYPE_PIN_COMPLEX: if (depth > 0) break; /* Fall */ default: /* Try to find reachable ADCs with specified nid. */ for (j = devinfo->startnode; j < devinfo->endnode; j++) { wc = hdaa_widget_get(devinfo, j); if (wc == NULL || wc->enable == 0) continue; for (i = 0; i < wc->nconns; i++) { if (wc->connsenable[i] == 0) continue; if (wc->conns[i] != nid) continue; if (hdaa_audio_trace_to_out(devinfo, j, depth + 1) != 0) { res = 1; if (wc->type == HDA_PARAM_AUDIO_WIDGET_CAP_TYPE_AUDIO_SELECTOR && wc->selconn == -1) wc->selconn = i; } } } break; } if (res && w->bindas == -1) w->bindas = -2; HDA_BOOTHVERBOSE( device_printf(devinfo->dev, " %*snid %d returned %d\n", depth + 1, "", w->nid, res); ); return (res); } /* * Trace extra associations (beeper, monitor) */ static void hdaa_audio_trace_as_extra(struct hdaa_devinfo *devinfo) { struct hdaa_audio_as *as = devinfo->as; struct hdaa_widget *w; int j; /* Input monitor */ /* Find mixer associated with input, but supplying signal for output associations. Hope it will be input monitor. */ HDA_BOOTVERBOSE( device_printf(devinfo->dev, "Tracing input monitor\n"); ); for (j = devinfo->startnode; j < devinfo->endnode; j++) { w = hdaa_widget_get(devinfo, j); if (w == NULL || w->enable == 0) continue; if (w->type != HDA_PARAM_AUDIO_WIDGET_CAP_TYPE_AUDIO_MIXER) continue; if (w->bindas < 0 || as[w->bindas].dir != HDAA_CTL_IN) continue; HDA_BOOTVERBOSE( device_printf(devinfo->dev, " Tracing nid %d to out\n", j); ); if (hdaa_audio_trace_to_out(devinfo, w->nid, 0)) { HDA_BOOTVERBOSE( device_printf(devinfo->dev, " nid %d is input monitor\n", w->nid); ); w->ossdev = SOUND_MIXER_IMIX; } } /* Other inputs monitor */ /* Find input pins supplying signal for output associations. Hope it will be input monitoring. */ HDA_BOOTVERBOSE( device_printf(devinfo->dev, "Tracing other input monitors\n"); ); for (j = devinfo->startnode; j < devinfo->endnode; j++) { w = hdaa_widget_get(devinfo, j); if (w == NULL || w->enable == 0) continue; if (w->type != HDA_PARAM_AUDIO_WIDGET_CAP_TYPE_PIN_COMPLEX) continue; if (w->bindas < 0 || as[w->bindas].dir != HDAA_CTL_IN) continue; HDA_BOOTVERBOSE( device_printf(devinfo->dev, " Tracing nid %d to out\n", j); ); if (hdaa_audio_trace_to_out(devinfo, w->nid, 0)) { HDA_BOOTVERBOSE( device_printf(devinfo->dev, " nid %d is input monitor\n", w->nid); ); } } /* Beeper */ HDA_BOOTVERBOSE( device_printf(devinfo->dev, "Tracing beeper\n"); ); for (j = devinfo->startnode; j < devinfo->endnode; j++) { w = hdaa_widget_get(devinfo, j); if (w == NULL || w->enable == 0) continue; if (w->type != HDA_PARAM_AUDIO_WIDGET_CAP_TYPE_BEEP_WIDGET) continue; HDA_BOOTHVERBOSE( device_printf(devinfo->dev, " Tracing nid %d to out\n", j); ); if (hdaa_audio_trace_to_out(devinfo, w->nid, 0)) { HDA_BOOTVERBOSE( device_printf(devinfo->dev, " nid %d traced to out\n", j); ); } w->bindas = -2; } } /* * Bind assotiations to PCM channels */ static void hdaa_audio_bind_as(struct hdaa_devinfo *devinfo) { struct hdaa_audio_as *as = devinfo->as; int i, j, cnt = 0, free; for (j = 0; j < devinfo->ascnt; j++) { if (as[j].enable) cnt += as[j].num_chans; } if (devinfo->num_chans == 0) { devinfo->chans = (struct hdaa_chan *)malloc( sizeof(struct hdaa_chan) * cnt, M_HDAA, M_ZERO | M_NOWAIT); if (devinfo->chans == NULL) { device_printf(devinfo->dev, "Channels memory allocation failed!\n"); return; } } else { devinfo->chans = (struct hdaa_chan *)realloc(devinfo->chans, sizeof(struct hdaa_chan) * (devinfo->num_chans + cnt), M_HDAA, M_ZERO | M_NOWAIT); if (devinfo->chans == NULL) { devinfo->num_chans = 0; device_printf(devinfo->dev, "Channels memory allocation failed!\n"); return; } /* Fixup relative pointers after realloc */ for (j = 0; j < devinfo->num_chans; j++) devinfo->chans[j].caps.fmtlist = devinfo->chans[j].fmtlist; } free = devinfo->num_chans; devinfo->num_chans += cnt; for (j = free; j < free + cnt; j++) { devinfo->chans[j].devinfo = devinfo; devinfo->chans[j].as = -1; } /* Assign associations in order of their numbers, */ for (j = 0; j < devinfo->ascnt; j++) { if (as[j].enable == 0) continue; for (i = 0; i < as[j].num_chans; i++) { devinfo->chans[free].as = j; devinfo->chans[free].asindex = i; devinfo->chans[free].dir = (as[j].dir == HDAA_CTL_IN) ? PCMDIR_REC : PCMDIR_PLAY; hdaa_pcmchannel_setup(&devinfo->chans[free]); as[j].chans[i] = free; free++; } } } static void hdaa_audio_disable_nonaudio(struct hdaa_devinfo *devinfo) { struct hdaa_widget *w; int i; /* Disable power and volume widgets. */ for (i = devinfo->startnode; i < devinfo->endnode; i++) { w = hdaa_widget_get(devinfo, i); if (w == NULL || w->enable == 0) continue; if (w->type == HDA_PARAM_AUDIO_WIDGET_CAP_TYPE_POWER_WIDGET || w->type == HDA_PARAM_AUDIO_WIDGET_CAP_TYPE_VOLUME_WIDGET) { w->enable = 0; HDA_BOOTHVERBOSE( device_printf(devinfo->dev, " Disabling nid %d due to it's" " non-audio type.\n", w->nid); ); } } } static void hdaa_audio_disable_useless(struct hdaa_devinfo *devinfo) { struct hdaa_widget *w, *cw; struct hdaa_audio_ctl *ctl; int done, found, i, j, k; /* Disable useless pins. */ for (i = devinfo->startnode; i < devinfo->endnode; i++) { w = hdaa_widget_get(devinfo, i); if (w == NULL || w->enable == 0) continue; if (w->type == HDA_PARAM_AUDIO_WIDGET_CAP_TYPE_PIN_COMPLEX) { if ((w->wclass.pin.config & HDA_CONFIG_DEFAULTCONF_CONNECTIVITY_MASK) == HDA_CONFIG_DEFAULTCONF_CONNECTIVITY_NONE) { w->enable = 0; HDA_BOOTHVERBOSE( device_printf(devinfo->dev, " Disabling pin nid %d due" " to None connectivity.\n", w->nid); ); } else if ((w->wclass.pin.config & HDA_CONFIG_DEFAULTCONF_ASSOCIATION_MASK) == 0) { w->enable = 0; HDA_BOOTHVERBOSE( device_printf(devinfo->dev, " Disabling unassociated" " pin nid %d.\n", w->nid); ); } } } do { done = 1; /* Disable and mute controls for disabled widgets. */ i = 0; while ((ctl = hdaa_audio_ctl_each(devinfo, &i)) != NULL) { if (ctl->enable == 0) continue; if (ctl->widget->enable == 0 || (ctl->childwidget != NULL && ctl->childwidget->enable == 0)) { ctl->forcemute = 1; ctl->muted = HDAA_AMP_MUTE_ALL; ctl->left = 0; ctl->right = 0; ctl->enable = 0; if (ctl->ndir == HDAA_CTL_IN) ctl->widget->connsenable[ctl->index] = 0; done = 0; HDA_BOOTHVERBOSE( device_printf(devinfo->dev, " Disabling ctl %d nid %d cnid %d due" " to disabled widget.\n", i, ctl->widget->nid, (ctl->childwidget != NULL)? ctl->childwidget->nid:-1); ); } } /* Disable useless widgets. */ for (i = devinfo->startnode; i < devinfo->endnode; i++) { w = hdaa_widget_get(devinfo, i); if (w == NULL || w->enable == 0) continue; /* Disable inputs with disabled child widgets. */ for (j = 0; j < w->nconns; j++) { if (w->connsenable[j]) { cw = hdaa_widget_get(devinfo, w->conns[j]); if (cw == NULL || cw->enable == 0) { w->connsenable[j] = 0; HDA_BOOTHVERBOSE( device_printf(devinfo->dev, " Disabling nid %d connection %d due" " to disabled child widget.\n", i, j); ); } } } if (w->type != HDA_PARAM_AUDIO_WIDGET_CAP_TYPE_AUDIO_SELECTOR && w->type != HDA_PARAM_AUDIO_WIDGET_CAP_TYPE_AUDIO_MIXER) continue; /* Disable mixers and selectors without inputs. */ found = 0; for (j = 0; j < w->nconns; j++) { if (w->connsenable[j]) { found = 1; break; } } if (found == 0) { w->enable = 0; done = 0; HDA_BOOTHVERBOSE( device_printf(devinfo->dev, " Disabling nid %d due to all it's" " inputs disabled.\n", w->nid); ); } /* Disable nodes without consumers. */ if (w->type != HDA_PARAM_AUDIO_WIDGET_CAP_TYPE_AUDIO_SELECTOR && w->type != HDA_PARAM_AUDIO_WIDGET_CAP_TYPE_AUDIO_MIXER) continue; found = 0; for (k = devinfo->startnode; k < devinfo->endnode; k++) { cw = hdaa_widget_get(devinfo, k); if (cw == NULL || cw->enable == 0) continue; for (j = 0; j < cw->nconns; j++) { if (cw->connsenable[j] && cw->conns[j] == i) { found = 1; break; } } } if (found == 0) { w->enable = 0; done = 0; HDA_BOOTHVERBOSE( device_printf(devinfo->dev, " Disabling nid %d due to all it's" " consumers disabled.\n", w->nid); ); } } } while (done == 0); } static void hdaa_audio_disable_unas(struct hdaa_devinfo *devinfo) { struct hdaa_audio_as *as = devinfo->as; struct hdaa_widget *w, *cw; struct hdaa_audio_ctl *ctl; int i, j, k; /* Disable unassosiated widgets. */ for (i = devinfo->startnode; i < devinfo->endnode; i++) { w = hdaa_widget_get(devinfo, i); if (w == NULL || w->enable == 0) continue; if (w->bindas == -1) { w->enable = 0; HDA_BOOTHVERBOSE( device_printf(devinfo->dev, " Disabling unassociated nid %d.\n", w->nid); ); } } /* Disable input connections on input pin and * output on output. */ for (i = devinfo->startnode; i < devinfo->endnode; i++) { w = hdaa_widget_get(devinfo, i); if (w == NULL || w->enable == 0) continue; if (w->type != HDA_PARAM_AUDIO_WIDGET_CAP_TYPE_PIN_COMPLEX) continue; if (w->bindas < 0) continue; if (as[w->bindas].dir == HDAA_CTL_IN) { for (j = 0; j < w->nconns; j++) { if (w->connsenable[j] == 0) continue; w->connsenable[j] = 0; HDA_BOOTHVERBOSE( device_printf(devinfo->dev, " Disabling connection to input pin " "nid %d conn %d.\n", i, j); ); } ctl = hdaa_audio_ctl_amp_get(devinfo, w->nid, HDAA_CTL_IN, -1, 1); if (ctl && ctl->enable) { ctl->forcemute = 1; ctl->muted = HDAA_AMP_MUTE_ALL; ctl->left = 0; ctl->right = 0; ctl->enable = 0; } } else { ctl = hdaa_audio_ctl_amp_get(devinfo, w->nid, HDAA_CTL_OUT, -1, 1); if (ctl && ctl->enable) { ctl->forcemute = 1; ctl->muted = HDAA_AMP_MUTE_ALL; ctl->left = 0; ctl->right = 0; ctl->enable = 0; } for (k = devinfo->startnode; k < devinfo->endnode; k++) { cw = hdaa_widget_get(devinfo, k); if (cw == NULL || cw->enable == 0) continue; for (j = 0; j < cw->nconns; j++) { if (cw->connsenable[j] && cw->conns[j] == i) { cw->connsenable[j] = 0; HDA_BOOTHVERBOSE( device_printf(devinfo->dev, " Disabling connection from output pin " "nid %d conn %d cnid %d.\n", k, j, i); ); if (cw->type == HDA_PARAM_AUDIO_WIDGET_CAP_TYPE_PIN_COMPLEX && cw->nconns > 1) continue; ctl = hdaa_audio_ctl_amp_get(devinfo, k, HDAA_CTL_IN, j, 1); if (ctl && ctl->enable) { ctl->forcemute = 1; ctl->muted = HDAA_AMP_MUTE_ALL; ctl->left = 0; ctl->right = 0; ctl->enable = 0; } } } } } } } static void hdaa_audio_disable_notselected(struct hdaa_devinfo *devinfo) { struct hdaa_audio_as *as = devinfo->as; struct hdaa_widget *w; int i, j; /* On playback path we can safely disable all unseleted inputs. */ for (i = devinfo->startnode; i < devinfo->endnode; i++) { w = hdaa_widget_get(devinfo, i); if (w == NULL || w->enable == 0) continue; if (w->nconns <= 1) continue; if (w->type == HDA_PARAM_AUDIO_WIDGET_CAP_TYPE_AUDIO_MIXER) continue; if (w->bindas < 0 || as[w->bindas].dir == HDAA_CTL_IN) continue; for (j = 0; j < w->nconns; j++) { if (w->connsenable[j] == 0) continue; if (w->selconn < 0 || w->selconn == j) continue; w->connsenable[j] = 0; HDA_BOOTHVERBOSE( device_printf(devinfo->dev, " Disabling unselected connection " "nid %d conn %d.\n", i, j); ); } } } static void hdaa_audio_disable_crossas(struct hdaa_devinfo *devinfo) { struct hdaa_audio_as *ases = devinfo->as; struct hdaa_widget *w, *cw; struct hdaa_audio_ctl *ctl; int i, j; /* Disable crossassociatement and unwanted crosschannel connections. */ /* ... using selectors */ for (i = devinfo->startnode; i < devinfo->endnode; i++) { w = hdaa_widget_get(devinfo, i); if (w == NULL || w->enable == 0) continue; if (w->nconns <= 1) continue; if (w->type == HDA_PARAM_AUDIO_WIDGET_CAP_TYPE_AUDIO_MIXER) continue; /* Allow any -> mix */ if (w->bindas == -2) continue; for (j = 0; j < w->nconns; j++) { if (w->connsenable[j] == 0) continue; cw = hdaa_widget_get(devinfo, w->conns[j]); if (cw == NULL || w->enable == 0) continue; /* Allow mix -> out. */ if (cw->bindas == -2 && w->bindas >= 0 && ases[w->bindas].dir == HDAA_CTL_OUT) continue; /* Allow mix -> mixed-in. */ if (cw->bindas == -2 && w->bindas >= 0 && ases[w->bindas].mixed) continue; /* Allow in -> mix. */ if ((w->pflags & HDAA_ADC_MONITOR) && cw->bindas >= 0 && ases[cw->bindas].dir == HDAA_CTL_IN) continue; /* Allow if have common as/seqs. */ if (w->bindas == cw->bindas && (w->bindseqmask & cw->bindseqmask) != 0) continue; w->connsenable[j] = 0; HDA_BOOTHVERBOSE( device_printf(devinfo->dev, " Disabling crossassociatement connection " "nid %d conn %d cnid %d.\n", i, j, cw->nid); ); } } /* ... using controls */ i = 0; while ((ctl = hdaa_audio_ctl_each(devinfo, &i)) != NULL) { if (ctl->enable == 0 || ctl->childwidget == NULL) continue; /* Allow any -> mix */ if (ctl->widget->bindas == -2) continue; /* Allow mix -> out. */ if (ctl->childwidget->bindas == -2 && ctl->widget->bindas >= 0 && ases[ctl->widget->bindas].dir == HDAA_CTL_OUT) continue; /* Allow mix -> mixed-in. */ if (ctl->childwidget->bindas == -2 && ctl->widget->bindas >= 0 && ases[ctl->widget->bindas].mixed) continue; /* Allow in -> mix. */ if ((ctl->widget->pflags & HDAA_ADC_MONITOR) && ctl->childwidget->bindas >= 0 && ases[ctl->childwidget->bindas].dir == HDAA_CTL_IN) continue; /* Allow if have common as/seqs. */ if (ctl->widget->bindas == ctl->childwidget->bindas && (ctl->widget->bindseqmask & ctl->childwidget->bindseqmask) != 0) continue; ctl->forcemute = 1; ctl->muted = HDAA_AMP_MUTE_ALL; ctl->left = 0; ctl->right = 0; ctl->enable = 0; if (ctl->ndir == HDAA_CTL_IN) ctl->widget->connsenable[ctl->index] = 0; HDA_BOOTHVERBOSE( device_printf(devinfo->dev, " Disabling crossassociatement connection " "ctl %d nid %d cnid %d.\n", i, ctl->widget->nid, ctl->childwidget->nid); ); } } /* * Find controls to control amplification for source and calculate possible * amplification range. */ static int hdaa_audio_ctl_source_amp(struct hdaa_devinfo *devinfo, nid_t nid, int index, int ossdev, int ctlable, int depth, int *minamp, int *maxamp) { struct hdaa_widget *w, *wc; struct hdaa_audio_ctl *ctl; int i, j, conns = 0, tminamp, tmaxamp, cminamp, cmaxamp, found = 0; if (depth > HDA_PARSE_MAXDEPTH) return (found); w = hdaa_widget_get(devinfo, nid); if (w == NULL || w->enable == 0) return (found); /* Count number of active inputs. */ if (depth > 0) { for (j = 0; j < w->nconns; j++) { if (!w->connsenable[j]) continue; conns++; } } /* If this is not a first step - use input mixer. Pins have common input ctl so care must be taken. */ if (depth > 0 && ctlable && (conns == 1 || w->type != HDA_PARAM_AUDIO_WIDGET_CAP_TYPE_PIN_COMPLEX)) { ctl = hdaa_audio_ctl_amp_get(devinfo, w->nid, HDAA_CTL_IN, index, 1); if (ctl) { ctl->ossmask |= (1 << ossdev); found++; if (*minamp == *maxamp) { *minamp += MINQDB(ctl); *maxamp += MAXQDB(ctl); } } } /* If widget has own ossdev - not traverse it. - It will be traversed on it's own. */ + It will be traversed on its own. */ if (w->ossdev >= 0 && depth > 0) return (found); /* We must not traverse pin */ if ((w->type == HDA_PARAM_AUDIO_WIDGET_CAP_TYPE_AUDIO_INPUT || w->type == HDA_PARAM_AUDIO_WIDGET_CAP_TYPE_PIN_COMPLEX) && depth > 0) return (found); /* record that this widget exports such signal, */ w->ossmask |= (1 << ossdev); /* * If signals mixed, we can't assign controls farther. * Ignore this on depth zero. Caller must knows why. */ if (conns > 1 && w->type == HDA_PARAM_AUDIO_WIDGET_CAP_TYPE_AUDIO_MIXER) ctlable = 0; if (ctlable) { ctl = hdaa_audio_ctl_amp_get(devinfo, w->nid, HDAA_CTL_OUT, -1, 1); if (ctl) { ctl->ossmask |= (1 << ossdev); found++; if (*minamp == *maxamp) { *minamp += MINQDB(ctl); *maxamp += MAXQDB(ctl); } } } cminamp = cmaxamp = 0; for (i = devinfo->startnode; i < devinfo->endnode; i++) { wc = hdaa_widget_get(devinfo, i); if (wc == NULL || wc->enable == 0) continue; for (j = 0; j < wc->nconns; j++) { if (wc->connsenable[j] && wc->conns[j] == nid) { tminamp = tmaxamp = 0; found += hdaa_audio_ctl_source_amp(devinfo, wc->nid, j, ossdev, ctlable, depth + 1, &tminamp, &tmaxamp); if (cminamp == 0 && cmaxamp == 0) { cminamp = tminamp; cmaxamp = tmaxamp; } else if (tminamp != tmaxamp) { cminamp = imax(cminamp, tminamp); cmaxamp = imin(cmaxamp, tmaxamp); } } } } if (*minamp == *maxamp && cminamp < cmaxamp) { *minamp += cminamp; *maxamp += cmaxamp; } return (found); } /* * Find controls to control amplification for destination and calculate * possible amplification range. */ static int hdaa_audio_ctl_dest_amp(struct hdaa_devinfo *devinfo, nid_t nid, int index, int ossdev, int depth, int *minamp, int *maxamp) { struct hdaa_audio_as *as = devinfo->as; struct hdaa_widget *w, *wc; struct hdaa_audio_ctl *ctl; int i, j, consumers, tminamp, tmaxamp, cminamp, cmaxamp, found = 0; if (depth > HDA_PARSE_MAXDEPTH) return (found); w = hdaa_widget_get(devinfo, nid); if (w == NULL || w->enable == 0) return (found); if (depth > 0) { /* If this node produce output for several consumers, we can't touch it. */ consumers = 0; for (i = devinfo->startnode; i < devinfo->endnode; i++) { wc = hdaa_widget_get(devinfo, i); if (wc == NULL || wc->enable == 0) continue; for (j = 0; j < wc->nconns; j++) { if (wc->connsenable[j] && wc->conns[j] == nid) consumers++; } } /* The only exception is if real HP redirection is configured and this is a duplication point. XXX: Actually exception is not completely correct. XXX: Duplication point check is not perfect. */ if ((consumers == 2 && (w->bindas < 0 || as[w->bindas].hpredir < 0 || as[w->bindas].fakeredir || (w->bindseqmask & (1 << 15)) == 0)) || consumers > 2) return (found); /* Else use it's output mixer. */ ctl = hdaa_audio_ctl_amp_get(devinfo, w->nid, HDAA_CTL_OUT, -1, 1); if (ctl) { ctl->ossmask |= (1 << ossdev); found++; if (*minamp == *maxamp) { *minamp += MINQDB(ctl); *maxamp += MAXQDB(ctl); } } } /* We must not traverse pin */ if (w->type == HDA_PARAM_AUDIO_WIDGET_CAP_TYPE_PIN_COMPLEX && depth > 0) return (found); cminamp = cmaxamp = 0; for (i = 0; i < w->nconns; i++) { if (w->connsenable[i] == 0) continue; if (index >= 0 && i != index) continue; tminamp = tmaxamp = 0; ctl = hdaa_audio_ctl_amp_get(devinfo, w->nid, HDAA_CTL_IN, i, 1); if (ctl) { ctl->ossmask |= (1 << ossdev); found++; if (*minamp == *maxamp) { tminamp += MINQDB(ctl); tmaxamp += MAXQDB(ctl); } } found += hdaa_audio_ctl_dest_amp(devinfo, w->conns[i], -1, ossdev, depth + 1, &tminamp, &tmaxamp); if (cminamp == 0 && cmaxamp == 0) { cminamp = tminamp; cmaxamp = tmaxamp; } else if (tminamp != tmaxamp) { cminamp = imax(cminamp, tminamp); cmaxamp = imin(cmaxamp, tmaxamp); } } if (*minamp == *maxamp && cminamp < cmaxamp) { *minamp += cminamp; *maxamp += cmaxamp; } return (found); } /* * Assign OSS names to sound sources */ static void hdaa_audio_assign_names(struct hdaa_devinfo *devinfo) { struct hdaa_audio_as *as = devinfo->as; struct hdaa_widget *w; int i, j; int type = -1, use, used = 0; static const int types[7][13] = { { SOUND_MIXER_LINE, SOUND_MIXER_LINE1, SOUND_MIXER_LINE2, SOUND_MIXER_LINE3, -1 }, /* line */ { SOUND_MIXER_MONITOR, SOUND_MIXER_MIC, -1 }, /* int mic */ { SOUND_MIXER_MIC, SOUND_MIXER_MONITOR, -1 }, /* ext mic */ { SOUND_MIXER_CD, -1 }, /* cd */ { SOUND_MIXER_SPEAKER, -1 }, /* speaker */ { SOUND_MIXER_DIGITAL1, SOUND_MIXER_DIGITAL2, SOUND_MIXER_DIGITAL3, -1 }, /* digital */ { SOUND_MIXER_LINE, SOUND_MIXER_LINE1, SOUND_MIXER_LINE2, SOUND_MIXER_LINE3, SOUND_MIXER_PHONEIN, SOUND_MIXER_PHONEOUT, SOUND_MIXER_VIDEO, SOUND_MIXER_RADIO, SOUND_MIXER_DIGITAL1, SOUND_MIXER_DIGITAL2, SOUND_MIXER_DIGITAL3, SOUND_MIXER_MONITOR, -1 } /* others */ }; /* Surely known names */ for (i = devinfo->startnode; i < devinfo->endnode; i++) { w = hdaa_widget_get(devinfo, i); if (w == NULL || w->enable == 0) continue; if (w->bindas == -1) continue; use = -1; switch (w->type) { case HDA_PARAM_AUDIO_WIDGET_CAP_TYPE_PIN_COMPLEX: if (as[w->bindas].dir == HDAA_CTL_OUT) break; type = -1; switch (w->wclass.pin.config & HDA_CONFIG_DEFAULTCONF_DEVICE_MASK) { case HDA_CONFIG_DEFAULTCONF_DEVICE_LINE_IN: type = 0; break; case HDA_CONFIG_DEFAULTCONF_DEVICE_MIC_IN: if ((w->wclass.pin.config & HDA_CONFIG_DEFAULTCONF_CONNECTIVITY_MASK) == HDA_CONFIG_DEFAULTCONF_CONNECTIVITY_JACK) break; type = 1; break; case HDA_CONFIG_DEFAULTCONF_DEVICE_CD: type = 3; break; case HDA_CONFIG_DEFAULTCONF_DEVICE_SPEAKER: type = 4; break; case HDA_CONFIG_DEFAULTCONF_DEVICE_SPDIF_IN: case HDA_CONFIG_DEFAULTCONF_DEVICE_DIGITAL_OTHER_IN: type = 5; break; } if (type == -1) break; j = 0; while (types[type][j] >= 0 && (used & (1 << types[type][j])) != 0) { j++; } if (types[type][j] >= 0) use = types[type][j]; break; case HDA_PARAM_AUDIO_WIDGET_CAP_TYPE_AUDIO_OUTPUT: use = SOUND_MIXER_PCM; break; case HDA_PARAM_AUDIO_WIDGET_CAP_TYPE_BEEP_WIDGET: use = SOUND_MIXER_SPEAKER; break; default: break; } if (use >= 0) { w->ossdev = use; used |= (1 << use); } } /* Semi-known names */ for (i = devinfo->startnode; i < devinfo->endnode; i++) { w = hdaa_widget_get(devinfo, i); if (w == NULL || w->enable == 0) continue; if (w->ossdev >= 0) continue; if (w->bindas == -1) continue; if (w->type != HDA_PARAM_AUDIO_WIDGET_CAP_TYPE_PIN_COMPLEX) continue; if (as[w->bindas].dir == HDAA_CTL_OUT) continue; type = -1; switch (w->wclass.pin.config & HDA_CONFIG_DEFAULTCONF_DEVICE_MASK) { case HDA_CONFIG_DEFAULTCONF_DEVICE_LINE_OUT: case HDA_CONFIG_DEFAULTCONF_DEVICE_SPEAKER: case HDA_CONFIG_DEFAULTCONF_DEVICE_HP_OUT: case HDA_CONFIG_DEFAULTCONF_DEVICE_AUX: type = 0; break; case HDA_CONFIG_DEFAULTCONF_DEVICE_MIC_IN: type = 2; break; case HDA_CONFIG_DEFAULTCONF_DEVICE_SPDIF_OUT: case HDA_CONFIG_DEFAULTCONF_DEVICE_DIGITAL_OTHER_OUT: type = 5; break; } if (type == -1) break; j = 0; while (types[type][j] >= 0 && (used & (1 << types[type][j])) != 0) { j++; } if (types[type][j] >= 0) { w->ossdev = types[type][j]; used |= (1 << types[type][j]); } } /* Others */ for (i = devinfo->startnode; i < devinfo->endnode; i++) { w = hdaa_widget_get(devinfo, i); if (w == NULL || w->enable == 0) continue; if (w->ossdev >= 0) continue; if (w->bindas == -1) continue; if (w->type != HDA_PARAM_AUDIO_WIDGET_CAP_TYPE_PIN_COMPLEX) continue; if (as[w->bindas].dir == HDAA_CTL_OUT) continue; j = 0; while (types[6][j] >= 0 && (used & (1 << types[6][j])) != 0) { j++; } if (types[6][j] >= 0) { w->ossdev = types[6][j]; used |= (1 << types[6][j]); } } } static void hdaa_audio_build_tree(struct hdaa_devinfo *devinfo) { struct hdaa_audio_as *as = devinfo->as; int j, res; /* Trace all associations in order of their numbers. */ for (j = 0; j < devinfo->ascnt; j++) { if (as[j].enable == 0) continue; HDA_BOOTVERBOSE( device_printf(devinfo->dev, "Tracing association %d (%d)\n", j, as[j].index); ); if (as[j].dir == HDAA_CTL_OUT) { retry: res = hdaa_audio_trace_as_out(devinfo, j, 0); if (res == 0 && as[j].hpredir >= 0 && as[j].fakeredir == 0) { /* If CODEC can't do analog HP redirection try to make it using one more DAC. */ as[j].fakeredir = 1; goto retry; } } else if (as[j].mixed) res = hdaa_audio_trace_as_in(devinfo, j); else res = hdaa_audio_trace_as_in_mch(devinfo, j, 0); if (res) { HDA_BOOTVERBOSE( device_printf(devinfo->dev, "Association %d (%d) trace succeeded\n", j, as[j].index); ); } else { HDA_BOOTVERBOSE( device_printf(devinfo->dev, "Association %d (%d) trace failed\n", j, as[j].index); ); as[j].enable = 0; } } /* Look for additional DACs/ADCs. */ for (j = 0; j < devinfo->ascnt; j++) { if (as[j].enable == 0) continue; hdaa_audio_adddac(devinfo, j); } /* Trace mixer and beeper pseudo associations. */ hdaa_audio_trace_as_extra(devinfo); } /* * Store in pdevinfo new data about whether and how we can control signal * for OSS device to/from specified widget. */ static void hdaa_adjust_amp(struct hdaa_widget *w, int ossdev, int found, int minamp, int maxamp) { struct hdaa_devinfo *devinfo = w->devinfo; struct hdaa_pcm_devinfo *pdevinfo; if (w->bindas >= 0) pdevinfo = devinfo->as[w->bindas].pdevinfo; else pdevinfo = &devinfo->devs[0]; if (found) pdevinfo->ossmask |= (1 << ossdev); if (minamp == 0 && maxamp == 0) return; if (pdevinfo->minamp[ossdev] == 0 && pdevinfo->maxamp[ossdev] == 0) { pdevinfo->minamp[ossdev] = minamp; pdevinfo->maxamp[ossdev] = maxamp; } else { pdevinfo->minamp[ossdev] = imax(pdevinfo->minamp[ossdev], minamp); pdevinfo->maxamp[ossdev] = imin(pdevinfo->maxamp[ossdev], maxamp); } } /* * Trace signals from/to all possible sources/destionstions to find possible * recording sources, OSS device control ranges and to assign controls. */ static void hdaa_audio_assign_mixers(struct hdaa_devinfo *devinfo) { struct hdaa_audio_as *as = devinfo->as; struct hdaa_widget *w, *cw; int i, j, minamp, maxamp, found; /* Assign mixers to the tree. */ for (i = devinfo->startnode; i < devinfo->endnode; i++) { w = hdaa_widget_get(devinfo, i); if (w == NULL || w->enable == 0) continue; minamp = maxamp = 0; if (w->type == HDA_PARAM_AUDIO_WIDGET_CAP_TYPE_AUDIO_OUTPUT || w->type == HDA_PARAM_AUDIO_WIDGET_CAP_TYPE_BEEP_WIDGET || (w->type == HDA_PARAM_AUDIO_WIDGET_CAP_TYPE_PIN_COMPLEX && as[w->bindas].dir == HDAA_CTL_IN)) { if (w->ossdev < 0) continue; found = hdaa_audio_ctl_source_amp(devinfo, w->nid, -1, w->ossdev, 1, 0, &minamp, &maxamp); hdaa_adjust_amp(w, w->ossdev, found, minamp, maxamp); } else if (w->type == HDA_PARAM_AUDIO_WIDGET_CAP_TYPE_AUDIO_INPUT) { found = hdaa_audio_ctl_dest_amp(devinfo, w->nid, -1, SOUND_MIXER_RECLEV, 0, &minamp, &maxamp); hdaa_adjust_amp(w, SOUND_MIXER_RECLEV, found, minamp, maxamp); } else if (w->type == HDA_PARAM_AUDIO_WIDGET_CAP_TYPE_PIN_COMPLEX && as[w->bindas].dir == HDAA_CTL_OUT) { found = hdaa_audio_ctl_dest_amp(devinfo, w->nid, -1, SOUND_MIXER_VOLUME, 0, &minamp, &maxamp); hdaa_adjust_amp(w, SOUND_MIXER_VOLUME, found, minamp, maxamp); } if (w->ossdev == SOUND_MIXER_IMIX) { minamp = maxamp = 0; found = hdaa_audio_ctl_source_amp(devinfo, w->nid, -1, w->ossdev, 1, 0, &minamp, &maxamp); if (minamp == maxamp) { /* If we are unable to control input monitor as source - try to control it as destination. */ found += hdaa_audio_ctl_dest_amp(devinfo, w->nid, -1, w->ossdev, 0, &minamp, &maxamp); w->pflags |= HDAA_IMIX_AS_DST; } hdaa_adjust_amp(w, w->ossdev, found, minamp, maxamp); } if (w->pflags & HDAA_ADC_MONITOR) { for (j = 0; j < w->nconns; j++) { if (!w->connsenable[j]) continue; cw = hdaa_widget_get(devinfo, w->conns[j]); if (cw == NULL || cw->enable == 0) continue; if (cw->bindas == -1) continue; if (cw->bindas >= 0 && as[cw->bindas].dir != HDAA_CTL_IN) continue; minamp = maxamp = 0; found = hdaa_audio_ctl_dest_amp(devinfo, w->nid, j, SOUND_MIXER_IGAIN, 0, &minamp, &maxamp); hdaa_adjust_amp(w, SOUND_MIXER_IGAIN, found, minamp, maxamp); } } } } static void hdaa_audio_prepare_pin_ctrl(struct hdaa_devinfo *devinfo) { struct hdaa_audio_as *as = devinfo->as; struct hdaa_widget *w; uint32_t pincap; int i; for (i = 0; i < devinfo->nodecnt; i++) { w = &devinfo->widget[i]; if (w == NULL) continue; if (w->type != HDA_PARAM_AUDIO_WIDGET_CAP_TYPE_PIN_COMPLEX && w->waspin == 0) continue; pincap = w->wclass.pin.cap; /* Disable everything. */ w->wclass.pin.ctrl &= ~( HDA_CMD_SET_PIN_WIDGET_CTRL_HPHN_ENABLE | HDA_CMD_SET_PIN_WIDGET_CTRL_OUT_ENABLE | HDA_CMD_SET_PIN_WIDGET_CTRL_IN_ENABLE | HDA_CMD_SET_PIN_WIDGET_CTRL_VREF_ENABLE_MASK); if (w->enable == 0) { /* Pin is unused so left it disabled. */ continue; } else if (w->waspin) { /* Enable input for beeper input. */ w->wclass.pin.ctrl |= HDA_CMD_SET_PIN_WIDGET_CTRL_IN_ENABLE; } else if (w->bindas < 0 || as[w->bindas].enable == 0) { /* Pin is unused so left it disabled. */ continue; } else if (as[w->bindas].dir == HDAA_CTL_IN) { /* Input pin, configure for input. */ if (HDA_PARAM_PIN_CAP_INPUT_CAP(pincap)) w->wclass.pin.ctrl |= HDA_CMD_SET_PIN_WIDGET_CTRL_IN_ENABLE; if ((devinfo->quirks & HDAA_QUIRK_IVREF100) && HDA_PARAM_PIN_CAP_VREF_CTRL_100(pincap)) w->wclass.pin.ctrl |= HDA_CMD_SET_PIN_WIDGET_CTRL_VREF_ENABLE( HDA_CMD_PIN_WIDGET_CTRL_VREF_ENABLE_100); else if ((devinfo->quirks & HDAA_QUIRK_IVREF80) && HDA_PARAM_PIN_CAP_VREF_CTRL_80(pincap)) w->wclass.pin.ctrl |= HDA_CMD_SET_PIN_WIDGET_CTRL_VREF_ENABLE( HDA_CMD_PIN_WIDGET_CTRL_VREF_ENABLE_80); else if ((devinfo->quirks & HDAA_QUIRK_IVREF50) && HDA_PARAM_PIN_CAP_VREF_CTRL_50(pincap)) w->wclass.pin.ctrl |= HDA_CMD_SET_PIN_WIDGET_CTRL_VREF_ENABLE( HDA_CMD_PIN_WIDGET_CTRL_VREF_ENABLE_50); } else { /* Output pin, configure for output. */ if (HDA_PARAM_PIN_CAP_OUTPUT_CAP(pincap)) w->wclass.pin.ctrl |= HDA_CMD_SET_PIN_WIDGET_CTRL_OUT_ENABLE; if (HDA_PARAM_PIN_CAP_HEADPHONE_CAP(pincap) && (w->wclass.pin.config & HDA_CONFIG_DEFAULTCONF_DEVICE_MASK) == HDA_CONFIG_DEFAULTCONF_DEVICE_HP_OUT) w->wclass.pin.ctrl |= HDA_CMD_SET_PIN_WIDGET_CTRL_HPHN_ENABLE; if ((devinfo->quirks & HDAA_QUIRK_OVREF100) && HDA_PARAM_PIN_CAP_VREF_CTRL_100(pincap)) w->wclass.pin.ctrl |= HDA_CMD_SET_PIN_WIDGET_CTRL_VREF_ENABLE( HDA_CMD_PIN_WIDGET_CTRL_VREF_ENABLE_100); else if ((devinfo->quirks & HDAA_QUIRK_OVREF80) && HDA_PARAM_PIN_CAP_VREF_CTRL_80(pincap)) w->wclass.pin.ctrl |= HDA_CMD_SET_PIN_WIDGET_CTRL_VREF_ENABLE( HDA_CMD_PIN_WIDGET_CTRL_VREF_ENABLE_80); else if ((devinfo->quirks & HDAA_QUIRK_OVREF50) && HDA_PARAM_PIN_CAP_VREF_CTRL_50(pincap)) w->wclass.pin.ctrl |= HDA_CMD_SET_PIN_WIDGET_CTRL_VREF_ENABLE( HDA_CMD_PIN_WIDGET_CTRL_VREF_ENABLE_50); } } } static void hdaa_audio_ctl_commit(struct hdaa_devinfo *devinfo) { struct hdaa_audio_ctl *ctl; int i, z; i = 0; while ((ctl = hdaa_audio_ctl_each(devinfo, &i)) != NULL) { if (ctl->enable == 0 || ctl->ossmask != 0) { /* Mute disabled and mixer controllable controls. * Last will be initialized by mixer_init(). * This expected to reduce click on startup. */ hdaa_audio_ctl_amp_set(ctl, HDAA_AMP_MUTE_ALL, 0, 0); continue; } /* Init fixed controls to 0dB amplification. */ z = ctl->offset; if (z > ctl->step) z = ctl->step; hdaa_audio_ctl_amp_set(ctl, HDAA_AMP_MUTE_NONE, z, z); } } static void hdaa_gpio_commit(struct hdaa_devinfo *devinfo) { uint32_t gdata, gmask, gdir; int i, numgpio; numgpio = HDA_PARAM_GPIO_COUNT_NUM_GPIO(devinfo->gpio_cap); if (devinfo->gpio != 0 && numgpio != 0) { gdata = hda_command(devinfo->dev, HDA_CMD_GET_GPIO_DATA(0, devinfo->nid)); gmask = hda_command(devinfo->dev, HDA_CMD_GET_GPIO_ENABLE_MASK(0, devinfo->nid)); gdir = hda_command(devinfo->dev, HDA_CMD_GET_GPIO_DIRECTION(0, devinfo->nid)); for (i = 0; i < numgpio; i++) { if ((devinfo->gpio & HDAA_GPIO_MASK(i)) == HDAA_GPIO_SET(i)) { gdata |= (1 << i); gmask |= (1 << i); gdir |= (1 << i); } else if ((devinfo->gpio & HDAA_GPIO_MASK(i)) == HDAA_GPIO_CLEAR(i)) { gdata &= ~(1 << i); gmask |= (1 << i); gdir |= (1 << i); } else if ((devinfo->gpio & HDAA_GPIO_MASK(i)) == HDAA_GPIO_DISABLE(i)) { gmask &= ~(1 << i); } else if ((devinfo->gpio & HDAA_GPIO_MASK(i)) == HDAA_GPIO_INPUT(i)) { gmask |= (1 << i); gdir &= ~(1 << i); } } HDA_BOOTVERBOSE( device_printf(devinfo->dev, "GPIO commit\n"); ); hda_command(devinfo->dev, HDA_CMD_SET_GPIO_ENABLE_MASK(0, devinfo->nid, gmask)); hda_command(devinfo->dev, HDA_CMD_SET_GPIO_DIRECTION(0, devinfo->nid, gdir)); hda_command(devinfo->dev, HDA_CMD_SET_GPIO_DATA(0, devinfo->nid, gdata)); HDA_BOOTVERBOSE( hdaa_dump_gpio(devinfo); ); } } static void hdaa_gpo_commit(struct hdaa_devinfo *devinfo) { uint32_t gdata; int i, numgpo; numgpo = HDA_PARAM_GPIO_COUNT_NUM_GPO(devinfo->gpio_cap); if (devinfo->gpo != 0 && numgpo != 0) { gdata = hda_command(devinfo->dev, HDA_CMD_GET_GPO_DATA(0, devinfo->nid)); for (i = 0; i < numgpo; i++) { if ((devinfo->gpio & HDAA_GPIO_MASK(i)) == HDAA_GPIO_SET(i)) { gdata |= (1 << i); } else if ((devinfo->gpio & HDAA_GPIO_MASK(i)) == HDAA_GPIO_CLEAR(i)) { gdata &= ~(1 << i); } } HDA_BOOTVERBOSE( device_printf(devinfo->dev, "GPO commit\n"); ); hda_command(devinfo->dev, HDA_CMD_SET_GPO_DATA(0, devinfo->nid, gdata)); HDA_BOOTVERBOSE( hdaa_dump_gpo(devinfo); ); } } static void hdaa_audio_commit(struct hdaa_devinfo *devinfo) { struct hdaa_widget *w; int i; /* Commit controls. */ hdaa_audio_ctl_commit(devinfo); /* Commit selectors, pins and EAPD. */ for (i = 0; i < devinfo->nodecnt; i++) { w = &devinfo->widget[i]; if (w == NULL) continue; if (w->selconn == -1) w->selconn = 0; if (w->nconns > 0) hdaa_widget_connection_select(w, w->selconn); if (w->type == HDA_PARAM_AUDIO_WIDGET_CAP_TYPE_PIN_COMPLEX || w->waspin) { hda_command(devinfo->dev, HDA_CMD_SET_PIN_WIDGET_CTRL(0, w->nid, w->wclass.pin.ctrl)); } if (w->param.eapdbtl != HDA_INVALID) { uint32_t val; val = w->param.eapdbtl; if (devinfo->quirks & HDAA_QUIRK_EAPDINV) val ^= HDA_CMD_SET_EAPD_BTL_ENABLE_EAPD; hda_command(devinfo->dev, HDA_CMD_SET_EAPD_BTL_ENABLE(0, w->nid, val)); } } hdaa_gpio_commit(devinfo); hdaa_gpo_commit(devinfo); } static void hdaa_powerup(struct hdaa_devinfo *devinfo) { int i; hda_command(devinfo->dev, HDA_CMD_SET_POWER_STATE(0, devinfo->nid, HDA_CMD_POWER_STATE_D0)); DELAY(100); for (i = devinfo->startnode; i < devinfo->endnode; i++) { hda_command(devinfo->dev, HDA_CMD_SET_POWER_STATE(0, i, HDA_CMD_POWER_STATE_D0)); } DELAY(1000); } static int hdaa_pcmchannel_setup(struct hdaa_chan *ch) { struct hdaa_devinfo *devinfo = ch->devinfo; struct hdaa_audio_as *as = devinfo->as; struct hdaa_widget *w; uint32_t cap, fmtcap, pcmcap; int i, j, ret, channels, onlystereo; uint16_t pinset; ch->caps = hdaa_caps; ch->caps.fmtlist = ch->fmtlist; ch->bit16 = 1; ch->bit32 = 0; ch->pcmrates[0] = 48000; ch->pcmrates[1] = 0; ch->stripecap = 0xff; ret = 0; channels = 0; onlystereo = 1; pinset = 0; fmtcap = devinfo->supp_stream_formats; pcmcap = devinfo->supp_pcm_size_rate; for (i = 0; i < 16; i++) { /* Check as is correct */ if (ch->as < 0) break; /* Cound only present DACs */ if (as[ch->as].dacs[ch->asindex][i] <= 0) continue; /* Ignore duplicates */ for (j = 0; j < ret; j++) { if (ch->io[j] == as[ch->as].dacs[ch->asindex][i]) break; } if (j < ret) continue; w = hdaa_widget_get(devinfo, as[ch->as].dacs[ch->asindex][i]); if (w == NULL || w->enable == 0) continue; cap = w->param.supp_stream_formats; if (!HDA_PARAM_SUPP_STREAM_FORMATS_PCM(cap) && !HDA_PARAM_SUPP_STREAM_FORMATS_AC3(cap)) continue; /* Many CODECs does not declare AC3 support on SPDIF. I don't beleave that they doesn't support it! */ if (HDA_PARAM_AUDIO_WIDGET_CAP_DIGITAL(w->param.widget_cap)) cap |= HDA_PARAM_SUPP_STREAM_FORMATS_AC3_MASK; if (ret == 0) { fmtcap = cap; pcmcap = w->param.supp_pcm_size_rate; } else { fmtcap &= cap; pcmcap &= w->param.supp_pcm_size_rate; } ch->io[ret++] = as[ch->as].dacs[ch->asindex][i]; ch->stripecap &= w->wclass.conv.stripecap; /* Do not count redirection pin/dac channels. */ if (i == 15 && as[ch->as].hpredir >= 0) continue; channels += HDA_PARAM_AUDIO_WIDGET_CAP_CC(w->param.widget_cap) + 1; if (HDA_PARAM_AUDIO_WIDGET_CAP_CC(w->param.widget_cap) != 1) onlystereo = 0; pinset |= (1 << i); } ch->io[ret] = -1; ch->channels = channels; if (as[ch->as].fakeredir) ret--; /* Standard speaks only about stereo pins and playback, ... */ if ((!onlystereo) || as[ch->as].mixed) pinset = 0; /* ..., but there it gives us info about speakers layout. */ as[ch->as].pinset = pinset; ch->supp_stream_formats = fmtcap; ch->supp_pcm_size_rate = pcmcap; /* * 8bit = 0 * 16bit = 1 * 20bit = 2 * 24bit = 3 * 32bit = 4 */ if (ret > 0) { i = 0; if (HDA_PARAM_SUPP_STREAM_FORMATS_PCM(fmtcap)) { if (HDA_PARAM_SUPP_PCM_SIZE_RATE_16BIT(pcmcap)) ch->bit16 = 1; else if (HDA_PARAM_SUPP_PCM_SIZE_RATE_8BIT(pcmcap)) ch->bit16 = 0; if (HDA_PARAM_SUPP_PCM_SIZE_RATE_24BIT(pcmcap)) ch->bit32 = 3; else if (HDA_PARAM_SUPP_PCM_SIZE_RATE_20BIT(pcmcap)) ch->bit32 = 2; else if (HDA_PARAM_SUPP_PCM_SIZE_RATE_32BIT(pcmcap)) ch->bit32 = 4; if (!(devinfo->quirks & HDAA_QUIRK_FORCESTEREO)) { ch->fmtlist[i++] = SND_FORMAT(AFMT_S16_LE, 1, 0); if (ch->bit32) ch->fmtlist[i++] = SND_FORMAT(AFMT_S32_LE, 1, 0); } if (channels >= 2) { ch->fmtlist[i++] = SND_FORMAT(AFMT_S16_LE, 2, 0); if (ch->bit32) ch->fmtlist[i++] = SND_FORMAT(AFMT_S32_LE, 2, 0); } if (channels >= 3 && !onlystereo) { ch->fmtlist[i++] = SND_FORMAT(AFMT_S16_LE, 3, 0); if (ch->bit32) ch->fmtlist[i++] = SND_FORMAT(AFMT_S32_LE, 3, 0); ch->fmtlist[i++] = SND_FORMAT(AFMT_S16_LE, 3, 1); if (ch->bit32) ch->fmtlist[i++] = SND_FORMAT(AFMT_S32_LE, 3, 1); } if (channels >= 4) { ch->fmtlist[i++] = SND_FORMAT(AFMT_S16_LE, 4, 0); if (ch->bit32) ch->fmtlist[i++] = SND_FORMAT(AFMT_S32_LE, 4, 0); if (!onlystereo) { ch->fmtlist[i++] = SND_FORMAT(AFMT_S16_LE, 4, 1); if (ch->bit32) ch->fmtlist[i++] = SND_FORMAT(AFMT_S32_LE, 4, 1); } } if (channels >= 5 && !onlystereo) { ch->fmtlist[i++] = SND_FORMAT(AFMT_S16_LE, 5, 0); if (ch->bit32) ch->fmtlist[i++] = SND_FORMAT(AFMT_S32_LE, 5, 0); ch->fmtlist[i++] = SND_FORMAT(AFMT_S16_LE, 5, 1); if (ch->bit32) ch->fmtlist[i++] = SND_FORMAT(AFMT_S32_LE, 5, 1); } if (channels >= 6) { ch->fmtlist[i++] = SND_FORMAT(AFMT_S16_LE, 6, 1); if (ch->bit32) ch->fmtlist[i++] = SND_FORMAT(AFMT_S32_LE, 6, 1); if (!onlystereo) { ch->fmtlist[i++] = SND_FORMAT(AFMT_S16_LE, 6, 0); if (ch->bit32) ch->fmtlist[i++] = SND_FORMAT(AFMT_S32_LE, 6, 0); } } if (channels >= 7 && !onlystereo) { ch->fmtlist[i++] = SND_FORMAT(AFMT_S16_LE, 7, 0); if (ch->bit32) ch->fmtlist[i++] = SND_FORMAT(AFMT_S32_LE, 7, 0); ch->fmtlist[i++] = SND_FORMAT(AFMT_S16_LE, 7, 1); if (ch->bit32) ch->fmtlist[i++] = SND_FORMAT(AFMT_S32_LE, 7, 1); } if (channels >= 8) { ch->fmtlist[i++] = SND_FORMAT(AFMT_S16_LE, 8, 1); if (ch->bit32) ch->fmtlist[i++] = SND_FORMAT(AFMT_S32_LE, 8, 1); } } if (HDA_PARAM_SUPP_STREAM_FORMATS_AC3(fmtcap)) { ch->fmtlist[i++] = SND_FORMAT(AFMT_AC3, 2, 0); if (channels >= 8) { ch->fmtlist[i++] = SND_FORMAT(AFMT_AC3, 8, 0); ch->fmtlist[i++] = SND_FORMAT(AFMT_AC3, 8, 1); } } ch->fmtlist[i] = 0; i = 0; if (HDA_PARAM_SUPP_PCM_SIZE_RATE_8KHZ(pcmcap)) ch->pcmrates[i++] = 8000; if (HDA_PARAM_SUPP_PCM_SIZE_RATE_11KHZ(pcmcap)) ch->pcmrates[i++] = 11025; if (HDA_PARAM_SUPP_PCM_SIZE_RATE_16KHZ(pcmcap)) ch->pcmrates[i++] = 16000; if (HDA_PARAM_SUPP_PCM_SIZE_RATE_22KHZ(pcmcap)) ch->pcmrates[i++] = 22050; if (HDA_PARAM_SUPP_PCM_SIZE_RATE_32KHZ(pcmcap)) ch->pcmrates[i++] = 32000; if (HDA_PARAM_SUPP_PCM_SIZE_RATE_44KHZ(pcmcap)) ch->pcmrates[i++] = 44100; /* if (HDA_PARAM_SUPP_PCM_SIZE_RATE_48KHZ(pcmcap)) */ ch->pcmrates[i++] = 48000; if (HDA_PARAM_SUPP_PCM_SIZE_RATE_88KHZ(pcmcap)) ch->pcmrates[i++] = 88200; if (HDA_PARAM_SUPP_PCM_SIZE_RATE_96KHZ(pcmcap)) ch->pcmrates[i++] = 96000; if (HDA_PARAM_SUPP_PCM_SIZE_RATE_176KHZ(pcmcap)) ch->pcmrates[i++] = 176400; if (HDA_PARAM_SUPP_PCM_SIZE_RATE_192KHZ(pcmcap)) ch->pcmrates[i++] = 192000; /* if (HDA_PARAM_SUPP_PCM_SIZE_RATE_384KHZ(pcmcap)) */ ch->pcmrates[i] = 0; if (i > 0) { ch->caps.minspeed = ch->pcmrates[0]; ch->caps.maxspeed = ch->pcmrates[i - 1]; } } return (ret); } static void hdaa_prepare_pcms(struct hdaa_devinfo *devinfo) { struct hdaa_audio_as *as = devinfo->as; int i, j, k, apdev = 0, ardev = 0, dpdev = 0, drdev = 0; for (i = 0; i < devinfo->ascnt; i++) { if (as[i].enable == 0) continue; if (as[i].dir == HDAA_CTL_IN) { if (as[i].digital) drdev++; else ardev++; } else { if (as[i].digital) dpdev++; else apdev++; } } devinfo->num_devs = max(ardev, apdev) + max(drdev, dpdev); devinfo->devs = (struct hdaa_pcm_devinfo *)malloc( devinfo->num_devs * sizeof(struct hdaa_pcm_devinfo), M_HDAA, M_ZERO | M_NOWAIT); if (devinfo->devs == NULL) { device_printf(devinfo->dev, "Unable to allocate memory for devices\n"); return; } for (i = 0; i < devinfo->num_devs; i++) { devinfo->devs[i].index = i; devinfo->devs[i].devinfo = devinfo; devinfo->devs[i].playas = -1; devinfo->devs[i].recas = -1; devinfo->devs[i].digital = 255; } for (i = 0; i < devinfo->ascnt; i++) { if (as[i].enable == 0) continue; for (j = 0; j < devinfo->num_devs; j++) { if (devinfo->devs[j].digital != 255 && (!devinfo->devs[j].digital) != (!as[i].digital)) continue; if (as[i].dir == HDAA_CTL_IN) { if (devinfo->devs[j].recas >= 0) continue; devinfo->devs[j].recas = i; } else { if (devinfo->devs[j].playas >= 0) continue; devinfo->devs[j].playas = i; } as[i].pdevinfo = &devinfo->devs[j]; for (k = 0; k < as[i].num_chans; k++) { devinfo->chans[as[i].chans[k]].pdevinfo = &devinfo->devs[j]; } devinfo->devs[j].digital = as[i].digital; break; } } } static void hdaa_create_pcms(struct hdaa_devinfo *devinfo) { int i; for (i = 0; i < devinfo->num_devs; i++) { struct hdaa_pcm_devinfo *pdevinfo = &devinfo->devs[i]; pdevinfo->dev = device_add_child(devinfo->dev, "pcm", -1); device_set_ivars(pdevinfo->dev, (void *)pdevinfo); } } static void hdaa_dump_ctls(struct hdaa_pcm_devinfo *pdevinfo, const char *banner, uint32_t flag) { struct hdaa_devinfo *devinfo = pdevinfo->devinfo; struct hdaa_audio_ctl *ctl; char buf[64]; int i, j, printed = 0; if (flag == 0) { flag = ~(SOUND_MASK_VOLUME | SOUND_MASK_PCM | SOUND_MASK_CD | SOUND_MASK_LINE | SOUND_MASK_RECLEV | SOUND_MASK_MIC | SOUND_MASK_SPEAKER | SOUND_MASK_IGAIN | SOUND_MASK_OGAIN | SOUND_MASK_IMIX | SOUND_MASK_MONITOR); } for (j = 0; j < SOUND_MIXER_NRDEVICES; j++) { if ((flag & (1 << j)) == 0) continue; i = 0; printed = 0; while ((ctl = hdaa_audio_ctl_each(devinfo, &i)) != NULL) { if (ctl->enable == 0 || ctl->widget->enable == 0) continue; if (!((pdevinfo->playas >= 0 && ctl->widget->bindas == pdevinfo->playas) || (pdevinfo->recas >= 0 && ctl->widget->bindas == pdevinfo->recas) || (ctl->widget->bindas == -2 && pdevinfo->index == 0))) continue; if ((ctl->ossmask & (1 << j)) == 0) continue; if (printed == 0) { if (banner != NULL) { device_printf(pdevinfo->dev, "%s", banner); } else { device_printf(pdevinfo->dev, "Unknown Ctl"); } printf(" (OSS: %s)", hdaa_audio_ctl_ossmixer_mask2allname(1 << j, buf, sizeof(buf))); if (pdevinfo->ossmask & (1 << j)) { printf(": %+d/%+ddB\n", pdevinfo->minamp[j] / 4, pdevinfo->maxamp[j] / 4); } else printf("\n"); printed = 1; } device_printf(pdevinfo->dev, " +- ctl %2d (nid %3d %s", i, ctl->widget->nid, (ctl->ndir == HDAA_CTL_IN)?"in ":"out"); if (ctl->ndir == HDAA_CTL_IN && ctl->ndir == ctl->dir) printf(" %2d): ", ctl->index); else printf("): "); if (ctl->step > 0) { printf("%+d/%+ddB (%d steps)%s\n", MINQDB(ctl) / 4, MAXQDB(ctl) / 4, ctl->step + 1, ctl->mute?" + mute":""); } else printf("%s\n", ctl->mute?"mute":""); } } if (printed) device_printf(pdevinfo->dev, "\n"); } static void hdaa_dump_audio_formats(device_t dev, uint32_t fcap, uint32_t pcmcap) { uint32_t cap; cap = fcap; if (cap != 0) { device_printf(dev, " Stream cap: 0x%08x", cap); if (HDA_PARAM_SUPP_STREAM_FORMATS_AC3(cap)) printf(" AC3"); if (HDA_PARAM_SUPP_STREAM_FORMATS_FLOAT32(cap)) printf(" FLOAT32"); if (HDA_PARAM_SUPP_STREAM_FORMATS_PCM(cap)) printf(" PCM"); printf("\n"); } cap = pcmcap; if (cap != 0) { device_printf(dev, " PCM cap: 0x%08x", cap); if (HDA_PARAM_SUPP_PCM_SIZE_RATE_8BIT(cap)) printf(" 8"); if (HDA_PARAM_SUPP_PCM_SIZE_RATE_16BIT(cap)) printf(" 16"); if (HDA_PARAM_SUPP_PCM_SIZE_RATE_20BIT(cap)) printf(" 20"); if (HDA_PARAM_SUPP_PCM_SIZE_RATE_24BIT(cap)) printf(" 24"); if (HDA_PARAM_SUPP_PCM_SIZE_RATE_32BIT(cap)) printf(" 32"); printf(" bits,"); if (HDA_PARAM_SUPP_PCM_SIZE_RATE_8KHZ(cap)) printf(" 8"); if (HDA_PARAM_SUPP_PCM_SIZE_RATE_11KHZ(cap)) printf(" 11"); if (HDA_PARAM_SUPP_PCM_SIZE_RATE_16KHZ(cap)) printf(" 16"); if (HDA_PARAM_SUPP_PCM_SIZE_RATE_22KHZ(cap)) printf(" 22"); if (HDA_PARAM_SUPP_PCM_SIZE_RATE_32KHZ(cap)) printf(" 32"); if (HDA_PARAM_SUPP_PCM_SIZE_RATE_44KHZ(cap)) printf(" 44"); printf(" 48"); if (HDA_PARAM_SUPP_PCM_SIZE_RATE_88KHZ(cap)) printf(" 88"); if (HDA_PARAM_SUPP_PCM_SIZE_RATE_96KHZ(cap)) printf(" 96"); if (HDA_PARAM_SUPP_PCM_SIZE_RATE_176KHZ(cap)) printf(" 176"); if (HDA_PARAM_SUPP_PCM_SIZE_RATE_192KHZ(cap)) printf(" 192"); printf(" KHz\n"); } } static void hdaa_dump_pin(struct hdaa_widget *w) { uint32_t pincap; pincap = w->wclass.pin.cap; device_printf(w->devinfo->dev, " Pin cap: 0x%08x", pincap); if (HDA_PARAM_PIN_CAP_IMP_SENSE_CAP(pincap)) printf(" ISC"); if (HDA_PARAM_PIN_CAP_TRIGGER_REQD(pincap)) printf(" TRQD"); if (HDA_PARAM_PIN_CAP_PRESENCE_DETECT_CAP(pincap)) printf(" PDC"); if (HDA_PARAM_PIN_CAP_HEADPHONE_CAP(pincap)) printf(" HP"); if (HDA_PARAM_PIN_CAP_OUTPUT_CAP(pincap)) printf(" OUT"); if (HDA_PARAM_PIN_CAP_INPUT_CAP(pincap)) printf(" IN"); if (HDA_PARAM_PIN_CAP_BALANCED_IO_PINS(pincap)) printf(" BAL"); if (HDA_PARAM_PIN_CAP_HDMI(pincap)) printf(" HDMI"); if (HDA_PARAM_PIN_CAP_VREF_CTRL(pincap)) { printf(" VREF["); if (HDA_PARAM_PIN_CAP_VREF_CTRL_50(pincap)) printf(" 50"); if (HDA_PARAM_PIN_CAP_VREF_CTRL_80(pincap)) printf(" 80"); if (HDA_PARAM_PIN_CAP_VREF_CTRL_100(pincap)) printf(" 100"); if (HDA_PARAM_PIN_CAP_VREF_CTRL_GROUND(pincap)) printf(" GROUND"); if (HDA_PARAM_PIN_CAP_VREF_CTRL_HIZ(pincap)) printf(" HIZ"); printf(" ]"); } if (HDA_PARAM_PIN_CAP_EAPD_CAP(pincap)) printf(" EAPD"); if (HDA_PARAM_PIN_CAP_DP(pincap)) printf(" DP"); if (HDA_PARAM_PIN_CAP_HBR(pincap)) printf(" HBR"); printf("\n"); device_printf(w->devinfo->dev, " Pin config: 0x%08x\n", w->wclass.pin.config); device_printf(w->devinfo->dev, " Pin control: 0x%08x", w->wclass.pin.ctrl); if (w->wclass.pin.ctrl & HDA_CMD_SET_PIN_WIDGET_CTRL_HPHN_ENABLE) printf(" HP"); if (w->wclass.pin.ctrl & HDA_CMD_SET_PIN_WIDGET_CTRL_IN_ENABLE) printf(" IN"); if (w->wclass.pin.ctrl & HDA_CMD_SET_PIN_WIDGET_CTRL_OUT_ENABLE) printf(" OUT"); if (HDA_PARAM_AUDIO_WIDGET_CAP_DIGITAL(w->param.widget_cap)) { if ((w->wclass.pin.ctrl & HDA_CMD_SET_PIN_WIDGET_CTRL_VREF_ENABLE_MASK) == 0x03) printf(" HBR"); else if ((w->wclass.pin.ctrl & HDA_CMD_SET_PIN_WIDGET_CTRL_VREF_ENABLE_MASK) != 0) printf(" EPTs"); } else { if ((w->wclass.pin.ctrl & HDA_CMD_SET_PIN_WIDGET_CTRL_VREF_ENABLE_MASK) != 0) printf(" VREFs"); } printf("\n"); } static void hdaa_dump_pin_config(struct hdaa_widget *w, uint32_t conf) { device_printf(w->devinfo->dev, "%2d %08x %-2d %-2d " "%-13s %-5s %-7s %-10s %-7s %d%s\n", w->nid, conf, HDA_CONFIG_DEFAULTCONF_ASSOCIATION(conf), HDA_CONFIG_DEFAULTCONF_SEQUENCE(conf), HDA_DEVS[HDA_CONFIG_DEFAULTCONF_DEVICE(conf)], HDA_CONNS[HDA_CONFIG_DEFAULTCONF_CONNECTIVITY(conf)], HDA_CONNECTORS[HDA_CONFIG_DEFAULTCONF_CONNECTION_TYPE(conf)], HDA_LOCS[HDA_CONFIG_DEFAULTCONF_LOCATION(conf)], HDA_COLORS[HDA_CONFIG_DEFAULTCONF_COLOR(conf)], HDA_CONFIG_DEFAULTCONF_MISC(conf), (w->enable == 0)?" DISA":""); } static void hdaa_dump_pin_configs(struct hdaa_devinfo *devinfo) { struct hdaa_widget *w; int i; device_printf(devinfo->dev, "nid 0x as seq " "device conn jack loc color misc\n"); for (i = devinfo->startnode; i < devinfo->endnode; i++) { w = hdaa_widget_get(devinfo, i); if (w == NULL) continue; if (w->type != HDA_PARAM_AUDIO_WIDGET_CAP_TYPE_PIN_COMPLEX) continue; hdaa_dump_pin_config(w, w->wclass.pin.config); } } static void hdaa_dump_amp(device_t dev, uint32_t cap, const char *banner) { int offset, size, step; offset = HDA_PARAM_OUTPUT_AMP_CAP_OFFSET(cap); size = HDA_PARAM_OUTPUT_AMP_CAP_STEPSIZE(cap); step = HDA_PARAM_OUTPUT_AMP_CAP_NUMSTEPS(cap); device_printf(dev, " %s amp: 0x%08x " "mute=%d step=%d size=%d offset=%d (%+d/%+ddB)\n", banner, cap, HDA_PARAM_OUTPUT_AMP_CAP_MUTE_CAP(cap), step, size, offset, ((0 - offset) * (size + 1)) / 4, ((step - offset) * (size + 1)) / 4); } static void hdaa_dump_nodes(struct hdaa_devinfo *devinfo) { struct hdaa_widget *w, *cw; char buf[64]; int i, j; device_printf(devinfo->dev, "\n"); device_printf(devinfo->dev, "Default parameters:\n"); hdaa_dump_audio_formats(devinfo->dev, devinfo->supp_stream_formats, devinfo->supp_pcm_size_rate); hdaa_dump_amp(devinfo->dev, devinfo->inamp_cap, " Input"); hdaa_dump_amp(devinfo->dev, devinfo->outamp_cap, "Output"); for (i = devinfo->startnode; i < devinfo->endnode; i++) { w = hdaa_widget_get(devinfo, i); if (w == NULL) { device_printf(devinfo->dev, "Ghost widget nid=%d\n", i); continue; } device_printf(devinfo->dev, "\n"); device_printf(devinfo->dev, " nid: %d%s\n", w->nid, (w->enable == 0) ? " [DISABLED]" : ""); device_printf(devinfo->dev, " Name: %s\n", w->name); device_printf(devinfo->dev, " Widget cap: 0x%08x", w->param.widget_cap); if (w->param.widget_cap & 0x0ee1) { if (HDA_PARAM_AUDIO_WIDGET_CAP_LR_SWAP(w->param.widget_cap)) printf(" LRSWAP"); if (HDA_PARAM_AUDIO_WIDGET_CAP_POWER_CTRL(w->param.widget_cap)) printf(" PWR"); if (HDA_PARAM_AUDIO_WIDGET_CAP_DIGITAL(w->param.widget_cap)) printf(" DIGITAL"); if (HDA_PARAM_AUDIO_WIDGET_CAP_UNSOL_CAP(w->param.widget_cap)) printf(" UNSOL"); if (HDA_PARAM_AUDIO_WIDGET_CAP_PROC_WIDGET(w->param.widget_cap)) printf(" PROC"); if (HDA_PARAM_AUDIO_WIDGET_CAP_STRIPE(w->param.widget_cap)) printf(" STRIPE(x%d)", 1 << (fls(w->wclass.conv.stripecap) - 1)); j = HDA_PARAM_AUDIO_WIDGET_CAP_CC(w->param.widget_cap); if (j == 1) printf(" STEREO"); else if (j > 1) printf(" %dCH", j + 1); } printf("\n"); if (w->bindas != -1) { device_printf(devinfo->dev, " Association: %d (0x%04x)\n", w->bindas, w->bindseqmask); } if (w->ossmask != 0 || w->ossdev >= 0) { device_printf(devinfo->dev, " OSS: %s", hdaa_audio_ctl_ossmixer_mask2allname(w->ossmask, buf, sizeof(buf))); if (w->ossdev >= 0) printf(" (%s)", ossnames[w->ossdev]); printf("\n"); } if (w->type == HDA_PARAM_AUDIO_WIDGET_CAP_TYPE_AUDIO_OUTPUT || w->type == HDA_PARAM_AUDIO_WIDGET_CAP_TYPE_AUDIO_INPUT) { hdaa_dump_audio_formats(devinfo->dev, w->param.supp_stream_formats, w->param.supp_pcm_size_rate); } else if (w->type == HDA_PARAM_AUDIO_WIDGET_CAP_TYPE_PIN_COMPLEX || w->waspin) hdaa_dump_pin(w); if (w->param.eapdbtl != HDA_INVALID) device_printf(devinfo->dev, " EAPD: 0x%08x\n", w->param.eapdbtl); if (HDA_PARAM_AUDIO_WIDGET_CAP_OUT_AMP(w->param.widget_cap) && w->param.outamp_cap != 0) hdaa_dump_amp(devinfo->dev, w->param.outamp_cap, "Output"); if (HDA_PARAM_AUDIO_WIDGET_CAP_IN_AMP(w->param.widget_cap) && w->param.inamp_cap != 0) hdaa_dump_amp(devinfo->dev, w->param.inamp_cap, " Input"); if (w->nconns > 0) device_printf(devinfo->dev, " Connections: %d\n", w->nconns); for (j = 0; j < w->nconns; j++) { cw = hdaa_widget_get(devinfo, w->conns[j]); device_printf(devinfo->dev, " + %s<- nid=%d [%s]", (w->connsenable[j] == 0)?"[DISABLED] ":"", w->conns[j], (cw == NULL) ? "GHOST!" : cw->name); if (cw == NULL) printf(" [UNKNOWN]"); else if (cw->enable == 0) printf(" [DISABLED]"); if (w->nconns > 1 && w->selconn == j && w->type != HDA_PARAM_AUDIO_WIDGET_CAP_TYPE_AUDIO_MIXER) printf(" (selected)"); printf("\n"); } } } static void hdaa_dump_dst_nid(struct hdaa_pcm_devinfo *pdevinfo, nid_t nid, int depth) { struct hdaa_devinfo *devinfo = pdevinfo->devinfo; struct hdaa_widget *w, *cw; char buf[64]; int i; if (depth > HDA_PARSE_MAXDEPTH) return; w = hdaa_widget_get(devinfo, nid); if (w == NULL || w->enable == 0) return; if (depth == 0) device_printf(pdevinfo->dev, "%*s", 4, ""); else device_printf(pdevinfo->dev, "%*s + <- ", 4 + (depth - 1) * 7, ""); printf("nid=%d [%s]", w->nid, w->name); if (depth > 0) { if (w->ossmask == 0) { printf("\n"); return; } printf(" [src: %s]", hdaa_audio_ctl_ossmixer_mask2allname( w->ossmask, buf, sizeof(buf))); if (w->ossdev >= 0) { printf("\n"); return; } } printf("\n"); for (i = 0; i < w->nconns; i++) { if (w->connsenable[i] == 0) continue; cw = hdaa_widget_get(devinfo, w->conns[i]); if (cw == NULL || cw->enable == 0 || cw->bindas == -1) continue; hdaa_dump_dst_nid(pdevinfo, w->conns[i], depth + 1); } } static void hdaa_dump_dac(struct hdaa_pcm_devinfo *pdevinfo) { struct hdaa_devinfo *devinfo = pdevinfo->devinfo; struct hdaa_audio_as *as; struct hdaa_widget *w; nid_t *nids; int chid, i; if (pdevinfo->playas < 0) return; device_printf(pdevinfo->dev, "Playback:\n"); chid = devinfo->as[pdevinfo->playas].chans[0]; hdaa_dump_audio_formats(pdevinfo->dev, devinfo->chans[chid].supp_stream_formats, devinfo->chans[chid].supp_pcm_size_rate); for (i = 0; i < devinfo->as[pdevinfo->playas].num_chans; i++) { chid = devinfo->as[pdevinfo->playas].chans[i]; device_printf(pdevinfo->dev, " DAC:"); for (nids = devinfo->chans[chid].io; *nids != -1; nids++) printf(" %d", *nids); printf("\n"); } as = &devinfo->as[pdevinfo->playas]; for (i = 0; i < 16; i++) { if (as->pins[i] <= 0) continue; w = hdaa_widget_get(devinfo, as->pins[i]); if (w == NULL || w->enable == 0) continue; device_printf(pdevinfo->dev, "\n"); hdaa_dump_dst_nid(pdevinfo, as->pins[i], 0); } device_printf(pdevinfo->dev, "\n"); } static void hdaa_dump_adc(struct hdaa_pcm_devinfo *pdevinfo) { struct hdaa_devinfo *devinfo = pdevinfo->devinfo; struct hdaa_widget *w; nid_t *nids; int chid, i; if (pdevinfo->recas < 0) return; device_printf(pdevinfo->dev, "Record:\n"); chid = devinfo->as[pdevinfo->recas].chans[0]; hdaa_dump_audio_formats(pdevinfo->dev, devinfo->chans[chid].supp_stream_formats, devinfo->chans[chid].supp_pcm_size_rate); for (i = 0; i < devinfo->as[pdevinfo->recas].num_chans; i++) { chid = devinfo->as[pdevinfo->recas].chans[i]; device_printf(pdevinfo->dev, " ADC:"); for (nids = devinfo->chans[chid].io; *nids != -1; nids++) printf(" %d", *nids); printf("\n"); } for (i = devinfo->startnode; i < devinfo->endnode; i++) { w = hdaa_widget_get(devinfo, i); if (w == NULL || w->enable == 0) continue; if (w->type != HDA_PARAM_AUDIO_WIDGET_CAP_TYPE_AUDIO_INPUT) continue; if (w->bindas != pdevinfo->recas) continue; device_printf(pdevinfo->dev, "\n"); hdaa_dump_dst_nid(pdevinfo, i, 0); } device_printf(pdevinfo->dev, "\n"); } static void hdaa_dump_mix(struct hdaa_pcm_devinfo *pdevinfo) { struct hdaa_devinfo *devinfo = pdevinfo->devinfo; struct hdaa_widget *w; int i; int printed = 0; for (i = devinfo->startnode; i < devinfo->endnode; i++) { w = hdaa_widget_get(devinfo, i); if (w == NULL || w->enable == 0) continue; if (w->ossdev != SOUND_MIXER_IMIX) continue; if (w->bindas != pdevinfo->recas) continue; if (printed == 0) { printed = 1; device_printf(pdevinfo->dev, "Input Mix:\n"); } device_printf(pdevinfo->dev, "\n"); hdaa_dump_dst_nid(pdevinfo, i, 0); } if (printed) device_printf(pdevinfo->dev, "\n"); } static void hdaa_pindump(device_t dev) { struct hdaa_devinfo *devinfo = device_get_softc(dev); struct hdaa_widget *w; uint32_t res, pincap, delay; int i; device_printf(dev, "Dumping AFG pins:\n"); device_printf(dev, "nid 0x as seq " "device conn jack loc color misc\n"); for (i = devinfo->startnode; i < devinfo->endnode; i++) { w = hdaa_widget_get(devinfo, i); if (w == NULL || w->type != HDA_PARAM_AUDIO_WIDGET_CAP_TYPE_PIN_COMPLEX) continue; hdaa_dump_pin_config(w, w->wclass.pin.config); pincap = w->wclass.pin.cap; device_printf(dev, " Caps: %2s %3s %2s %4s %4s", HDA_PARAM_PIN_CAP_INPUT_CAP(pincap)?"IN":"", HDA_PARAM_PIN_CAP_OUTPUT_CAP(pincap)?"OUT":"", HDA_PARAM_PIN_CAP_HEADPHONE_CAP(pincap)?"HP":"", HDA_PARAM_PIN_CAP_EAPD_CAP(pincap)?"EAPD":"", HDA_PARAM_PIN_CAP_VREF_CTRL(pincap)?"VREF":""); if (HDA_PARAM_PIN_CAP_IMP_SENSE_CAP(pincap) || HDA_PARAM_PIN_CAP_PRESENCE_DETECT_CAP(pincap)) { if (HDA_PARAM_PIN_CAP_TRIGGER_REQD(pincap)) { delay = 0; hda_command(dev, HDA_CMD_SET_PIN_SENSE(0, w->nid, 0)); do { res = hda_command(dev, HDA_CMD_GET_PIN_SENSE(0, w->nid)); if (res != 0x7fffffff && res != 0xffffffff) break; DELAY(10); } while (++delay < 10000); } else { delay = 0; res = hda_command(dev, HDA_CMD_GET_PIN_SENSE(0, w->nid)); } printf(" Sense: 0x%08x (%sconnected%s)", res, (res & HDA_CMD_GET_PIN_SENSE_PRESENCE_DETECT) ? "" : "dis", (HDA_PARAM_AUDIO_WIDGET_CAP_DIGITAL(w->param.widget_cap) && (res & HDA_CMD_GET_PIN_SENSE_ELD_VALID)) ? ", ELD valid" : ""); if (delay > 0) printf(" delay %dus", delay * 10); } printf("\n"); } device_printf(dev, "NumGPIO=%d NumGPO=%d NumGPI=%d GPIWake=%d GPIUnsol=%d\n", HDA_PARAM_GPIO_COUNT_NUM_GPIO(devinfo->gpio_cap), HDA_PARAM_GPIO_COUNT_NUM_GPO(devinfo->gpio_cap), HDA_PARAM_GPIO_COUNT_NUM_GPI(devinfo->gpio_cap), HDA_PARAM_GPIO_COUNT_GPI_WAKE(devinfo->gpio_cap), HDA_PARAM_GPIO_COUNT_GPI_UNSOL(devinfo->gpio_cap)); hdaa_dump_gpi(devinfo); hdaa_dump_gpio(devinfo); hdaa_dump_gpo(devinfo); } static void hdaa_configure(device_t dev) { struct hdaa_devinfo *devinfo = device_get_softc(dev); struct hdaa_audio_ctl *ctl; int i; HDA_BOOTHVERBOSE( device_printf(dev, "Applying built-in patches...\n"); ); hdaa_patch(devinfo); HDA_BOOTHVERBOSE( device_printf(dev, "Applying local patches...\n"); ); hdaa_local_patch(devinfo); hdaa_audio_postprocess(devinfo); HDA_BOOTHVERBOSE( device_printf(dev, "Parsing Ctls...\n"); ); hdaa_audio_ctl_parse(devinfo); HDA_BOOTHVERBOSE( device_printf(dev, "Disabling nonaudio...\n"); ); hdaa_audio_disable_nonaudio(devinfo); HDA_BOOTHVERBOSE( device_printf(dev, "Disabling useless...\n"); ); hdaa_audio_disable_useless(devinfo); HDA_BOOTVERBOSE( device_printf(dev, "Patched pins configuration:\n"); hdaa_dump_pin_configs(devinfo); ); HDA_BOOTHVERBOSE( device_printf(dev, "Parsing pin associations...\n"); ); hdaa_audio_as_parse(devinfo); HDA_BOOTHVERBOSE( device_printf(dev, "Building AFG tree...\n"); ); hdaa_audio_build_tree(devinfo); HDA_BOOTHVERBOSE( device_printf(dev, "Disabling unassociated " "widgets...\n"); ); hdaa_audio_disable_unas(devinfo); HDA_BOOTHVERBOSE( device_printf(dev, "Disabling nonselected " "inputs...\n"); ); hdaa_audio_disable_notselected(devinfo); HDA_BOOTHVERBOSE( device_printf(dev, "Disabling useless...\n"); ); hdaa_audio_disable_useless(devinfo); HDA_BOOTHVERBOSE( device_printf(dev, "Disabling " "crossassociatement connections...\n"); ); hdaa_audio_disable_crossas(devinfo); HDA_BOOTHVERBOSE( device_printf(dev, "Disabling useless...\n"); ); hdaa_audio_disable_useless(devinfo); HDA_BOOTHVERBOSE( device_printf(dev, "Binding associations to channels...\n"); ); hdaa_audio_bind_as(devinfo); HDA_BOOTHVERBOSE( device_printf(dev, "Assigning names to signal sources...\n"); ); hdaa_audio_assign_names(devinfo); HDA_BOOTHVERBOSE( device_printf(dev, "Preparing PCM devices...\n"); ); hdaa_prepare_pcms(devinfo); HDA_BOOTHVERBOSE( device_printf(dev, "Assigning mixers to the tree...\n"); ); hdaa_audio_assign_mixers(devinfo); HDA_BOOTHVERBOSE( device_printf(dev, "Preparing pin controls...\n"); ); hdaa_audio_prepare_pin_ctrl(devinfo); HDA_BOOTHVERBOSE( device_printf(dev, "AFG commit...\n"); ); hdaa_audio_commit(devinfo); HDA_BOOTHVERBOSE( device_printf(dev, "Applying direct built-in patches...\n"); ); hdaa_patch_direct(devinfo); HDA_BOOTHVERBOSE( device_printf(dev, "Pin sense init...\n"); ); hdaa_sense_init(devinfo); HDA_BOOTHVERBOSE( device_printf(dev, "Creating PCM devices...\n"); ); hdaa_create_pcms(devinfo); HDA_BOOTVERBOSE( if (devinfo->quirks != 0) { device_printf(dev, "FG config/quirks:"); for (i = 0; i < nitems(hdaa_quirks_tab); i++) { if ((devinfo->quirks & hdaa_quirks_tab[i].value) == hdaa_quirks_tab[i].value) printf(" %s", hdaa_quirks_tab[i].key); } printf("\n"); } ); HDA_BOOTHVERBOSE( device_printf(dev, "\n"); device_printf(dev, "+-----------+\n"); device_printf(dev, "| HDA NODES |\n"); device_printf(dev, "+-----------+\n"); hdaa_dump_nodes(devinfo); device_printf(dev, "\n"); device_printf(dev, "+----------------+\n"); device_printf(dev, "| HDA AMPLIFIERS |\n"); device_printf(dev, "+----------------+\n"); device_printf(dev, "\n"); i = 0; while ((ctl = hdaa_audio_ctl_each(devinfo, &i)) != NULL) { device_printf(dev, "%3d: nid %3d %s (%s) index %d", i, (ctl->widget != NULL) ? ctl->widget->nid : -1, (ctl->ndir == HDAA_CTL_IN)?"in ":"out", (ctl->dir == HDAA_CTL_IN)?"in ":"out", ctl->index); if (ctl->childwidget != NULL) printf(" cnid %3d", ctl->childwidget->nid); else printf(" "); printf(" ossmask=0x%08x\n", ctl->ossmask); device_printf(dev, " mute: %d step: %3d size: %3d off: %3d%s\n", ctl->mute, ctl->step, ctl->size, ctl->offset, (ctl->enable == 0) ? " [DISABLED]" : ((ctl->ossmask == 0) ? " [UNUSED]" : "")); } device_printf(dev, "\n"); ); } static void hdaa_unconfigure(device_t dev) { struct hdaa_devinfo *devinfo = device_get_softc(dev); struct hdaa_widget *w; int i, j; HDA_BOOTHVERBOSE( device_printf(dev, "Pin sense deinit...\n"); ); hdaa_sense_deinit(devinfo); free(devinfo->ctl, M_HDAA); devinfo->ctl = NULL; devinfo->ctlcnt = 0; free(devinfo->as, M_HDAA); devinfo->as = NULL; devinfo->ascnt = 0; free(devinfo->devs, M_HDAA); devinfo->devs = NULL; devinfo->num_devs = 0; free(devinfo->chans, M_HDAA); devinfo->chans = NULL; devinfo->num_chans = 0; for (i = devinfo->startnode; i < devinfo->endnode; i++) { w = hdaa_widget_get(devinfo, i); if (w == NULL) continue; w->enable = 1; w->selconn = -1; w->pflags = 0; w->bindas = -1; w->bindseqmask = 0; w->ossdev = -1; w->ossmask = 0; for (j = 0; j < w->nconns; j++) w->connsenable[j] = 1; if (w->type == HDA_PARAM_AUDIO_WIDGET_CAP_TYPE_PIN_COMPLEX) w->wclass.pin.config = w->wclass.pin.newconf; if (w->eld != NULL) { w->eld_len = 0; free(w->eld, M_HDAA); w->eld = NULL; } } } static int hdaa_sysctl_gpi_state(SYSCTL_HANDLER_ARGS) { struct hdaa_devinfo *devinfo = oidp->oid_arg1; device_t dev = devinfo->dev; char buf[256]; int n = 0, i, numgpi; uint32_t data = 0; buf[0] = 0; hdaa_lock(devinfo); numgpi = HDA_PARAM_GPIO_COUNT_NUM_GPI(devinfo->gpio_cap); if (numgpi > 0) { data = hda_command(dev, HDA_CMD_GET_GPI_DATA(0, devinfo->nid)); } hdaa_unlock(devinfo); for (i = 0; i < numgpi; i++) { n += snprintf(buf + n, sizeof(buf) - n, "%s%d=%d", n != 0 ? " " : "", i, ((data >> i) & 1)); } return (sysctl_handle_string(oidp, buf, sizeof(buf), req)); } static int hdaa_sysctl_gpio_state(SYSCTL_HANDLER_ARGS) { struct hdaa_devinfo *devinfo = oidp->oid_arg1; device_t dev = devinfo->dev; char buf[256]; int n = 0, i, numgpio; uint32_t data = 0, enable = 0, dir = 0; buf[0] = 0; hdaa_lock(devinfo); numgpio = HDA_PARAM_GPIO_COUNT_NUM_GPIO(devinfo->gpio_cap); if (numgpio > 0) { data = hda_command(dev, HDA_CMD_GET_GPIO_DATA(0, devinfo->nid)); enable = hda_command(dev, HDA_CMD_GET_GPIO_ENABLE_MASK(0, devinfo->nid)); dir = hda_command(dev, HDA_CMD_GET_GPIO_DIRECTION(0, devinfo->nid)); } hdaa_unlock(devinfo); for (i = 0; i < numgpio; i++) { n += snprintf(buf + n, sizeof(buf) - n, "%s%d=", n != 0 ? " " : "", i); if ((enable & (1 << i)) == 0) { n += snprintf(buf + n, sizeof(buf) - n, "disabled"); continue; } n += snprintf(buf + n, sizeof(buf) - n, "%sput(%d)", ((dir >> i) & 1) ? "out" : "in", ((data >> i) & 1)); } return (sysctl_handle_string(oidp, buf, sizeof(buf), req)); } static int hdaa_sysctl_gpio_config(SYSCTL_HANDLER_ARGS) { struct hdaa_devinfo *devinfo = oidp->oid_arg1; char buf[256]; int error, n = 0, i, numgpio; uint32_t gpio, x; gpio = devinfo->newgpio; numgpio = HDA_PARAM_GPIO_COUNT_NUM_GPIO(devinfo->gpio_cap); buf[0] = 0; for (i = 0; i < numgpio; i++) { x = (gpio & HDAA_GPIO_MASK(i)) >> HDAA_GPIO_SHIFT(i); n += snprintf(buf + n, sizeof(buf) - n, "%s%d=%s", n != 0 ? " " : "", i, HDA_GPIO_ACTIONS[x]); } error = sysctl_handle_string(oidp, buf, sizeof(buf), req); if (error != 0 || req->newptr == NULL) return (error); if (strncmp(buf, "0x", 2) == 0) gpio = strtol(buf + 2, NULL, 16); else gpio = hdaa_gpio_patch(gpio, buf); hdaa_lock(devinfo); devinfo->newgpio = devinfo->gpio = gpio; hdaa_gpio_commit(devinfo); hdaa_unlock(devinfo); return (0); } static int hdaa_sysctl_gpo_state(SYSCTL_HANDLER_ARGS) { struct hdaa_devinfo *devinfo = oidp->oid_arg1; device_t dev = devinfo->dev; char buf[256]; int n = 0, i, numgpo; uint32_t data = 0; buf[0] = 0; hdaa_lock(devinfo); numgpo = HDA_PARAM_GPIO_COUNT_NUM_GPO(devinfo->gpio_cap); if (numgpo > 0) { data = hda_command(dev, HDA_CMD_GET_GPO_DATA(0, devinfo->nid)); } hdaa_unlock(devinfo); for (i = 0; i < numgpo; i++) { n += snprintf(buf + n, sizeof(buf) - n, "%s%d=%d", n != 0 ? " " : "", i, ((data >> i) & 1)); } return (sysctl_handle_string(oidp, buf, sizeof(buf), req)); } static int hdaa_sysctl_gpo_config(SYSCTL_HANDLER_ARGS) { struct hdaa_devinfo *devinfo = oidp->oid_arg1; char buf[256]; int error, n = 0, i, numgpo; uint32_t gpo, x; gpo = devinfo->newgpo; numgpo = HDA_PARAM_GPIO_COUNT_NUM_GPO(devinfo->gpio_cap); buf[0] = 0; for (i = 0; i < numgpo; i++) { x = (gpo & HDAA_GPIO_MASK(i)) >> HDAA_GPIO_SHIFT(i); n += snprintf(buf + n, sizeof(buf) - n, "%s%d=%s", n != 0 ? " " : "", i, HDA_GPIO_ACTIONS[x]); } error = sysctl_handle_string(oidp, buf, sizeof(buf), req); if (error != 0 || req->newptr == NULL) return (error); if (strncmp(buf, "0x", 2) == 0) gpo = strtol(buf + 2, NULL, 16); else gpo = hdaa_gpio_patch(gpo, buf); hdaa_lock(devinfo); devinfo->newgpo = devinfo->gpo = gpo; hdaa_gpo_commit(devinfo); hdaa_unlock(devinfo); return (0); } static int hdaa_sysctl_reconfig(SYSCTL_HANDLER_ARGS) { device_t dev; struct hdaa_devinfo *devinfo; int error, val; dev = oidp->oid_arg1; devinfo = device_get_softc(dev); if (devinfo == NULL) return (EINVAL); val = 0; error = sysctl_handle_int(oidp, &val, 0, req); if (error != 0 || req->newptr == NULL || val == 0) return (error); HDA_BOOTHVERBOSE( device_printf(dev, "Reconfiguration...\n"); ); if ((error = device_delete_children(dev)) != 0) return (error); hdaa_lock(devinfo); hdaa_unconfigure(dev); hdaa_configure(dev); hdaa_unlock(devinfo); bus_generic_attach(dev); HDA_BOOTHVERBOSE( device_printf(dev, "Reconfiguration done\n"); ); return (0); } static int hdaa_suspend(device_t dev) { struct hdaa_devinfo *devinfo = device_get_softc(dev); int i; HDA_BOOTHVERBOSE( device_printf(dev, "Suspend...\n"); ); hdaa_lock(devinfo); HDA_BOOTHVERBOSE( device_printf(dev, "Stop streams...\n"); ); for (i = 0; i < devinfo->num_chans; i++) { if (devinfo->chans[i].flags & HDAA_CHN_RUNNING) { devinfo->chans[i].flags |= HDAA_CHN_SUSPEND; hdaa_channel_stop(&devinfo->chans[i]); } } HDA_BOOTHVERBOSE( device_printf(dev, "Power down FG" " nid=%d to the D3 state...\n", devinfo->nid); ); hda_command(devinfo->dev, HDA_CMD_SET_POWER_STATE(0, devinfo->nid, HDA_CMD_POWER_STATE_D3)); callout_stop(&devinfo->poll_jack); hdaa_unlock(devinfo); callout_drain(&devinfo->poll_jack); HDA_BOOTHVERBOSE( device_printf(dev, "Suspend done\n"); ); return (0); } static int hdaa_resume(device_t dev) { struct hdaa_devinfo *devinfo = device_get_softc(dev); int i; HDA_BOOTHVERBOSE( device_printf(dev, "Resume...\n"); ); hdaa_lock(devinfo); HDA_BOOTHVERBOSE( device_printf(dev, "Power up audio FG nid=%d...\n", devinfo->nid); ); hdaa_powerup(devinfo); HDA_BOOTHVERBOSE( device_printf(dev, "AFG commit...\n"); ); hdaa_audio_commit(devinfo); HDA_BOOTHVERBOSE( device_printf(dev, "Applying direct built-in patches...\n"); ); hdaa_patch_direct(devinfo); HDA_BOOTHVERBOSE( device_printf(dev, "Pin sense init...\n"); ); hdaa_sense_init(devinfo); hdaa_unlock(devinfo); for (i = 0; i < devinfo->num_devs; i++) { struct hdaa_pcm_devinfo *pdevinfo = &devinfo->devs[i]; HDA_BOOTHVERBOSE( device_printf(pdevinfo->dev, "OSS mixer reinitialization...\n"); ); if (mixer_reinit(pdevinfo->dev) == -1) device_printf(pdevinfo->dev, "unable to reinitialize the mixer\n"); } hdaa_lock(devinfo); HDA_BOOTHVERBOSE( device_printf(dev, "Start streams...\n"); ); for (i = 0; i < devinfo->num_chans; i++) { if (devinfo->chans[i].flags & HDAA_CHN_SUSPEND) { devinfo->chans[i].flags &= ~HDAA_CHN_SUSPEND; hdaa_channel_start(&devinfo->chans[i]); } } hdaa_unlock(devinfo); HDA_BOOTHVERBOSE( device_printf(dev, "Resume done\n"); ); return (0); } static int hdaa_probe(device_t dev) { const char *pdesc; char buf[128]; if (hda_get_node_type(dev) != HDA_PARAM_FCT_GRP_TYPE_NODE_TYPE_AUDIO) return (ENXIO); pdesc = device_get_desc(device_get_parent(dev)); snprintf(buf, sizeof(buf), "%.*s Audio Function Group", (int)(strlen(pdesc) - 10), pdesc); device_set_desc_copy(dev, buf); return (BUS_PROBE_DEFAULT); } static int hdaa_attach(device_t dev) { struct hdaa_devinfo *devinfo = device_get_softc(dev); uint32_t res; nid_t nid = hda_get_node_id(dev); devinfo->dev = dev; devinfo->lock = HDAC_GET_MTX(device_get_parent(dev), dev); devinfo->nid = nid; devinfo->newquirks = -1; devinfo->newgpio = -1; devinfo->newgpo = -1; callout_init(&devinfo->poll_jack, 1); devinfo->poll_ival = hz; hdaa_lock(devinfo); res = hda_command(dev, HDA_CMD_GET_PARAMETER(0 , nid, HDA_PARAM_SUB_NODE_COUNT)); hdaa_unlock(devinfo); devinfo->nodecnt = HDA_PARAM_SUB_NODE_COUNT_TOTAL(res); devinfo->startnode = HDA_PARAM_SUB_NODE_COUNT_START(res); devinfo->endnode = devinfo->startnode + devinfo->nodecnt; HDA_BOOTVERBOSE( device_printf(dev, "Subsystem ID: 0x%08x\n", hda_get_subsystem_id(dev)); ); HDA_BOOTHVERBOSE( device_printf(dev, "Audio Function Group at nid=%d: %d subnodes %d-%d\n", nid, devinfo->nodecnt, devinfo->startnode, devinfo->endnode - 1); ); if (devinfo->nodecnt > 0) devinfo->widget = (struct hdaa_widget *)malloc( sizeof(*(devinfo->widget)) * devinfo->nodecnt, M_HDAA, M_WAITOK | M_ZERO); else devinfo->widget = NULL; hdaa_lock(devinfo); HDA_BOOTHVERBOSE( device_printf(dev, "Powering up...\n"); ); hdaa_powerup(devinfo); HDA_BOOTHVERBOSE( device_printf(dev, "Parsing audio FG...\n"); ); hdaa_audio_parse(devinfo); HDA_BOOTVERBOSE( device_printf(dev, "Original pins configuration:\n"); hdaa_dump_pin_configs(devinfo); ); hdaa_configure(dev); hdaa_unlock(devinfo); SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev), SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), OID_AUTO, "config", CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_MPSAFE, &devinfo->newquirks, 0, hdaa_sysctl_quirks, "A", "Configuration options"); SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev), SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), OID_AUTO, "gpi_state", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, devinfo, 0, hdaa_sysctl_gpi_state, "A", "GPI state"); SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev), SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), OID_AUTO, "gpio_state", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, devinfo, 0, hdaa_sysctl_gpio_state, "A", "GPIO state"); SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev), SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), OID_AUTO, "gpio_config", CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_MPSAFE, devinfo, 0, hdaa_sysctl_gpio_config, "A", "GPIO configuration"); SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev), SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), OID_AUTO, "gpo_state", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, devinfo, 0, hdaa_sysctl_gpo_state, "A", "GPO state"); SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev), SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), OID_AUTO, "gpo_config", CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_MPSAFE, devinfo, 0, hdaa_sysctl_gpo_config, "A", "GPO configuration"); SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev), SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), OID_AUTO, "reconfig", CTLTYPE_INT | CTLFLAG_RW, dev, 0, hdaa_sysctl_reconfig, "I", "Reprocess configuration"); bus_generic_attach(dev); return (0); } static int hdaa_detach(device_t dev) { struct hdaa_devinfo *devinfo = device_get_softc(dev); int error; if ((error = device_delete_children(dev)) != 0) return (error); hdaa_lock(devinfo); hdaa_unconfigure(dev); devinfo->poll_ival = 0; callout_stop(&devinfo->poll_jack); hdaa_unlock(devinfo); callout_drain(&devinfo->poll_jack); free(devinfo->widget, M_HDAA); return (0); } static int hdaa_print_child(device_t dev, device_t child) { struct hdaa_devinfo *devinfo = device_get_softc(dev); struct hdaa_pcm_devinfo *pdevinfo = (struct hdaa_pcm_devinfo *)device_get_ivars(child); struct hdaa_audio_as *as; int retval, first = 1, i; retval = bus_print_child_header(dev, child); retval += printf(" at nid "); if (pdevinfo->playas >= 0) { as = &devinfo->as[pdevinfo->playas]; for (i = 0; i < 16; i++) { if (as->pins[i] <= 0) continue; retval += printf("%s%d", first ? "" : ",", as->pins[i]); first = 0; } } if (pdevinfo->recas >= 0) { if (pdevinfo->playas >= 0) { retval += printf(" and "); first = 1; } as = &devinfo->as[pdevinfo->recas]; for (i = 0; i < 16; i++) { if (as->pins[i] <= 0) continue; retval += printf("%s%d", first ? "" : ",", as->pins[i]); first = 0; } } retval += bus_print_child_footer(dev, child); return (retval); } static int hdaa_child_location_str(device_t dev, device_t child, char *buf, size_t buflen) { struct hdaa_devinfo *devinfo = device_get_softc(dev); struct hdaa_pcm_devinfo *pdevinfo = (struct hdaa_pcm_devinfo *)device_get_ivars(child); struct hdaa_audio_as *as; int first = 1, i, len = 0; len += snprintf(buf + len, buflen - len, "nid="); if (pdevinfo->playas >= 0) { as = &devinfo->as[pdevinfo->playas]; for (i = 0; i < 16; i++) { if (as->pins[i] <= 0) continue; len += snprintf(buf + len, buflen - len, "%s%d", first ? "" : ",", as->pins[i]); first = 0; } } if (pdevinfo->recas >= 0) { as = &devinfo->as[pdevinfo->recas]; for (i = 0; i < 16; i++) { if (as->pins[i] <= 0) continue; len += snprintf(buf + len, buflen - len, "%s%d", first ? "" : ",", as->pins[i]); first = 0; } } return (0); } static void hdaa_stream_intr(device_t dev, int dir, int stream) { struct hdaa_devinfo *devinfo = device_get_softc(dev); struct hdaa_chan *ch; int i; for (i = 0; i < devinfo->num_chans; i++) { ch = &devinfo->chans[i]; if (!(ch->flags & HDAA_CHN_RUNNING)) continue; if (ch->dir == ((dir == 1) ? PCMDIR_PLAY : PCMDIR_REC) && ch->sid == stream) { hdaa_unlock(devinfo); chn_intr(ch->c); hdaa_lock(devinfo); } } } static void hdaa_unsol_intr(device_t dev, uint32_t resp) { struct hdaa_devinfo *devinfo = device_get_softc(dev); struct hdaa_widget *w; int i, tag, flags; HDA_BOOTHVERBOSE( device_printf(dev, "Unsolicited response %08x\n", resp); ); tag = resp >> 26; for (i = devinfo->startnode; i < devinfo->endnode; i++) { w = hdaa_widget_get(devinfo, i); if (w == NULL || w->enable == 0 || w->type != HDA_PARAM_AUDIO_WIDGET_CAP_TYPE_PIN_COMPLEX) continue; if (w->unsol != tag) continue; if (HDA_PARAM_PIN_CAP_DP(w->wclass.pin.cap) || HDA_PARAM_PIN_CAP_HDMI(w->wclass.pin.cap)) flags = resp & 0x03; else flags = 0x01; if (flags & 0x01) hdaa_presence_handler(w); if (flags & 0x02) hdaa_eld_handler(w); } } static device_method_t hdaa_methods[] = { /* device interface */ DEVMETHOD(device_probe, hdaa_probe), DEVMETHOD(device_attach, hdaa_attach), DEVMETHOD(device_detach, hdaa_detach), DEVMETHOD(device_suspend, hdaa_suspend), DEVMETHOD(device_resume, hdaa_resume), /* Bus interface */ DEVMETHOD(bus_print_child, hdaa_print_child), DEVMETHOD(bus_child_location_str, hdaa_child_location_str), DEVMETHOD(hdac_stream_intr, hdaa_stream_intr), DEVMETHOD(hdac_unsol_intr, hdaa_unsol_intr), DEVMETHOD(hdac_pindump, hdaa_pindump), DEVMETHOD_END }; static driver_t hdaa_driver = { "hdaa", hdaa_methods, sizeof(struct hdaa_devinfo), }; static devclass_t hdaa_devclass; DRIVER_MODULE(snd_hda, hdacc, hdaa_driver, hdaa_devclass, NULL, NULL); static void hdaa_chan_formula(struct hdaa_devinfo *devinfo, int asid, char *buf, int buflen) { struct hdaa_audio_as *as; int c; as = &devinfo->as[asid]; c = devinfo->chans[as->chans[0]].channels; if (c == 1) snprintf(buf, buflen, "mono"); else if (c == 2) { if (as->hpredir < 0) buf[0] = 0; else snprintf(buf, buflen, "2.0"); } else if (as->pinset == 0x0003) snprintf(buf, buflen, "3.1"); else if (as->pinset == 0x0005 || as->pinset == 0x0011) snprintf(buf, buflen, "4.0"); else if (as->pinset == 0x0007 || as->pinset == 0x0013) snprintf(buf, buflen, "5.1"); else if (as->pinset == 0x0017) snprintf(buf, buflen, "7.1"); else snprintf(buf, buflen, "%dch", c); if (as->hpredir >= 0) strlcat(buf, "+HP", buflen); } static int hdaa_chan_type(struct hdaa_devinfo *devinfo, int asid) { struct hdaa_audio_as *as; struct hdaa_widget *w; int i, t = -1, t1; as = &devinfo->as[asid]; for (i = 0; i < 16; i++) { w = hdaa_widget_get(devinfo, as->pins[i]); if (w == NULL || w->enable == 0 || w->type != HDA_PARAM_AUDIO_WIDGET_CAP_TYPE_PIN_COMPLEX) continue; t1 = HDA_CONFIG_DEFAULTCONF_DEVICE(w->wclass.pin.config); if (t == -1) t = t1; else if (t != t1) { t = -2; break; } } return (t); } static int hdaa_sysctl_32bit(SYSCTL_HANDLER_ARGS) { struct hdaa_audio_as *as = (struct hdaa_audio_as *)oidp->oid_arg1; struct hdaa_pcm_devinfo *pdevinfo = as->pdevinfo; struct hdaa_devinfo *devinfo = pdevinfo->devinfo; struct hdaa_chan *ch; int error, val, i; uint32_t pcmcap; ch = &devinfo->chans[as->chans[0]]; val = (ch->bit32 == 4) ? 32 : ((ch->bit32 == 3) ? 24 : ((ch->bit32 == 2) ? 20 : 0)); error = sysctl_handle_int(oidp, &val, 0, req); if (error != 0 || req->newptr == NULL) return (error); pcmcap = ch->supp_pcm_size_rate; if (val == 32 && HDA_PARAM_SUPP_PCM_SIZE_RATE_32BIT(pcmcap)) ch->bit32 = 4; else if (val == 24 && HDA_PARAM_SUPP_PCM_SIZE_RATE_24BIT(pcmcap)) ch->bit32 = 3; else if (val == 20 && HDA_PARAM_SUPP_PCM_SIZE_RATE_20BIT(pcmcap)) ch->bit32 = 2; else return (EINVAL); for (i = 1; i < as->num_chans; i++) devinfo->chans[as->chans[i]].bit32 = ch->bit32; return (0); } static int hdaa_pcm_probe(device_t dev) { struct hdaa_pcm_devinfo *pdevinfo = (struct hdaa_pcm_devinfo *)device_get_ivars(dev); struct hdaa_devinfo *devinfo = pdevinfo->devinfo; const char *pdesc; char chans1[8], chans2[8]; char buf[128]; int loc1, loc2, t1, t2; if (pdevinfo->playas >= 0) loc1 = devinfo->as[pdevinfo->playas].location; else loc1 = devinfo->as[pdevinfo->recas].location; if (pdevinfo->recas >= 0) loc2 = devinfo->as[pdevinfo->recas].location; else loc2 = loc1; if (loc1 != loc2) loc1 = -2; if (loc1 >= 0 && HDA_LOCS[loc1][0] == '0') loc1 = -2; chans1[0] = 0; chans2[0] = 0; t1 = t2 = -1; if (pdevinfo->playas >= 0) { hdaa_chan_formula(devinfo, pdevinfo->playas, chans1, sizeof(chans1)); t1 = hdaa_chan_type(devinfo, pdevinfo->playas); } if (pdevinfo->recas >= 0) { hdaa_chan_formula(devinfo, pdevinfo->recas, chans2, sizeof(chans2)); t2 = hdaa_chan_type(devinfo, pdevinfo->recas); } if (chans1[0] != 0 || chans2[0] != 0) { if (chans1[0] == 0 && pdevinfo->playas >= 0) snprintf(chans1, sizeof(chans1), "2.0"); else if (chans2[0] == 0 && pdevinfo->recas >= 0) snprintf(chans2, sizeof(chans2), "2.0"); if (strcmp(chans1, chans2) == 0) chans2[0] = 0; } if (t1 == -1) t1 = t2; else if (t2 == -1) t2 = t1; if (t1 != t2) t1 = -2; if (pdevinfo->digital) t1 = -2; pdesc = device_get_desc(device_get_parent(dev)); snprintf(buf, sizeof(buf), "%.*s (%s%s%s%s%s%s%s%s%s)", (int)(strlen(pdesc) - 21), pdesc, loc1 >= 0 ? HDA_LOCS[loc1] : "", loc1 >= 0 ? " " : "", (pdevinfo->digital == 0x7)?"HDMI/DP": ((pdevinfo->digital == 0x5)?"DisplayPort": ((pdevinfo->digital == 0x3)?"HDMI": ((pdevinfo->digital)?"Digital":"Analog"))), chans1[0] ? " " : "", chans1, chans2[0] ? "/" : "", chans2, t1 >= 0 ? " " : "", t1 >= 0 ? HDA_DEVS[t1] : ""); device_set_desc_copy(dev, buf); return (BUS_PROBE_SPECIFIC); } static int hdaa_pcm_attach(device_t dev) { struct hdaa_pcm_devinfo *pdevinfo = (struct hdaa_pcm_devinfo *)device_get_ivars(dev); struct hdaa_devinfo *devinfo = pdevinfo->devinfo; struct hdaa_audio_as *as; struct snddev_info *d; char status[SND_STATUSLEN]; int i; pdevinfo->chan_size = pcm_getbuffersize(dev, HDA_BUFSZ_MIN, HDA_BUFSZ_DEFAULT, HDA_BUFSZ_MAX); HDA_BOOTVERBOSE( hdaa_dump_dac(pdevinfo); hdaa_dump_adc(pdevinfo); hdaa_dump_mix(pdevinfo); hdaa_dump_ctls(pdevinfo, "Master Volume", SOUND_MASK_VOLUME); hdaa_dump_ctls(pdevinfo, "PCM Volume", SOUND_MASK_PCM); hdaa_dump_ctls(pdevinfo, "CD Volume", SOUND_MASK_CD); hdaa_dump_ctls(pdevinfo, "Microphone Volume", SOUND_MASK_MIC); hdaa_dump_ctls(pdevinfo, "Microphone2 Volume", SOUND_MASK_MONITOR); hdaa_dump_ctls(pdevinfo, "Line-in Volume", SOUND_MASK_LINE); hdaa_dump_ctls(pdevinfo, "Speaker/Beep Volume", SOUND_MASK_SPEAKER); hdaa_dump_ctls(pdevinfo, "Recording Level", SOUND_MASK_RECLEV); hdaa_dump_ctls(pdevinfo, "Input Mix Level", SOUND_MASK_IMIX); hdaa_dump_ctls(pdevinfo, "Input Monitoring Level", SOUND_MASK_IGAIN); hdaa_dump_ctls(pdevinfo, NULL, 0); ); if (resource_int_value(device_get_name(dev), device_get_unit(dev), "blocksize", &i) == 0 && i > 0) { i &= HDA_BLK_ALIGN; if (i < HDA_BLK_MIN) i = HDA_BLK_MIN; pdevinfo->chan_blkcnt = pdevinfo->chan_size / i; i = 0; while (pdevinfo->chan_blkcnt >> i) i++; pdevinfo->chan_blkcnt = 1 << (i - 1); if (pdevinfo->chan_blkcnt < HDA_BDL_MIN) pdevinfo->chan_blkcnt = HDA_BDL_MIN; else if (pdevinfo->chan_blkcnt > HDA_BDL_MAX) pdevinfo->chan_blkcnt = HDA_BDL_MAX; } else pdevinfo->chan_blkcnt = HDA_BDL_DEFAULT; /* * We don't register interrupt handler with snd_setup_intr * in pcm device. Mark pcm device as MPSAFE manually. */ pcm_setflags(dev, pcm_getflags(dev) | SD_F_MPSAFE); HDA_BOOTHVERBOSE( device_printf(dev, "OSS mixer initialization...\n"); ); if (mixer_init(dev, &hdaa_audio_ctl_ossmixer_class, pdevinfo) != 0) device_printf(dev, "Can't register mixer\n"); HDA_BOOTHVERBOSE( device_printf(dev, "Registering PCM channels...\n"); ); if (pcm_register(dev, pdevinfo, (pdevinfo->playas >= 0)?1:0, (pdevinfo->recas >= 0)?1:0) != 0) device_printf(dev, "Can't register PCM\n"); pdevinfo->registered++; d = device_get_softc(dev); if (pdevinfo->playas >= 0) { as = &devinfo->as[pdevinfo->playas]; for (i = 0; i < as->num_chans; i++) pcm_addchan(dev, PCMDIR_PLAY, &hdaa_channel_class, &devinfo->chans[as->chans[i]]); SYSCTL_ADD_PROC(&d->play_sysctl_ctx, SYSCTL_CHILDREN(d->play_sysctl_tree), OID_AUTO, "32bit", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, as, sizeof(as), hdaa_sysctl_32bit, "I", "Resolution of 32bit samples (20/24/32bit)"); } if (pdevinfo->recas >= 0) { as = &devinfo->as[pdevinfo->recas]; for (i = 0; i < as->num_chans; i++) pcm_addchan(dev, PCMDIR_REC, &hdaa_channel_class, &devinfo->chans[as->chans[i]]); SYSCTL_ADD_PROC(&d->rec_sysctl_ctx, SYSCTL_CHILDREN(d->rec_sysctl_tree), OID_AUTO, "32bit", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, as, sizeof(as), hdaa_sysctl_32bit, "I", "Resolution of 32bit samples (20/24/32bit)"); pdevinfo->autorecsrc = 2; resource_int_value(device_get_name(dev), device_get_unit(dev), "rec.autosrc", &pdevinfo->autorecsrc); SYSCTL_ADD_INT(&d->rec_sysctl_ctx, SYSCTL_CHILDREN(d->rec_sysctl_tree), OID_AUTO, "autosrc", CTLFLAG_RW, &pdevinfo->autorecsrc, 0, "Automatic recording source selection"); } if (pdevinfo->mixer != NULL) { hdaa_audio_ctl_set_defaults(pdevinfo); hdaa_lock(devinfo); if (pdevinfo->playas >= 0) { as = &devinfo->as[pdevinfo->playas]; hdaa_channels_handler(as); } if (pdevinfo->recas >= 0) { as = &devinfo->as[pdevinfo->recas]; hdaa_autorecsrc_handler(as, NULL); hdaa_channels_handler(as); } hdaa_unlock(devinfo); } snprintf(status, SND_STATUSLEN, "on %s %s", device_get_nameunit(device_get_parent(dev)), PCM_KLDSTRING(snd_hda)); pcm_setstatus(dev, status); return (0); } static int hdaa_pcm_detach(device_t dev) { struct hdaa_pcm_devinfo *pdevinfo = (struct hdaa_pcm_devinfo *)device_get_ivars(dev); int err; if (pdevinfo->registered > 0) { err = pcm_unregister(dev); if (err != 0) return (err); } return (0); } static device_method_t hdaa_pcm_methods[] = { /* device interface */ DEVMETHOD(device_probe, hdaa_pcm_probe), DEVMETHOD(device_attach, hdaa_pcm_attach), DEVMETHOD(device_detach, hdaa_pcm_detach), DEVMETHOD_END }; static driver_t hdaa_pcm_driver = { "pcm", hdaa_pcm_methods, PCM_SOFTC_SIZE, }; DRIVER_MODULE(snd_hda_pcm, hdaa, hdaa_pcm_driver, pcm_devclass, NULL, NULL); MODULE_DEPEND(snd_hda, sound, SOUND_MINVER, SOUND_PREFVER, SOUND_MAXVER); MODULE_VERSION(snd_hda, 1); Index: head/sys/dev/sound/pci/hda/hdac.c =================================================================== --- head/sys/dev/sound/pci/hda/hdac.c (revision 308456) +++ head/sys/dev/sound/pci/hda/hdac.c (revision 308457) @@ -1,2094 +1,2094 @@ /*- * Copyright (c) 2006 Stephane E. Potvin * Copyright (c) 2006 Ariff Abdullah * Copyright (c) 2008-2012 Alexander Motin * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * Intel High Definition Audio (Controller) driver for FreeBSD. */ #ifdef HAVE_KERNEL_OPTION_HEADERS #include "opt_snd.h" #endif #include #include #include #include #include #include #include #include #include #define HDA_DRV_TEST_REV "20120126_0002" SND_DECLARE_FILE("$FreeBSD$"); #define hdac_lock(sc) snd_mtxlock((sc)->lock) #define hdac_unlock(sc) snd_mtxunlock((sc)->lock) #define hdac_lockassert(sc) snd_mtxassert((sc)->lock) #define hdac_lockowned(sc) mtx_owned((sc)->lock) #define HDAC_QUIRK_64BIT (1 << 0) #define HDAC_QUIRK_DMAPOS (1 << 1) #define HDAC_QUIRK_MSI (1 << 2) static const struct { const char *key; uint32_t value; } hdac_quirks_tab[] = { { "64bit", HDAC_QUIRK_DMAPOS }, { "dmapos", HDAC_QUIRK_DMAPOS }, { "msi", HDAC_QUIRK_MSI }, }; MALLOC_DEFINE(M_HDAC, "hdac", "HDA Controller"); static const struct { uint32_t model; const char *desc; char quirks_on; char quirks_off; } hdac_devices[] = { { HDA_INTEL_OAK, "Intel Oaktrail", 0, 0 }, { HDA_INTEL_BAY, "Intel BayTrail", 0, 0 }, { HDA_INTEL_HSW1, "Intel Haswell", 0, 0 }, { HDA_INTEL_HSW2, "Intel Haswell", 0, 0 }, { HDA_INTEL_HSW3, "Intel Haswell", 0, 0 }, { HDA_INTEL_BDW1, "Intel Broadwell", 0, 0 }, { HDA_INTEL_BDW2, "Intel Broadwell", 0, 0 }, { HDA_INTEL_CPT, "Intel Cougar Point", 0, 0 }, { HDA_INTEL_PATSBURG,"Intel Patsburg", 0, 0 }, { HDA_INTEL_PPT1, "Intel Panther Point", 0, 0 }, { HDA_INTEL_LPT1, "Intel Lynx Point", 0, 0 }, { HDA_INTEL_LPT2, "Intel Lynx Point", 0, 0 }, { HDA_INTEL_WCPT, "Intel Wildcat Point", 0, 0 }, { HDA_INTEL_WELLS1, "Intel Wellsburg", 0, 0 }, { HDA_INTEL_WELLS2, "Intel Wellsburg", 0, 0 }, { HDA_INTEL_LPTLP1, "Intel Lynx Point-LP", 0, 0 }, { HDA_INTEL_LPTLP2, "Intel Lynx Point-LP", 0, 0 }, { HDA_INTEL_SRPTLP, "Intel Sunrise Point-LP", 0, 0 }, { HDA_INTEL_SRPT, "Intel Sunrise Point", 0, 0 }, { HDA_INTEL_82801F, "Intel 82801F", 0, 0 }, { HDA_INTEL_63XXESB, "Intel 631x/632xESB", 0, 0 }, { HDA_INTEL_82801G, "Intel 82801G", 0, 0 }, { HDA_INTEL_82801H, "Intel 82801H", 0, 0 }, { HDA_INTEL_82801I, "Intel 82801I", 0, 0 }, { HDA_INTEL_82801JI, "Intel 82801JI", 0, 0 }, { HDA_INTEL_82801JD, "Intel 82801JD", 0, 0 }, { HDA_INTEL_PCH, "Intel 5 Series/3400 Series", 0, 0 }, { HDA_INTEL_PCH2, "Intel 5 Series/3400 Series", 0, 0 }, { HDA_INTEL_SCH, "Intel SCH", 0, 0 }, { HDA_NVIDIA_MCP51, "NVIDIA MCP51", 0, HDAC_QUIRK_MSI }, { HDA_NVIDIA_MCP55, "NVIDIA MCP55", 0, HDAC_QUIRK_MSI }, { HDA_NVIDIA_MCP61_1, "NVIDIA MCP61", 0, 0 }, { HDA_NVIDIA_MCP61_2, "NVIDIA MCP61", 0, 0 }, { HDA_NVIDIA_MCP65_1, "NVIDIA MCP65", 0, 0 }, { HDA_NVIDIA_MCP65_2, "NVIDIA MCP65", 0, 0 }, { HDA_NVIDIA_MCP67_1, "NVIDIA MCP67", 0, 0 }, { HDA_NVIDIA_MCP67_2, "NVIDIA MCP67", 0, 0 }, { HDA_NVIDIA_MCP73_1, "NVIDIA MCP73", 0, 0 }, { HDA_NVIDIA_MCP73_2, "NVIDIA MCP73", 0, 0 }, { HDA_NVIDIA_MCP78_1, "NVIDIA MCP78", 0, HDAC_QUIRK_64BIT }, { HDA_NVIDIA_MCP78_2, "NVIDIA MCP78", 0, HDAC_QUIRK_64BIT }, { HDA_NVIDIA_MCP78_3, "NVIDIA MCP78", 0, HDAC_QUIRK_64BIT }, { HDA_NVIDIA_MCP78_4, "NVIDIA MCP78", 0, HDAC_QUIRK_64BIT }, { HDA_NVIDIA_MCP79_1, "NVIDIA MCP79", 0, 0 }, { HDA_NVIDIA_MCP79_2, "NVIDIA MCP79", 0, 0 }, { HDA_NVIDIA_MCP79_3, "NVIDIA MCP79", 0, 0 }, { HDA_NVIDIA_MCP79_4, "NVIDIA MCP79", 0, 0 }, { HDA_NVIDIA_MCP89_1, "NVIDIA MCP89", 0, 0 }, { HDA_NVIDIA_MCP89_2, "NVIDIA MCP89", 0, 0 }, { HDA_NVIDIA_MCP89_3, "NVIDIA MCP89", 0, 0 }, { HDA_NVIDIA_MCP89_4, "NVIDIA MCP89", 0, 0 }, { HDA_NVIDIA_0BE2, "NVIDIA (0x0be2)", 0, HDAC_QUIRK_MSI }, { HDA_NVIDIA_0BE3, "NVIDIA (0x0be3)", 0, HDAC_QUIRK_MSI }, { HDA_NVIDIA_0BE4, "NVIDIA (0x0be4)", 0, HDAC_QUIRK_MSI }, { HDA_NVIDIA_GT100, "NVIDIA GT100", 0, HDAC_QUIRK_MSI }, { HDA_NVIDIA_GT104, "NVIDIA GT104", 0, HDAC_QUIRK_MSI }, { HDA_NVIDIA_GT106, "NVIDIA GT106", 0, HDAC_QUIRK_MSI }, { HDA_NVIDIA_GT108, "NVIDIA GT108", 0, HDAC_QUIRK_MSI }, { HDA_NVIDIA_GT116, "NVIDIA GT116", 0, HDAC_QUIRK_MSI }, { HDA_NVIDIA_GF119, "NVIDIA GF119", 0, 0 }, { HDA_NVIDIA_GF110_1, "NVIDIA GF110", 0, HDAC_QUIRK_MSI }, { HDA_NVIDIA_GF110_2, "NVIDIA GF110", 0, HDAC_QUIRK_MSI }, { HDA_ATI_SB450, "ATI SB450", 0, 0 }, { HDA_ATI_SB600, "ATI SB600", 0, 0 }, { HDA_ATI_RS600, "ATI RS600", 0, 0 }, { HDA_ATI_RS690, "ATI RS690", 0, 0 }, { HDA_ATI_RS780, "ATI RS780", 0, 0 }, { HDA_ATI_R600, "ATI R600", 0, 0 }, { HDA_ATI_RV610, "ATI RV610", 0, 0 }, { HDA_ATI_RV620, "ATI RV620", 0, 0 }, { HDA_ATI_RV630, "ATI RV630", 0, 0 }, { HDA_ATI_RV635, "ATI RV635", 0, 0 }, { HDA_ATI_RV710, "ATI RV710", 0, 0 }, { HDA_ATI_RV730, "ATI RV730", 0, 0 }, { HDA_ATI_RV740, "ATI RV740", 0, 0 }, { HDA_ATI_RV770, "ATI RV770", 0, 0 }, { HDA_ATI_RV810, "ATI RV810", 0, 0 }, { HDA_ATI_RV830, "ATI RV830", 0, 0 }, { HDA_ATI_RV840, "ATI RV840", 0, 0 }, { HDA_ATI_RV870, "ATI RV870", 0, 0 }, { HDA_ATI_RV910, "ATI RV910", 0, 0 }, { HDA_ATI_RV930, "ATI RV930", 0, 0 }, { HDA_ATI_RV940, "ATI RV940", 0, 0 }, { HDA_ATI_RV970, "ATI RV970", 0, 0 }, { HDA_ATI_R1000, "ATI R1000", 0, 0 }, { HDA_AMD_HUDSON2, "AMD Hudson-2", 0, 0 }, { HDA_RDC_M3010, "RDC M3010", 0, 0 }, { HDA_VIA_VT82XX, "VIA VT8251/8237A",0, 0 }, { HDA_SIS_966, "SiS 966", 0, 0 }, { HDA_ULI_M5461, "ULI M5461", 0, 0 }, /* Unknown */ { HDA_INTEL_ALL, "Intel", 0, 0 }, { HDA_NVIDIA_ALL, "NVIDIA", 0, 0 }, { HDA_ATI_ALL, "ATI", 0, 0 }, { HDA_AMD_ALL, "AMD", 0, 0 }, { HDA_VIA_ALL, "VIA", 0, 0 }, { HDA_SIS_ALL, "SiS", 0, 0 }, { HDA_ULI_ALL, "ULI", 0, 0 }, }; static const struct { uint16_t vendor; uint8_t reg; uint8_t mask; uint8_t enable; } hdac_pcie_snoop[] = { { INTEL_VENDORID, 0x00, 0x00, 0x00 }, { ATI_VENDORID, 0x42, 0xf8, 0x02 }, { NVIDIA_VENDORID, 0x4e, 0xf0, 0x0f }, }; /**************************************************************************** * Function prototypes ****************************************************************************/ static void hdac_intr_handler(void *); static int hdac_reset(struct hdac_softc *, int); static int hdac_get_capabilities(struct hdac_softc *); static void hdac_dma_cb(void *, bus_dma_segment_t *, int, int); static int hdac_dma_alloc(struct hdac_softc *, struct hdac_dma *, bus_size_t); static void hdac_dma_free(struct hdac_softc *, struct hdac_dma *); static int hdac_mem_alloc(struct hdac_softc *); static void hdac_mem_free(struct hdac_softc *); static int hdac_irq_alloc(struct hdac_softc *); static void hdac_irq_free(struct hdac_softc *); static void hdac_corb_init(struct hdac_softc *); static void hdac_rirb_init(struct hdac_softc *); static void hdac_corb_start(struct hdac_softc *); static void hdac_rirb_start(struct hdac_softc *); static void hdac_attach2(void *); static uint32_t hdac_send_command(struct hdac_softc *, nid_t, uint32_t); static int hdac_probe(device_t); static int hdac_attach(device_t); static int hdac_detach(device_t); static int hdac_suspend(device_t); static int hdac_resume(device_t); static int hdac_rirb_flush(struct hdac_softc *sc); static int hdac_unsolq_flush(struct hdac_softc *sc); #define hdac_command(a1, a2, a3) \ hdac_send_command(a1, a3, a2) /* This function surely going to make its way into upper level someday. */ static void hdac_config_fetch(struct hdac_softc *sc, uint32_t *on, uint32_t *off) { const char *res = NULL; int i = 0, j, k, len, inv; if (resource_string_value(device_get_name(sc->dev), device_get_unit(sc->dev), "config", &res) != 0) return; if (!(res != NULL && strlen(res) > 0)) return; HDA_BOOTVERBOSE( device_printf(sc->dev, "Config options:"); ); for (;;) { while (res[i] != '\0' && (res[i] == ',' || isspace(res[i]) != 0)) i++; if (res[i] == '\0') { HDA_BOOTVERBOSE( printf("\n"); ); return; } j = i; while (res[j] != '\0' && !(res[j] == ',' || isspace(res[j]) != 0)) j++; len = j - i; if (len > 2 && strncmp(res + i, "no", 2) == 0) inv = 2; else inv = 0; for (k = 0; len > inv && k < nitems(hdac_quirks_tab); k++) { if (strncmp(res + i + inv, hdac_quirks_tab[k].key, len - inv) != 0) continue; if (len - inv != strlen(hdac_quirks_tab[k].key)) continue; HDA_BOOTVERBOSE( printf(" %s%s", (inv != 0) ? "no" : "", hdac_quirks_tab[k].key); ); if (inv == 0) { *on |= hdac_quirks_tab[k].value; *on &= ~hdac_quirks_tab[k].value; } else if (inv != 0) { *off |= hdac_quirks_tab[k].value; *off &= ~hdac_quirks_tab[k].value; } break; } i = j; } } /**************************************************************************** * void hdac_intr_handler(void *) * * Interrupt handler. Processes interrupts received from the hdac. ****************************************************************************/ static void hdac_intr_handler(void *context) { struct hdac_softc *sc; device_t dev; uint32_t intsts; uint8_t rirbsts; int i; sc = (struct hdac_softc *)context; hdac_lock(sc); /* Do we have anything to do? */ intsts = HDAC_READ_4(&sc->mem, HDAC_INTSTS); if ((intsts & HDAC_INTSTS_GIS) == 0) { hdac_unlock(sc); return; } /* Was this a controller interrupt? */ if (intsts & HDAC_INTSTS_CIS) { rirbsts = HDAC_READ_1(&sc->mem, HDAC_RIRBSTS); /* Get as many responses that we can */ while (rirbsts & HDAC_RIRBSTS_RINTFL) { HDAC_WRITE_1(&sc->mem, HDAC_RIRBSTS, HDAC_RIRBSTS_RINTFL); hdac_rirb_flush(sc); rirbsts = HDAC_READ_1(&sc->mem, HDAC_RIRBSTS); } if (sc->unsolq_rp != sc->unsolq_wp) taskqueue_enqueue(taskqueue_thread, &sc->unsolq_task); } if (intsts & HDAC_INTSTS_SIS_MASK) { for (i = 0; i < sc->num_ss; i++) { if ((intsts & (1 << i)) == 0) continue; HDAC_WRITE_1(&sc->mem, (i << 5) + HDAC_SDSTS, HDAC_SDSTS_DESE | HDAC_SDSTS_FIFOE | HDAC_SDSTS_BCIS ); if ((dev = sc->streams[i].dev) != NULL) { HDAC_STREAM_INTR(dev, sc->streams[i].dir, sc->streams[i].stream); } } } HDAC_WRITE_4(&sc->mem, HDAC_INTSTS, intsts); hdac_unlock(sc); } static void hdac_poll_callback(void *arg) { struct hdac_softc *sc = arg; if (sc == NULL) return; hdac_lock(sc); if (sc->polling == 0) { hdac_unlock(sc); return; } callout_reset(&sc->poll_callout, sc->poll_ival, hdac_poll_callback, sc); hdac_unlock(sc); hdac_intr_handler(sc); } /**************************************************************************** * int hdac_reset(hdac_softc *, int) * * Reset the hdac to a quiescent and known state. ****************************************************************************/ static int hdac_reset(struct hdac_softc *sc, int wakeup) { uint32_t gctl; int count, i; /* * Stop all Streams DMA engine */ for (i = 0; i < sc->num_iss; i++) HDAC_WRITE_4(&sc->mem, HDAC_ISDCTL(sc, i), 0x0); for (i = 0; i < sc->num_oss; i++) HDAC_WRITE_4(&sc->mem, HDAC_OSDCTL(sc, i), 0x0); for (i = 0; i < sc->num_bss; i++) HDAC_WRITE_4(&sc->mem, HDAC_BSDCTL(sc, i), 0x0); /* * Stop Control DMA engines. */ HDAC_WRITE_1(&sc->mem, HDAC_CORBCTL, 0x0); HDAC_WRITE_1(&sc->mem, HDAC_RIRBCTL, 0x0); /* * Reset DMA position buffer. */ HDAC_WRITE_4(&sc->mem, HDAC_DPIBLBASE, 0x0); HDAC_WRITE_4(&sc->mem, HDAC_DPIBUBASE, 0x0); /* * Reset the controller. The reset must remain asserted for * a minimum of 100us. */ gctl = HDAC_READ_4(&sc->mem, HDAC_GCTL); HDAC_WRITE_4(&sc->mem, HDAC_GCTL, gctl & ~HDAC_GCTL_CRST); count = 10000; do { gctl = HDAC_READ_4(&sc->mem, HDAC_GCTL); if (!(gctl & HDAC_GCTL_CRST)) break; DELAY(10); } while (--count); if (gctl & HDAC_GCTL_CRST) { device_printf(sc->dev, "Unable to put hdac in reset\n"); return (ENXIO); } /* If wakeup is not requested - leave the controller in reset state. */ if (!wakeup) return (0); DELAY(100); gctl = HDAC_READ_4(&sc->mem, HDAC_GCTL); HDAC_WRITE_4(&sc->mem, HDAC_GCTL, gctl | HDAC_GCTL_CRST); count = 10000; do { gctl = HDAC_READ_4(&sc->mem, HDAC_GCTL); if (gctl & HDAC_GCTL_CRST) break; DELAY(10); } while (--count); if (!(gctl & HDAC_GCTL_CRST)) { device_printf(sc->dev, "Device stuck in reset\n"); return (ENXIO); } /* * Wait for codecs to finish their own reset sequence. The delay here - * should be of 250us but for some reasons, on it's not enough on my + * should be of 250us but for some reasons, it's not enough on my * computer. Let's use twice as much as necessary to make sure that * it's reset properly. */ DELAY(1000); return (0); } /**************************************************************************** * int hdac_get_capabilities(struct hdac_softc *); * * Retreive the general capabilities of the hdac; * Number of Input Streams * Number of Output Streams * Number of bidirectional Streams * 64bit ready * CORB and RIRB sizes ****************************************************************************/ static int hdac_get_capabilities(struct hdac_softc *sc) { uint16_t gcap; uint8_t corbsize, rirbsize; gcap = HDAC_READ_2(&sc->mem, HDAC_GCAP); sc->num_iss = HDAC_GCAP_ISS(gcap); sc->num_oss = HDAC_GCAP_OSS(gcap); sc->num_bss = HDAC_GCAP_BSS(gcap); sc->num_ss = sc->num_iss + sc->num_oss + sc->num_bss; sc->num_sdo = HDAC_GCAP_NSDO(gcap); sc->support_64bit = (gcap & HDAC_GCAP_64OK) != 0; if (sc->quirks_on & HDAC_QUIRK_64BIT) sc->support_64bit = 1; else if (sc->quirks_off & HDAC_QUIRK_64BIT) sc->support_64bit = 0; corbsize = HDAC_READ_1(&sc->mem, HDAC_CORBSIZE); if ((corbsize & HDAC_CORBSIZE_CORBSZCAP_256) == HDAC_CORBSIZE_CORBSZCAP_256) sc->corb_size = 256; else if ((corbsize & HDAC_CORBSIZE_CORBSZCAP_16) == HDAC_CORBSIZE_CORBSZCAP_16) sc->corb_size = 16; else if ((corbsize & HDAC_CORBSIZE_CORBSZCAP_2) == HDAC_CORBSIZE_CORBSZCAP_2) sc->corb_size = 2; else { device_printf(sc->dev, "%s: Invalid corb size (%x)\n", __func__, corbsize); return (ENXIO); } rirbsize = HDAC_READ_1(&sc->mem, HDAC_RIRBSIZE); if ((rirbsize & HDAC_RIRBSIZE_RIRBSZCAP_256) == HDAC_RIRBSIZE_RIRBSZCAP_256) sc->rirb_size = 256; else if ((rirbsize & HDAC_RIRBSIZE_RIRBSZCAP_16) == HDAC_RIRBSIZE_RIRBSZCAP_16) sc->rirb_size = 16; else if ((rirbsize & HDAC_RIRBSIZE_RIRBSZCAP_2) == HDAC_RIRBSIZE_RIRBSZCAP_2) sc->rirb_size = 2; else { device_printf(sc->dev, "%s: Invalid rirb size (%x)\n", __func__, rirbsize); return (ENXIO); } HDA_BOOTVERBOSE( device_printf(sc->dev, "Caps: OSS %d, ISS %d, BSS %d, " "NSDO %d%s, CORB %d, RIRB %d\n", sc->num_oss, sc->num_iss, sc->num_bss, 1 << sc->num_sdo, sc->support_64bit ? ", 64bit" : "", sc->corb_size, sc->rirb_size); ); return (0); } /**************************************************************************** * void hdac_dma_cb * * This function is called by bus_dmamap_load when the mapping has been * established. We just record the physical address of the mapping into * the struct hdac_dma passed in. ****************************************************************************/ static void hdac_dma_cb(void *callback_arg, bus_dma_segment_t *segs, int nseg, int error) { struct hdac_dma *dma; if (error == 0) { dma = (struct hdac_dma *)callback_arg; dma->dma_paddr = segs[0].ds_addr; } } /**************************************************************************** * int hdac_dma_alloc * * This function allocate and setup a dma region (struct hdac_dma). * It must be freed by a corresponding hdac_dma_free. ****************************************************************************/ static int hdac_dma_alloc(struct hdac_softc *sc, struct hdac_dma *dma, bus_size_t size) { bus_size_t roundsz; int result; roundsz = roundup2(size, HDA_DMA_ALIGNMENT); bzero(dma, sizeof(*dma)); /* * Create a DMA tag */ result = bus_dma_tag_create( bus_get_dma_tag(sc->dev), /* parent */ HDA_DMA_ALIGNMENT, /* alignment */ 0, /* boundary */ (sc->support_64bit) ? BUS_SPACE_MAXADDR : BUS_SPACE_MAXADDR_32BIT, /* lowaddr */ BUS_SPACE_MAXADDR, /* highaddr */ NULL, /* filtfunc */ NULL, /* fistfuncarg */ roundsz, /* maxsize */ 1, /* nsegments */ roundsz, /* maxsegsz */ 0, /* flags */ NULL, /* lockfunc */ NULL, /* lockfuncarg */ &dma->dma_tag); /* dmat */ if (result != 0) { device_printf(sc->dev, "%s: bus_dma_tag_create failed (%x)\n", __func__, result); goto hdac_dma_alloc_fail; } /* * Allocate DMA memory */ result = bus_dmamem_alloc(dma->dma_tag, (void **)&dma->dma_vaddr, BUS_DMA_NOWAIT | BUS_DMA_ZERO | ((sc->flags & HDAC_F_DMA_NOCACHE) ? BUS_DMA_NOCACHE : 0), &dma->dma_map); if (result != 0) { device_printf(sc->dev, "%s: bus_dmamem_alloc failed (%x)\n", __func__, result); goto hdac_dma_alloc_fail; } dma->dma_size = roundsz; /* * Map the memory */ result = bus_dmamap_load(dma->dma_tag, dma->dma_map, (void *)dma->dma_vaddr, roundsz, hdac_dma_cb, (void *)dma, 0); if (result != 0 || dma->dma_paddr == 0) { if (result == 0) result = ENOMEM; device_printf(sc->dev, "%s: bus_dmamem_load failed (%x)\n", __func__, result); goto hdac_dma_alloc_fail; } HDA_BOOTHVERBOSE( device_printf(sc->dev, "%s: size=%ju -> roundsz=%ju\n", __func__, (uintmax_t)size, (uintmax_t)roundsz); ); return (0); hdac_dma_alloc_fail: hdac_dma_free(sc, dma); return (result); } /**************************************************************************** * void hdac_dma_free(struct hdac_softc *, struct hdac_dma *) * * Free a struct dhac_dma that has been previously allocated via the * hdac_dma_alloc function. ****************************************************************************/ static void hdac_dma_free(struct hdac_softc *sc, struct hdac_dma *dma) { if (dma->dma_paddr != 0) { #if 0 /* Flush caches */ bus_dmamap_sync(dma->dma_tag, dma->dma_map, BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE); #endif bus_dmamap_unload(dma->dma_tag, dma->dma_map); dma->dma_paddr = 0; } if (dma->dma_vaddr != NULL) { bus_dmamem_free(dma->dma_tag, dma->dma_vaddr, dma->dma_map); dma->dma_vaddr = NULL; } if (dma->dma_tag != NULL) { bus_dma_tag_destroy(dma->dma_tag); dma->dma_tag = NULL; } dma->dma_size = 0; } /**************************************************************************** * int hdac_mem_alloc(struct hdac_softc *) * * Allocate all the bus resources necessary to speak with the physical * controller. ****************************************************************************/ static int hdac_mem_alloc(struct hdac_softc *sc) { struct hdac_mem *mem; mem = &sc->mem; mem->mem_rid = PCIR_BAR(0); mem->mem_res = bus_alloc_resource_any(sc->dev, SYS_RES_MEMORY, &mem->mem_rid, RF_ACTIVE); if (mem->mem_res == NULL) { device_printf(sc->dev, "%s: Unable to allocate memory resource\n", __func__); return (ENOMEM); } mem->mem_tag = rman_get_bustag(mem->mem_res); mem->mem_handle = rman_get_bushandle(mem->mem_res); return (0); } /**************************************************************************** * void hdac_mem_free(struct hdac_softc *) * * Free up resources previously allocated by hdac_mem_alloc. ****************************************************************************/ static void hdac_mem_free(struct hdac_softc *sc) { struct hdac_mem *mem; mem = &sc->mem; if (mem->mem_res != NULL) bus_release_resource(sc->dev, SYS_RES_MEMORY, mem->mem_rid, mem->mem_res); mem->mem_res = NULL; } /**************************************************************************** * int hdac_irq_alloc(struct hdac_softc *) * * Allocate and setup the resources necessary for interrupt handling. ****************************************************************************/ static int hdac_irq_alloc(struct hdac_softc *sc) { struct hdac_irq *irq; int result; irq = &sc->irq; irq->irq_rid = 0x0; if ((sc->quirks_off & HDAC_QUIRK_MSI) == 0 && (result = pci_msi_count(sc->dev)) == 1 && pci_alloc_msi(sc->dev, &result) == 0) irq->irq_rid = 0x1; irq->irq_res = bus_alloc_resource_any(sc->dev, SYS_RES_IRQ, &irq->irq_rid, RF_SHAREABLE | RF_ACTIVE); if (irq->irq_res == NULL) { device_printf(sc->dev, "%s: Unable to allocate irq\n", __func__); goto hdac_irq_alloc_fail; } result = bus_setup_intr(sc->dev, irq->irq_res, INTR_MPSAFE | INTR_TYPE_AV, NULL, hdac_intr_handler, sc, &irq->irq_handle); if (result != 0) { device_printf(sc->dev, "%s: Unable to setup interrupt handler (%x)\n", __func__, result); goto hdac_irq_alloc_fail; } return (0); hdac_irq_alloc_fail: hdac_irq_free(sc); return (ENXIO); } /**************************************************************************** * void hdac_irq_free(struct hdac_softc *) * * Free up resources previously allocated by hdac_irq_alloc. ****************************************************************************/ static void hdac_irq_free(struct hdac_softc *sc) { struct hdac_irq *irq; irq = &sc->irq; if (irq->irq_res != NULL && irq->irq_handle != NULL) bus_teardown_intr(sc->dev, irq->irq_res, irq->irq_handle); if (irq->irq_res != NULL) bus_release_resource(sc->dev, SYS_RES_IRQ, irq->irq_rid, irq->irq_res); if (irq->irq_rid == 0x1) pci_release_msi(sc->dev); irq->irq_handle = NULL; irq->irq_res = NULL; irq->irq_rid = 0x0; } /**************************************************************************** * void hdac_corb_init(struct hdac_softc *) * * Initialize the corb registers for operations but do not start it up yet. * The CORB engine must not be running when this function is called. ****************************************************************************/ static void hdac_corb_init(struct hdac_softc *sc) { uint8_t corbsize; uint64_t corbpaddr; /* Setup the CORB size. */ switch (sc->corb_size) { case 256: corbsize = HDAC_CORBSIZE_CORBSIZE(HDAC_CORBSIZE_CORBSIZE_256); break; case 16: corbsize = HDAC_CORBSIZE_CORBSIZE(HDAC_CORBSIZE_CORBSIZE_16); break; case 2: corbsize = HDAC_CORBSIZE_CORBSIZE(HDAC_CORBSIZE_CORBSIZE_2); break; default: panic("%s: Invalid CORB size (%x)\n", __func__, sc->corb_size); } HDAC_WRITE_1(&sc->mem, HDAC_CORBSIZE, corbsize); /* Setup the CORB Address in the hdac */ corbpaddr = (uint64_t)sc->corb_dma.dma_paddr; HDAC_WRITE_4(&sc->mem, HDAC_CORBLBASE, (uint32_t)corbpaddr); HDAC_WRITE_4(&sc->mem, HDAC_CORBUBASE, (uint32_t)(corbpaddr >> 32)); /* Set the WP and RP */ sc->corb_wp = 0; HDAC_WRITE_2(&sc->mem, HDAC_CORBWP, sc->corb_wp); HDAC_WRITE_2(&sc->mem, HDAC_CORBRP, HDAC_CORBRP_CORBRPRST); /* * The HDA specification indicates that the CORBRPRST bit will always * read as zero. Unfortunately, it seems that at least the 82801G * doesn't reset the bit to zero, which stalls the corb engine. * manually reset the bit to zero before continuing. */ HDAC_WRITE_2(&sc->mem, HDAC_CORBRP, 0x0); /* Enable CORB error reporting */ #if 0 HDAC_WRITE_1(&sc->mem, HDAC_CORBCTL, HDAC_CORBCTL_CMEIE); #endif } /**************************************************************************** * void hdac_rirb_init(struct hdac_softc *) * * Initialize the rirb registers for operations but do not start it up yet. * The RIRB engine must not be running when this function is called. ****************************************************************************/ static void hdac_rirb_init(struct hdac_softc *sc) { uint8_t rirbsize; uint64_t rirbpaddr; /* Setup the RIRB size. */ switch (sc->rirb_size) { case 256: rirbsize = HDAC_RIRBSIZE_RIRBSIZE(HDAC_RIRBSIZE_RIRBSIZE_256); break; case 16: rirbsize = HDAC_RIRBSIZE_RIRBSIZE(HDAC_RIRBSIZE_RIRBSIZE_16); break; case 2: rirbsize = HDAC_RIRBSIZE_RIRBSIZE(HDAC_RIRBSIZE_RIRBSIZE_2); break; default: panic("%s: Invalid RIRB size (%x)\n", __func__, sc->rirb_size); } HDAC_WRITE_1(&sc->mem, HDAC_RIRBSIZE, rirbsize); /* Setup the RIRB Address in the hdac */ rirbpaddr = (uint64_t)sc->rirb_dma.dma_paddr; HDAC_WRITE_4(&sc->mem, HDAC_RIRBLBASE, (uint32_t)rirbpaddr); HDAC_WRITE_4(&sc->mem, HDAC_RIRBUBASE, (uint32_t)(rirbpaddr >> 32)); /* Setup the WP and RP */ sc->rirb_rp = 0; HDAC_WRITE_2(&sc->mem, HDAC_RIRBWP, HDAC_RIRBWP_RIRBWPRST); /* Setup the interrupt threshold */ HDAC_WRITE_2(&sc->mem, HDAC_RINTCNT, sc->rirb_size / 2); /* Enable Overrun and response received reporting */ #if 0 HDAC_WRITE_1(&sc->mem, HDAC_RIRBCTL, HDAC_RIRBCTL_RIRBOIC | HDAC_RIRBCTL_RINTCTL); #else HDAC_WRITE_1(&sc->mem, HDAC_RIRBCTL, HDAC_RIRBCTL_RINTCTL); #endif #if 0 /* * Make sure that the Host CPU cache doesn't contain any dirty * cache lines that falls in the rirb. If I understood correctly, it * should be sufficient to do this only once as the rirb is purely * read-only from now on. */ bus_dmamap_sync(sc->rirb_dma.dma_tag, sc->rirb_dma.dma_map, BUS_DMASYNC_PREREAD); #endif } /**************************************************************************** * void hdac_corb_start(hdac_softc *) * * Startup the corb DMA engine ****************************************************************************/ static void hdac_corb_start(struct hdac_softc *sc) { uint32_t corbctl; corbctl = HDAC_READ_1(&sc->mem, HDAC_CORBCTL); corbctl |= HDAC_CORBCTL_CORBRUN; HDAC_WRITE_1(&sc->mem, HDAC_CORBCTL, corbctl); } /**************************************************************************** * void hdac_rirb_start(hdac_softc *) * * Startup the rirb DMA engine ****************************************************************************/ static void hdac_rirb_start(struct hdac_softc *sc) { uint32_t rirbctl; rirbctl = HDAC_READ_1(&sc->mem, HDAC_RIRBCTL); rirbctl |= HDAC_RIRBCTL_RIRBDMAEN; HDAC_WRITE_1(&sc->mem, HDAC_RIRBCTL, rirbctl); } static int hdac_rirb_flush(struct hdac_softc *sc) { struct hdac_rirb *rirb_base, *rirb; nid_t cad; uint32_t resp; uint8_t rirbwp; int ret; rirb_base = (struct hdac_rirb *)sc->rirb_dma.dma_vaddr; rirbwp = HDAC_READ_1(&sc->mem, HDAC_RIRBWP); #if 0 bus_dmamap_sync(sc->rirb_dma.dma_tag, sc->rirb_dma.dma_map, BUS_DMASYNC_POSTREAD); #endif ret = 0; while (sc->rirb_rp != rirbwp) { sc->rirb_rp++; sc->rirb_rp %= sc->rirb_size; rirb = &rirb_base[sc->rirb_rp]; cad = HDAC_RIRB_RESPONSE_EX_SDATA_IN(rirb->response_ex); resp = rirb->response; if (rirb->response_ex & HDAC_RIRB_RESPONSE_EX_UNSOLICITED) { sc->unsolq[sc->unsolq_wp++] = resp; sc->unsolq_wp %= HDAC_UNSOLQ_MAX; sc->unsolq[sc->unsolq_wp++] = cad; sc->unsolq_wp %= HDAC_UNSOLQ_MAX; } else if (sc->codecs[cad].pending <= 0) { device_printf(sc->dev, "Unexpected unsolicited " "response from address %d: %08x\n", cad, resp); } else { sc->codecs[cad].response = resp; sc->codecs[cad].pending--; } ret++; } return (ret); } static int hdac_unsolq_flush(struct hdac_softc *sc) { device_t child; nid_t cad; uint32_t resp; int ret = 0; if (sc->unsolq_st == HDAC_UNSOLQ_READY) { sc->unsolq_st = HDAC_UNSOLQ_BUSY; while (sc->unsolq_rp != sc->unsolq_wp) { resp = sc->unsolq[sc->unsolq_rp++]; sc->unsolq_rp %= HDAC_UNSOLQ_MAX; cad = sc->unsolq[sc->unsolq_rp++]; sc->unsolq_rp %= HDAC_UNSOLQ_MAX; if ((child = sc->codecs[cad].dev) != NULL) HDAC_UNSOL_INTR(child, resp); ret++; } sc->unsolq_st = HDAC_UNSOLQ_READY; } return (ret); } /**************************************************************************** * uint32_t hdac_command_sendone_internal * * Wrapper function that sends only one command to a given codec ****************************************************************************/ static uint32_t hdac_send_command(struct hdac_softc *sc, nid_t cad, uint32_t verb) { int timeout; uint32_t *corb; if (!hdac_lockowned(sc)) device_printf(sc->dev, "WARNING!!!! mtx not owned!!!!\n"); verb &= ~HDA_CMD_CAD_MASK; verb |= ((uint32_t)cad) << HDA_CMD_CAD_SHIFT; sc->codecs[cad].response = HDA_INVALID; sc->codecs[cad].pending++; sc->corb_wp++; sc->corb_wp %= sc->corb_size; corb = (uint32_t *)sc->corb_dma.dma_vaddr; #if 0 bus_dmamap_sync(sc->corb_dma.dma_tag, sc->corb_dma.dma_map, BUS_DMASYNC_PREWRITE); #endif corb[sc->corb_wp] = verb; #if 0 bus_dmamap_sync(sc->corb_dma.dma_tag, sc->corb_dma.dma_map, BUS_DMASYNC_POSTWRITE); #endif HDAC_WRITE_2(&sc->mem, HDAC_CORBWP, sc->corb_wp); timeout = 10000; do { if (hdac_rirb_flush(sc) == 0) DELAY(10); } while (sc->codecs[cad].pending != 0 && --timeout); if (sc->codecs[cad].pending != 0) { device_printf(sc->dev, "Command timeout on address %d\n", cad); sc->codecs[cad].pending = 0; } if (sc->unsolq_rp != sc->unsolq_wp) taskqueue_enqueue(taskqueue_thread, &sc->unsolq_task); return (sc->codecs[cad].response); } /**************************************************************************** * Device Methods ****************************************************************************/ /**************************************************************************** * int hdac_probe(device_t) * * Probe for the presence of an hdac. If none is found, check for a generic * match using the subclass of the device. ****************************************************************************/ static int hdac_probe(device_t dev) { int i, result; uint32_t model; uint16_t class, subclass; char desc[64]; model = (uint32_t)pci_get_device(dev) << 16; model |= (uint32_t)pci_get_vendor(dev) & 0x0000ffff; class = pci_get_class(dev); subclass = pci_get_subclass(dev); bzero(desc, sizeof(desc)); result = ENXIO; for (i = 0; i < nitems(hdac_devices); i++) { if (hdac_devices[i].model == model) { strlcpy(desc, hdac_devices[i].desc, sizeof(desc)); result = BUS_PROBE_DEFAULT; break; } if (HDA_DEV_MATCH(hdac_devices[i].model, model) && class == PCIC_MULTIMEDIA && subclass == PCIS_MULTIMEDIA_HDA) { snprintf(desc, sizeof(desc), "%s (0x%04x)", hdac_devices[i].desc, pci_get_device(dev)); result = BUS_PROBE_GENERIC; break; } } if (result == ENXIO && class == PCIC_MULTIMEDIA && subclass == PCIS_MULTIMEDIA_HDA) { snprintf(desc, sizeof(desc), "Generic (0x%08x)", model); result = BUS_PROBE_GENERIC; } if (result != ENXIO) { strlcat(desc, " HDA Controller", sizeof(desc)); device_set_desc_copy(dev, desc); } return (result); } static void hdac_unsolq_task(void *context, int pending) { struct hdac_softc *sc; sc = (struct hdac_softc *)context; hdac_lock(sc); hdac_unsolq_flush(sc); hdac_unlock(sc); } /**************************************************************************** * int hdac_attach(device_t) * * Attach the device into the kernel. Interrupts usually won't be enabled * when this function is called. Setup everything that doesn't require * interrupts and defer probing of codecs until interrupts are enabled. ****************************************************************************/ static int hdac_attach(device_t dev) { struct hdac_softc *sc; int result; int i, devid = -1; uint32_t model; uint16_t class, subclass; uint16_t vendor; uint8_t v; sc = device_get_softc(dev); HDA_BOOTVERBOSE( device_printf(dev, "PCI card vendor: 0x%04x, device: 0x%04x\n", pci_get_subvendor(dev), pci_get_subdevice(dev)); device_printf(dev, "HDA Driver Revision: %s\n", HDA_DRV_TEST_REV); ); model = (uint32_t)pci_get_device(dev) << 16; model |= (uint32_t)pci_get_vendor(dev) & 0x0000ffff; class = pci_get_class(dev); subclass = pci_get_subclass(dev); for (i = 0; i < nitems(hdac_devices); i++) { if (hdac_devices[i].model == model) { devid = i; break; } if (HDA_DEV_MATCH(hdac_devices[i].model, model) && class == PCIC_MULTIMEDIA && subclass == PCIS_MULTIMEDIA_HDA) { devid = i; break; } } sc->lock = snd_mtxcreate(device_get_nameunit(dev), "HDA driver mutex"); sc->dev = dev; TASK_INIT(&sc->unsolq_task, 0, hdac_unsolq_task, sc); callout_init(&sc->poll_callout, 1); for (i = 0; i < HDAC_CODEC_MAX; i++) sc->codecs[i].dev = NULL; if (devid >= 0) { sc->quirks_on = hdac_devices[devid].quirks_on; sc->quirks_off = hdac_devices[devid].quirks_off; } else { sc->quirks_on = 0; sc->quirks_off = 0; } if (resource_int_value(device_get_name(dev), device_get_unit(dev), "msi", &i) == 0) { if (i == 0) sc->quirks_off |= HDAC_QUIRK_MSI; else { sc->quirks_on |= HDAC_QUIRK_MSI; sc->quirks_off |= ~HDAC_QUIRK_MSI; } } hdac_config_fetch(sc, &sc->quirks_on, &sc->quirks_off); HDA_BOOTVERBOSE( device_printf(sc->dev, "Config options: on=0x%08x off=0x%08x\n", sc->quirks_on, sc->quirks_off); ); sc->poll_ival = hz; if (resource_int_value(device_get_name(dev), device_get_unit(dev), "polling", &i) == 0 && i != 0) sc->polling = 1; else sc->polling = 0; pci_enable_busmaster(dev); vendor = pci_get_vendor(dev); if (vendor == INTEL_VENDORID) { /* TCSEL -> TC0 */ v = pci_read_config(dev, 0x44, 1); pci_write_config(dev, 0x44, v & 0xf8, 1); HDA_BOOTHVERBOSE( device_printf(dev, "TCSEL: 0x%02d -> 0x%02d\n", v, pci_read_config(dev, 0x44, 1)); ); } #if defined(__i386__) || defined(__amd64__) sc->flags |= HDAC_F_DMA_NOCACHE; if (resource_int_value(device_get_name(dev), device_get_unit(dev), "snoop", &i) == 0 && i != 0) { #else sc->flags &= ~HDAC_F_DMA_NOCACHE; #endif /* * Try to enable PCIe snoop to avoid messing around with * uncacheable DMA attribute. Since PCIe snoop register * config is pretty much vendor specific, there are no * general solutions on how to enable it, forcing us (even * Microsoft) to enable uncacheable or write combined DMA * by default. * * http://msdn2.microsoft.com/en-us/library/ms790324.aspx */ for (i = 0; i < nitems(hdac_pcie_snoop); i++) { if (hdac_pcie_snoop[i].vendor != vendor) continue; sc->flags &= ~HDAC_F_DMA_NOCACHE; if (hdac_pcie_snoop[i].reg == 0x00) break; v = pci_read_config(dev, hdac_pcie_snoop[i].reg, 1); if ((v & hdac_pcie_snoop[i].enable) == hdac_pcie_snoop[i].enable) break; v &= hdac_pcie_snoop[i].mask; v |= hdac_pcie_snoop[i].enable; pci_write_config(dev, hdac_pcie_snoop[i].reg, v, 1); v = pci_read_config(dev, hdac_pcie_snoop[i].reg, 1); if ((v & hdac_pcie_snoop[i].enable) != hdac_pcie_snoop[i].enable) { HDA_BOOTVERBOSE( device_printf(dev, "WARNING: Failed to enable PCIe " "snoop!\n"); ); #if defined(__i386__) || defined(__amd64__) sc->flags |= HDAC_F_DMA_NOCACHE; #endif } break; } #if defined(__i386__) || defined(__amd64__) } #endif HDA_BOOTHVERBOSE( device_printf(dev, "DMA Coherency: %s / vendor=0x%04x\n", (sc->flags & HDAC_F_DMA_NOCACHE) ? "Uncacheable" : "PCIe snoop", vendor); ); /* Allocate resources */ result = hdac_mem_alloc(sc); if (result != 0) goto hdac_attach_fail; result = hdac_irq_alloc(sc); if (result != 0) goto hdac_attach_fail; /* Get Capabilities */ result = hdac_get_capabilities(sc); if (result != 0) goto hdac_attach_fail; /* Allocate CORB, RIRB, POS and BDLs dma memory */ result = hdac_dma_alloc(sc, &sc->corb_dma, sc->corb_size * sizeof(uint32_t)); if (result != 0) goto hdac_attach_fail; result = hdac_dma_alloc(sc, &sc->rirb_dma, sc->rirb_size * sizeof(struct hdac_rirb)); if (result != 0) goto hdac_attach_fail; sc->streams = malloc(sizeof(struct hdac_stream) * sc->num_ss, M_HDAC, M_ZERO | M_WAITOK); for (i = 0; i < sc->num_ss; i++) { result = hdac_dma_alloc(sc, &sc->streams[i].bdl, sizeof(struct hdac_bdle) * HDA_BDL_MAX); if (result != 0) goto hdac_attach_fail; } if (sc->quirks_on & HDAC_QUIRK_DMAPOS) { if (hdac_dma_alloc(sc, &sc->pos_dma, (sc->num_ss) * 8) != 0) { HDA_BOOTVERBOSE( device_printf(dev, "Failed to " "allocate DMA pos buffer " "(non-fatal)\n"); ); } else { uint64_t addr = sc->pos_dma.dma_paddr; HDAC_WRITE_4(&sc->mem, HDAC_DPIBUBASE, addr >> 32); HDAC_WRITE_4(&sc->mem, HDAC_DPIBLBASE, (addr & HDAC_DPLBASE_DPLBASE_MASK) | HDAC_DPLBASE_DPLBASE_DMAPBE); } } result = bus_dma_tag_create( bus_get_dma_tag(sc->dev), /* parent */ HDA_DMA_ALIGNMENT, /* alignment */ 0, /* boundary */ (sc->support_64bit) ? BUS_SPACE_MAXADDR : BUS_SPACE_MAXADDR_32BIT, /* lowaddr */ BUS_SPACE_MAXADDR, /* highaddr */ NULL, /* filtfunc */ NULL, /* fistfuncarg */ HDA_BUFSZ_MAX, /* maxsize */ 1, /* nsegments */ HDA_BUFSZ_MAX, /* maxsegsz */ 0, /* flags */ NULL, /* lockfunc */ NULL, /* lockfuncarg */ &sc->chan_dmat); /* dmat */ if (result != 0) { device_printf(dev, "%s: bus_dma_tag_create failed (%x)\n", __func__, result); goto hdac_attach_fail; } /* Quiesce everything */ HDA_BOOTHVERBOSE( device_printf(dev, "Reset controller...\n"); ); hdac_reset(sc, 1); /* Initialize the CORB and RIRB */ hdac_corb_init(sc); hdac_rirb_init(sc); /* Defer remaining of initialization until interrupts are enabled */ sc->intrhook.ich_func = hdac_attach2; sc->intrhook.ich_arg = (void *)sc; if (cold == 0 || config_intrhook_establish(&sc->intrhook) != 0) { sc->intrhook.ich_func = NULL; hdac_attach2((void *)sc); } return (0); hdac_attach_fail: hdac_irq_free(sc); for (i = 0; i < sc->num_ss; i++) hdac_dma_free(sc, &sc->streams[i].bdl); free(sc->streams, M_HDAC); hdac_dma_free(sc, &sc->rirb_dma); hdac_dma_free(sc, &sc->corb_dma); hdac_mem_free(sc); snd_mtxfree(sc->lock); return (ENXIO); } static int sysctl_hdac_pindump(SYSCTL_HANDLER_ARGS) { struct hdac_softc *sc; device_t *devlist; device_t dev; int devcount, i, err, val; dev = oidp->oid_arg1; sc = device_get_softc(dev); if (sc == NULL) return (EINVAL); val = 0; err = sysctl_handle_int(oidp, &val, 0, req); if (err != 0 || req->newptr == NULL || val == 0) return (err); /* XXX: Temporary. For debugging. */ if (val == 100) { hdac_suspend(dev); return (0); } else if (val == 101) { hdac_resume(dev); return (0); } if ((err = device_get_children(dev, &devlist, &devcount)) != 0) return (err); hdac_lock(sc); for (i = 0; i < devcount; i++) HDAC_PINDUMP(devlist[i]); hdac_unlock(sc); free(devlist, M_TEMP); return (0); } static int hdac_mdata_rate(uint16_t fmt) { static const int mbits[8] = { 8, 16, 32, 32, 32, 32, 32, 32 }; int rate, bits; if (fmt & (1 << 14)) rate = 44100; else rate = 48000; rate *= ((fmt >> 11) & 0x07) + 1; rate /= ((fmt >> 8) & 0x07) + 1; bits = mbits[(fmt >> 4) & 0x03]; bits *= (fmt & 0x0f) + 1; return (rate * bits); } static int hdac_bdata_rate(uint16_t fmt, int output) { static const int bbits[8] = { 8, 16, 20, 24, 32, 32, 32, 32 }; int rate, bits; rate = 48000; rate *= ((fmt >> 11) & 0x07) + 1; bits = bbits[(fmt >> 4) & 0x03]; bits *= (fmt & 0x0f) + 1; if (!output) bits = ((bits + 7) & ~0x07) + 10; return (rate * bits); } static void hdac_poll_reinit(struct hdac_softc *sc) { int i, pollticks, min = 1000000; struct hdac_stream *s; if (sc->polling == 0) return; if (sc->unsol_registered > 0) min = hz / 2; for (i = 0; i < sc->num_ss; i++) { s = &sc->streams[i]; if (s->running == 0) continue; pollticks = ((uint64_t)hz * s->blksz) / (hdac_mdata_rate(s->format) / 8); pollticks >>= 1; if (pollticks > hz) pollticks = hz; if (pollticks < 1) { HDA_BOOTVERBOSE( device_printf(sc->dev, "poll interval < 1 tick !\n"); ); pollticks = 1; } if (min > pollticks) min = pollticks; } HDA_BOOTVERBOSE( device_printf(sc->dev, "poll interval %d -> %d ticks\n", sc->poll_ival, min); ); sc->poll_ival = min; if (min == 1000000) callout_stop(&sc->poll_callout); else callout_reset(&sc->poll_callout, 1, hdac_poll_callback, sc); } static int sysctl_hdac_polling(SYSCTL_HANDLER_ARGS) { struct hdac_softc *sc; device_t dev; uint32_t ctl; int err, val; dev = oidp->oid_arg1; sc = device_get_softc(dev); if (sc == NULL) return (EINVAL); hdac_lock(sc); val = sc->polling; hdac_unlock(sc); err = sysctl_handle_int(oidp, &val, 0, req); if (err != 0 || req->newptr == NULL) return (err); if (val < 0 || val > 1) return (EINVAL); hdac_lock(sc); if (val != sc->polling) { if (val == 0) { callout_stop(&sc->poll_callout); hdac_unlock(sc); callout_drain(&sc->poll_callout); hdac_lock(sc); sc->polling = 0; ctl = HDAC_READ_4(&sc->mem, HDAC_INTCTL); ctl |= HDAC_INTCTL_GIE; HDAC_WRITE_4(&sc->mem, HDAC_INTCTL, ctl); } else { ctl = HDAC_READ_4(&sc->mem, HDAC_INTCTL); ctl &= ~HDAC_INTCTL_GIE; HDAC_WRITE_4(&sc->mem, HDAC_INTCTL, ctl); sc->polling = 1; hdac_poll_reinit(sc); } } hdac_unlock(sc); return (err); } static void hdac_attach2(void *arg) { struct hdac_softc *sc; device_t child; uint32_t vendorid, revisionid; int i; uint16_t statests; sc = (struct hdac_softc *)arg; hdac_lock(sc); /* Remove ourselves from the config hooks */ if (sc->intrhook.ich_func != NULL) { config_intrhook_disestablish(&sc->intrhook); sc->intrhook.ich_func = NULL; } HDA_BOOTHVERBOSE( device_printf(sc->dev, "Starting CORB Engine...\n"); ); hdac_corb_start(sc); HDA_BOOTHVERBOSE( device_printf(sc->dev, "Starting RIRB Engine...\n"); ); hdac_rirb_start(sc); HDA_BOOTHVERBOSE( device_printf(sc->dev, "Enabling controller interrupt...\n"); ); HDAC_WRITE_4(&sc->mem, HDAC_GCTL, HDAC_READ_4(&sc->mem, HDAC_GCTL) | HDAC_GCTL_UNSOL); if (sc->polling == 0) { HDAC_WRITE_4(&sc->mem, HDAC_INTCTL, HDAC_INTCTL_CIE | HDAC_INTCTL_GIE); } DELAY(1000); HDA_BOOTHVERBOSE( device_printf(sc->dev, "Scanning HDA codecs ...\n"); ); statests = HDAC_READ_2(&sc->mem, HDAC_STATESTS); hdac_unlock(sc); for (i = 0; i < HDAC_CODEC_MAX; i++) { if (HDAC_STATESTS_SDIWAKE(statests, i)) { HDA_BOOTHVERBOSE( device_printf(sc->dev, "Found CODEC at address %d\n", i); ); hdac_lock(sc); vendorid = hdac_send_command(sc, i, HDA_CMD_GET_PARAMETER(0, 0x0, HDA_PARAM_VENDOR_ID)); revisionid = hdac_send_command(sc, i, HDA_CMD_GET_PARAMETER(0, 0x0, HDA_PARAM_REVISION_ID)); hdac_unlock(sc); if (vendorid == HDA_INVALID && revisionid == HDA_INVALID) { device_printf(sc->dev, "CODEC is not responding!\n"); continue; } sc->codecs[i].vendor_id = HDA_PARAM_VENDOR_ID_VENDOR_ID(vendorid); sc->codecs[i].device_id = HDA_PARAM_VENDOR_ID_DEVICE_ID(vendorid); sc->codecs[i].revision_id = HDA_PARAM_REVISION_ID_REVISION_ID(revisionid); sc->codecs[i].stepping_id = HDA_PARAM_REVISION_ID_STEPPING_ID(revisionid); child = device_add_child(sc->dev, "hdacc", -1); if (child == NULL) { device_printf(sc->dev, "Failed to add CODEC device\n"); continue; } device_set_ivars(child, (void *)(intptr_t)i); sc->codecs[i].dev = child; } } bus_generic_attach(sc->dev); SYSCTL_ADD_PROC(device_get_sysctl_ctx(sc->dev), SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev)), OID_AUTO, "pindump", CTLTYPE_INT | CTLFLAG_RW, sc->dev, sizeof(sc->dev), sysctl_hdac_pindump, "I", "Dump pin states/data"); SYSCTL_ADD_PROC(device_get_sysctl_ctx(sc->dev), SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev)), OID_AUTO, "polling", CTLTYPE_INT | CTLFLAG_RW, sc->dev, sizeof(sc->dev), sysctl_hdac_polling, "I", "Enable polling mode"); } /**************************************************************************** * int hdac_suspend(device_t) * * Suspend and power down HDA bus and codecs. ****************************************************************************/ static int hdac_suspend(device_t dev) { struct hdac_softc *sc = device_get_softc(dev); HDA_BOOTHVERBOSE( device_printf(dev, "Suspend...\n"); ); bus_generic_suspend(dev); hdac_lock(sc); HDA_BOOTHVERBOSE( device_printf(dev, "Reset controller...\n"); ); callout_stop(&sc->poll_callout); hdac_reset(sc, 0); hdac_unlock(sc); callout_drain(&sc->poll_callout); taskqueue_drain(taskqueue_thread, &sc->unsolq_task); HDA_BOOTHVERBOSE( device_printf(dev, "Suspend done\n"); ); return (0); } /**************************************************************************** * int hdac_resume(device_t) * * Powerup and restore HDA bus and codecs state. ****************************************************************************/ static int hdac_resume(device_t dev) { struct hdac_softc *sc = device_get_softc(dev); int error; HDA_BOOTHVERBOSE( device_printf(dev, "Resume...\n"); ); hdac_lock(sc); /* Quiesce everything */ HDA_BOOTHVERBOSE( device_printf(dev, "Reset controller...\n"); ); hdac_reset(sc, 1); /* Initialize the CORB and RIRB */ hdac_corb_init(sc); hdac_rirb_init(sc); HDA_BOOTHVERBOSE( device_printf(dev, "Starting CORB Engine...\n"); ); hdac_corb_start(sc); HDA_BOOTHVERBOSE( device_printf(dev, "Starting RIRB Engine...\n"); ); hdac_rirb_start(sc); HDA_BOOTHVERBOSE( device_printf(dev, "Enabling controller interrupt...\n"); ); HDAC_WRITE_4(&sc->mem, HDAC_GCTL, HDAC_READ_4(&sc->mem, HDAC_GCTL) | HDAC_GCTL_UNSOL); HDAC_WRITE_4(&sc->mem, HDAC_INTCTL, HDAC_INTCTL_CIE | HDAC_INTCTL_GIE); DELAY(1000); hdac_poll_reinit(sc); hdac_unlock(sc); error = bus_generic_resume(dev); HDA_BOOTHVERBOSE( device_printf(dev, "Resume done\n"); ); return (error); } /**************************************************************************** * int hdac_detach(device_t) * * Detach and free up resources utilized by the hdac device. ****************************************************************************/ static int hdac_detach(device_t dev) { struct hdac_softc *sc = device_get_softc(dev); device_t *devlist; int cad, i, devcount, error; if ((error = device_get_children(dev, &devlist, &devcount)) != 0) return (error); for (i = 0; i < devcount; i++) { cad = (intptr_t)device_get_ivars(devlist[i]); if ((error = device_delete_child(dev, devlist[i])) != 0) { free(devlist, M_TEMP); return (error); } sc->codecs[cad].dev = NULL; } free(devlist, M_TEMP); hdac_lock(sc); hdac_reset(sc, 0); hdac_unlock(sc); taskqueue_drain(taskqueue_thread, &sc->unsolq_task); hdac_irq_free(sc); for (i = 0; i < sc->num_ss; i++) hdac_dma_free(sc, &sc->streams[i].bdl); free(sc->streams, M_HDAC); hdac_dma_free(sc, &sc->pos_dma); hdac_dma_free(sc, &sc->rirb_dma); hdac_dma_free(sc, &sc->corb_dma); if (sc->chan_dmat != NULL) { bus_dma_tag_destroy(sc->chan_dmat); sc->chan_dmat = NULL; } hdac_mem_free(sc); snd_mtxfree(sc->lock); return (0); } static bus_dma_tag_t hdac_get_dma_tag(device_t dev, device_t child) { struct hdac_softc *sc = device_get_softc(dev); return (sc->chan_dmat); } static int hdac_print_child(device_t dev, device_t child) { int retval; retval = bus_print_child_header(dev, child); retval += printf(" at cad %d", (int)(intptr_t)device_get_ivars(child)); retval += bus_print_child_footer(dev, child); return (retval); } static int hdac_child_location_str(device_t dev, device_t child, char *buf, size_t buflen) { snprintf(buf, buflen, "cad=%d", (int)(intptr_t)device_get_ivars(child)); return (0); } static int hdac_child_pnpinfo_str_method(device_t dev, device_t child, char *buf, size_t buflen) { struct hdac_softc *sc = device_get_softc(dev); nid_t cad = (uintptr_t)device_get_ivars(child); snprintf(buf, buflen, "vendor=0x%04x device=0x%04x revision=0x%02x " "stepping=0x%02x", sc->codecs[cad].vendor_id, sc->codecs[cad].device_id, sc->codecs[cad].revision_id, sc->codecs[cad].stepping_id); return (0); } static int hdac_read_ivar(device_t dev, device_t child, int which, uintptr_t *result) { struct hdac_softc *sc = device_get_softc(dev); nid_t cad = (uintptr_t)device_get_ivars(child); switch (which) { case HDA_IVAR_CODEC_ID: *result = cad; break; case HDA_IVAR_VENDOR_ID: *result = sc->codecs[cad].vendor_id; break; case HDA_IVAR_DEVICE_ID: *result = sc->codecs[cad].device_id; break; case HDA_IVAR_REVISION_ID: *result = sc->codecs[cad].revision_id; break; case HDA_IVAR_STEPPING_ID: *result = sc->codecs[cad].stepping_id; break; case HDA_IVAR_SUBVENDOR_ID: *result = pci_get_subvendor(dev); break; case HDA_IVAR_SUBDEVICE_ID: *result = pci_get_subdevice(dev); break; case HDA_IVAR_DMA_NOCACHE: *result = (sc->flags & HDAC_F_DMA_NOCACHE) != 0; break; default: return (ENOENT); } return (0); } static struct mtx * hdac_get_mtx(device_t dev, device_t child) { struct hdac_softc *sc = device_get_softc(dev); return (sc->lock); } static uint32_t hdac_codec_command(device_t dev, device_t child, uint32_t verb) { return (hdac_send_command(device_get_softc(dev), (intptr_t)device_get_ivars(child), verb)); } static int hdac_find_stream(struct hdac_softc *sc, int dir, int stream) { int i, ss; ss = -1; /* Allocate ISS/BSS first. */ if (dir == 0) { for (i = 0; i < sc->num_iss; i++) { if (sc->streams[i].stream == stream) { ss = i; break; } } } else { for (i = 0; i < sc->num_oss; i++) { if (sc->streams[i + sc->num_iss].stream == stream) { ss = i + sc->num_iss; break; } } } /* Fallback to BSS. */ if (ss == -1) { for (i = 0; i < sc->num_bss; i++) { if (sc->streams[i + sc->num_iss + sc->num_oss].stream == stream) { ss = i + sc->num_iss + sc->num_oss; break; } } } return (ss); } static int hdac_stream_alloc(device_t dev, device_t child, int dir, int format, int stripe, uint32_t **dmapos) { struct hdac_softc *sc = device_get_softc(dev); nid_t cad = (uintptr_t)device_get_ivars(child); int stream, ss, bw, maxbw, prevbw; /* Look for empty stream. */ ss = hdac_find_stream(sc, dir, 0); /* Return if found nothing. */ if (ss < 0) return (0); /* Check bus bandwidth. */ bw = hdac_bdata_rate(format, dir); if (dir == 1) { bw *= 1 << (sc->num_sdo - stripe); prevbw = sc->sdo_bw_used; maxbw = 48000 * 960 * (1 << sc->num_sdo); } else { prevbw = sc->codecs[cad].sdi_bw_used; maxbw = 48000 * 464; } HDA_BOOTHVERBOSE( device_printf(dev, "%dKbps of %dKbps bandwidth used%s\n", (bw + prevbw) / 1000, maxbw / 1000, bw + prevbw > maxbw ? " -- OVERFLOW!" : ""); ); if (bw + prevbw > maxbw) return (0); if (dir == 1) sc->sdo_bw_used += bw; else sc->codecs[cad].sdi_bw_used += bw; /* Allocate stream number */ if (ss >= sc->num_iss + sc->num_oss) stream = 15 - (ss - sc->num_iss + sc->num_oss); else if (ss >= sc->num_iss) stream = ss - sc->num_iss + 1; else stream = ss + 1; sc->streams[ss].dev = child; sc->streams[ss].dir = dir; sc->streams[ss].stream = stream; sc->streams[ss].bw = bw; sc->streams[ss].format = format; sc->streams[ss].stripe = stripe; if (dmapos != NULL) { if (sc->pos_dma.dma_vaddr != NULL) *dmapos = (uint32_t *)(sc->pos_dma.dma_vaddr + ss * 8); else *dmapos = NULL; } return (stream); } static void hdac_stream_free(device_t dev, device_t child, int dir, int stream) { struct hdac_softc *sc = device_get_softc(dev); nid_t cad = (uintptr_t)device_get_ivars(child); int ss; ss = hdac_find_stream(sc, dir, stream); KASSERT(ss >= 0, ("Free for not allocated stream (%d/%d)\n", dir, stream)); if (dir == 1) sc->sdo_bw_used -= sc->streams[ss].bw; else sc->codecs[cad].sdi_bw_used -= sc->streams[ss].bw; sc->streams[ss].stream = 0; sc->streams[ss].dev = NULL; } static int hdac_stream_start(device_t dev, device_t child, int dir, int stream, bus_addr_t buf, int blksz, int blkcnt) { struct hdac_softc *sc = device_get_softc(dev); struct hdac_bdle *bdle; uint64_t addr; int i, ss, off; uint32_t ctl; ss = hdac_find_stream(sc, dir, stream); KASSERT(ss >= 0, ("Start for not allocated stream (%d/%d)\n", dir, stream)); addr = (uint64_t)buf; bdle = (struct hdac_bdle *)sc->streams[ss].bdl.dma_vaddr; for (i = 0; i < blkcnt; i++, bdle++) { bdle->addrl = (uint32_t)addr; bdle->addrh = (uint32_t)(addr >> 32); bdle->len = blksz; bdle->ioc = 1; addr += blksz; } off = ss << 5; HDAC_WRITE_4(&sc->mem, off + HDAC_SDCBL, blksz * blkcnt); HDAC_WRITE_2(&sc->mem, off + HDAC_SDLVI, blkcnt - 1); addr = sc->streams[ss].bdl.dma_paddr; HDAC_WRITE_4(&sc->mem, off + HDAC_SDBDPL, (uint32_t)addr); HDAC_WRITE_4(&sc->mem, off + HDAC_SDBDPU, (uint32_t)(addr >> 32)); ctl = HDAC_READ_1(&sc->mem, off + HDAC_SDCTL2); if (dir) ctl |= HDAC_SDCTL2_DIR; else ctl &= ~HDAC_SDCTL2_DIR; ctl &= ~HDAC_SDCTL2_STRM_MASK; ctl |= stream << HDAC_SDCTL2_STRM_SHIFT; ctl &= ~HDAC_SDCTL2_STRIPE_MASK; ctl |= sc->streams[ss].stripe << HDAC_SDCTL2_STRIPE_SHIFT; HDAC_WRITE_1(&sc->mem, off + HDAC_SDCTL2, ctl); HDAC_WRITE_2(&sc->mem, off + HDAC_SDFMT, sc->streams[ss].format); ctl = HDAC_READ_4(&sc->mem, HDAC_INTCTL); ctl |= 1 << ss; HDAC_WRITE_4(&sc->mem, HDAC_INTCTL, ctl); HDAC_WRITE_1(&sc->mem, off + HDAC_SDSTS, HDAC_SDSTS_DESE | HDAC_SDSTS_FIFOE | HDAC_SDSTS_BCIS); ctl = HDAC_READ_1(&sc->mem, off + HDAC_SDCTL0); ctl |= HDAC_SDCTL_IOCE | HDAC_SDCTL_FEIE | HDAC_SDCTL_DEIE | HDAC_SDCTL_RUN; HDAC_WRITE_1(&sc->mem, off + HDAC_SDCTL0, ctl); sc->streams[ss].blksz = blksz; sc->streams[ss].running = 1; hdac_poll_reinit(sc); return (0); } static void hdac_stream_stop(device_t dev, device_t child, int dir, int stream) { struct hdac_softc *sc = device_get_softc(dev); int ss, off; uint32_t ctl; ss = hdac_find_stream(sc, dir, stream); KASSERT(ss >= 0, ("Stop for not allocated stream (%d/%d)\n", dir, stream)); off = ss << 5; ctl = HDAC_READ_1(&sc->mem, off + HDAC_SDCTL0); ctl &= ~(HDAC_SDCTL_IOCE | HDAC_SDCTL_FEIE | HDAC_SDCTL_DEIE | HDAC_SDCTL_RUN); HDAC_WRITE_1(&sc->mem, off + HDAC_SDCTL0, ctl); ctl = HDAC_READ_4(&sc->mem, HDAC_INTCTL); ctl &= ~(1 << ss); HDAC_WRITE_4(&sc->mem, HDAC_INTCTL, ctl); sc->streams[ss].running = 0; hdac_poll_reinit(sc); } static void hdac_stream_reset(device_t dev, device_t child, int dir, int stream) { struct hdac_softc *sc = device_get_softc(dev); int timeout = 1000; int to = timeout; int ss, off; uint32_t ctl; ss = hdac_find_stream(sc, dir, stream); KASSERT(ss >= 0, ("Reset for not allocated stream (%d/%d)\n", dir, stream)); off = ss << 5; ctl = HDAC_READ_1(&sc->mem, off + HDAC_SDCTL0); ctl |= HDAC_SDCTL_SRST; HDAC_WRITE_1(&sc->mem, off + HDAC_SDCTL0, ctl); do { ctl = HDAC_READ_1(&sc->mem, off + HDAC_SDCTL0); if (ctl & HDAC_SDCTL_SRST) break; DELAY(10); } while (--to); if (!(ctl & HDAC_SDCTL_SRST)) device_printf(dev, "Reset setting timeout\n"); ctl &= ~HDAC_SDCTL_SRST; HDAC_WRITE_1(&sc->mem, off + HDAC_SDCTL0, ctl); to = timeout; do { ctl = HDAC_READ_1(&sc->mem, off + HDAC_SDCTL0); if (!(ctl & HDAC_SDCTL_SRST)) break; DELAY(10); } while (--to); if (ctl & HDAC_SDCTL_SRST) device_printf(dev, "Reset timeout!\n"); } static uint32_t hdac_stream_getptr(device_t dev, device_t child, int dir, int stream) { struct hdac_softc *sc = device_get_softc(dev); int ss, off; ss = hdac_find_stream(sc, dir, stream); KASSERT(ss >= 0, ("Reset for not allocated stream (%d/%d)\n", dir, stream)); off = ss << 5; return (HDAC_READ_4(&sc->mem, off + HDAC_SDLPIB)); } static int hdac_unsol_alloc(device_t dev, device_t child, int tag) { struct hdac_softc *sc = device_get_softc(dev); sc->unsol_registered++; hdac_poll_reinit(sc); return (tag); } static void hdac_unsol_free(device_t dev, device_t child, int tag) { struct hdac_softc *sc = device_get_softc(dev); sc->unsol_registered--; hdac_poll_reinit(sc); } static device_method_t hdac_methods[] = { /* device interface */ DEVMETHOD(device_probe, hdac_probe), DEVMETHOD(device_attach, hdac_attach), DEVMETHOD(device_detach, hdac_detach), DEVMETHOD(device_suspend, hdac_suspend), DEVMETHOD(device_resume, hdac_resume), /* Bus interface */ DEVMETHOD(bus_get_dma_tag, hdac_get_dma_tag), DEVMETHOD(bus_print_child, hdac_print_child), DEVMETHOD(bus_child_location_str, hdac_child_location_str), DEVMETHOD(bus_child_pnpinfo_str, hdac_child_pnpinfo_str_method), DEVMETHOD(bus_read_ivar, hdac_read_ivar), DEVMETHOD(hdac_get_mtx, hdac_get_mtx), DEVMETHOD(hdac_codec_command, hdac_codec_command), DEVMETHOD(hdac_stream_alloc, hdac_stream_alloc), DEVMETHOD(hdac_stream_free, hdac_stream_free), DEVMETHOD(hdac_stream_start, hdac_stream_start), DEVMETHOD(hdac_stream_stop, hdac_stream_stop), DEVMETHOD(hdac_stream_reset, hdac_stream_reset), DEVMETHOD(hdac_stream_getptr, hdac_stream_getptr), DEVMETHOD(hdac_unsol_alloc, hdac_unsol_alloc), DEVMETHOD(hdac_unsol_free, hdac_unsol_free), DEVMETHOD_END }; static driver_t hdac_driver = { "hdac", hdac_methods, sizeof(struct hdac_softc), }; static devclass_t hdac_devclass; DRIVER_MODULE(snd_hda, pci, hdac_driver, hdac_devclass, NULL, NULL); Index: head/sys/fs/nullfs/null_vnops.c =================================================================== --- head/sys/fs/nullfs/null_vnops.c (revision 308456) +++ head/sys/fs/nullfs/null_vnops.c (revision 308457) @@ -1,934 +1,934 @@ /*- * Copyright (c) 1992, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * John Heidemann of the UCLA Ficus project. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)null_vnops.c 8.6 (Berkeley) 5/27/95 * * Ancestors: * @(#)lofs_vnops.c 1.2 (Berkeley) 6/18/92 * ...and... * @(#)null_vnodeops.c 1.20 92/07/07 UCLA Ficus project * * $FreeBSD$ */ /* * Null Layer * * (See mount_nullfs(8) for more information.) * * The null layer duplicates a portion of the filesystem * name space under a new name. In this respect, it is * similar to the loopback filesystem. It differs from * the loopback fs in two respects: it is implemented using * a stackable layers techniques, and its "null-node"s stack above * all lower-layer vnodes, not just over directory vnodes. * * The null layer has two purposes. First, it serves as a demonstration * of layering by proving a layer which does nothing. (It actually * does everything the loopback filesystem does, which is slightly * more than nothing.) Second, the null layer can serve as a prototype * layer. Since it provides all necessary layer framework, * new filesystem layers can be created very easily be starting * with a null layer. * * The remainder of this man page examines the null layer as a basis * for constructing new layers. * * * INSTANTIATING NEW NULL LAYERS * * New null layers are created with mount_nullfs(8). * Mount_nullfs(8) takes two arguments, the pathname * of the lower vfs (target-pn) and the pathname where the null * layer will appear in the namespace (alias-pn). After * the null layer is put into place, the contents * of target-pn subtree will be aliased under alias-pn. * * * OPERATION OF A NULL LAYER * * The null layer is the minimum filesystem layer, * simply bypassing all possible operations to the lower layer * for processing there. The majority of its activity centers * on the bypass routine, through which nearly all vnode operations * pass. * * The bypass routine accepts arbitrary vnode operations for * handling by the lower layer. It begins by examing vnode * operation arguments and replacing any null-nodes by their * lower-layer equivlants. It then invokes the operation * on the lower layer. Finally, it replaces the null-nodes * in the arguments and, if a vnode is return by the operation, * stacks a null-node on top of the returned vnode. * * Although bypass handles most operations, vop_getattr, vop_lock, * vop_unlock, vop_inactive, vop_reclaim, and vop_print are not * bypassed. Vop_getattr must change the fsid being returned. * Vop_lock and vop_unlock must handle any locking for the * current vnode as well as pass the lock request down. * Vop_inactive and vop_reclaim are not bypassed so that * they can handle freeing null-layer specific data. Vop_print * is not bypassed to avoid excessive debugging information. * Also, certain vnode operations change the locking state within * the operation (create, mknod, remove, link, rename, mkdir, rmdir, * and symlink). Ideally these operations should not change the * lock state, but should be changed to let the caller of the * function unlock them. Otherwise all intermediate vnode layers * (such as union, umapfs, etc) must catch these functions to do * the necessary locking at their layer. * * * INSTANTIATING VNODE STACKS * * Mounting associates the null layer with a lower layer, * effect stacking two VFSes. Vnode stacks are instead * created on demand as files are accessed. * * The initial mount creates a single vnode stack for the * root of the new null layer. All other vnode stacks * are created as a result of vnode operations on * this or other null vnode stacks. * * New vnode stacks come into existence as a result of * an operation which returns a vnode. * The bypass routine stacks a null-node above the new * vnode before returning it to the caller. * * For example, imagine mounting a null layer with * "mount_nullfs /usr/include /dev/layer/null". * Changing directory to /dev/layer/null will assign * the root null-node (which was created when the null layer was mounted). * Now consider opening "sys". A vop_lookup would be * done on the root null-node. This operation would bypass through * to the lower layer which would return a vnode representing * the UFS "sys". Null_bypass then builds a null-node * aliasing the UFS "sys" and returns this to the caller. * Later operations on the null-node "sys" will repeat this * process when constructing other vnode stacks. * * * CREATING OTHER FILE SYSTEM LAYERS * * One of the easiest ways to construct new filesystem layers is to make * a copy of the null layer, rename all files and variables, and * then begin modifing the copy. Sed can be used to easily rename * all variables. * * The umap layer is an example of a layer descended from the * null layer. * * * INVOKING OPERATIONS ON LOWER LAYERS * * There are two techniques to invoke operations on a lower layer * when the operation cannot be completely bypassed. Each method * is appropriate in different situations. In both cases, * it is the responsibility of the aliasing layer to make * the operation arguments "correct" for the lower layer * by mapping a vnode arguments to the lower layer. * * The first approach is to call the aliasing layer's bypass routine. * This method is most suitable when you wish to invoke the operation * currently being handled on the lower layer. It has the advantage * that the bypass routine already must do argument mapping. * An example of this is null_getattrs in the null layer. * * A second approach is to directly invoke vnode operations on * the lower layer with the VOP_OPERATIONNAME interface. * The advantage of this method is that it is easy to invoke * arbitrary operations on the lower layer. The disadvantage * is that vnode arguments must be manualy mapped. * */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static int null_bug_bypass = 0; /* for debugging: enables bypass printf'ing */ SYSCTL_INT(_debug, OID_AUTO, nullfs_bug_bypass, CTLFLAG_RW, &null_bug_bypass, 0, ""); /* * This is the 10-Apr-92 bypass routine. * This version has been optimized for speed, throwing away some * safety checks. It should still always work, but it's not as * robust to programmer errors. * * In general, we map all vnodes going down and unmap them on the way back. * As an exception to this, vnodes can be marked "unmapped" by setting * the Nth bit in operation's vdesc_flags. * * Also, some BSD vnode operations have the side effect of vrele'ing * their arguments. With stacking, the reference counts are held * by the upper node, not the lower one, so we must handle these * side-effects here. This is not of concern in Sun-derived systems * since there are no such side-effects. * * This makes the following assumptions: * - only one returned vpp * - no INOUT vpp's (Sun's vop_open has one of these) * - the vnode operation vector of the first vnode should be used * to determine what implementation of the op should be invoked * - all mapped vnodes are of our vnode-type (NEEDSWORK: * problems on rmdir'ing mount points and renaming?) */ int null_bypass(struct vop_generic_args *ap) { struct vnode **this_vp_p; int error; struct vnode *old_vps[VDESC_MAX_VPS]; struct vnode **vps_p[VDESC_MAX_VPS]; struct vnode ***vppp; struct vnodeop_desc *descp = ap->a_desc; int reles, i; if (null_bug_bypass) printf ("null_bypass: %s\n", descp->vdesc_name); #ifdef DIAGNOSTIC /* * We require at least one vp. */ if (descp->vdesc_vp_offsets == NULL || descp->vdesc_vp_offsets[0] == VDESC_NO_OFFSET) panic ("null_bypass: no vp's in map"); #endif /* * Map the vnodes going in. * Later, we'll invoke the operation based on * the first mapped vnode's operation vector. */ reles = descp->vdesc_flags; for (i = 0; i < VDESC_MAX_VPS; reles >>= 1, i++) { if (descp->vdesc_vp_offsets[i] == VDESC_NO_OFFSET) break; /* bail out at end of list */ vps_p[i] = this_vp_p = VOPARG_OFFSETTO(struct vnode**,descp->vdesc_vp_offsets[i],ap); /* * We're not guaranteed that any but the first vnode * are of our type. Check for and don't map any * that aren't. (We must always map first vp or vclean fails.) */ if (i && (*this_vp_p == NULLVP || (*this_vp_p)->v_op != &null_vnodeops)) { old_vps[i] = NULLVP; } else { old_vps[i] = *this_vp_p; *(vps_p[i]) = NULLVPTOLOWERVP(*this_vp_p); /* * XXX - Several operations have the side effect * of vrele'ing their vp's. We must account for * that. (This should go away in the future.) */ if (reles & VDESC_VP0_WILLRELE) VREF(*this_vp_p); } } /* * Call the operation on the lower layer * with the modified argument structure. */ if (vps_p[0] && *vps_p[0]) error = VCALL(ap); else { printf("null_bypass: no map for %s\n", descp->vdesc_name); error = EINVAL; } /* * Maintain the illusion of call-by-value * by restoring vnodes in the argument structure * to their original value. */ reles = descp->vdesc_flags; for (i = 0; i < VDESC_MAX_VPS; reles >>= 1, i++) { if (descp->vdesc_vp_offsets[i] == VDESC_NO_OFFSET) break; /* bail out at end of list */ if (old_vps[i]) { *(vps_p[i]) = old_vps[i]; #if 0 if (reles & VDESC_VP0_WILLUNLOCK) VOP_UNLOCK(*(vps_p[i]), 0); #endif if (reles & VDESC_VP0_WILLRELE) vrele(*(vps_p[i])); } } /* * Map the possible out-going vpp * (Assumes that the lower layer always returns * a VREF'ed vpp unless it gets an error.) */ if (descp->vdesc_vpp_offset != VDESC_NO_OFFSET && !(descp->vdesc_flags & VDESC_NOMAP_VPP) && !error) { /* * XXX - even though some ops have vpp returned vp's, * several ops actually vrele this before returning. * We must avoid these ops. * (This should go away when these ops are regularized.) */ if (descp->vdesc_flags & VDESC_VPP_WILLRELE) goto out; vppp = VOPARG_OFFSETTO(struct vnode***, descp->vdesc_vpp_offset,ap); if (*vppp) error = null_nodeget(old_vps[0]->v_mount, **vppp, *vppp); } out: return (error); } static int null_add_writecount(struct vop_add_writecount_args *ap) { struct vnode *lvp, *vp; int error; vp = ap->a_vp; lvp = NULLVPTOLOWERVP(vp); KASSERT(vp->v_writecount + ap->a_inc >= 0, ("wrong writecount inc")); if (vp->v_writecount > 0 && vp->v_writecount + ap->a_inc == 0) error = VOP_ADD_WRITECOUNT(lvp, -1); else if (vp->v_writecount == 0 && vp->v_writecount + ap->a_inc > 0) error = VOP_ADD_WRITECOUNT(lvp, 1); else error = 0; if (error == 0) vp->v_writecount += ap->a_inc; return (error); } /* * We have to carry on the locking protocol on the null layer vnodes * as we progress through the tree. We also have to enforce read-only * if this layer is mounted read-only. */ static int null_lookup(struct vop_lookup_args *ap) { struct componentname *cnp = ap->a_cnp; struct vnode *dvp = ap->a_dvp; int flags = cnp->cn_flags; struct vnode *vp, *ldvp, *lvp; struct mount *mp; int error; mp = dvp->v_mount; if ((flags & ISLASTCN) != 0 && (mp->mnt_flag & MNT_RDONLY) != 0 && (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) return (EROFS); /* * Although it is possible to call null_bypass(), we'll do * a direct call to reduce overhead */ ldvp = NULLVPTOLOWERVP(dvp); vp = lvp = NULL; KASSERT((ldvp->v_vflag & VV_ROOT) == 0 || ((dvp->v_vflag & VV_ROOT) != 0 && (flags & ISDOTDOT) == 0), ("ldvp %p fl %#x dvp %p fl %#x flags %#x", ldvp, ldvp->v_vflag, dvp, dvp->v_vflag, flags)); /* * Hold ldvp. The reference on it, owned by dvp, is lost in * case of dvp reclamation, and we need ldvp to move our lock * from ldvp to dvp. */ vhold(ldvp); error = VOP_LOOKUP(ldvp, &lvp, cnp); /* * VOP_LOOKUP() on lower vnode may unlock ldvp, which allows * dvp to be reclaimed due to shared v_vnlock. Check for the * doomed state and return error. */ if ((error == 0 || error == EJUSTRETURN) && (dvp->v_iflag & VI_DOOMED) != 0) { error = ENOENT; if (lvp != NULL) vput(lvp); /* * If vgone() did reclaimed dvp before curthread * relocked ldvp, the locks of dvp and ldpv are no * longer shared. In this case, relock of ldvp in * lower fs VOP_LOOKUP() does not restore the locking * state of dvp. Compensate for this by unlocking * ldvp and locking dvp, which is also correct if the * locks are still shared. */ VOP_UNLOCK(ldvp, 0); vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY); } vdrop(ldvp); if (error == EJUSTRETURN && (flags & ISLASTCN) != 0 && (mp->mnt_flag & MNT_RDONLY) != 0 && (cnp->cn_nameiop == CREATE || cnp->cn_nameiop == RENAME)) error = EROFS; if ((error == 0 || error == EJUSTRETURN) && lvp != NULL) { if (ldvp == lvp) { *ap->a_vpp = dvp; VREF(dvp); vrele(lvp); } else { error = null_nodeget(mp, lvp, &vp); if (error == 0) *ap->a_vpp = vp; } } return (error); } static int null_open(struct vop_open_args *ap) { int retval; struct vnode *vp, *ldvp; vp = ap->a_vp; ldvp = NULLVPTOLOWERVP(vp); retval = null_bypass(&ap->a_gen); if (retval == 0) vp->v_object = ldvp->v_object; return (retval); } /* * Setattr call. Disallow write attempts if the layer is mounted read-only. */ static int null_setattr(struct vop_setattr_args *ap) { struct vnode *vp = ap->a_vp; struct vattr *vap = ap->a_vap; if ((vap->va_flags != VNOVAL || vap->va_uid != (uid_t)VNOVAL || vap->va_gid != (gid_t)VNOVAL || vap->va_atime.tv_sec != VNOVAL || vap->va_mtime.tv_sec != VNOVAL || vap->va_mode != (mode_t)VNOVAL) && (vp->v_mount->mnt_flag & MNT_RDONLY)) return (EROFS); if (vap->va_size != VNOVAL) { switch (vp->v_type) { case VDIR: return (EISDIR); case VCHR: case VBLK: case VSOCK: case VFIFO: if (vap->va_flags != VNOVAL) return (EOPNOTSUPP); return (0); case VREG: case VLNK: default: /* * Disallow write attempts if the filesystem is * mounted read-only. */ if (vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); } } return (null_bypass((struct vop_generic_args *)ap)); } /* * We handle getattr only to change the fsid. */ static int null_getattr(struct vop_getattr_args *ap) { int error; if ((error = null_bypass((struct vop_generic_args *)ap)) != 0) return (error); ap->a_vap->va_fsid = ap->a_vp->v_mount->mnt_stat.f_fsid.val[0]; return (0); } /* * Handle to disallow write access if mounted read-only. */ static int null_access(struct vop_access_args *ap) { struct vnode *vp = ap->a_vp; accmode_t accmode = ap->a_accmode; /* * Disallow write attempts on read-only layers; * unless the file is a socket, fifo, or a block or * character device resident on the filesystem. */ if (accmode & VWRITE) { switch (vp->v_type) { case VDIR: case VLNK: case VREG: if (vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); break; default: break; } } return (null_bypass((struct vop_generic_args *)ap)); } static int null_accessx(struct vop_accessx_args *ap) { struct vnode *vp = ap->a_vp; accmode_t accmode = ap->a_accmode; /* * Disallow write attempts on read-only layers; * unless the file is a socket, fifo, or a block or * character device resident on the filesystem. */ if (accmode & VWRITE) { switch (vp->v_type) { case VDIR: case VLNK: case VREG: if (vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); break; default: break; } } return (null_bypass((struct vop_generic_args *)ap)); } /* * Increasing refcount of lower vnode is needed at least for the case * when lower FS is NFS to do sillyrename if the file is in use. * Unfortunately v_usecount is incremented in many places in * the kernel and, as such, there may be races that result in * the NFS client doing an extraneous silly rename, but that seems * preferable to not doing a silly rename when it is needed. */ static int null_remove(struct vop_remove_args *ap) { int retval, vreleit; struct vnode *lvp, *vp; vp = ap->a_vp; if (vrefcnt(vp) > 1) { lvp = NULLVPTOLOWERVP(vp); VREF(lvp); vreleit = 1; } else vreleit = 0; VTONULL(vp)->null_flags |= NULLV_DROP; retval = null_bypass(&ap->a_gen); if (vreleit != 0) vrele(lvp); return (retval); } /* * We handle this to eliminate null FS to lower FS * file moving. Don't know why we don't allow this, * possibly we should. */ static int null_rename(struct vop_rename_args *ap) { struct vnode *tdvp = ap->a_tdvp; struct vnode *fvp = ap->a_fvp; struct vnode *fdvp = ap->a_fdvp; struct vnode *tvp = ap->a_tvp; struct null_node *tnn; /* Check for cross-device rename. */ if ((fvp->v_mount != tdvp->v_mount) || (tvp && (fvp->v_mount != tvp->v_mount))) { if (tdvp == tvp) vrele(tdvp); else vput(tdvp); if (tvp) vput(tvp); vrele(fdvp); vrele(fvp); return (EXDEV); } if (tvp != NULL) { tnn = VTONULL(tvp); tnn->null_flags |= NULLV_DROP; } return (null_bypass((struct vop_generic_args *)ap)); } static int null_rmdir(struct vop_rmdir_args *ap) { VTONULL(ap->a_vp)->null_flags |= NULLV_DROP; return (null_bypass(&ap->a_gen)); } /* * We need to process our own vnode lock and then clear the * interlock flag as it applies only to our vnode, not the * vnodes below us on the stack. */ static int null_lock(struct vop_lock1_args *ap) { struct vnode *vp = ap->a_vp; int flags = ap->a_flags; struct null_node *nn; struct vnode *lvp; int error; if ((flags & LK_INTERLOCK) == 0) { VI_LOCK(vp); ap->a_flags = flags |= LK_INTERLOCK; } nn = VTONULL(vp); /* * If we're still active we must ask the lower layer to - * lock as ffs has special lock considerations in it's + * lock as ffs has special lock considerations in its * vop lock. */ if (nn != NULL && (lvp = NULLVPTOLOWERVP(vp)) != NULL) { VI_LOCK_FLAGS(lvp, MTX_DUPOK); VI_UNLOCK(vp); /* * We have to hold the vnode here to solve a potential * reclaim race. If we're forcibly vgone'd while we * still have refs, a thread could be sleeping inside * the lowervp's vop_lock routine. When we vgone we will * drop our last ref to the lowervp, which would allow it * to be reclaimed. The lowervp could then be recycled, - * in which case it is not legal to be sleeping in it's VOP. + * in which case it is not legal to be sleeping in its VOP. * We prevent it from being recycled by holding the vnode * here. */ vholdl(lvp); error = VOP_LOCK(lvp, flags); /* * We might have slept to get the lock and someone might have * clean our vnode already, switching vnode lock from one in * lowervp to v_lock in our own vnode structure. Handle this * case by reacquiring correct lock in requested mode. */ if (VTONULL(vp) == NULL && error == 0) { ap->a_flags &= ~(LK_TYPE_MASK | LK_INTERLOCK); switch (flags & LK_TYPE_MASK) { case LK_SHARED: ap->a_flags |= LK_SHARED; break; case LK_UPGRADE: case LK_EXCLUSIVE: ap->a_flags |= LK_EXCLUSIVE; break; default: panic("Unsupported lock request %d\n", ap->a_flags); } VOP_UNLOCK(lvp, 0); error = vop_stdlock(ap); } vdrop(lvp); } else error = vop_stdlock(ap); return (error); } /* * We need to process our own vnode unlock and then clear the * interlock flag as it applies only to our vnode, not the * vnodes below us on the stack. */ static int null_unlock(struct vop_unlock_args *ap) { struct vnode *vp = ap->a_vp; int flags = ap->a_flags; int mtxlkflag = 0; struct null_node *nn; struct vnode *lvp; int error; if ((flags & LK_INTERLOCK) != 0) mtxlkflag = 1; else if (mtx_owned(VI_MTX(vp)) == 0) { VI_LOCK(vp); mtxlkflag = 2; } nn = VTONULL(vp); if (nn != NULL && (lvp = NULLVPTOLOWERVP(vp)) != NULL) { VI_LOCK_FLAGS(lvp, MTX_DUPOK); flags |= LK_INTERLOCK; vholdl(lvp); VI_UNLOCK(vp); error = VOP_UNLOCK(lvp, flags); vdrop(lvp); if (mtxlkflag == 0) VI_LOCK(vp); } else { if (mtxlkflag == 2) VI_UNLOCK(vp); error = vop_stdunlock(ap); } return (error); } /* * Do not allow the VOP_INACTIVE to be passed to the lower layer, * since the reference count on the lower vnode is not related to * ours. */ static int null_inactive(struct vop_inactive_args *ap __unused) { struct vnode *vp, *lvp; struct null_node *xp; struct mount *mp; struct null_mount *xmp; vp = ap->a_vp; xp = VTONULL(vp); lvp = NULLVPTOLOWERVP(vp); mp = vp->v_mount; xmp = MOUNTTONULLMOUNT(mp); if ((xmp->nullm_flags & NULLM_CACHE) == 0 || (xp->null_flags & NULLV_DROP) != 0 || (lvp->v_vflag & VV_NOSYNC) != 0) { /* * If this is the last reference and caching of the * nullfs vnodes is not enabled, or the lower vnode is * deleted, then free up the vnode so as not to tie up * the lower vnodes. */ vp->v_object = NULL; vrecycle(vp); } return (0); } /* * Now, the nullfs vnode and, due to the sharing lock, the lower * vnode, are exclusively locked, and we shall destroy the null vnode. */ static int null_reclaim(struct vop_reclaim_args *ap) { struct vnode *vp; struct null_node *xp; struct vnode *lowervp; vp = ap->a_vp; xp = VTONULL(vp); lowervp = xp->null_lowervp; KASSERT(lowervp != NULL && vp->v_vnlock != &vp->v_lock, ("Reclaiming incomplete null vnode %p", vp)); null_hashrem(xp); /* * Use the interlock to protect the clearing of v_data to * prevent faults in null_lock(). */ lockmgr(&vp->v_lock, LK_EXCLUSIVE, NULL); VI_LOCK(vp); vp->v_data = NULL; vp->v_object = NULL; vp->v_vnlock = &vp->v_lock; VI_UNLOCK(vp); /* * If we were opened for write, we leased one write reference * to the lower vnode. If this is a reclamation due to the * forced unmount, undo the reference now. */ if (vp->v_writecount > 0) VOP_ADD_WRITECOUNT(lowervp, -1); if ((xp->null_flags & NULLV_NOUNLOCK) != 0) vunref(lowervp); else vput(lowervp); free(xp, M_NULLFSNODE); return (0); } static int null_print(struct vop_print_args *ap) { struct vnode *vp = ap->a_vp; printf("\tvp=%p, lowervp=%p\n", vp, VTONULL(vp)->null_lowervp); return (0); } /* ARGSUSED */ static int null_getwritemount(struct vop_getwritemount_args *ap) { struct null_node *xp; struct vnode *lowervp; struct vnode *vp; vp = ap->a_vp; VI_LOCK(vp); xp = VTONULL(vp); if (xp && (lowervp = xp->null_lowervp)) { VI_LOCK_FLAGS(lowervp, MTX_DUPOK); VI_UNLOCK(vp); vholdl(lowervp); VI_UNLOCK(lowervp); VOP_GETWRITEMOUNT(lowervp, ap->a_mpp); vdrop(lowervp); } else { VI_UNLOCK(vp); *(ap->a_mpp) = NULL; } return (0); } static int null_vptofh(struct vop_vptofh_args *ap) { struct vnode *lvp; lvp = NULLVPTOLOWERVP(ap->a_vp); return VOP_VPTOFH(lvp, ap->a_fhp); } static int null_vptocnp(struct vop_vptocnp_args *ap) { struct vnode *vp = ap->a_vp; struct vnode **dvp = ap->a_vpp; struct vnode *lvp, *ldvp; struct ucred *cred = ap->a_cred; int error, locked; locked = VOP_ISLOCKED(vp); lvp = NULLVPTOLOWERVP(vp); vhold(lvp); VOP_UNLOCK(vp, 0); /* vp is held by vn_vptocnp_locked that called us */ ldvp = lvp; vref(lvp); error = vn_vptocnp(&ldvp, cred, ap->a_buf, ap->a_buflen); vdrop(lvp); if (error != 0) { vn_lock(vp, locked | LK_RETRY); return (ENOENT); } /* * Exclusive lock is required by insmntque1 call in * null_nodeget() */ error = vn_lock(ldvp, LK_EXCLUSIVE); if (error != 0) { vrele(ldvp); vn_lock(vp, locked | LK_RETRY); return (ENOENT); } error = null_nodeget(vp->v_mount, ldvp, dvp); if (error == 0) { #ifdef DIAGNOSTIC NULLVPTOLOWERVP(*dvp); #endif VOP_UNLOCK(*dvp, 0); /* keep reference on *dvp */ } vn_lock(vp, locked | LK_RETRY); return (error); } /* * Global vfs data structures */ struct vop_vector null_vnodeops = { .vop_bypass = null_bypass, .vop_access = null_access, .vop_accessx = null_accessx, .vop_advlockpurge = vop_stdadvlockpurge, .vop_bmap = VOP_EOPNOTSUPP, .vop_getattr = null_getattr, .vop_getwritemount = null_getwritemount, .vop_inactive = null_inactive, .vop_islocked = vop_stdislocked, .vop_lock1 = null_lock, .vop_lookup = null_lookup, .vop_open = null_open, .vop_print = null_print, .vop_reclaim = null_reclaim, .vop_remove = null_remove, .vop_rename = null_rename, .vop_rmdir = null_rmdir, .vop_setattr = null_setattr, .vop_strategy = VOP_EOPNOTSUPP, .vop_unlock = null_unlock, .vop_vptocnp = null_vptocnp, .vop_vptofh = null_vptofh, .vop_add_writecount = null_add_writecount, }; Index: head/sys/geom/raid/md_ddf.c =================================================================== --- head/sys/geom/raid/md_ddf.c (revision 308456) +++ head/sys/geom/raid/md_ddf.c (revision 308457) @@ -1,3085 +1,3085 @@ /*- * Copyright (c) 2012 Alexander Motin * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include "geom/raid/g_raid.h" #include "geom/raid/md_ddf.h" #include "g_raid_md_if.h" static MALLOC_DEFINE(M_MD_DDF, "md_ddf_data", "GEOM_RAID DDF metadata"); #define DDF_MAX_DISKS_HARD 128 #define DDF_MAX_DISKS 16 #define DDF_MAX_VDISKS 7 #define DDF_MAX_PARTITIONS 1 #define DECADE (3600*24*(365*10+2)) /* 10 years in seconds. */ struct ddf_meta { u_int sectorsize; u_int bigendian; struct ddf_header *hdr; struct ddf_cd_record *cdr; struct ddf_pd_record *pdr; struct ddf_vd_record *vdr; void *cr; struct ddf_pdd_record *pdd; struct ddf_bbm_log *bbm; }; struct ddf_vol_meta { u_int sectorsize; u_int bigendian; struct ddf_header *hdr; struct ddf_cd_record *cdr; struct ddf_vd_entry *vde; struct ddf_vdc_record *vdc; struct ddf_vdc_record *bvdc[DDF_MAX_DISKS_HARD]; }; struct g_raid_md_ddf_perdisk { struct ddf_meta pd_meta; }; struct g_raid_md_ddf_pervolume { struct ddf_vol_meta pv_meta; int pv_started; struct callout pv_start_co; /* STARTING state timer. */ }; struct g_raid_md_ddf_object { struct g_raid_md_object mdio_base; u_int mdio_bigendian; struct ddf_meta mdio_meta; int mdio_starting; struct callout mdio_start_co; /* STARTING state timer. */ int mdio_started; struct root_hold_token *mdio_rootmount; /* Root mount delay token. */ }; static g_raid_md_create_req_t g_raid_md_create_req_ddf; static g_raid_md_taste_t g_raid_md_taste_ddf; static g_raid_md_event_t g_raid_md_event_ddf; static g_raid_md_volume_event_t g_raid_md_volume_event_ddf; static g_raid_md_ctl_t g_raid_md_ctl_ddf; static g_raid_md_write_t g_raid_md_write_ddf; static g_raid_md_fail_disk_t g_raid_md_fail_disk_ddf; static g_raid_md_free_disk_t g_raid_md_free_disk_ddf; static g_raid_md_free_volume_t g_raid_md_free_volume_ddf; static g_raid_md_free_t g_raid_md_free_ddf; static kobj_method_t g_raid_md_ddf_methods[] = { KOBJMETHOD(g_raid_md_create_req, g_raid_md_create_req_ddf), KOBJMETHOD(g_raid_md_taste, g_raid_md_taste_ddf), KOBJMETHOD(g_raid_md_event, g_raid_md_event_ddf), KOBJMETHOD(g_raid_md_volume_event, g_raid_md_volume_event_ddf), KOBJMETHOD(g_raid_md_ctl, g_raid_md_ctl_ddf), KOBJMETHOD(g_raid_md_write, g_raid_md_write_ddf), KOBJMETHOD(g_raid_md_fail_disk, g_raid_md_fail_disk_ddf), KOBJMETHOD(g_raid_md_free_disk, g_raid_md_free_disk_ddf), KOBJMETHOD(g_raid_md_free_volume, g_raid_md_free_volume_ddf), KOBJMETHOD(g_raid_md_free, g_raid_md_free_ddf), { 0, 0 } }; static struct g_raid_md_class g_raid_md_ddf_class = { "DDF", g_raid_md_ddf_methods, sizeof(struct g_raid_md_ddf_object), .mdc_enable = 1, .mdc_priority = 100 }; #define GET8(m, f) ((m)->f) #define GET16(m, f) ((m)->bigendian ? be16dec(&(m)->f) : le16dec(&(m)->f)) #define GET32(m, f) ((m)->bigendian ? be32dec(&(m)->f) : le32dec(&(m)->f)) #define GET64(m, f) ((m)->bigendian ? be64dec(&(m)->f) : le64dec(&(m)->f)) #define GET8D(m, f) (f) #define GET16D(m, f) ((m)->bigendian ? be16dec(&f) : le16dec(&f)) #define GET32D(m, f) ((m)->bigendian ? be32dec(&f) : le32dec(&f)) #define GET64D(m, f) ((m)->bigendian ? be64dec(&f) : le64dec(&f)) #define GET8P(m, f) (*(f)) #define GET16P(m, f) ((m)->bigendian ? be16dec(f) : le16dec(f)) #define GET32P(m, f) ((m)->bigendian ? be32dec(f) : le32dec(f)) #define GET64P(m, f) ((m)->bigendian ? be64dec(f) : le64dec(f)) #define SET8P(m, f, v) \ (*(f) = (v)) #define SET16P(m, f, v) \ do { \ if ((m)->bigendian) \ be16enc((f), (v)); \ else \ le16enc((f), (v)); \ } while (0) #define SET32P(m, f, v) \ do { \ if ((m)->bigendian) \ be32enc((f), (v)); \ else \ le32enc((f), (v)); \ } while (0) #define SET64P(m, f, v) \ do { \ if ((m)->bigendian) \ be64enc((f), (v)); \ else \ le64enc((f), (v)); \ } while (0) #define SET8(m, f, v) SET8P((m), &((m)->f), (v)) #define SET16(m, f, v) SET16P((m), &((m)->f), (v)) #define SET32(m, f, v) SET32P((m), &((m)->f), (v)) #define SET64(m, f, v) SET64P((m), &((m)->f), (v)) #define SET8D(m, f, v) SET8P((m), &(f), (v)) #define SET16D(m, f, v) SET16P((m), &(f), (v)) #define SET32D(m, f, v) SET32P((m), &(f), (v)) #define SET64D(m, f, v) SET64P((m), &(f), (v)) #define GETCRNUM(m) (GET32((m), hdr->cr_length) / \ GET16((m), hdr->Configuration_Record_Length)) #define GETVDCPTR(m, n) ((struct ddf_vdc_record *)((uint8_t *)(m)->cr + \ (n) * GET16((m), hdr->Configuration_Record_Length) * \ (m)->sectorsize)) #define GETSAPTR(m, n) ((struct ddf_sa_record *)((uint8_t *)(m)->cr + \ (n) * GET16((m), hdr->Configuration_Record_Length) * \ (m)->sectorsize)) static int isff(uint8_t *buf, int size) { int i; for (i = 0; i < size; i++) if (buf[i] != 0xff) return (0); return (1); } static void print_guid(uint8_t *buf) { int i, ascii; ascii = 1; for (i = 0; i < 24; i++) { if (buf[i] != 0 && (buf[i] < ' ' || buf[i] > 127)) { ascii = 0; break; } } if (ascii) { printf("'%.24s'", buf); } else { for (i = 0; i < 24; i++) printf("%02x", buf[i]); } } static void g_raid_md_ddf_print(struct ddf_meta *meta) { struct ddf_vdc_record *vdc; struct ddf_vuc_record *vuc; struct ddf_sa_record *sa; uint64_t *val2; uint32_t val; int i, j, k, num, num2; if (g_raid_debug < 1) return; printf("********* DDF Metadata *********\n"); printf("**** Header ****\n"); printf("DDF_Header_GUID "); print_guid(meta->hdr->DDF_Header_GUID); printf("\n"); printf("DDF_rev %8.8s\n", (char *)&meta->hdr->DDF_rev[0]); printf("Sequence_Number 0x%08x\n", GET32(meta, hdr->Sequence_Number)); printf("TimeStamp 0x%08x\n", GET32(meta, hdr->TimeStamp)); printf("Open_Flag 0x%02x\n", GET16(meta, hdr->Open_Flag)); printf("Foreign_Flag 0x%02x\n", GET16(meta, hdr->Foreign_Flag)); printf("Diskgrouping 0x%02x\n", GET16(meta, hdr->Diskgrouping)); printf("Primary_Header_LBA %ju\n", GET64(meta, hdr->Primary_Header_LBA)); printf("Secondary_Header_LBA %ju\n", GET64(meta, hdr->Secondary_Header_LBA)); printf("WorkSpace_Length %u\n", GET32(meta, hdr->WorkSpace_Length)); printf("WorkSpace_LBA %ju\n", GET64(meta, hdr->WorkSpace_LBA)); printf("Max_PD_Entries %u\n", GET16(meta, hdr->Max_PD_Entries)); printf("Max_VD_Entries %u\n", GET16(meta, hdr->Max_VD_Entries)); printf("Max_Partitions %u\n", GET16(meta, hdr->Max_Partitions)); printf("Configuration_Record_Length %u\n", GET16(meta, hdr->Configuration_Record_Length)); printf("Max_Primary_Element_Entries %u\n", GET16(meta, hdr->Max_Primary_Element_Entries)); printf("Controller Data %u:%u\n", GET32(meta, hdr->cd_section), GET32(meta, hdr->cd_length)); printf("Physical Disk %u:%u\n", GET32(meta, hdr->pdr_section), GET32(meta, hdr->pdr_length)); printf("Virtual Disk %u:%u\n", GET32(meta, hdr->vdr_section), GET32(meta, hdr->vdr_length)); printf("Configuration Recs %u:%u\n", GET32(meta, hdr->cr_section), GET32(meta, hdr->cr_length)); printf("Physical Disk Recs %u:%u\n", GET32(meta, hdr->pdd_section), GET32(meta, hdr->pdd_length)); printf("BBM Log %u:%u\n", GET32(meta, hdr->bbmlog_section), GET32(meta, hdr->bbmlog_length)); printf("Diagnostic Space %u:%u\n", GET32(meta, hdr->Diagnostic_Space), GET32(meta, hdr->Diagnostic_Space_Length)); printf("Vendor_Specific_Logs %u:%u\n", GET32(meta, hdr->Vendor_Specific_Logs), GET32(meta, hdr->Vendor_Specific_Logs_Length)); printf("**** Controller Data ****\n"); printf("Controller_GUID "); print_guid(meta->cdr->Controller_GUID); printf("\n"); printf("Controller_Type 0x%04x%04x 0x%04x%04x\n", GET16(meta, cdr->Controller_Type.Vendor_ID), GET16(meta, cdr->Controller_Type.Device_ID), GET16(meta, cdr->Controller_Type.SubVendor_ID), GET16(meta, cdr->Controller_Type.SubDevice_ID)); printf("Product_ID '%.16s'\n", (char *)&meta->cdr->Product_ID[0]); printf("**** Physical Disk Records ****\n"); printf("Populated_PDEs %u\n", GET16(meta, pdr->Populated_PDEs)); printf("Max_PDE_Supported %u\n", GET16(meta, pdr->Max_PDE_Supported)); for (j = 0; j < GET16(meta, pdr->Populated_PDEs); j++) { if (isff(meta->pdr->entry[j].PD_GUID, 24)) continue; if (GET32(meta, pdr->entry[j].PD_Reference) == 0xffffffff) continue; printf("PD_GUID "); print_guid(meta->pdr->entry[j].PD_GUID); printf("\n"); printf("PD_Reference 0x%08x\n", GET32(meta, pdr->entry[j].PD_Reference)); printf("PD_Type 0x%04x\n", GET16(meta, pdr->entry[j].PD_Type)); printf("PD_State 0x%04x\n", GET16(meta, pdr->entry[j].PD_State)); printf("Configured_Size %ju\n", GET64(meta, pdr->entry[j].Configured_Size)); printf("Block_Size %u\n", GET16(meta, pdr->entry[j].Block_Size)); } printf("**** Virtual Disk Records ****\n"); printf("Populated_VDEs %u\n", GET16(meta, vdr->Populated_VDEs)); printf("Max_VDE_Supported %u\n", GET16(meta, vdr->Max_VDE_Supported)); for (j = 0; j < GET16(meta, vdr->Populated_VDEs); j++) { if (isff(meta->vdr->entry[j].VD_GUID, 24)) continue; printf("VD_GUID "); print_guid(meta->vdr->entry[j].VD_GUID); printf("\n"); printf("VD_Number 0x%04x\n", GET16(meta, vdr->entry[j].VD_Number)); printf("VD_Type 0x%04x\n", GET16(meta, vdr->entry[j].VD_Type)); printf("VD_State 0x%02x\n", GET8(meta, vdr->entry[j].VD_State)); printf("Init_State 0x%02x\n", GET8(meta, vdr->entry[j].Init_State)); printf("Drive_Failures_Remaining %u\n", GET8(meta, vdr->entry[j].Drive_Failures_Remaining)); printf("VD_Name '%.16s'\n", (char *)&meta->vdr->entry[j].VD_Name); } printf("**** Configuration Records ****\n"); num = GETCRNUM(meta); for (j = 0; j < num; j++) { vdc = GETVDCPTR(meta, j); val = GET32D(meta, vdc->Signature); switch (val) { case DDF_VDCR_SIGNATURE: printf("** Virtual Disk Configuration **\n"); printf("VD_GUID "); print_guid(vdc->VD_GUID); printf("\n"); printf("Timestamp 0x%08x\n", GET32D(meta, vdc->Timestamp)); printf("Sequence_Number 0x%08x\n", GET32D(meta, vdc->Sequence_Number)); printf("Primary_Element_Count %u\n", GET16D(meta, vdc->Primary_Element_Count)); printf("Stripe_Size %u\n", GET8D(meta, vdc->Stripe_Size)); printf("Primary_RAID_Level 0x%02x\n", GET8D(meta, vdc->Primary_RAID_Level)); printf("RLQ 0x%02x\n", GET8D(meta, vdc->RLQ)); printf("Secondary_Element_Count %u\n", GET8D(meta, vdc->Secondary_Element_Count)); printf("Secondary_Element_Seq %u\n", GET8D(meta, vdc->Secondary_Element_Seq)); printf("Secondary_RAID_Level 0x%02x\n", GET8D(meta, vdc->Secondary_RAID_Level)); printf("Block_Count %ju\n", GET64D(meta, vdc->Block_Count)); printf("VD_Size %ju\n", GET64D(meta, vdc->VD_Size)); printf("Block_Size %u\n", GET16D(meta, vdc->Block_Size)); printf("Rotate_Parity_count %u\n", GET8D(meta, vdc->Rotate_Parity_count)); printf("Associated_Spare_Disks"); for (i = 0; i < 8; i++) { if (GET32D(meta, vdc->Associated_Spares[i]) != 0xffffffff) printf(" 0x%08x", GET32D(meta, vdc->Associated_Spares[i])); } printf("\n"); printf("Cache_Flags %016jx\n", GET64D(meta, vdc->Cache_Flags)); printf("BG_Rate %u\n", GET8D(meta, vdc->BG_Rate)); printf("MDF_Parity_Disks %u\n", GET8D(meta, vdc->MDF_Parity_Disks)); printf("MDF_Parity_Generator_Polynomial 0x%04x\n", GET16D(meta, vdc->MDF_Parity_Generator_Polynomial)); printf("MDF_Constant_Generation_Method 0x%02x\n", GET8D(meta, vdc->MDF_Constant_Generation_Method)); printf("Physical_Disks "); num2 = GET16D(meta, vdc->Primary_Element_Count); val2 = (uint64_t *)&(vdc->Physical_Disk_Sequence[GET16(meta, hdr->Max_Primary_Element_Entries)]); for (i = 0; i < num2; i++) printf(" 0x%08x @ %ju", GET32D(meta, vdc->Physical_Disk_Sequence[i]), GET64P(meta, val2 + i)); printf("\n"); break; case DDF_VUCR_SIGNATURE: printf("** Vendor Unique Configuration **\n"); vuc = (struct ddf_vuc_record *)vdc; printf("VD_GUID "); print_guid(vuc->VD_GUID); printf("\n"); break; case DDF_SA_SIGNATURE: printf("** Spare Assignment Configuration **\n"); sa = (struct ddf_sa_record *)vdc; printf("Timestamp 0x%08x\n", GET32D(meta, sa->Timestamp)); printf("Spare_Type 0x%02x\n", GET8D(meta, sa->Spare_Type)); printf("Populated_SAEs %u\n", GET16D(meta, sa->Populated_SAEs)); printf("MAX_SAE_Supported %u\n", GET16D(meta, sa->MAX_SAE_Supported)); for (i = 0; i < GET16D(meta, sa->Populated_SAEs); i++) { if (isff(sa->entry[i].VD_GUID, 24)) continue; printf("VD_GUID "); for (k = 0; k < 24; k++) printf("%02x", sa->entry[i].VD_GUID[k]); printf("\n"); printf("Secondary_Element %u\n", GET16D(meta, sa->entry[i].Secondary_Element)); } break; case 0x00000000: case 0xFFFFFFFF: break; default: printf("Unknown configuration signature %08x\n", val); break; } } printf("**** Physical Disk Data ****\n"); printf("PD_GUID "); print_guid(meta->pdd->PD_GUID); printf("\n"); printf("PD_Reference 0x%08x\n", GET32(meta, pdd->PD_Reference)); printf("Forced_Ref_Flag 0x%02x\n", GET8(meta, pdd->Forced_Ref_Flag)); printf("Forced_PD_GUID_Flag 0x%02x\n", GET8(meta, pdd->Forced_PD_GUID_Flag)); } static int ddf_meta_find_pd(struct ddf_meta *meta, uint8_t *GUID, uint32_t PD_Reference) { int i; for (i = 0; i < GET16(meta, pdr->Populated_PDEs); i++) { if (GUID != NULL) { if (memcmp(meta->pdr->entry[i].PD_GUID, GUID, 24) == 0) return (i); } else if (PD_Reference != 0xffffffff) { if (GET32(meta, pdr->entry[i].PD_Reference) == PD_Reference) return (i); } else if (isff(meta->pdr->entry[i].PD_GUID, 24)) return (i); } if (GUID == NULL && PD_Reference == 0xffffffff) { if (i >= GET16(meta, pdr->Max_PDE_Supported)) return (-1); SET16(meta, pdr->Populated_PDEs, i + 1); return (i); } return (-1); } static int ddf_meta_find_vd(struct ddf_meta *meta, uint8_t *GUID) { int i; for (i = 0; i < GET16(meta, vdr->Populated_VDEs); i++) { if (GUID != NULL) { if (memcmp(meta->vdr->entry[i].VD_GUID, GUID, 24) == 0) return (i); } else if (isff(meta->vdr->entry[i].VD_GUID, 24)) return (i); } if (GUID == NULL) { if (i >= GET16(meta, vdr->Max_VDE_Supported)) return (-1); SET16(meta, vdr->Populated_VDEs, i + 1); return (i); } return (-1); } static struct ddf_vdc_record * ddf_meta_find_vdc(struct ddf_meta *meta, uint8_t *GUID) { struct ddf_vdc_record *vdc; int i, num; num = GETCRNUM(meta); for (i = 0; i < num; i++) { vdc = GETVDCPTR(meta, i); if (GUID != NULL) { if (GET32D(meta, vdc->Signature) == DDF_VDCR_SIGNATURE && memcmp(vdc->VD_GUID, GUID, 24) == 0) return (vdc); } else if (GET32D(meta, vdc->Signature) == 0xffffffff || GET32D(meta, vdc->Signature) == 0) return (vdc); } return (NULL); } static int ddf_meta_count_vdc(struct ddf_meta *meta, uint8_t *GUID) { struct ddf_vdc_record *vdc; int i, num, cnt; cnt = 0; num = GETCRNUM(meta); for (i = 0; i < num; i++) { vdc = GETVDCPTR(meta, i); if (GET32D(meta, vdc->Signature) != DDF_VDCR_SIGNATURE) continue; if (GUID == NULL || memcmp(vdc->VD_GUID, GUID, 24) == 0) cnt++; } return (cnt); } static int ddf_meta_find_disk(struct ddf_vol_meta *vmeta, uint32_t PD_Reference, int *bvdp, int *posp) { int i, bvd, pos; i = 0; for (bvd = 0; bvd < GET8(vmeta, vdc->Secondary_Element_Count); bvd++) { if (vmeta->bvdc[bvd] == NULL) { i += GET16(vmeta, vdc->Primary_Element_Count); // XXX continue; } for (pos = 0; pos < GET16(vmeta, bvdc[bvd]->Primary_Element_Count); pos++, i++) { if (GET32(vmeta, bvdc[bvd]->Physical_Disk_Sequence[pos]) == PD_Reference) { if (bvdp != NULL) *bvdp = bvd; if (posp != NULL) *posp = pos; return (i); } } } return (-1); } static struct ddf_sa_record * ddf_meta_find_sa(struct ddf_meta *meta, int create) { struct ddf_sa_record *sa; int i, num; num = GETCRNUM(meta); for (i = 0; i < num; i++) { sa = GETSAPTR(meta, i); if (GET32D(meta, sa->Signature) == DDF_SA_SIGNATURE) return (sa); } if (create) { for (i = 0; i < num; i++) { sa = GETSAPTR(meta, i); if (GET32D(meta, sa->Signature) == 0xffffffff || GET32D(meta, sa->Signature) == 0) return (sa); } } return (NULL); } static void ddf_meta_create(struct g_raid_disk *disk, struct ddf_meta *sample) { struct timespec ts; struct clocktime ct; struct g_raid_md_ddf_perdisk *pd; struct g_raid_md_ddf_object *mdi; struct ddf_meta *meta; struct ddf_pd_entry *pde; off_t anchorlba; u_int ss, pos, size; int len, error; char serial_buffer[24]; if (sample->hdr == NULL) sample = NULL; mdi = (struct g_raid_md_ddf_object *)disk->d_softc->sc_md; pd = (struct g_raid_md_ddf_perdisk *)disk->d_md_data; meta = &pd->pd_meta; ss = disk->d_consumer->provider->sectorsize; anchorlba = disk->d_consumer->provider->mediasize / ss - 1; meta->sectorsize = ss; meta->bigendian = sample ? sample->bigendian : mdi->mdio_bigendian; getnanotime(&ts); clock_ts_to_ct(&ts, &ct); /* Header */ meta->hdr = malloc(ss, M_MD_DDF, M_WAITOK); memset(meta->hdr, 0xff, ss); if (sample) { memcpy(meta->hdr, sample->hdr, sizeof(struct ddf_header)); if (ss != sample->sectorsize) { SET32(meta, hdr->WorkSpace_Length, howmany(GET32(sample, hdr->WorkSpace_Length) * sample->sectorsize, ss)); SET16(meta, hdr->Configuration_Record_Length, howmany(GET16(sample, hdr->Configuration_Record_Length) * sample->sectorsize, ss)); SET32(meta, hdr->cd_length, howmany(GET32(sample, hdr->cd_length) * sample->sectorsize, ss)); SET32(meta, hdr->pdr_length, howmany(GET32(sample, hdr->pdr_length) * sample->sectorsize, ss)); SET32(meta, hdr->vdr_length, howmany(GET32(sample, hdr->vdr_length) * sample->sectorsize, ss)); SET32(meta, hdr->cr_length, howmany(GET32(sample, hdr->cr_length) * sample->sectorsize, ss)); SET32(meta, hdr->pdd_length, howmany(GET32(sample, hdr->pdd_length) * sample->sectorsize, ss)); SET32(meta, hdr->bbmlog_length, howmany(GET32(sample, hdr->bbmlog_length) * sample->sectorsize, ss)); SET32(meta, hdr->Diagnostic_Space, howmany(GET32(sample, hdr->bbmlog_length) * sample->sectorsize, ss)); SET32(meta, hdr->Vendor_Specific_Logs, howmany(GET32(sample, hdr->bbmlog_length) * sample->sectorsize, ss)); } } else { SET32(meta, hdr->Signature, DDF_HEADER_SIGNATURE); snprintf(meta->hdr->DDF_Header_GUID, 25, "FreeBSD %08x%08x", (u_int)(ts.tv_sec - DECADE), arc4random()); memcpy(meta->hdr->DDF_rev, "02.00.00", 8); SET32(meta, hdr->TimeStamp, (ts.tv_sec - DECADE)); SET32(meta, hdr->WorkSpace_Length, 16 * 1024 * 1024 / ss); SET16(meta, hdr->Max_PD_Entries, DDF_MAX_DISKS - 1); SET16(meta, hdr->Max_VD_Entries, DDF_MAX_VDISKS); SET16(meta, hdr->Max_Partitions, DDF_MAX_PARTITIONS); SET16(meta, hdr->Max_Primary_Element_Entries, DDF_MAX_DISKS); SET16(meta, hdr->Configuration_Record_Length, howmany(sizeof(struct ddf_vdc_record) + (4 + 8) * GET16(meta, hdr->Max_Primary_Element_Entries), ss)); SET32(meta, hdr->cd_length, howmany(sizeof(struct ddf_cd_record), ss)); SET32(meta, hdr->pdr_length, howmany(sizeof(struct ddf_pd_record) + sizeof(struct ddf_pd_entry) * GET16(meta, hdr->Max_PD_Entries), ss)); SET32(meta, hdr->vdr_length, howmany(sizeof(struct ddf_vd_record) + sizeof(struct ddf_vd_entry) * GET16(meta, hdr->Max_VD_Entries), ss)); SET32(meta, hdr->cr_length, GET16(meta, hdr->Configuration_Record_Length) * (GET16(meta, hdr->Max_Partitions) + 1)); SET32(meta, hdr->pdd_length, howmany(sizeof(struct ddf_pdd_record), ss)); SET32(meta, hdr->bbmlog_length, 0); SET32(meta, hdr->Diagnostic_Space_Length, 0); SET32(meta, hdr->Vendor_Specific_Logs_Length, 0); } pos = 1; SET32(meta, hdr->cd_section, pos); pos += GET32(meta, hdr->cd_length); SET32(meta, hdr->pdr_section, pos); pos += GET32(meta, hdr->pdr_length); SET32(meta, hdr->vdr_section, pos); pos += GET32(meta, hdr->vdr_length); SET32(meta, hdr->cr_section, pos); pos += GET32(meta, hdr->cr_length); SET32(meta, hdr->pdd_section, pos); pos += GET32(meta, hdr->pdd_length); SET32(meta, hdr->bbmlog_section, GET32(meta, hdr->bbmlog_length) != 0 ? pos : 0xffffffff); pos += GET32(meta, hdr->bbmlog_length); SET32(meta, hdr->Diagnostic_Space, GET32(meta, hdr->Diagnostic_Space_Length) != 0 ? pos : 0xffffffff); pos += GET32(meta, hdr->Diagnostic_Space_Length); SET32(meta, hdr->Vendor_Specific_Logs, GET32(meta, hdr->Vendor_Specific_Logs_Length) != 0 ? pos : 0xffffffff); pos += min(GET32(meta, hdr->Vendor_Specific_Logs_Length), 1); SET64(meta, hdr->Primary_Header_LBA, anchorlba - pos); SET64(meta, hdr->Secondary_Header_LBA, 0xffffffffffffffffULL); SET64(meta, hdr->WorkSpace_LBA, anchorlba + 1 - 32 * 1024 * 1024 / ss); /* Controller Data */ size = GET32(meta, hdr->cd_length) * ss; meta->cdr = malloc(size, M_MD_DDF, M_WAITOK); memset(meta->cdr, 0xff, size); SET32(meta, cdr->Signature, DDF_CONTROLLER_DATA_SIGNATURE); memcpy(meta->cdr->Controller_GUID, "FreeBSD GEOM RAID SERIAL", 24); memcpy(meta->cdr->Product_ID, "FreeBSD GEOMRAID", 16); /* Physical Drive Records. */ size = GET32(meta, hdr->pdr_length) * ss; meta->pdr = malloc(size, M_MD_DDF, M_WAITOK); memset(meta->pdr, 0xff, size); SET32(meta, pdr->Signature, DDF_PDR_SIGNATURE); SET16(meta, pdr->Populated_PDEs, 1); SET16(meta, pdr->Max_PDE_Supported, GET16(meta, hdr->Max_PD_Entries)); pde = &meta->pdr->entry[0]; len = sizeof(serial_buffer); error = g_io_getattr("GEOM::ident", disk->d_consumer, &len, serial_buffer); if (error == 0 && (len = strlen (serial_buffer)) >= 6 && len <= 20) snprintf(pde->PD_GUID, 25, "DISK%20s", serial_buffer); else snprintf(pde->PD_GUID, 25, "DISK%04d%02d%02d%08x%04x", ct.year, ct.mon, ct.day, arc4random(), arc4random() & 0xffff); SET32D(meta, pde->PD_Reference, arc4random()); SET16D(meta, pde->PD_Type, DDF_PDE_GUID_FORCE); SET16D(meta, pde->PD_State, 0); SET64D(meta, pde->Configured_Size, anchorlba + 1 - 32 * 1024 * 1024 / ss); SET16D(meta, pde->Block_Size, ss); /* Virtual Drive Records. */ size = GET32(meta, hdr->vdr_length) * ss; meta->vdr = malloc(size, M_MD_DDF, M_WAITOK); memset(meta->vdr, 0xff, size); SET32(meta, vdr->Signature, DDF_VD_RECORD_SIGNATURE); SET32(meta, vdr->Populated_VDEs, 0); SET16(meta, vdr->Max_VDE_Supported, GET16(meta, hdr->Max_VD_Entries)); /* Configuration Records. */ size = GET32(meta, hdr->cr_length) * ss; meta->cr = malloc(size, M_MD_DDF, M_WAITOK); memset(meta->cr, 0xff, size); /* Physical Disk Data. */ size = GET32(meta, hdr->pdd_length) * ss; meta->pdd = malloc(size, M_MD_DDF, M_WAITOK); memset(meta->pdd, 0xff, size); SET32(meta, pdd->Signature, DDF_PDD_SIGNATURE); memcpy(meta->pdd->PD_GUID, pde->PD_GUID, 24); SET32(meta, pdd->PD_Reference, GET32D(meta, pde->PD_Reference)); SET8(meta, pdd->Forced_Ref_Flag, DDF_PDD_FORCED_REF); SET8(meta, pdd->Forced_PD_GUID_Flag, DDF_PDD_FORCED_GUID); /* Bad Block Management Log. */ if (GET32(meta, hdr->bbmlog_length) != 0) { size = GET32(meta, hdr->bbmlog_length) * ss; meta->bbm = malloc(size, M_MD_DDF, M_WAITOK); memset(meta->bbm, 0xff, size); SET32(meta, bbm->Signature, DDF_BBML_SIGNATURE); SET32(meta, bbm->Entry_Count, 0); SET32(meta, bbm->Spare_Block_Count, 0); } } static void ddf_meta_copy(struct ddf_meta *dst, struct ddf_meta *src) { struct ddf_header *hdr; u_int ss; hdr = src->hdr; dst->bigendian = src->bigendian; ss = dst->sectorsize = src->sectorsize; dst->hdr = malloc(ss, M_MD_DDF, M_WAITOK); memcpy(dst->hdr, src->hdr, ss); dst->cdr = malloc(GET32(src, hdr->cd_length) * ss, M_MD_DDF, M_WAITOK); memcpy(dst->cdr, src->cdr, GET32(src, hdr->cd_length) * ss); dst->pdr = malloc(GET32(src, hdr->pdr_length) * ss, M_MD_DDF, M_WAITOK); memcpy(dst->pdr, src->pdr, GET32(src, hdr->pdr_length) * ss); dst->vdr = malloc(GET32(src, hdr->vdr_length) * ss, M_MD_DDF, M_WAITOK); memcpy(dst->vdr, src->vdr, GET32(src, hdr->vdr_length) * ss); dst->cr = malloc(GET32(src, hdr->cr_length) * ss, M_MD_DDF, M_WAITOK); memcpy(dst->cr, src->cr, GET32(src, hdr->cr_length) * ss); dst->pdd = malloc(GET32(src, hdr->pdd_length) * ss, M_MD_DDF, M_WAITOK); memcpy(dst->pdd, src->pdd, GET32(src, hdr->pdd_length) * ss); if (src->bbm != NULL) { dst->bbm = malloc(GET32(src, hdr->bbmlog_length) * ss, M_MD_DDF, M_WAITOK); memcpy(dst->bbm, src->bbm, GET32(src, hdr->bbmlog_length) * ss); } } static void ddf_meta_update(struct ddf_meta *meta, struct ddf_meta *src) { struct ddf_pd_entry *pde, *spde; int i, j; for (i = 0; i < GET16(src, pdr->Populated_PDEs); i++) { spde = &src->pdr->entry[i]; if (isff(spde->PD_GUID, 24)) continue; j = ddf_meta_find_pd(meta, NULL, GET32(src, pdr->entry[i].PD_Reference)); if (j < 0) { j = ddf_meta_find_pd(meta, NULL, 0xffffffff); pde = &meta->pdr->entry[j]; memcpy(pde, spde, sizeof(*pde)); } else { pde = &meta->pdr->entry[j]; SET16D(meta, pde->PD_State, GET16D(meta, pde->PD_State) | GET16D(src, pde->PD_State)); } } } static void ddf_meta_free(struct ddf_meta *meta) { if (meta->hdr != NULL) { free(meta->hdr, M_MD_DDF); meta->hdr = NULL; } if (meta->cdr != NULL) { free(meta->cdr, M_MD_DDF); meta->cdr = NULL; } if (meta->pdr != NULL) { free(meta->pdr, M_MD_DDF); meta->pdr = NULL; } if (meta->vdr != NULL) { free(meta->vdr, M_MD_DDF); meta->vdr = NULL; } if (meta->cr != NULL) { free(meta->cr, M_MD_DDF); meta->cr = NULL; } if (meta->pdd != NULL) { free(meta->pdd, M_MD_DDF); meta->pdd = NULL; } if (meta->bbm != NULL) { free(meta->bbm, M_MD_DDF); meta->bbm = NULL; } } static void ddf_vol_meta_create(struct ddf_vol_meta *meta, struct ddf_meta *sample) { struct timespec ts; struct clocktime ct; struct ddf_header *hdr; u_int ss, size; hdr = sample->hdr; meta->bigendian = sample->bigendian; ss = meta->sectorsize = sample->sectorsize; meta->hdr = malloc(ss, M_MD_DDF, M_WAITOK); memcpy(meta->hdr, sample->hdr, ss); meta->cdr = malloc(GET32(sample, hdr->cd_length) * ss, M_MD_DDF, M_WAITOK); memcpy(meta->cdr, sample->cdr, GET32(sample, hdr->cd_length) * ss); meta->vde = malloc(sizeof(struct ddf_vd_entry), M_MD_DDF, M_WAITOK); memset(meta->vde, 0xff, sizeof(struct ddf_vd_entry)); getnanotime(&ts); clock_ts_to_ct(&ts, &ct); snprintf(meta->vde->VD_GUID, 25, "FreeBSD%04d%02d%02d%08x%01x", ct.year, ct.mon, ct.day, arc4random(), arc4random() & 0xf); size = GET16(sample, hdr->Configuration_Record_Length) * ss; meta->vdc = malloc(size, M_MD_DDF, M_WAITOK); memset(meta->vdc, 0xff, size); SET32(meta, vdc->Signature, DDF_VDCR_SIGNATURE); memcpy(meta->vdc->VD_GUID, meta->vde->VD_GUID, 24); SET32(meta, vdc->Sequence_Number, 0); } static void ddf_vol_meta_update(struct ddf_vol_meta *dst, struct ddf_meta *src, uint8_t *GUID, int started) { struct ddf_header *hdr; struct ddf_vd_entry *vde; struct ddf_vdc_record *vdc; int vnew, bvnew, bvd, size; u_int ss; hdr = src->hdr; vde = &src->vdr->entry[ddf_meta_find_vd(src, GUID)]; vdc = ddf_meta_find_vdc(src, GUID); if (GET8D(src, vdc->Secondary_Element_Count) == 1) bvd = 0; else bvd = GET8D(src, vdc->Secondary_Element_Seq); size = GET16(src, hdr->Configuration_Record_Length) * src->sectorsize; if (dst->vdc == NULL || (!started && ((int32_t)(GET32D(src, vdc->Sequence_Number) - GET32(dst, vdc->Sequence_Number))) > 0)) vnew = 1; else vnew = 0; if (dst->bvdc[bvd] == NULL || (!started && ((int32_t)(GET32D(src, vdc->Sequence_Number) - GET32(dst, bvdc[bvd]->Sequence_Number))) > 0)) bvnew = 1; else bvnew = 0; if (vnew) { dst->bigendian = src->bigendian; ss = dst->sectorsize = src->sectorsize; if (dst->hdr != NULL) free(dst->hdr, M_MD_DDF); dst->hdr = malloc(ss, M_MD_DDF, M_WAITOK); memcpy(dst->hdr, src->hdr, ss); if (dst->cdr != NULL) free(dst->cdr, M_MD_DDF); dst->cdr = malloc(GET32(src, hdr->cd_length) * ss, M_MD_DDF, M_WAITOK); memcpy(dst->cdr, src->cdr, GET32(src, hdr->cd_length) * ss); if (dst->vde != NULL) free(dst->vde, M_MD_DDF); dst->vde = malloc(sizeof(struct ddf_vd_entry), M_MD_DDF, M_WAITOK); memcpy(dst->vde, vde, sizeof(struct ddf_vd_entry)); if (dst->vdc != NULL) free(dst->vdc, M_MD_DDF); dst->vdc = malloc(size, M_MD_DDF, M_WAITOK); memcpy(dst->vdc, vdc, size); } if (bvnew) { if (dst->bvdc[bvd] != NULL) free(dst->bvdc[bvd], M_MD_DDF); dst->bvdc[bvd] = malloc(size, M_MD_DDF, M_WAITOK); memcpy(dst->bvdc[bvd], vdc, size); } } static void ddf_vol_meta_free(struct ddf_vol_meta *meta) { int i; if (meta->hdr != NULL) { free(meta->hdr, M_MD_DDF); meta->hdr = NULL; } if (meta->cdr != NULL) { free(meta->cdr, M_MD_DDF); meta->cdr = NULL; } if (meta->vde != NULL) { free(meta->vde, M_MD_DDF); meta->vde = NULL; } if (meta->vdc != NULL) { free(meta->vdc, M_MD_DDF); meta->vdc = NULL; } for (i = 0; i < DDF_MAX_DISKS_HARD; i++) { if (meta->bvdc[i] != NULL) { free(meta->bvdc[i], M_MD_DDF); meta->bvdc[i] = NULL; } } } static int ddf_meta_unused_range(struct ddf_meta *meta, off_t *off, off_t *size) { struct ddf_vdc_record *vdc; off_t beg[32], end[32], beg1, end1; uint64_t *offp; int i, j, n, num, pos; uint32_t ref; *off = 0; *size = 0; ref = GET32(meta, pdd->PD_Reference); pos = ddf_meta_find_pd(meta, NULL, ref); beg[0] = 0; end[0] = GET64(meta, pdr->entry[pos].Configured_Size); n = 1; num = GETCRNUM(meta); for (i = 0; i < num; i++) { vdc = GETVDCPTR(meta, i); if (GET32D(meta, vdc->Signature) != DDF_VDCR_SIGNATURE) continue; for (pos = 0; pos < GET16D(meta, vdc->Primary_Element_Count); pos++) if (GET32D(meta, vdc->Physical_Disk_Sequence[pos]) == ref) break; if (pos == GET16D(meta, vdc->Primary_Element_Count)) continue; offp = (uint64_t *)&(vdc->Physical_Disk_Sequence[ GET16(meta, hdr->Max_Primary_Element_Entries)]); beg1 = GET64P(meta, offp + pos); end1 = beg1 + GET64D(meta, vdc->Block_Count); for (j = 0; j < n; j++) { if (beg[j] >= end1 || end[j] <= beg1 ) continue; if (beg[j] < beg1 && end[j] > end1) { beg[n] = end1; end[n] = end[j]; end[j] = beg1; n++; } else if (beg[j] < beg1) end[j] = beg1; else beg[j] = end1; } } for (j = 0; j < n; j++) { if (end[j] - beg[j] > *size) { *off = beg[j]; *size = end[j] - beg[j]; } } return ((*size > 0) ? 1 : 0); } static void ddf_meta_get_name(struct ddf_meta *meta, int num, char *buf) { const char *b; int i; b = meta->vdr->entry[num].VD_Name; for (i = 15; i >= 0; i--) if (b[i] != 0x20) break; memcpy(buf, b, i + 1); buf[i + 1] = 0; } static void ddf_meta_put_name(struct ddf_vol_meta *meta, char *buf) { int len; len = min(strlen(buf), 16); memset(meta->vde->VD_Name, 0x20, 16); memcpy(meta->vde->VD_Name, buf, len); } static int ddf_meta_read(struct g_consumer *cp, struct ddf_meta *meta) { struct g_provider *pp; struct ddf_header *ahdr, *hdr; char *abuf, *buf; off_t plba, slba, lba; int error, len, i; u_int ss; uint32_t val; ddf_meta_free(meta); pp = cp->provider; ss = meta->sectorsize = pp->sectorsize; /* Read anchor block. */ abuf = g_read_data(cp, pp->mediasize - ss, ss, &error); if (abuf == NULL) { G_RAID_DEBUG(1, "Cannot read metadata from %s (error=%d).", pp->name, error); return (error); } ahdr = (struct ddf_header *)abuf; /* Check if this is an DDF RAID struct */ if (be32dec(&ahdr->Signature) == DDF_HEADER_SIGNATURE) meta->bigendian = 1; else if (le32dec(&ahdr->Signature) == DDF_HEADER_SIGNATURE) meta->bigendian = 0; else { G_RAID_DEBUG(1, "DDF signature check failed on %s", pp->name); error = EINVAL; goto done; } if (ahdr->Header_Type != DDF_HEADER_ANCHOR) { G_RAID_DEBUG(1, "DDF header type check failed on %s", pp->name); error = EINVAL; goto done; } meta->hdr = ahdr; plba = GET64(meta, hdr->Primary_Header_LBA); slba = GET64(meta, hdr->Secondary_Header_LBA); val = GET32(meta, hdr->CRC); SET32(meta, hdr->CRC, 0xffffffff); meta->hdr = NULL; if (crc32(ahdr, ss) != val) { G_RAID_DEBUG(1, "DDF CRC mismatch on %s", pp->name); error = EINVAL; goto done; } if ((plba + 6) * ss >= pp->mediasize) { G_RAID_DEBUG(1, "DDF primary header LBA is wrong on %s", pp->name); error = EINVAL; goto done; } if (slba != -1 && (slba + 6) * ss >= pp->mediasize) { G_RAID_DEBUG(1, "DDF secondary header LBA is wrong on %s", pp->name); error = EINVAL; goto done; } lba = plba; doread: error = 0; ddf_meta_free(meta); /* Read header block. */ buf = g_read_data(cp, lba * ss, ss, &error); if (buf == NULL) { readerror: G_RAID_DEBUG(1, "DDF %s metadata read error on %s (error=%d).", (lba == plba) ? "primary" : "secondary", pp->name, error); if (lba == plba && slba != -1) { lba = slba; goto doread; } G_RAID_DEBUG(1, "DDF metadata read error on %s.", pp->name); goto done; } meta->hdr = malloc(ss, M_MD_DDF, M_WAITOK); memcpy(meta->hdr, buf, ss); g_free(buf); hdr = meta->hdr; val = GET32(meta, hdr->CRC); SET32(meta, hdr->CRC, 0xffffffff); if (hdr->Signature != ahdr->Signature || crc32(meta->hdr, ss) != val || memcmp(hdr->DDF_Header_GUID, ahdr->DDF_Header_GUID, 24) || GET64(meta, hdr->Primary_Header_LBA) != plba || GET64(meta, hdr->Secondary_Header_LBA) != slba) { hdrerror: G_RAID_DEBUG(1, "DDF %s metadata check failed on %s", (lba == plba) ? "primary" : "secondary", pp->name); if (lba == plba && slba != -1) { lba = slba; goto doread; } G_RAID_DEBUG(1, "DDF metadata check failed on %s", pp->name); error = EINVAL; goto done; } if ((lba == plba && hdr->Header_Type != DDF_HEADER_PRIMARY) || (lba == slba && hdr->Header_Type != DDF_HEADER_SECONDARY)) goto hdrerror; len = 1; len = max(len, GET32(meta, hdr->cd_section) + GET32(meta, hdr->cd_length)); len = max(len, GET32(meta, hdr->pdr_section) + GET32(meta, hdr->pdr_length)); len = max(len, GET32(meta, hdr->vdr_section) + GET32(meta, hdr->vdr_length)); len = max(len, GET32(meta, hdr->cr_section) + GET32(meta, hdr->cr_length)); len = max(len, GET32(meta, hdr->pdd_section) + GET32(meta, hdr->pdd_length)); if ((val = GET32(meta, hdr->bbmlog_section)) != 0xffffffff) len = max(len, val + GET32(meta, hdr->bbmlog_length)); if ((val = GET32(meta, hdr->Diagnostic_Space)) != 0xffffffff) len = max(len, val + GET32(meta, hdr->Diagnostic_Space_Length)); if ((val = GET32(meta, hdr->Vendor_Specific_Logs)) != 0xffffffff) len = max(len, val + GET32(meta, hdr->Vendor_Specific_Logs_Length)); if ((plba + len) * ss >= pp->mediasize) goto hdrerror; if (slba != -1 && (slba + len) * ss >= pp->mediasize) goto hdrerror; /* Workaround for Adaptec implementation. */ if (GET16(meta, hdr->Max_Primary_Element_Entries) == 0xffff) { SET16(meta, hdr->Max_Primary_Element_Entries, min(GET16(meta, hdr->Max_PD_Entries), (GET16(meta, hdr->Configuration_Record_Length) * ss - 512) / 12)); } /* Read controller data. */ buf = g_read_data(cp, (lba + GET32(meta, hdr->cd_section)) * ss, GET32(meta, hdr->cd_length) * ss, &error); if (buf == NULL) goto readerror; meta->cdr = malloc(GET32(meta, hdr->cd_length) * ss, M_MD_DDF, M_WAITOK); memcpy(meta->cdr, buf, GET32(meta, hdr->cd_length) * ss); g_free(buf); if (GET32(meta, cdr->Signature) != DDF_CONTROLLER_DATA_SIGNATURE) goto hdrerror; /* Read physical disk records. */ buf = g_read_data(cp, (lba + GET32(meta, hdr->pdr_section)) * ss, GET32(meta, hdr->pdr_length) * ss, &error); if (buf == NULL) goto readerror; meta->pdr = malloc(GET32(meta, hdr->pdr_length) * ss, M_MD_DDF, M_WAITOK); memcpy(meta->pdr, buf, GET32(meta, hdr->pdr_length) * ss); g_free(buf); if (GET32(meta, pdr->Signature) != DDF_PDR_SIGNATURE) goto hdrerror; /* * Workaround for reading metadata corrupted due to graid bug. * XXX: Remove this before we have disks above 128PB. :) */ if (meta->bigendian) { for (i = 0; i < GET16(meta, pdr->Populated_PDEs); i++) { if (isff(meta->pdr->entry[i].PD_GUID, 24)) continue; if (GET32(meta, pdr->entry[i].PD_Reference) == 0xffffffff) continue; if (GET64(meta, pdr->entry[i].Configured_Size) >= (1ULL << 48)) { SET16(meta, pdr->entry[i].PD_State, GET16(meta, pdr->entry[i].PD_State) & ~DDF_PDE_FAILED); SET64(meta, pdr->entry[i].Configured_Size, GET64(meta, pdr->entry[i].Configured_Size) & ((1ULL << 48) - 1)); } } } /* Read virtual disk records. */ buf = g_read_data(cp, (lba + GET32(meta, hdr->vdr_section)) * ss, GET32(meta, hdr->vdr_length) * ss, &error); if (buf == NULL) goto readerror; meta->vdr = malloc(GET32(meta, hdr->vdr_length) * ss, M_MD_DDF, M_WAITOK); memcpy(meta->vdr, buf, GET32(meta, hdr->vdr_length) * ss); g_free(buf); if (GET32(meta, vdr->Signature) != DDF_VD_RECORD_SIGNATURE) goto hdrerror; /* Read configuration records. */ buf = g_read_data(cp, (lba + GET32(meta, hdr->cr_section)) * ss, GET32(meta, hdr->cr_length) * ss, &error); if (buf == NULL) goto readerror; meta->cr = malloc(GET32(meta, hdr->cr_length) * ss, M_MD_DDF, M_WAITOK); memcpy(meta->cr, buf, GET32(meta, hdr->cr_length) * ss); g_free(buf); /* Read physical disk data. */ buf = g_read_data(cp, (lba + GET32(meta, hdr->pdd_section)) * ss, GET32(meta, hdr->pdd_length) * ss, &error); if (buf == NULL) goto readerror; meta->pdd = malloc(GET32(meta, hdr->pdd_length) * ss, M_MD_DDF, M_WAITOK); memcpy(meta->pdd, buf, GET32(meta, hdr->pdd_length) * ss); g_free(buf); if (GET32(meta, pdd->Signature) != DDF_PDD_SIGNATURE) goto hdrerror; i = ddf_meta_find_pd(meta, NULL, GET32(meta, pdd->PD_Reference)); if (i < 0) goto hdrerror; /* Read BBM Log. */ if (GET32(meta, hdr->bbmlog_section) != 0xffffffff && GET32(meta, hdr->bbmlog_length) != 0) { buf = g_read_data(cp, (lba + GET32(meta, hdr->bbmlog_section)) * ss, GET32(meta, hdr->bbmlog_length) * ss, &error); if (buf == NULL) goto readerror; meta->bbm = malloc(GET32(meta, hdr->bbmlog_length) * ss, M_MD_DDF, M_WAITOK); memcpy(meta->bbm, buf, GET32(meta, hdr->bbmlog_length) * ss); g_free(buf); if (GET32(meta, bbm->Signature) != DDF_BBML_SIGNATURE) goto hdrerror; } done: g_free(abuf); if (error != 0) ddf_meta_free(meta); return (error); } static int ddf_meta_write(struct g_consumer *cp, struct ddf_meta *meta) { struct g_provider *pp; struct ddf_vdc_record *vdc; off_t alba, plba, slba, lba; u_int ss, size; int error, i, num; pp = cp->provider; ss = pp->sectorsize; lba = alba = pp->mediasize / ss - 1; plba = GET64(meta, hdr->Primary_Header_LBA); slba = GET64(meta, hdr->Secondary_Header_LBA); next: SET8(meta, hdr->Header_Type, (lba == alba) ? DDF_HEADER_ANCHOR : (lba == plba) ? DDF_HEADER_PRIMARY : DDF_HEADER_SECONDARY); SET32(meta, hdr->CRC, 0xffffffff); SET32(meta, hdr->CRC, crc32(meta->hdr, ss)); error = g_write_data(cp, lba * ss, meta->hdr, ss); if (error != 0) { err: G_RAID_DEBUG(1, "Cannot write metadata to %s (error=%d).", pp->name, error); if (lba != alba) goto done; } if (lba == alba) { lba = plba; goto next; } size = GET32(meta, hdr->cd_length) * ss; SET32(meta, cdr->CRC, 0xffffffff); SET32(meta, cdr->CRC, crc32(meta->cdr, size)); error = g_write_data(cp, (lba + GET32(meta, hdr->cd_section)) * ss, meta->cdr, size); if (error != 0) goto err; size = GET32(meta, hdr->pdr_length) * ss; SET32(meta, pdr->CRC, 0xffffffff); SET32(meta, pdr->CRC, crc32(meta->pdr, size)); error = g_write_data(cp, (lba + GET32(meta, hdr->pdr_section)) * ss, meta->pdr, size); if (error != 0) goto err; size = GET32(meta, hdr->vdr_length) * ss; SET32(meta, vdr->CRC, 0xffffffff); SET32(meta, vdr->CRC, crc32(meta->vdr, size)); error = g_write_data(cp, (lba + GET32(meta, hdr->vdr_section)) * ss, meta->vdr, size); if (error != 0) goto err; size = GET16(meta, hdr->Configuration_Record_Length) * ss; num = GETCRNUM(meta); for (i = 0; i < num; i++) { vdc = GETVDCPTR(meta, i); SET32D(meta, vdc->CRC, 0xffffffff); SET32D(meta, vdc->CRC, crc32(vdc, size)); } error = g_write_data(cp, (lba + GET32(meta, hdr->cr_section)) * ss, meta->cr, size * num); if (error != 0) goto err; size = GET32(meta, hdr->pdd_length) * ss; SET32(meta, pdd->CRC, 0xffffffff); SET32(meta, pdd->CRC, crc32(meta->pdd, size)); error = g_write_data(cp, (lba + GET32(meta, hdr->pdd_section)) * ss, meta->pdd, size); if (error != 0) goto err; if (GET32(meta, hdr->bbmlog_length) != 0) { size = GET32(meta, hdr->bbmlog_length) * ss; SET32(meta, bbm->CRC, 0xffffffff); SET32(meta, bbm->CRC, crc32(meta->bbm, size)); error = g_write_data(cp, (lba + GET32(meta, hdr->bbmlog_section)) * ss, meta->bbm, size); if (error != 0) goto err; } done: if (lba == plba && slba != -1) { lba = slba; goto next; } return (error); } static int ddf_meta_erase(struct g_consumer *cp) { struct g_provider *pp; char *buf; int error; pp = cp->provider; buf = malloc(pp->sectorsize, M_MD_DDF, M_WAITOK | M_ZERO); error = g_write_data(cp, pp->mediasize - pp->sectorsize, buf, pp->sectorsize); if (error != 0) { G_RAID_DEBUG(1, "Cannot erase metadata on %s (error=%d).", pp->name, error); } free(buf, M_MD_DDF); return (error); } static struct g_raid_volume * g_raid_md_ddf_get_volume(struct g_raid_softc *sc, uint8_t *GUID) { struct g_raid_volume *vol; struct g_raid_md_ddf_pervolume *pv; TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) { pv = vol->v_md_data; if (memcmp(pv->pv_meta.vde->VD_GUID, GUID, 24) == 0) break; } return (vol); } static struct g_raid_disk * g_raid_md_ddf_get_disk(struct g_raid_softc *sc, uint8_t *GUID, uint32_t id) { struct g_raid_disk *disk; struct g_raid_md_ddf_perdisk *pd; struct ddf_meta *meta; TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { pd = (struct g_raid_md_ddf_perdisk *)disk->d_md_data; meta = &pd->pd_meta; if (GUID != NULL) { if (memcmp(meta->pdd->PD_GUID, GUID, 24) == 0) break; } else { if (GET32(meta, pdd->PD_Reference) == id) break; } } return (disk); } static int g_raid_md_ddf_purge_volumes(struct g_raid_softc *sc) { struct g_raid_volume *vol, *tvol; struct g_raid_md_ddf_pervolume *pv; int i, res; res = 0; TAILQ_FOREACH_SAFE(vol, &sc->sc_volumes, v_next, tvol) { pv = vol->v_md_data; if (vol->v_stopping) continue; for (i = 0; i < vol->v_disks_count; i++) { if (vol->v_subdisks[i].sd_state != G_RAID_SUBDISK_S_NONE) break; } if (i >= vol->v_disks_count) { g_raid_destroy_volume(vol); res = 1; } } return (res); } static int g_raid_md_ddf_purge_disks(struct g_raid_softc *sc) { #if 0 struct g_raid_disk *disk, *tdisk; struct g_raid_volume *vol; struct g_raid_md_ddf_perdisk *pd; int i, j, res; res = 0; TAILQ_FOREACH_SAFE(disk, &sc->sc_disks, d_next, tdisk) { if (disk->d_state == G_RAID_DISK_S_SPARE) continue; pd = (struct g_raid_md_ddf_perdisk *)disk->d_md_data; /* Scan for deleted volumes. */ for (i = 0; i < pd->pd_subdisks; ) { vol = g_raid_md_ddf_get_volume(sc, pd->pd_meta[i]->volume_id); if (vol != NULL && !vol->v_stopping) { i++; continue; } free(pd->pd_meta[i], M_MD_DDF); for (j = i; j < pd->pd_subdisks - 1; j++) pd->pd_meta[j] = pd->pd_meta[j + 1]; pd->pd_meta[DDF_MAX_SUBDISKS - 1] = NULL; pd->pd_subdisks--; pd->pd_updated = 1; } /* If there is no metadata left - erase and delete disk. */ if (pd->pd_subdisks == 0) { ddf_meta_erase(disk->d_consumer); g_raid_destroy_disk(disk); res = 1; } } return (res); #endif return (0); } static int g_raid_md_ddf_supported(int level, int qual, int disks, int force) { if (disks > DDF_MAX_DISKS_HARD) return (0); switch (level) { case G_RAID_VOLUME_RL_RAID0: if (qual != G_RAID_VOLUME_RLQ_NONE) return (0); if (disks < 1) return (0); if (!force && disks < 2) return (0); break; case G_RAID_VOLUME_RL_RAID1: if (disks < 1) return (0); if (qual == G_RAID_VOLUME_RLQ_R1SM) { if (!force && disks != 2) return (0); } else if (qual == G_RAID_VOLUME_RLQ_R1MM) { if (!force && disks != 3) return (0); } else return (0); break; case G_RAID_VOLUME_RL_RAID3: if (qual != G_RAID_VOLUME_RLQ_R3P0 && qual != G_RAID_VOLUME_RLQ_R3PN) return (0); if (disks < 3) return (0); break; case G_RAID_VOLUME_RL_RAID4: if (qual != G_RAID_VOLUME_RLQ_R4P0 && qual != G_RAID_VOLUME_RLQ_R4PN) return (0); if (disks < 3) return (0); break; case G_RAID_VOLUME_RL_RAID5: if (qual != G_RAID_VOLUME_RLQ_R5RA && qual != G_RAID_VOLUME_RLQ_R5RS && qual != G_RAID_VOLUME_RLQ_R5LA && qual != G_RAID_VOLUME_RLQ_R5LS) return (0); if (disks < 3) return (0); break; case G_RAID_VOLUME_RL_RAID6: if (qual != G_RAID_VOLUME_RLQ_R6RA && qual != G_RAID_VOLUME_RLQ_R6RS && qual != G_RAID_VOLUME_RLQ_R6LA && qual != G_RAID_VOLUME_RLQ_R6LS) return (0); if (disks < 4) return (0); break; case G_RAID_VOLUME_RL_RAIDMDF: if (qual != G_RAID_VOLUME_RLQ_RMDFRA && qual != G_RAID_VOLUME_RLQ_RMDFRS && qual != G_RAID_VOLUME_RLQ_RMDFLA && qual != G_RAID_VOLUME_RLQ_RMDFLS) return (0); if (disks < 4) return (0); break; case G_RAID_VOLUME_RL_RAID1E: if (qual != G_RAID_VOLUME_RLQ_R1EA && qual != G_RAID_VOLUME_RLQ_R1EO) return (0); if (disks < 3) return (0); break; case G_RAID_VOLUME_RL_SINGLE: if (qual != G_RAID_VOLUME_RLQ_NONE) return (0); if (disks != 1) return (0); break; case G_RAID_VOLUME_RL_CONCAT: if (qual != G_RAID_VOLUME_RLQ_NONE) return (0); if (disks < 2) return (0); break; case G_RAID_VOLUME_RL_RAID5E: if (qual != G_RAID_VOLUME_RLQ_R5ERA && qual != G_RAID_VOLUME_RLQ_R5ERS && qual != G_RAID_VOLUME_RLQ_R5ELA && qual != G_RAID_VOLUME_RLQ_R5ELS) return (0); if (disks < 4) return (0); break; case G_RAID_VOLUME_RL_RAID5EE: if (qual != G_RAID_VOLUME_RLQ_R5EERA && qual != G_RAID_VOLUME_RLQ_R5EERS && qual != G_RAID_VOLUME_RLQ_R5EELA && qual != G_RAID_VOLUME_RLQ_R5EELS) return (0); if (disks < 4) return (0); break; case G_RAID_VOLUME_RL_RAID5R: if (qual != G_RAID_VOLUME_RLQ_R5RRA && qual != G_RAID_VOLUME_RLQ_R5RRS && qual != G_RAID_VOLUME_RLQ_R5RLA && qual != G_RAID_VOLUME_RLQ_R5RLS) return (0); if (disks < 3) return (0); break; default: return (0); } return (1); } static int g_raid_md_ddf_start_disk(struct g_raid_disk *disk, struct g_raid_volume *vol) { struct g_raid_softc *sc; struct g_raid_subdisk *sd; struct g_raid_md_ddf_perdisk *pd; struct g_raid_md_ddf_pervolume *pv; struct g_raid_md_ddf_object *mdi; struct ddf_vol_meta *vmeta; struct ddf_meta *pdmeta, *gmeta; struct ddf_vdc_record *vdc1; struct ddf_sa_record *sa; off_t size, eoff = 0, esize = 0; uint64_t *val2; int disk_pos, md_disk_bvd = -1, md_disk_pos = -1, md_pde_pos; int i, resurrection = 0; uint32_t reference; sc = disk->d_softc; mdi = (struct g_raid_md_ddf_object *)sc->sc_md; pd = (struct g_raid_md_ddf_perdisk *)disk->d_md_data; pdmeta = &pd->pd_meta; reference = GET32(&pd->pd_meta, pdd->PD_Reference); pv = vol->v_md_data; vmeta = &pv->pv_meta; gmeta = &mdi->mdio_meta; - /* Find disk position in metadata by it's reference. */ + /* Find disk position in metadata by its reference. */ disk_pos = ddf_meta_find_disk(vmeta, reference, &md_disk_bvd, &md_disk_pos); md_pde_pos = ddf_meta_find_pd(gmeta, NULL, reference); if (disk_pos < 0) { G_RAID_DEBUG1(1, sc, "Disk %s is not a present part of the volume %s", g_raid_get_diskname(disk), vol->v_name); /* Failed stale disk is useless for us. */ if ((GET16(gmeta, pdr->entry[md_pde_pos].PD_State) & DDF_PDE_PFA) != 0) { g_raid_change_disk_state(disk, G_RAID_DISK_S_STALE_FAILED); return (0); } /* If disk has some metadata for this volume - erase. */ if ((vdc1 = ddf_meta_find_vdc(pdmeta, vmeta->vdc->VD_GUID)) != NULL) SET32D(pdmeta, vdc1->Signature, 0xffffffff); /* If we are in the start process, that's all for now. */ if (!pv->pv_started) goto nofit; /* * If we have already started - try to get use of the disk. * Try to replace OFFLINE disks first, then FAILED. */ if (ddf_meta_count_vdc(&pd->pd_meta, NULL) >= GET16(&pd->pd_meta, hdr->Max_Partitions)) { G_RAID_DEBUG1(1, sc, "No free partitions on disk %s", g_raid_get_diskname(disk)); goto nofit; } ddf_meta_unused_range(&pd->pd_meta, &eoff, &esize); if (esize == 0) { G_RAID_DEBUG1(1, sc, "No free space on disk %s", g_raid_get_diskname(disk)); goto nofit; } eoff *= pd->pd_meta.sectorsize; esize *= pd->pd_meta.sectorsize; size = INT64_MAX; for (i = 0; i < vol->v_disks_count; i++) { sd = &vol->v_subdisks[i]; if (sd->sd_state != G_RAID_SUBDISK_S_NONE) size = sd->sd_size; if (sd->sd_state <= G_RAID_SUBDISK_S_FAILED && (disk_pos < 0 || vol->v_subdisks[i].sd_state < sd->sd_state)) disk_pos = i; } if (disk_pos >= 0 && vol->v_raid_level != G_RAID_VOLUME_RL_CONCAT && esize < size) { G_RAID_DEBUG1(1, sc, "Disk %s free space " "is too small (%ju < %ju)", g_raid_get_diskname(disk), esize, size); disk_pos = -1; } if (disk_pos >= 0) { if (vol->v_raid_level != G_RAID_VOLUME_RL_CONCAT) esize = size; md_disk_bvd = disk_pos / GET16(vmeta, vdc->Primary_Element_Count); // XXX md_disk_pos = disk_pos % GET16(vmeta, vdc->Primary_Element_Count); // XXX } else { nofit: if (disk->d_state == G_RAID_DISK_S_NONE) g_raid_change_disk_state(disk, G_RAID_DISK_S_STALE); return (0); } /* * If spare is committable, delete spare record. * Othersize, mark it active and leave there. */ sa = ddf_meta_find_sa(&pd->pd_meta, 0); if (sa != NULL) { if ((GET8D(&pd->pd_meta, sa->Spare_Type) & DDF_SAR_TYPE_REVERTIBLE) == 0) { SET32D(&pd->pd_meta, sa->Signature, 0xffffffff); } else { SET8D(&pd->pd_meta, sa->Spare_Type, GET8D(&pd->pd_meta, sa->Spare_Type) | DDF_SAR_TYPE_ACTIVE); } } G_RAID_DEBUG1(1, sc, "Disk %s takes pos %d in the volume %s", g_raid_get_diskname(disk), disk_pos, vol->v_name); resurrection = 1; } sd = &vol->v_subdisks[disk_pos]; if (resurrection && sd->sd_disk != NULL) { g_raid_change_disk_state(sd->sd_disk, G_RAID_DISK_S_STALE_FAILED); TAILQ_REMOVE(&sd->sd_disk->d_subdisks, sd, sd_next); } vol->v_subdisks[disk_pos].sd_disk = disk; TAILQ_INSERT_TAIL(&disk->d_subdisks, sd, sd_next); /* Welcome the new disk. */ if (resurrection) g_raid_change_disk_state(disk, G_RAID_DISK_S_ACTIVE); else if (GET16(gmeta, pdr->entry[md_pde_pos].PD_State) & DDF_PDE_PFA) g_raid_change_disk_state(disk, G_RAID_DISK_S_FAILED); else g_raid_change_disk_state(disk, G_RAID_DISK_S_ACTIVE); if (resurrection) { sd->sd_offset = eoff; sd->sd_size = esize; } else if (pdmeta->cr != NULL && (vdc1 = ddf_meta_find_vdc(pdmeta, vmeta->vdc->VD_GUID)) != NULL) { val2 = (uint64_t *)&(vdc1->Physical_Disk_Sequence[GET16(vmeta, hdr->Max_Primary_Element_Entries)]); sd->sd_offset = (off_t)GET64P(pdmeta, val2 + md_disk_pos) * 512; sd->sd_size = (off_t)GET64D(pdmeta, vdc1->Block_Count) * 512; } if (resurrection) { /* Stale disk, almost same as new. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_NEW); } else if (GET16(gmeta, pdr->entry[md_pde_pos].PD_State) & DDF_PDE_PFA) { /* Failed disk. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_FAILED); } else if ((GET16(gmeta, pdr->entry[md_pde_pos].PD_State) & (DDF_PDE_FAILED | DDF_PDE_REBUILD)) != 0) { /* Rebuilding disk. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_REBUILD); sd->sd_rebuild_pos = 0; } else if ((GET8(vmeta, vde->VD_State) & DDF_VDE_DIRTY) != 0 || (GET8(vmeta, vde->Init_State) & DDF_VDE_INIT_MASK) != DDF_VDE_INIT_FULL) { /* Stale disk or dirty volume (unclean shutdown). */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_STALE); } else { /* Up to date disk. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_ACTIVE); } g_raid_event_send(sd, G_RAID_SUBDISK_E_NEW, G_RAID_EVENT_SUBDISK); return (resurrection); } static void g_raid_md_ddf_refill(struct g_raid_softc *sc) { struct g_raid_volume *vol; struct g_raid_subdisk *sd; struct g_raid_disk *disk; struct g_raid_md_object *md; struct g_raid_md_ddf_perdisk *pd; struct g_raid_md_ddf_pervolume *pv; int update, updated, i, bad; md = sc->sc_md; restart: updated = 0; TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) { pv = vol->v_md_data; if (!pv->pv_started || vol->v_stopping) continue; /* Search for subdisk that needs replacement. */ bad = 0; for (i = 0; i < vol->v_disks_count; i++) { sd = &vol->v_subdisks[i]; if (sd->sd_state == G_RAID_SUBDISK_S_NONE || sd->sd_state == G_RAID_SUBDISK_S_FAILED) bad = 1; } if (!bad) continue; G_RAID_DEBUG1(1, sc, "Volume %s is not complete, " "trying to refill.", vol->v_name); TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { /* Skip failed. */ if (disk->d_state < G_RAID_DISK_S_SPARE) continue; /* Skip already used by this volume. */ for (i = 0; i < vol->v_disks_count; i++) { sd = &vol->v_subdisks[i]; if (sd->sd_disk == disk) break; } if (i < vol->v_disks_count) continue; /* Try to use disk if it has empty extents. */ pd = disk->d_md_data; if (ddf_meta_count_vdc(&pd->pd_meta, NULL) < GET16(&pd->pd_meta, hdr->Max_Partitions)) { update = g_raid_md_ddf_start_disk(disk, vol); } else update = 0; if (update) { updated = 1; g_raid_md_write_ddf(md, vol, NULL, disk); break; } } } if (updated) goto restart; } static void g_raid_md_ddf_start(struct g_raid_volume *vol) { struct g_raid_softc *sc; struct g_raid_subdisk *sd; struct g_raid_disk *disk; struct g_raid_md_object *md; struct g_raid_md_ddf_perdisk *pd; struct g_raid_md_ddf_pervolume *pv; struct g_raid_md_ddf_object *mdi; struct ddf_vol_meta *vmeta; struct ddf_vdc_record *vdc; uint64_t *val2; int i, j, bvd; sc = vol->v_softc; md = sc->sc_md; mdi = (struct g_raid_md_ddf_object *)md; pv = vol->v_md_data; vmeta = &pv->pv_meta; vdc = vmeta->vdc; vol->v_raid_level = GET8(vmeta, vdc->Primary_RAID_Level); vol->v_raid_level_qualifier = GET8(vmeta, vdc->RLQ); if (GET8(vmeta, vdc->Secondary_Element_Count) > 1 && vol->v_raid_level == G_RAID_VOLUME_RL_RAID1 && GET8(vmeta, vdc->Secondary_RAID_Level) == 0) vol->v_raid_level = G_RAID_VOLUME_RL_RAID1E; vol->v_sectorsize = GET16(vmeta, vdc->Block_Size); if (vol->v_sectorsize == 0xffff) vol->v_sectorsize = vmeta->sectorsize; vol->v_strip_size = vol->v_sectorsize << GET8(vmeta, vdc->Stripe_Size); vol->v_disks_count = GET16(vmeta, vdc->Primary_Element_Count) * GET8(vmeta, vdc->Secondary_Element_Count); vol->v_mdf_pdisks = GET8(vmeta, vdc->MDF_Parity_Disks); vol->v_mdf_polynomial = GET16(vmeta, vdc->MDF_Parity_Generator_Polynomial); vol->v_mdf_method = GET8(vmeta, vdc->MDF_Constant_Generation_Method); if (GET8(vmeta, vdc->Rotate_Parity_count) > 31) vol->v_rotate_parity = 1; else vol->v_rotate_parity = 1 << GET8(vmeta, vdc->Rotate_Parity_count); vol->v_mediasize = GET64(vmeta, vdc->VD_Size) * vol->v_sectorsize; for (i = 0, j = 0, bvd = 0; i < vol->v_disks_count; i++, j++) { if (j == GET16(vmeta, vdc->Primary_Element_Count)) { j = 0; bvd++; } sd = &vol->v_subdisks[i]; if (vmeta->bvdc[bvd] == NULL) { sd->sd_offset = 0; sd->sd_size = GET64(vmeta, vdc->Block_Count) * vol->v_sectorsize; continue; } val2 = (uint64_t *)&(vmeta->bvdc[bvd]->Physical_Disk_Sequence[ GET16(vmeta, hdr->Max_Primary_Element_Entries)]); sd->sd_offset = GET64P(vmeta, val2 + j) * vol->v_sectorsize; sd->sd_size = GET64(vmeta, bvdc[bvd]->Block_Count) * vol->v_sectorsize; } g_raid_start_volume(vol); /* Make all disks found till the moment take their places. */ TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { pd = (struct g_raid_md_ddf_perdisk *)disk->d_md_data; if (ddf_meta_find_vdc(&pd->pd_meta, vmeta->vdc->VD_GUID) != NULL) g_raid_md_ddf_start_disk(disk, vol); } pv->pv_started = 1; mdi->mdio_starting--; callout_stop(&pv->pv_start_co); G_RAID_DEBUG1(0, sc, "Volume started."); g_raid_md_write_ddf(md, vol, NULL, NULL); /* Pickup any STALE/SPARE disks to refill array if needed. */ g_raid_md_ddf_refill(sc); g_raid_event_send(vol, G_RAID_VOLUME_E_START, G_RAID_EVENT_VOLUME); } static void g_raid_ddf_go(void *arg) { struct g_raid_volume *vol; struct g_raid_softc *sc; struct g_raid_md_ddf_pervolume *pv; vol = arg; pv = vol->v_md_data; sc = vol->v_softc; if (!pv->pv_started) { G_RAID_DEBUG1(0, sc, "Force volume start due to timeout."); g_raid_event_send(vol, G_RAID_VOLUME_E_STARTMD, G_RAID_EVENT_VOLUME); } } static void g_raid_md_ddf_new_disk(struct g_raid_disk *disk) { struct g_raid_softc *sc; struct g_raid_md_object *md; struct g_raid_md_ddf_perdisk *pd; struct g_raid_md_ddf_pervolume *pv; struct g_raid_md_ddf_object *mdi; struct g_raid_volume *vol; struct ddf_meta *pdmeta; struct ddf_vol_meta *vmeta; struct ddf_vdc_record *vdc; struct ddf_vd_entry *vde; int i, j, k, num, have, need, cnt, spare; uint32_t val; char buf[17]; sc = disk->d_softc; md = sc->sc_md; mdi = (struct g_raid_md_ddf_object *)md; pd = (struct g_raid_md_ddf_perdisk *)disk->d_md_data; pdmeta = &pd->pd_meta; spare = -1; if (mdi->mdio_meta.hdr == NULL) ddf_meta_copy(&mdi->mdio_meta, pdmeta); else ddf_meta_update(&mdi->mdio_meta, pdmeta); num = GETCRNUM(pdmeta); for (j = 0; j < num; j++) { vdc = GETVDCPTR(pdmeta, j); val = GET32D(pdmeta, vdc->Signature); if (val == DDF_SA_SIGNATURE && spare == -1) spare = 1; if (val != DDF_VDCR_SIGNATURE) continue; spare = 0; k = ddf_meta_find_vd(pdmeta, vdc->VD_GUID); if (k < 0) continue; vde = &pdmeta->vdr->entry[k]; /* Look for volume with matching ID. */ vol = g_raid_md_ddf_get_volume(sc, vdc->VD_GUID); if (vol == NULL) { ddf_meta_get_name(pdmeta, k, buf); vol = g_raid_create_volume(sc, buf, GET16D(pdmeta, vde->VD_Number)); pv = malloc(sizeof(*pv), M_MD_DDF, M_WAITOK | M_ZERO); vol->v_md_data = pv; callout_init(&pv->pv_start_co, 1); callout_reset(&pv->pv_start_co, g_raid_start_timeout * hz, g_raid_ddf_go, vol); mdi->mdio_starting++; } else pv = vol->v_md_data; /* If we haven't started yet - check metadata freshness. */ vmeta = &pv->pv_meta; ddf_vol_meta_update(vmeta, pdmeta, vdc->VD_GUID, pv->pv_started); } if (spare == 1) { g_raid_change_disk_state(disk, G_RAID_DISK_S_SPARE); g_raid_md_ddf_refill(sc); } TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) { pv = vol->v_md_data; vmeta = &pv->pv_meta; if (ddf_meta_find_vdc(pdmeta, vmeta->vdc->VD_GUID) == NULL) continue; if (pv->pv_started) { if (g_raid_md_ddf_start_disk(disk, vol)) g_raid_md_write_ddf(md, vol, NULL, NULL); continue; } /* If we collected all needed disks - start array. */ need = 0; have = 0; for (k = 0; k < GET8(vmeta, vdc->Secondary_Element_Count); k++) { if (vmeta->bvdc[k] == NULL) { need += GET16(vmeta, vdc->Primary_Element_Count); continue; } cnt = GET16(vmeta, bvdc[k]->Primary_Element_Count); need += cnt; for (i = 0; i < cnt; i++) { val = GET32(vmeta, bvdc[k]->Physical_Disk_Sequence[i]); if (g_raid_md_ddf_get_disk(sc, NULL, val) != NULL) have++; } } G_RAID_DEBUG1(1, sc, "Volume %s now has %d of %d disks", vol->v_name, have, need); if (have == need) g_raid_md_ddf_start(vol); } } static int g_raid_md_create_req_ddf(struct g_raid_md_object *md, struct g_class *mp, struct gctl_req *req, struct g_geom **gp) { struct g_geom *geom; struct g_raid_softc *sc; struct g_raid_md_ddf_object *mdi, *mdi1; char name[16]; const char *fmtopt; int be = 1; mdi = (struct g_raid_md_ddf_object *)md; fmtopt = gctl_get_asciiparam(req, "fmtopt"); if (fmtopt == NULL || strcasecmp(fmtopt, "BE") == 0) be = 1; else if (strcasecmp(fmtopt, "LE") == 0) be = 0; else { gctl_error(req, "Incorrect fmtopt argument."); return (G_RAID_MD_TASTE_FAIL); } /* Search for existing node. */ LIST_FOREACH(geom, &mp->geom, geom) { sc = geom->softc; if (sc == NULL) continue; if (sc->sc_stopping != 0) continue; if (sc->sc_md->mdo_class != md->mdo_class) continue; mdi1 = (struct g_raid_md_ddf_object *)sc->sc_md; if (mdi1->mdio_bigendian != be) continue; break; } if (geom != NULL) { *gp = geom; return (G_RAID_MD_TASTE_EXISTING); } /* Create new one if not found. */ mdi->mdio_bigendian = be; snprintf(name, sizeof(name), "DDF%s", be ? "" : "-LE"); sc = g_raid_create_node(mp, name, md); if (sc == NULL) return (G_RAID_MD_TASTE_FAIL); md->mdo_softc = sc; *gp = sc->sc_geom; return (G_RAID_MD_TASTE_NEW); } static int g_raid_md_taste_ddf(struct g_raid_md_object *md, struct g_class *mp, struct g_consumer *cp, struct g_geom **gp) { struct g_consumer *rcp; struct g_provider *pp; struct g_raid_softc *sc; struct g_raid_disk *disk; struct ddf_meta meta; struct g_raid_md_ddf_perdisk *pd; struct g_raid_md_ddf_object *mdi; struct g_geom *geom; int error, result, be; char name[16]; G_RAID_DEBUG(1, "Tasting DDF on %s", cp->provider->name); mdi = (struct g_raid_md_ddf_object *)md; pp = cp->provider; /* Read metadata from device. */ g_topology_unlock(); bzero(&meta, sizeof(meta)); error = ddf_meta_read(cp, &meta); g_topology_lock(); if (error != 0) return (G_RAID_MD_TASTE_FAIL); be = meta.bigendian; /* Metadata valid. Print it. */ g_raid_md_ddf_print(&meta); /* Search for matching node. */ sc = NULL; LIST_FOREACH(geom, &mp->geom, geom) { sc = geom->softc; if (sc == NULL) continue; if (sc->sc_stopping != 0) continue; if (sc->sc_md->mdo_class != md->mdo_class) continue; mdi = (struct g_raid_md_ddf_object *)sc->sc_md; if (mdi->mdio_bigendian != be) continue; break; } /* Found matching node. */ if (geom != NULL) { G_RAID_DEBUG(1, "Found matching array %s", sc->sc_name); result = G_RAID_MD_TASTE_EXISTING; } else { /* Not found matching node -- create one. */ result = G_RAID_MD_TASTE_NEW; mdi->mdio_bigendian = be; snprintf(name, sizeof(name), "DDF%s", be ? "" : "-LE"); sc = g_raid_create_node(mp, name, md); md->mdo_softc = sc; geom = sc->sc_geom; } /* There is no return after this point, so we close passed consumer. */ g_access(cp, -1, 0, 0); rcp = g_new_consumer(geom); rcp->flags |= G_CF_DIRECT_RECEIVE; g_attach(rcp, pp); if (g_access(rcp, 1, 1, 1) != 0) ; //goto fail1; g_topology_unlock(); sx_xlock(&sc->sc_lock); pd = malloc(sizeof(*pd), M_MD_DDF, M_WAITOK | M_ZERO); pd->pd_meta = meta; disk = g_raid_create_disk(sc); disk->d_md_data = (void *)pd; disk->d_consumer = rcp; rcp->private = disk; g_raid_get_disk_info(disk); g_raid_md_ddf_new_disk(disk); sx_xunlock(&sc->sc_lock); g_topology_lock(); *gp = geom; return (result); } static int g_raid_md_event_ddf(struct g_raid_md_object *md, struct g_raid_disk *disk, u_int event) { struct g_raid_softc *sc; sc = md->mdo_softc; if (disk == NULL) return (-1); switch (event) { case G_RAID_DISK_E_DISCONNECTED: /* Delete disk. */ g_raid_change_disk_state(disk, G_RAID_DISK_S_NONE); g_raid_destroy_disk(disk); g_raid_md_ddf_purge_volumes(sc); /* Write updated metadata to all disks. */ g_raid_md_write_ddf(md, NULL, NULL, NULL); /* Check if anything left. */ if (g_raid_ndisks(sc, -1) == 0) g_raid_destroy_node(sc, 0); else g_raid_md_ddf_refill(sc); return (0); } return (-2); } static int g_raid_md_volume_event_ddf(struct g_raid_md_object *md, struct g_raid_volume *vol, u_int event) { struct g_raid_md_ddf_pervolume *pv; pv = (struct g_raid_md_ddf_pervolume *)vol->v_md_data; switch (event) { case G_RAID_VOLUME_E_STARTMD: if (!pv->pv_started) g_raid_md_ddf_start(vol); return (0); } return (-2); } static int g_raid_md_ctl_ddf(struct g_raid_md_object *md, struct gctl_req *req) { struct g_raid_softc *sc; struct g_raid_volume *vol, *vol1; struct g_raid_subdisk *sd; struct g_raid_disk *disk, *disks[DDF_MAX_DISKS_HARD]; struct g_raid_md_ddf_perdisk *pd; struct g_raid_md_ddf_pervolume *pv; struct g_raid_md_ddf_object *mdi; struct ddf_sa_record *sa; struct g_consumer *cp; struct g_provider *pp; char arg[16]; const char *nodename, *verb, *volname, *levelname, *diskname; char *tmp; int *nargs, *force; off_t size, sectorsize, strip, offs[DDF_MAX_DISKS_HARD], esize; intmax_t *sizearg, *striparg; int i, numdisks, len, level, qual; int error; sc = md->mdo_softc; mdi = (struct g_raid_md_ddf_object *)md; verb = gctl_get_param(req, "verb", NULL); nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); error = 0; if (strcmp(verb, "label") == 0) { if (*nargs < 4) { gctl_error(req, "Invalid number of arguments."); return (-1); } volname = gctl_get_asciiparam(req, "arg1"); if (volname == NULL) { gctl_error(req, "No volume name."); return (-2); } levelname = gctl_get_asciiparam(req, "arg2"); if (levelname == NULL) { gctl_error(req, "No RAID level."); return (-3); } if (g_raid_volume_str2level(levelname, &level, &qual)) { gctl_error(req, "Unknown RAID level '%s'.", levelname); return (-4); } numdisks = *nargs - 3; force = gctl_get_paraml(req, "force", sizeof(*force)); if (!g_raid_md_ddf_supported(level, qual, numdisks, force ? *force : 0)) { gctl_error(req, "Unsupported RAID level " "(0x%02x/0x%02x), or number of disks (%d).", level, qual, numdisks); return (-5); } /* Search for disks, connect them and probe. */ size = INT64_MAX; sectorsize = 0; bzero(disks, sizeof(disks)); bzero(offs, sizeof(offs)); for (i = 0; i < numdisks; i++) { snprintf(arg, sizeof(arg), "arg%d", i + 3); diskname = gctl_get_asciiparam(req, arg); if (diskname == NULL) { gctl_error(req, "No disk name (%s).", arg); error = -6; break; } if (strcmp(diskname, "NONE") == 0) continue; TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_consumer != NULL && disk->d_consumer->provider != NULL && strcmp(disk->d_consumer->provider->name, diskname) == 0) break; } if (disk != NULL) { if (disk->d_state != G_RAID_DISK_S_ACTIVE) { gctl_error(req, "Disk '%s' is in a " "wrong state (%s).", diskname, g_raid_disk_state2str(disk->d_state)); error = -7; break; } pd = disk->d_md_data; if (ddf_meta_count_vdc(&pd->pd_meta, NULL) >= GET16(&pd->pd_meta, hdr->Max_Partitions)) { gctl_error(req, "No free partitions " "on disk '%s'.", diskname); error = -7; break; } pp = disk->d_consumer->provider; disks[i] = disk; ddf_meta_unused_range(&pd->pd_meta, &offs[i], &esize); offs[i] *= pp->sectorsize; size = MIN(size, (off_t)esize * pp->sectorsize); sectorsize = MAX(sectorsize, pp->sectorsize); continue; } g_topology_lock(); cp = g_raid_open_consumer(sc, diskname); if (cp == NULL) { gctl_error(req, "Can't open disk '%s'.", diskname); g_topology_unlock(); error = -8; break; } pp = cp->provider; pd = malloc(sizeof(*pd), M_MD_DDF, M_WAITOK | M_ZERO); disk = g_raid_create_disk(sc); disk->d_md_data = (void *)pd; disk->d_consumer = cp; disks[i] = disk; cp->private = disk; ddf_meta_create(disk, &mdi->mdio_meta); if (mdi->mdio_meta.hdr == NULL) ddf_meta_copy(&mdi->mdio_meta, &pd->pd_meta); else ddf_meta_update(&mdi->mdio_meta, &pd->pd_meta); g_topology_unlock(); g_raid_get_disk_info(disk); /* Reserve some space for metadata. */ size = MIN(size, GET64(&pd->pd_meta, pdr->entry[0].Configured_Size) * pp->sectorsize); sectorsize = MAX(sectorsize, pp->sectorsize); } if (error != 0) { for (i = 0; i < numdisks; i++) { if (disks[i] != NULL && disks[i]->d_state == G_RAID_DISK_S_NONE) g_raid_destroy_disk(disks[i]); } return (error); } if (sectorsize <= 0) { gctl_error(req, "Can't get sector size."); return (-8); } /* Handle size argument. */ len = sizeof(*sizearg); sizearg = gctl_get_param(req, "size", &len); if (sizearg != NULL && len == sizeof(*sizearg) && *sizearg > 0) { if (*sizearg > size) { gctl_error(req, "Size too big %lld > %lld.", (long long)*sizearg, (long long)size); return (-9); } size = *sizearg; } /* Handle strip argument. */ strip = 131072; len = sizeof(*striparg); striparg = gctl_get_param(req, "strip", &len); if (striparg != NULL && len == sizeof(*striparg) && *striparg > 0) { if (*striparg < sectorsize) { gctl_error(req, "Strip size too small."); return (-10); } if (*striparg % sectorsize != 0) { gctl_error(req, "Incorrect strip size."); return (-11); } strip = *striparg; } /* Round size down to strip or sector. */ if (level == G_RAID_VOLUME_RL_RAID1 || level == G_RAID_VOLUME_RL_RAID3 || level == G_RAID_VOLUME_RL_SINGLE || level == G_RAID_VOLUME_RL_CONCAT) size -= (size % sectorsize); else if (level == G_RAID_VOLUME_RL_RAID1E && (numdisks & 1) != 0) size -= (size % (2 * strip)); else size -= (size % strip); if (size <= 0) { gctl_error(req, "Size too small."); return (-13); } /* We have all we need, create things: volume, ... */ pv = malloc(sizeof(*pv), M_MD_DDF, M_WAITOK | M_ZERO); ddf_vol_meta_create(&pv->pv_meta, &mdi->mdio_meta); pv->pv_started = 1; vol = g_raid_create_volume(sc, volname, -1); vol->v_md_data = pv; vol->v_raid_level = level; vol->v_raid_level_qualifier = qual; vol->v_strip_size = strip; vol->v_disks_count = numdisks; if (level == G_RAID_VOLUME_RL_RAID0 || level == G_RAID_VOLUME_RL_CONCAT || level == G_RAID_VOLUME_RL_SINGLE) vol->v_mediasize = size * numdisks; else if (level == G_RAID_VOLUME_RL_RAID1) vol->v_mediasize = size; else if (level == G_RAID_VOLUME_RL_RAID3 || level == G_RAID_VOLUME_RL_RAID4 || level == G_RAID_VOLUME_RL_RAID5) vol->v_mediasize = size * (numdisks - 1); else if (level == G_RAID_VOLUME_RL_RAID5R) { vol->v_mediasize = size * (numdisks - 1); vol->v_rotate_parity = 1024; } else if (level == G_RAID_VOLUME_RL_RAID6 || level == G_RAID_VOLUME_RL_RAID5E || level == G_RAID_VOLUME_RL_RAID5EE) vol->v_mediasize = size * (numdisks - 2); else if (level == G_RAID_VOLUME_RL_RAIDMDF) { if (numdisks < 5) vol->v_mdf_pdisks = 2; else vol->v_mdf_pdisks = 3; vol->v_mdf_polynomial = 0x11d; vol->v_mdf_method = 0x00; vol->v_mediasize = size * (numdisks - vol->v_mdf_pdisks); } else { /* RAID1E */ vol->v_mediasize = ((size * numdisks) / strip / 2) * strip; } vol->v_sectorsize = sectorsize; g_raid_start_volume(vol); /* , and subdisks. */ for (i = 0; i < numdisks; i++) { disk = disks[i]; sd = &vol->v_subdisks[i]; sd->sd_disk = disk; sd->sd_offset = offs[i]; sd->sd_size = size; if (disk == NULL) continue; TAILQ_INSERT_TAIL(&disk->d_subdisks, sd, sd_next); g_raid_change_disk_state(disk, G_RAID_DISK_S_ACTIVE); g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_ACTIVE); g_raid_event_send(sd, G_RAID_SUBDISK_E_NEW, G_RAID_EVENT_SUBDISK); } /* Write metadata based on created entities. */ G_RAID_DEBUG1(0, sc, "Array started."); g_raid_md_write_ddf(md, vol, NULL, NULL); /* Pickup any STALE/SPARE disks to refill array if needed. */ g_raid_md_ddf_refill(sc); g_raid_event_send(vol, G_RAID_VOLUME_E_START, G_RAID_EVENT_VOLUME); return (0); } if (strcmp(verb, "add") == 0) { gctl_error(req, "`add` command is not applicable, " "use `label` instead."); return (-99); } if (strcmp(verb, "delete") == 0) { nodename = gctl_get_asciiparam(req, "arg0"); if (nodename != NULL && strcasecmp(sc->sc_name, nodename) != 0) nodename = NULL; /* Full node destruction. */ if (*nargs == 1 && nodename != NULL) { /* Check if some volume is still open. */ force = gctl_get_paraml(req, "force", sizeof(*force)); if (force != NULL && *force == 0 && g_raid_nopens(sc) != 0) { gctl_error(req, "Some volume is still open."); return (-4); } TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_consumer) ddf_meta_erase(disk->d_consumer); } g_raid_destroy_node(sc, 0); return (0); } /* Destroy specified volume. If it was last - all node. */ if (*nargs > 2) { gctl_error(req, "Invalid number of arguments."); return (-1); } volname = gctl_get_asciiparam(req, nodename != NULL ? "arg1" : "arg0"); if (volname == NULL) { gctl_error(req, "No volume name."); return (-2); } /* Search for volume. */ TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) { if (strcmp(vol->v_name, volname) == 0) break; pp = vol->v_provider; if (pp == NULL) continue; if (strcmp(pp->name, volname) == 0) break; if (strncmp(pp->name, "raid/", 5) == 0 && strcmp(pp->name + 5, volname) == 0) break; } if (vol == NULL) { i = strtol(volname, &tmp, 10); if (verb != volname && tmp[0] == 0) { TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) { if (vol->v_global_id == i) break; } } } if (vol == NULL) { gctl_error(req, "Volume '%s' not found.", volname); return (-3); } /* Check if volume is still open. */ force = gctl_get_paraml(req, "force", sizeof(*force)); if (force != NULL && *force == 0 && vol->v_provider_open != 0) { gctl_error(req, "Volume is still open."); return (-4); } /* Destroy volume and potentially node. */ i = 0; TAILQ_FOREACH(vol1, &sc->sc_volumes, v_next) i++; if (i >= 2) { g_raid_destroy_volume(vol); g_raid_md_ddf_purge_disks(sc); g_raid_md_write_ddf(md, NULL, NULL, NULL); } else { TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_consumer) ddf_meta_erase(disk->d_consumer); } g_raid_destroy_node(sc, 0); } return (0); } if (strcmp(verb, "remove") == 0 || strcmp(verb, "fail") == 0) { if (*nargs < 2) { gctl_error(req, "Invalid number of arguments."); return (-1); } for (i = 1; i < *nargs; i++) { snprintf(arg, sizeof(arg), "arg%d", i); diskname = gctl_get_asciiparam(req, arg); if (diskname == NULL) { gctl_error(req, "No disk name (%s).", arg); error = -2; break; } if (strncmp(diskname, "/dev/", 5) == 0) diskname += 5; TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_consumer != NULL && disk->d_consumer->provider != NULL && strcmp(disk->d_consumer->provider->name, diskname) == 0) break; } if (disk == NULL) { gctl_error(req, "Disk '%s' not found.", diskname); error = -3; break; } if (strcmp(verb, "fail") == 0) { g_raid_md_fail_disk_ddf(md, NULL, disk); continue; } /* Erase metadata on deleting disk and destroy it. */ ddf_meta_erase(disk->d_consumer); g_raid_destroy_disk(disk); } g_raid_md_ddf_purge_volumes(sc); /* Write updated metadata to remaining disks. */ g_raid_md_write_ddf(md, NULL, NULL, NULL); /* Check if anything left. */ if (g_raid_ndisks(sc, -1) == 0) g_raid_destroy_node(sc, 0); else g_raid_md_ddf_refill(sc); return (error); } if (strcmp(verb, "insert") == 0) { if (*nargs < 2) { gctl_error(req, "Invalid number of arguments."); return (-1); } for (i = 1; i < *nargs; i++) { /* Get disk name. */ snprintf(arg, sizeof(arg), "arg%d", i); diskname = gctl_get_asciiparam(req, arg); if (diskname == NULL) { gctl_error(req, "No disk name (%s).", arg); error = -3; break; } /* Try to find provider with specified name. */ g_topology_lock(); cp = g_raid_open_consumer(sc, diskname); if (cp == NULL) { gctl_error(req, "Can't open disk '%s'.", diskname); g_topology_unlock(); error = -4; break; } pp = cp->provider; g_topology_unlock(); pd = malloc(sizeof(*pd), M_MD_DDF, M_WAITOK | M_ZERO); disk = g_raid_create_disk(sc); disk->d_consumer = cp; disk->d_md_data = (void *)pd; cp->private = disk; g_raid_get_disk_info(disk); /* Welcome the "new" disk. */ g_raid_change_disk_state(disk, G_RAID_DISK_S_SPARE); ddf_meta_create(disk, &mdi->mdio_meta); sa = ddf_meta_find_sa(&pd->pd_meta, 1); if (sa != NULL) { SET32D(&pd->pd_meta, sa->Signature, DDF_SA_SIGNATURE); SET8D(&pd->pd_meta, sa->Spare_Type, 0); SET16D(&pd->pd_meta, sa->Populated_SAEs, 0); SET16D(&pd->pd_meta, sa->MAX_SAE_Supported, (GET16(&pd->pd_meta, hdr->Configuration_Record_Length) * pd->pd_meta.sectorsize - sizeof(struct ddf_sa_record)) / sizeof(struct ddf_sa_entry)); } if (mdi->mdio_meta.hdr == NULL) ddf_meta_copy(&mdi->mdio_meta, &pd->pd_meta); else ddf_meta_update(&mdi->mdio_meta, &pd->pd_meta); g_raid_md_write_ddf(md, NULL, NULL, NULL); g_raid_md_ddf_refill(sc); } return (error); } return (-100); } static int g_raid_md_write_ddf(struct g_raid_md_object *md, struct g_raid_volume *tvol, struct g_raid_subdisk *tsd, struct g_raid_disk *tdisk) { struct g_raid_softc *sc; struct g_raid_volume *vol; struct g_raid_subdisk *sd; struct g_raid_disk *disk; struct g_raid_md_ddf_perdisk *pd; struct g_raid_md_ddf_pervolume *pv; struct g_raid_md_ddf_object *mdi; struct ddf_meta *gmeta; struct ddf_vol_meta *vmeta; struct ddf_vdc_record *vdc; struct ddf_sa_record *sa; uint64_t *val2; int i, j, pos, bvd, size; sc = md->mdo_softc; mdi = (struct g_raid_md_ddf_object *)md; gmeta = &mdi->mdio_meta; if (sc->sc_stopping == G_RAID_DESTROY_HARD) return (0); /* * Clear disk flags to let only really needed ones to be reset. * Do it only if there are no volumes in starting state now, * as they can update disk statuses yet and we may kill innocent. */ if (mdi->mdio_starting == 0) { for (i = 0; i < GET16(gmeta, pdr->Populated_PDEs); i++) { if (isff(gmeta->pdr->entry[i].PD_GUID, 24)) continue; SET16(gmeta, pdr->entry[i].PD_Type, GET16(gmeta, pdr->entry[i].PD_Type) & ~(DDF_PDE_PARTICIPATING | DDF_PDE_GLOBAL_SPARE | DDF_PDE_CONFIG_SPARE)); if ((GET16(gmeta, pdr->entry[i].PD_State) & DDF_PDE_PFA) == 0) SET16(gmeta, pdr->entry[i].PD_State, 0); } } /* Generate/update new per-volume metadata. */ TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) { pv = (struct g_raid_md_ddf_pervolume *)vol->v_md_data; if (vol->v_stopping || !pv->pv_started) continue; vmeta = &pv->pv_meta; SET32(vmeta, vdc->Sequence_Number, GET32(vmeta, vdc->Sequence_Number) + 1); if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1E && vol->v_disks_count % 2 == 0) SET16(vmeta, vdc->Primary_Element_Count, 2); else SET16(vmeta, vdc->Primary_Element_Count, vol->v_disks_count); SET8(vmeta, vdc->Stripe_Size, ffs(vol->v_strip_size / vol->v_sectorsize) - 1); if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1E && vol->v_disks_count % 2 == 0) { SET8(vmeta, vdc->Primary_RAID_Level, DDF_VDCR_RAID1); SET8(vmeta, vdc->RLQ, 0); SET8(vmeta, vdc->Secondary_Element_Count, vol->v_disks_count / 2); SET8(vmeta, vdc->Secondary_RAID_Level, 0); } else { SET8(vmeta, vdc->Primary_RAID_Level, vol->v_raid_level); SET8(vmeta, vdc->RLQ, vol->v_raid_level_qualifier); SET8(vmeta, vdc->Secondary_Element_Count, 1); SET8(vmeta, vdc->Secondary_RAID_Level, 0); } SET8(vmeta, vdc->Secondary_Element_Seq, 0); SET64(vmeta, vdc->Block_Count, 0); SET64(vmeta, vdc->VD_Size, vol->v_mediasize / vol->v_sectorsize); SET16(vmeta, vdc->Block_Size, vol->v_sectorsize); SET8(vmeta, vdc->Rotate_Parity_count, fls(vol->v_rotate_parity) - 1); SET8(vmeta, vdc->MDF_Parity_Disks, vol->v_mdf_pdisks); SET16(vmeta, vdc->MDF_Parity_Generator_Polynomial, vol->v_mdf_polynomial); SET8(vmeta, vdc->MDF_Constant_Generation_Method, vol->v_mdf_method); SET16(vmeta, vde->VD_Number, vol->v_global_id); if (vol->v_state <= G_RAID_VOLUME_S_BROKEN) SET8(vmeta, vde->VD_State, DDF_VDE_FAILED); else if (vol->v_state <= G_RAID_VOLUME_S_DEGRADED) SET8(vmeta, vde->VD_State, DDF_VDE_DEGRADED); else if (vol->v_state <= G_RAID_VOLUME_S_SUBOPTIMAL) SET8(vmeta, vde->VD_State, DDF_VDE_PARTIAL); else SET8(vmeta, vde->VD_State, DDF_VDE_OPTIMAL); if (vol->v_dirty || g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_STALE) > 0 || g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_RESYNC) > 0) SET8(vmeta, vde->VD_State, GET8(vmeta, vde->VD_State) | DDF_VDE_DIRTY); SET8(vmeta, vde->Init_State, DDF_VDE_INIT_FULL); // XXX ddf_meta_put_name(vmeta, vol->v_name); for (i = 0; i < vol->v_disks_count; i++) { sd = &vol->v_subdisks[i]; bvd = i / GET16(vmeta, vdc->Primary_Element_Count); pos = i % GET16(vmeta, vdc->Primary_Element_Count); disk = sd->sd_disk; if (disk != NULL) { pd = (struct g_raid_md_ddf_perdisk *)disk->d_md_data; if (vmeta->bvdc[bvd] == NULL) { size = GET16(vmeta, hdr->Configuration_Record_Length) * vmeta->sectorsize; vmeta->bvdc[bvd] = malloc(size, M_MD_DDF, M_WAITOK); memset(vmeta->bvdc[bvd], 0xff, size); } memcpy(vmeta->bvdc[bvd], vmeta->vdc, sizeof(struct ddf_vdc_record)); SET8(vmeta, bvdc[bvd]->Secondary_Element_Seq, bvd); SET64(vmeta, bvdc[bvd]->Block_Count, sd->sd_size / vol->v_sectorsize); SET32(vmeta, bvdc[bvd]->Physical_Disk_Sequence[pos], GET32(&pd->pd_meta, pdd->PD_Reference)); val2 = (uint64_t *)&(vmeta->bvdc[bvd]->Physical_Disk_Sequence[ GET16(vmeta, hdr->Max_Primary_Element_Entries)]); SET64P(vmeta, val2 + pos, sd->sd_offset / vol->v_sectorsize); } if (vmeta->bvdc[bvd] == NULL) continue; j = ddf_meta_find_pd(gmeta, NULL, GET32(vmeta, bvdc[bvd]->Physical_Disk_Sequence[pos])); if (j < 0) continue; SET16(gmeta, pdr->entry[j].PD_Type, GET16(gmeta, pdr->entry[j].PD_Type) | DDF_PDE_PARTICIPATING); if (sd->sd_state == G_RAID_SUBDISK_S_NONE) SET16(gmeta, pdr->entry[j].PD_State, GET16(gmeta, pdr->entry[j].PD_State) | (DDF_PDE_FAILED | DDF_PDE_MISSING)); else if (sd->sd_state == G_RAID_SUBDISK_S_FAILED) SET16(gmeta, pdr->entry[j].PD_State, GET16(gmeta, pdr->entry[j].PD_State) | (DDF_PDE_FAILED | DDF_PDE_PFA)); else if (sd->sd_state <= G_RAID_SUBDISK_S_REBUILD) SET16(gmeta, pdr->entry[j].PD_State, GET16(gmeta, pdr->entry[j].PD_State) | DDF_PDE_REBUILD); else SET16(gmeta, pdr->entry[j].PD_State, GET16(gmeta, pdr->entry[j].PD_State) | DDF_PDE_ONLINE); } } /* Mark spare and failed disks as such. */ TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { pd = (struct g_raid_md_ddf_perdisk *)disk->d_md_data; i = ddf_meta_find_pd(gmeta, NULL, GET32(&pd->pd_meta, pdd->PD_Reference)); if (i < 0) continue; if (disk->d_state == G_RAID_DISK_S_FAILED) { SET16(gmeta, pdr->entry[i].PD_State, GET16(gmeta, pdr->entry[i].PD_State) | (DDF_PDE_FAILED | DDF_PDE_PFA)); } if (disk->d_state != G_RAID_DISK_S_SPARE) continue; sa = ddf_meta_find_sa(&pd->pd_meta, 0); if (sa == NULL || (GET8D(&pd->pd_meta, sa->Spare_Type) & DDF_SAR_TYPE_DEDICATED) == 0) { SET16(gmeta, pdr->entry[i].PD_Type, GET16(gmeta, pdr->entry[i].PD_Type) | DDF_PDE_GLOBAL_SPARE); } else { SET16(gmeta, pdr->entry[i].PD_Type, GET16(gmeta, pdr->entry[i].PD_Type) | DDF_PDE_CONFIG_SPARE); } SET16(gmeta, pdr->entry[i].PD_State, GET16(gmeta, pdr->entry[i].PD_State) | DDF_PDE_ONLINE); } /* Remove disks without "participating" flag (unused). */ for (i = 0, j = -1; i < GET16(gmeta, pdr->Populated_PDEs); i++) { if (isff(gmeta->pdr->entry[i].PD_GUID, 24)) continue; if ((GET16(gmeta, pdr->entry[i].PD_Type) & (DDF_PDE_PARTICIPATING | DDF_PDE_GLOBAL_SPARE | DDF_PDE_CONFIG_SPARE)) != 0 || g_raid_md_ddf_get_disk(sc, NULL, GET32(gmeta, pdr->entry[i].PD_Reference)) != NULL) j = i; else memset(&gmeta->pdr->entry[i], 0xff, sizeof(struct ddf_pd_entry)); } SET16(gmeta, pdr->Populated_PDEs, j + 1); /* Update per-disk metadata and write them. */ TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { pd = (struct g_raid_md_ddf_perdisk *)disk->d_md_data; if (disk->d_state != G_RAID_DISK_S_ACTIVE && disk->d_state != G_RAID_DISK_S_SPARE) continue; /* Update PDR. */ memcpy(pd->pd_meta.pdr, gmeta->pdr, GET32(&pd->pd_meta, hdr->pdr_length) * pd->pd_meta.sectorsize); /* Update VDR. */ SET16(&pd->pd_meta, vdr->Populated_VDEs, 0); TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) { if (vol->v_stopping) continue; pv = (struct g_raid_md_ddf_pervolume *)vol->v_md_data; i = ddf_meta_find_vd(&pd->pd_meta, pv->pv_meta.vde->VD_GUID); if (i < 0) i = ddf_meta_find_vd(&pd->pd_meta, NULL); if (i >= 0) memcpy(&pd->pd_meta.vdr->entry[i], pv->pv_meta.vde, sizeof(struct ddf_vd_entry)); } /* Update VDC. */ if (mdi->mdio_starting == 0) { /* Remove all VDCs to restore needed later. */ j = GETCRNUM(&pd->pd_meta); for (i = 0; i < j; i++) { vdc = GETVDCPTR(&pd->pd_meta, i); if (GET32D(&pd->pd_meta, vdc->Signature) != DDF_VDCR_SIGNATURE) continue; SET32D(&pd->pd_meta, vdc->Signature, 0xffffffff); } } TAILQ_FOREACH(sd, &disk->d_subdisks, sd_next) { vol = sd->sd_volume; if (vol->v_stopping) continue; pv = (struct g_raid_md_ddf_pervolume *)vol->v_md_data; vmeta = &pv->pv_meta; vdc = ddf_meta_find_vdc(&pd->pd_meta, vmeta->vde->VD_GUID); if (vdc == NULL) vdc = ddf_meta_find_vdc(&pd->pd_meta, NULL); if (vdc != NULL) { bvd = sd->sd_pos / GET16(vmeta, vdc->Primary_Element_Count); memcpy(vdc, vmeta->bvdc[bvd], GET16(&pd->pd_meta, hdr->Configuration_Record_Length) * pd->pd_meta.sectorsize); } } G_RAID_DEBUG(1, "Writing DDF metadata to %s", g_raid_get_diskname(disk)); g_raid_md_ddf_print(&pd->pd_meta); ddf_meta_write(disk->d_consumer, &pd->pd_meta); } return (0); } static int g_raid_md_fail_disk_ddf(struct g_raid_md_object *md, struct g_raid_subdisk *tsd, struct g_raid_disk *tdisk) { struct g_raid_softc *sc; struct g_raid_md_ddf_perdisk *pd; struct g_raid_subdisk *sd; int i; sc = md->mdo_softc; pd = (struct g_raid_md_ddf_perdisk *)tdisk->d_md_data; /* We can't fail disk that is not a part of array now. */ if (tdisk->d_state != G_RAID_DISK_S_ACTIVE) return (-1); /* * Mark disk as failed in metadata and try to write that metadata * to the disk itself to prevent it's later resurrection as STALE. */ G_RAID_DEBUG(1, "Writing DDF metadata to %s", g_raid_get_diskname(tdisk)); i = ddf_meta_find_pd(&pd->pd_meta, NULL, GET32(&pd->pd_meta, pdd->PD_Reference)); SET16(&pd->pd_meta, pdr->entry[i].PD_State, DDF_PDE_FAILED | DDF_PDE_PFA); if (tdisk->d_consumer != NULL) ddf_meta_write(tdisk->d_consumer, &pd->pd_meta); /* Change states. */ g_raid_change_disk_state(tdisk, G_RAID_DISK_S_FAILED); TAILQ_FOREACH(sd, &tdisk->d_subdisks, sd_next) { g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_FAILED); g_raid_event_send(sd, G_RAID_SUBDISK_E_FAILED, G_RAID_EVENT_SUBDISK); } /* Write updated metadata to remaining disks. */ g_raid_md_write_ddf(md, NULL, NULL, tdisk); g_raid_md_ddf_refill(sc); return (0); } static int g_raid_md_free_disk_ddf(struct g_raid_md_object *md, struct g_raid_disk *disk) { struct g_raid_md_ddf_perdisk *pd; pd = (struct g_raid_md_ddf_perdisk *)disk->d_md_data; ddf_meta_free(&pd->pd_meta); free(pd, M_MD_DDF); disk->d_md_data = NULL; return (0); } static int g_raid_md_free_volume_ddf(struct g_raid_md_object *md, struct g_raid_volume *vol) { struct g_raid_md_ddf_object *mdi; struct g_raid_md_ddf_pervolume *pv; mdi = (struct g_raid_md_ddf_object *)md; pv = (struct g_raid_md_ddf_pervolume *)vol->v_md_data; ddf_vol_meta_free(&pv->pv_meta); if (!pv->pv_started) { pv->pv_started = 1; mdi->mdio_starting--; callout_stop(&pv->pv_start_co); } free(pv, M_MD_DDF); vol->v_md_data = NULL; return (0); } static int g_raid_md_free_ddf(struct g_raid_md_object *md) { struct g_raid_md_ddf_object *mdi; mdi = (struct g_raid_md_ddf_object *)md; if (!mdi->mdio_started) { mdi->mdio_started = 0; callout_stop(&mdi->mdio_start_co); G_RAID_DEBUG1(1, md->mdo_softc, "root_mount_rel %p", mdi->mdio_rootmount); root_mount_rel(mdi->mdio_rootmount); mdi->mdio_rootmount = NULL; } ddf_meta_free(&mdi->mdio_meta); return (0); } G_RAID_MD_DECLARE(ddf, "DDF"); Index: head/sys/geom/raid/md_intel.c =================================================================== --- head/sys/geom/raid/md_intel.c (revision 308456) +++ head/sys/geom/raid/md_intel.c (revision 308457) @@ -1,2714 +1,2714 @@ /*- * Copyright (c) 2010 Alexander Motin * Copyright (c) 2000 - 2008 Søren Schmidt * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include "geom/raid/g_raid.h" #include "g_raid_md_if.h" static MALLOC_DEFINE(M_MD_INTEL, "md_intel_data", "GEOM_RAID Intel metadata"); struct intel_raid_map { uint32_t offset; uint32_t disk_sectors; uint32_t stripe_count; uint16_t strip_sectors; uint8_t status; #define INTEL_S_READY 0x00 #define INTEL_S_UNINITIALIZED 0x01 #define INTEL_S_DEGRADED 0x02 #define INTEL_S_FAILURE 0x03 uint8_t type; #define INTEL_T_RAID0 0x00 #define INTEL_T_RAID1 0x01 #define INTEL_T_RAID5 0x05 uint8_t total_disks; uint8_t total_domains; uint8_t failed_disk_num; uint8_t ddf; uint32_t offset_hi; uint32_t disk_sectors_hi; uint32_t stripe_count_hi; uint32_t filler_2[4]; uint32_t disk_idx[1]; /* total_disks entries. */ #define INTEL_DI_IDX 0x00ffffff #define INTEL_DI_RBLD 0x01000000 } __packed; struct intel_raid_vol { uint8_t name[16]; u_int64_t total_sectors __packed; uint32_t state; #define INTEL_ST_BOOTABLE 0x00000001 #define INTEL_ST_BOOT_DEVICE 0x00000002 #define INTEL_ST_READ_COALESCING 0x00000004 #define INTEL_ST_WRITE_COALESCING 0x00000008 #define INTEL_ST_LAST_SHUTDOWN_DIRTY 0x00000010 #define INTEL_ST_HIDDEN_AT_BOOT 0x00000020 #define INTEL_ST_CURRENTLY_HIDDEN 0x00000040 #define INTEL_ST_VERIFY_AND_FIX 0x00000080 #define INTEL_ST_MAP_STATE_UNINIT 0x00000100 #define INTEL_ST_NO_AUTO_RECOVERY 0x00000200 #define INTEL_ST_CLONE_N_GO 0x00000400 #define INTEL_ST_CLONE_MAN_SYNC 0x00000800 #define INTEL_ST_CNG_MASTER_DISK_NUM 0x00001000 uint32_t reserved; uint8_t migr_priority; uint8_t num_sub_vols; uint8_t tid; uint8_t cng_master_disk; uint16_t cache_policy; uint8_t cng_state; #define INTEL_CNGST_UPDATED 0 #define INTEL_CNGST_NEEDS_UPDATE 1 #define INTEL_CNGST_MASTER_MISSING 2 uint8_t cng_sub_state; uint32_t filler_0[10]; uint32_t curr_migr_unit; uint32_t checkpoint_id; uint8_t migr_state; uint8_t migr_type; #define INTEL_MT_INIT 0 #define INTEL_MT_REBUILD 1 #define INTEL_MT_VERIFY 2 #define INTEL_MT_GEN_MIGR 3 #define INTEL_MT_STATE_CHANGE 4 #define INTEL_MT_REPAIR 5 uint8_t dirty; uint8_t fs_state; uint16_t verify_errors; uint16_t bad_blocks; uint32_t curr_migr_unit_hi; uint32_t filler_1[3]; struct intel_raid_map map[1]; /* 2 entries if migr_state != 0. */ } __packed; struct intel_raid_disk { #define INTEL_SERIAL_LEN 16 uint8_t serial[INTEL_SERIAL_LEN]; uint32_t sectors; uint32_t id; uint32_t flags; #define INTEL_F_SPARE 0x01 #define INTEL_F_ASSIGNED 0x02 #define INTEL_F_FAILED 0x04 #define INTEL_F_ONLINE 0x08 #define INTEL_F_DISABLED 0x80 uint32_t owner_cfg_num; uint32_t sectors_hi; uint32_t filler[3]; } __packed; struct intel_raid_conf { uint8_t intel_id[24]; #define INTEL_MAGIC "Intel Raid ISM Cfg Sig. " uint8_t version[6]; #define INTEL_VERSION_1000 "1.0.00" /* RAID0 */ #define INTEL_VERSION_1100 "1.1.00" /* RAID1 */ #define INTEL_VERSION_1200 "1.2.00" /* Many volumes */ #define INTEL_VERSION_1201 "1.2.01" /* 3 or 4 disks */ #define INTEL_VERSION_1202 "1.2.02" /* RAID5 */ #define INTEL_VERSION_1204 "1.2.04" /* 5 or 6 disks */ #define INTEL_VERSION_1206 "1.2.06" /* CNG */ #define INTEL_VERSION_1300 "1.3.00" /* Attributes */ uint8_t dummy_0[2]; uint32_t checksum; uint32_t config_size; uint32_t config_id; uint32_t generation; uint32_t error_log_size; uint32_t attributes; #define INTEL_ATTR_RAID0 0x00000001 #define INTEL_ATTR_RAID1 0x00000002 #define INTEL_ATTR_RAID10 0x00000004 #define INTEL_ATTR_RAID1E 0x00000008 #define INTEL_ATTR_RAID5 0x00000010 #define INTEL_ATTR_RAIDCNG 0x00000020 #define INTEL_ATTR_EXT_STRIP 0x00000040 #define INTEL_ATTR_NVM_CACHE 0x02000000 #define INTEL_ATTR_2TB_DISK 0x04000000 #define INTEL_ATTR_BBM 0x08000000 #define INTEL_ATTR_NVM_CACHE2 0x10000000 #define INTEL_ATTR_2TB 0x20000000 #define INTEL_ATTR_PM 0x40000000 #define INTEL_ATTR_CHECKSUM 0x80000000 uint8_t total_disks; uint8_t total_volumes; uint8_t error_log_pos; uint8_t dummy_2[1]; uint32_t cache_size; uint32_t orig_config_id; uint32_t pwr_cycle_count; uint32_t bbm_log_size; uint32_t filler_0[35]; struct intel_raid_disk disk[1]; /* total_disks entries. */ /* Here goes total_volumes of struct intel_raid_vol. */ } __packed; #define INTEL_ATTR_SUPPORTED ( INTEL_ATTR_RAID0 | INTEL_ATTR_RAID1 | \ INTEL_ATTR_RAID10 | INTEL_ATTR_RAID1E | INTEL_ATTR_RAID5 | \ INTEL_ATTR_RAIDCNG | INTEL_ATTR_EXT_STRIP | INTEL_ATTR_2TB_DISK | \ INTEL_ATTR_2TB | INTEL_ATTR_PM | INTEL_ATTR_CHECKSUM ) #define INTEL_MAX_MD_SIZE(ndisks) \ (sizeof(struct intel_raid_conf) + \ sizeof(struct intel_raid_disk) * (ndisks - 1) + \ sizeof(struct intel_raid_vol) * 2 + \ sizeof(struct intel_raid_map) * 2 + \ sizeof(uint32_t) * (ndisks - 1) * 4) struct g_raid_md_intel_perdisk { struct intel_raid_conf *pd_meta; int pd_disk_pos; struct intel_raid_disk pd_disk_meta; }; struct g_raid_md_intel_pervolume { int pv_volume_pos; int pv_cng; int pv_cng_man_sync; int pv_cng_master_disk; }; struct g_raid_md_intel_object { struct g_raid_md_object mdio_base; uint32_t mdio_config_id; uint32_t mdio_orig_config_id; uint32_t mdio_generation; struct intel_raid_conf *mdio_meta; struct callout mdio_start_co; /* STARTING state timer. */ int mdio_disks_present; int mdio_started; int mdio_incomplete; struct root_hold_token *mdio_rootmount; /* Root mount delay token. */ }; static g_raid_md_create_t g_raid_md_create_intel; static g_raid_md_taste_t g_raid_md_taste_intel; static g_raid_md_event_t g_raid_md_event_intel; static g_raid_md_ctl_t g_raid_md_ctl_intel; static g_raid_md_write_t g_raid_md_write_intel; static g_raid_md_fail_disk_t g_raid_md_fail_disk_intel; static g_raid_md_free_disk_t g_raid_md_free_disk_intel; static g_raid_md_free_volume_t g_raid_md_free_volume_intel; static g_raid_md_free_t g_raid_md_free_intel; static kobj_method_t g_raid_md_intel_methods[] = { KOBJMETHOD(g_raid_md_create, g_raid_md_create_intel), KOBJMETHOD(g_raid_md_taste, g_raid_md_taste_intel), KOBJMETHOD(g_raid_md_event, g_raid_md_event_intel), KOBJMETHOD(g_raid_md_ctl, g_raid_md_ctl_intel), KOBJMETHOD(g_raid_md_write, g_raid_md_write_intel), KOBJMETHOD(g_raid_md_fail_disk, g_raid_md_fail_disk_intel), KOBJMETHOD(g_raid_md_free_disk, g_raid_md_free_disk_intel), KOBJMETHOD(g_raid_md_free_volume, g_raid_md_free_volume_intel), KOBJMETHOD(g_raid_md_free, g_raid_md_free_intel), { 0, 0 } }; static struct g_raid_md_class g_raid_md_intel_class = { "Intel", g_raid_md_intel_methods, sizeof(struct g_raid_md_intel_object), .mdc_enable = 1, .mdc_priority = 100 }; static struct intel_raid_map * intel_get_map(struct intel_raid_vol *mvol, int i) { struct intel_raid_map *mmap; if (i > (mvol->migr_state ? 1 : 0)) return (NULL); mmap = &mvol->map[0]; for (; i > 0; i--) { mmap = (struct intel_raid_map *) &mmap->disk_idx[mmap->total_disks]; } return ((struct intel_raid_map *)mmap); } static struct intel_raid_vol * intel_get_volume(struct intel_raid_conf *meta, int i) { struct intel_raid_vol *mvol; struct intel_raid_map *mmap; if (i > 1) return (NULL); mvol = (struct intel_raid_vol *)&meta->disk[meta->total_disks]; for (; i > 0; i--) { mmap = intel_get_map(mvol, mvol->migr_state ? 1 : 0); mvol = (struct intel_raid_vol *) &mmap->disk_idx[mmap->total_disks]; } return (mvol); } static off_t intel_get_map_offset(struct intel_raid_map *mmap) { off_t offset = (off_t)mmap->offset_hi << 32; offset += mmap->offset; return (offset); } static void intel_set_map_offset(struct intel_raid_map *mmap, off_t offset) { mmap->offset = offset & 0xffffffff; mmap->offset_hi = offset >> 32; } static off_t intel_get_map_disk_sectors(struct intel_raid_map *mmap) { off_t disk_sectors = (off_t)mmap->disk_sectors_hi << 32; disk_sectors += mmap->disk_sectors; return (disk_sectors); } static void intel_set_map_disk_sectors(struct intel_raid_map *mmap, off_t disk_sectors) { mmap->disk_sectors = disk_sectors & 0xffffffff; mmap->disk_sectors_hi = disk_sectors >> 32; } static void intel_set_map_stripe_count(struct intel_raid_map *mmap, off_t stripe_count) { mmap->stripe_count = stripe_count & 0xffffffff; mmap->stripe_count_hi = stripe_count >> 32; } static off_t intel_get_disk_sectors(struct intel_raid_disk *disk) { off_t sectors = (off_t)disk->sectors_hi << 32; sectors += disk->sectors; return (sectors); } static void intel_set_disk_sectors(struct intel_raid_disk *disk, off_t sectors) { disk->sectors = sectors & 0xffffffff; disk->sectors_hi = sectors >> 32; } static off_t intel_get_vol_curr_migr_unit(struct intel_raid_vol *vol) { off_t curr_migr_unit = (off_t)vol->curr_migr_unit_hi << 32; curr_migr_unit += vol->curr_migr_unit; return (curr_migr_unit); } static void intel_set_vol_curr_migr_unit(struct intel_raid_vol *vol, off_t curr_migr_unit) { vol->curr_migr_unit = curr_migr_unit & 0xffffffff; vol->curr_migr_unit_hi = curr_migr_unit >> 32; } static char * intel_status2str(int status) { switch (status) { case INTEL_S_READY: return ("READY"); case INTEL_S_UNINITIALIZED: return ("UNINITIALIZED"); case INTEL_S_DEGRADED: return ("DEGRADED"); case INTEL_S_FAILURE: return ("FAILURE"); default: return ("UNKNOWN"); } } static char * intel_type2str(int type) { switch (type) { case INTEL_T_RAID0: return ("RAID0"); case INTEL_T_RAID1: return ("RAID1"); case INTEL_T_RAID5: return ("RAID5"); default: return ("UNKNOWN"); } } static char * intel_cngst2str(int cng_state) { switch (cng_state) { case INTEL_CNGST_UPDATED: return ("UPDATED"); case INTEL_CNGST_NEEDS_UPDATE: return ("NEEDS_UPDATE"); case INTEL_CNGST_MASTER_MISSING: return ("MASTER_MISSING"); default: return ("UNKNOWN"); } } static char * intel_mt2str(int type) { switch (type) { case INTEL_MT_INIT: return ("INIT"); case INTEL_MT_REBUILD: return ("REBUILD"); case INTEL_MT_VERIFY: return ("VERIFY"); case INTEL_MT_GEN_MIGR: return ("GEN_MIGR"); case INTEL_MT_STATE_CHANGE: return ("STATE_CHANGE"); case INTEL_MT_REPAIR: return ("REPAIR"); default: return ("UNKNOWN"); } } static void g_raid_md_intel_print(struct intel_raid_conf *meta) { struct intel_raid_vol *mvol; struct intel_raid_map *mmap; int i, j, k; if (g_raid_debug < 1) return; printf("********* ATA Intel MatrixRAID Metadata *********\n"); printf("intel_id <%.24s>\n", meta->intel_id); printf("version <%.6s>\n", meta->version); printf("checksum 0x%08x\n", meta->checksum); printf("config_size 0x%08x\n", meta->config_size); printf("config_id 0x%08x\n", meta->config_id); printf("generation 0x%08x\n", meta->generation); printf("error_log_size %d\n", meta->error_log_size); printf("attributes 0x%b\n", meta->attributes, "\020" "\001RAID0" "\002RAID1" "\003RAID10" "\004RAID1E" "\005RAID15" "\006RAIDCNG" "\007EXT_STRIP" "\032NVM_CACHE" "\0332TB_DISK" "\034BBM" "\035NVM_CACHE" "\0362TB" "\037PM" "\040CHECKSUM"); printf("total_disks %u\n", meta->total_disks); printf("total_volumes %u\n", meta->total_volumes); printf("error_log_pos %u\n", meta->error_log_pos); printf("cache_size %u\n", meta->cache_size); printf("orig_config_id 0x%08x\n", meta->orig_config_id); printf("pwr_cycle_count %u\n", meta->pwr_cycle_count); printf("bbm_log_size %u\n", meta->bbm_log_size); printf("Flags: S - Spare, A - Assigned, F - Failed, O - Online, D - Disabled\n"); printf("DISK# serial disk_sectors disk_sectors_hi disk_id flags owner\n"); for (i = 0; i < meta->total_disks; i++ ) { printf(" %d <%.16s> %u %u 0x%08x 0x%b %08x\n", i, meta->disk[i].serial, meta->disk[i].sectors, meta->disk[i].sectors_hi, meta->disk[i].id, meta->disk[i].flags, "\20\01S\02A\03F\04O\05D", meta->disk[i].owner_cfg_num); } for (i = 0; i < meta->total_volumes; i++) { mvol = intel_get_volume(meta, i); printf(" ****** Volume %d ******\n", i); printf(" name %.16s\n", mvol->name); printf(" total_sectors %ju\n", mvol->total_sectors); printf(" state 0x%b\n", mvol->state, "\020" "\001BOOTABLE" "\002BOOT_DEVICE" "\003READ_COALESCING" "\004WRITE_COALESCING" "\005LAST_SHUTDOWN_DIRTY" "\006HIDDEN_AT_BOOT" "\007CURRENTLY_HIDDEN" "\010VERIFY_AND_FIX" "\011MAP_STATE_UNINIT" "\012NO_AUTO_RECOVERY" "\013CLONE_N_GO" "\014CLONE_MAN_SYNC" "\015CNG_MASTER_DISK_NUM"); printf(" reserved %u\n", mvol->reserved); printf(" migr_priority %u\n", mvol->migr_priority); printf(" num_sub_vols %u\n", mvol->num_sub_vols); printf(" tid %u\n", mvol->tid); printf(" cng_master_disk %u\n", mvol->cng_master_disk); printf(" cache_policy %u\n", mvol->cache_policy); printf(" cng_state %u (%s)\n", mvol->cng_state, intel_cngst2str(mvol->cng_state)); printf(" cng_sub_state %u\n", mvol->cng_sub_state); printf(" curr_migr_unit %u\n", mvol->curr_migr_unit); printf(" curr_migr_unit_hi %u\n", mvol->curr_migr_unit_hi); printf(" checkpoint_id %u\n", mvol->checkpoint_id); printf(" migr_state %u\n", mvol->migr_state); printf(" migr_type %u (%s)\n", mvol->migr_type, intel_mt2str(mvol->migr_type)); printf(" dirty %u\n", mvol->dirty); printf(" fs_state %u\n", mvol->fs_state); printf(" verify_errors %u\n", mvol->verify_errors); printf(" bad_blocks %u\n", mvol->bad_blocks); for (j = 0; j < (mvol->migr_state ? 2 : 1); j++) { printf(" *** Map %d ***\n", j); mmap = intel_get_map(mvol, j); printf(" offset %u\n", mmap->offset); printf(" offset_hi %u\n", mmap->offset_hi); printf(" disk_sectors %u\n", mmap->disk_sectors); printf(" disk_sectors_hi %u\n", mmap->disk_sectors_hi); printf(" stripe_count %u\n", mmap->stripe_count); printf(" stripe_count_hi %u\n", mmap->stripe_count_hi); printf(" strip_sectors %u\n", mmap->strip_sectors); printf(" status %u (%s)\n", mmap->status, intel_status2str(mmap->status)); printf(" type %u (%s)\n", mmap->type, intel_type2str(mmap->type)); printf(" total_disks %u\n", mmap->total_disks); printf(" total_domains %u\n", mmap->total_domains); printf(" failed_disk_num %u\n", mmap->failed_disk_num); printf(" ddf %u\n", mmap->ddf); printf(" disk_idx "); for (k = 0; k < mmap->total_disks; k++) printf(" 0x%08x", mmap->disk_idx[k]); printf("\n"); } } printf("=================================================\n"); } static struct intel_raid_conf * intel_meta_copy(struct intel_raid_conf *meta) { struct intel_raid_conf *nmeta; nmeta = malloc(meta->config_size, M_MD_INTEL, M_WAITOK); memcpy(nmeta, meta, meta->config_size); return (nmeta); } static int intel_meta_find_disk(struct intel_raid_conf *meta, char *serial) { int pos; for (pos = 0; pos < meta->total_disks; pos++) { if (strncmp(meta->disk[pos].serial, serial, INTEL_SERIAL_LEN) == 0) return (pos); } return (-1); } static struct intel_raid_conf * intel_meta_read(struct g_consumer *cp) { struct g_provider *pp; struct intel_raid_conf *meta; struct intel_raid_vol *mvol; struct intel_raid_map *mmap, *mmap1; char *buf; int error, i, j, k, left, size; uint32_t checksum, *ptr; pp = cp->provider; /* Read the anchor sector. */ buf = g_read_data(cp, pp->mediasize - pp->sectorsize * 2, pp->sectorsize, &error); if (buf == NULL) { G_RAID_DEBUG(1, "Cannot read metadata from %s (error=%d).", pp->name, error); return (NULL); } meta = (struct intel_raid_conf *)buf; /* Check if this is an Intel RAID struct */ if (strncmp(meta->intel_id, INTEL_MAGIC, strlen(INTEL_MAGIC))) { G_RAID_DEBUG(1, "Intel signature check failed on %s", pp->name); g_free(buf); return (NULL); } if (meta->config_size > 65536 || meta->config_size < sizeof(struct intel_raid_conf)) { G_RAID_DEBUG(1, "Intel metadata size looks wrong: %d", meta->config_size); g_free(buf); return (NULL); } size = meta->config_size; meta = malloc(size, M_MD_INTEL, M_WAITOK); memcpy(meta, buf, min(size, pp->sectorsize)); g_free(buf); /* Read all the rest, if needed. */ if (meta->config_size > pp->sectorsize) { left = (meta->config_size - 1) / pp->sectorsize; buf = g_read_data(cp, pp->mediasize - pp->sectorsize * (2 + left), pp->sectorsize * left, &error); if (buf == NULL) { G_RAID_DEBUG(1, "Cannot read remaining metadata" " part from %s (error=%d).", pp->name, error); free(meta, M_MD_INTEL); return (NULL); } memcpy(((char *)meta) + pp->sectorsize, buf, pp->sectorsize * left); g_free(buf); } /* Check metadata checksum. */ for (checksum = 0, ptr = (uint32_t *)meta, i = 0; i < (meta->config_size / sizeof(uint32_t)); i++) { checksum += *ptr++; } checksum -= meta->checksum; if (checksum != meta->checksum) { G_RAID_DEBUG(1, "Intel checksum check failed on %s", pp->name); free(meta, M_MD_INTEL); return (NULL); } /* Validate metadata size. */ size = sizeof(struct intel_raid_conf) + sizeof(struct intel_raid_disk) * (meta->total_disks - 1) + sizeof(struct intel_raid_vol) * meta->total_volumes; if (size > meta->config_size) { badsize: G_RAID_DEBUG(1, "Intel metadata size incorrect %d < %d", meta->config_size, size); free(meta, M_MD_INTEL); return (NULL); } for (i = 0; i < meta->total_volumes; i++) { mvol = intel_get_volume(meta, i); mmap = intel_get_map(mvol, 0); size += 4 * (mmap->total_disks - 1); if (size > meta->config_size) goto badsize; if (mvol->migr_state) { size += sizeof(struct intel_raid_map); if (size > meta->config_size) goto badsize; mmap = intel_get_map(mvol, 1); size += 4 * (mmap->total_disks - 1); if (size > meta->config_size) goto badsize; } } g_raid_md_intel_print(meta); if (strncmp(meta->version, INTEL_VERSION_1300, 6) > 0) { G_RAID_DEBUG(1, "Intel unsupported version: '%.6s'", meta->version); free(meta, M_MD_INTEL); return (NULL); } if (strncmp(meta->version, INTEL_VERSION_1300, 6) >= 0 && (meta->attributes & ~INTEL_ATTR_SUPPORTED) != 0) { G_RAID_DEBUG(1, "Intel unsupported attributes: 0x%08x", meta->attributes & ~INTEL_ATTR_SUPPORTED); free(meta, M_MD_INTEL); return (NULL); } /* Validate disk indexes. */ for (i = 0; i < meta->total_volumes; i++) { mvol = intel_get_volume(meta, i); for (j = 0; j < (mvol->migr_state ? 2 : 1); j++) { mmap = intel_get_map(mvol, j); for (k = 0; k < mmap->total_disks; k++) { if ((mmap->disk_idx[k] & INTEL_DI_IDX) > meta->total_disks) { G_RAID_DEBUG(1, "Intel metadata disk" " index %d too big (>%d)", mmap->disk_idx[k] & INTEL_DI_IDX, meta->total_disks); free(meta, M_MD_INTEL); return (NULL); } } } } /* Validate migration types. */ for (i = 0; i < meta->total_volumes; i++) { mvol = intel_get_volume(meta, i); /* Deny unknown migration types. */ if (mvol->migr_state && mvol->migr_type != INTEL_MT_INIT && mvol->migr_type != INTEL_MT_REBUILD && mvol->migr_type != INTEL_MT_VERIFY && mvol->migr_type != INTEL_MT_GEN_MIGR && mvol->migr_type != INTEL_MT_REPAIR) { G_RAID_DEBUG(1, "Intel metadata has unsupported" " migration type %d", mvol->migr_type); free(meta, M_MD_INTEL); return (NULL); } /* Deny general migrations except SINGLE->RAID1. */ if (mvol->migr_state && mvol->migr_type == INTEL_MT_GEN_MIGR) { mmap = intel_get_map(mvol, 0); mmap1 = intel_get_map(mvol, 1); if (mmap1->total_disks != 1 || mmap->type != INTEL_T_RAID1 || mmap->total_disks != 2 || mmap->offset != mmap1->offset || mmap->disk_sectors != mmap1->disk_sectors || mmap->total_domains != mmap->total_disks || mmap->offset_hi != mmap1->offset_hi || mmap->disk_sectors_hi != mmap1->disk_sectors_hi || (mmap->disk_idx[0] != mmap1->disk_idx[0] && mmap->disk_idx[0] != mmap1->disk_idx[1])) { G_RAID_DEBUG(1, "Intel metadata has unsupported" " variant of general migration"); free(meta, M_MD_INTEL); return (NULL); } } } return (meta); } static int intel_meta_write(struct g_consumer *cp, struct intel_raid_conf *meta) { struct g_provider *pp; char *buf; int error, i, sectors; uint32_t checksum, *ptr; pp = cp->provider; /* Recalculate checksum for case if metadata were changed. */ meta->checksum = 0; for (checksum = 0, ptr = (uint32_t *)meta, i = 0; i < (meta->config_size / sizeof(uint32_t)); i++) { checksum += *ptr++; } meta->checksum = checksum; /* Create and fill buffer. */ sectors = howmany(meta->config_size, pp->sectorsize); buf = malloc(sectors * pp->sectorsize, M_MD_INTEL, M_WAITOK | M_ZERO); if (sectors > 1) { memcpy(buf, ((char *)meta) + pp->sectorsize, (sectors - 1) * pp->sectorsize); } memcpy(buf + (sectors - 1) * pp->sectorsize, meta, pp->sectorsize); error = g_write_data(cp, pp->mediasize - pp->sectorsize * (1 + sectors), buf, pp->sectorsize * sectors); if (error != 0) { G_RAID_DEBUG(1, "Cannot write metadata to %s (error=%d).", pp->name, error); } free(buf, M_MD_INTEL); return (error); } static int intel_meta_erase(struct g_consumer *cp) { struct g_provider *pp; char *buf; int error; pp = cp->provider; buf = malloc(pp->sectorsize, M_MD_INTEL, M_WAITOK | M_ZERO); error = g_write_data(cp, pp->mediasize - 2 * pp->sectorsize, buf, pp->sectorsize); if (error != 0) { G_RAID_DEBUG(1, "Cannot erase metadata on %s (error=%d).", pp->name, error); } free(buf, M_MD_INTEL); return (error); } static int intel_meta_write_spare(struct g_consumer *cp, struct intel_raid_disk *d) { struct intel_raid_conf *meta; int error; /* Fill anchor and single disk. */ meta = malloc(INTEL_MAX_MD_SIZE(1), M_MD_INTEL, M_WAITOK | M_ZERO); memcpy(&meta->intel_id[0], INTEL_MAGIC, sizeof(INTEL_MAGIC) - 1); memcpy(&meta->version[0], INTEL_VERSION_1000, sizeof(INTEL_VERSION_1000) - 1); meta->config_size = INTEL_MAX_MD_SIZE(1); meta->config_id = meta->orig_config_id = arc4random(); meta->generation = 1; meta->total_disks = 1; meta->disk[0] = *d; error = intel_meta_write(cp, meta); free(meta, M_MD_INTEL); return (error); } static struct g_raid_disk * g_raid_md_intel_get_disk(struct g_raid_softc *sc, int id) { struct g_raid_disk *disk; struct g_raid_md_intel_perdisk *pd; TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { pd = (struct g_raid_md_intel_perdisk *)disk->d_md_data; if (pd->pd_disk_pos == id) break; } return (disk); } static int g_raid_md_intel_supported(int level, int qual, int disks, int force) { switch (level) { case G_RAID_VOLUME_RL_RAID0: if (disks < 1) return (0); if (!force && (disks < 2 || disks > 6)) return (0); break; case G_RAID_VOLUME_RL_RAID1: if (disks < 1) return (0); if (!force && (disks != 2)) return (0); break; case G_RAID_VOLUME_RL_RAID1E: if (disks < 2) return (0); if (!force && (disks != 4)) return (0); break; case G_RAID_VOLUME_RL_RAID5: if (disks < 3) return (0); if (!force && disks > 6) return (0); if (qual != G_RAID_VOLUME_RLQ_R5LA) return (0); break; default: return (0); } if (level != G_RAID_VOLUME_RL_RAID5 && qual != G_RAID_VOLUME_RLQ_NONE) return (0); return (1); } static struct g_raid_volume * g_raid_md_intel_get_volume(struct g_raid_softc *sc, int id) { struct g_raid_volume *mvol; struct g_raid_md_intel_pervolume *pv; TAILQ_FOREACH(mvol, &sc->sc_volumes, v_next) { pv = mvol->v_md_data; if (pv->pv_volume_pos == id) break; } return (mvol); } static int g_raid_md_intel_start_disk(struct g_raid_disk *disk) { struct g_raid_softc *sc; struct g_raid_subdisk *sd, *tmpsd; struct g_raid_disk *olddisk, *tmpdisk; struct g_raid_md_object *md; struct g_raid_md_intel_object *mdi; struct g_raid_md_intel_pervolume *pv; struct g_raid_md_intel_perdisk *pd, *oldpd; struct intel_raid_conf *meta; struct intel_raid_vol *mvol; struct intel_raid_map *mmap0, *mmap1; int disk_pos, resurrection = 0, migr_global, i; sc = disk->d_softc; md = sc->sc_md; mdi = (struct g_raid_md_intel_object *)md; meta = mdi->mdio_meta; pd = (struct g_raid_md_intel_perdisk *)disk->d_md_data; olddisk = NULL; - /* Find disk position in metadata by it's serial. */ + /* Find disk position in metadata by its serial. */ disk_pos = intel_meta_find_disk(meta, pd->pd_disk_meta.serial); if (disk_pos < 0) { G_RAID_DEBUG1(1, sc, "Unknown, probably new or stale disk"); /* Failed stale disk is useless for us. */ if ((pd->pd_disk_meta.flags & INTEL_F_FAILED) && !(pd->pd_disk_meta.flags & INTEL_F_DISABLED)) { g_raid_change_disk_state(disk, G_RAID_DISK_S_STALE_FAILED); return (0); } /* If we are in the start process, that's all for now. */ if (!mdi->mdio_started) goto nofit; /* * If we have already started - try to get use of the disk. * Try to replace OFFLINE disks first, then FAILED. */ TAILQ_FOREACH(tmpdisk, &sc->sc_disks, d_next) { if (tmpdisk->d_state != G_RAID_DISK_S_OFFLINE && tmpdisk->d_state != G_RAID_DISK_S_FAILED) continue; /* Make sure this disk is big enough. */ TAILQ_FOREACH(sd, &tmpdisk->d_subdisks, sd_next) { off_t disk_sectors = intel_get_disk_sectors(&pd->pd_disk_meta); if (sd->sd_offset + sd->sd_size + 4096 > disk_sectors * 512) { G_RAID_DEBUG1(1, sc, "Disk too small (%llu < %llu)", (unsigned long long) disk_sectors * 512, (unsigned long long) sd->sd_offset + sd->sd_size + 4096); break; } } if (sd != NULL) continue; if (tmpdisk->d_state == G_RAID_DISK_S_OFFLINE) { olddisk = tmpdisk; break; } else if (olddisk == NULL) olddisk = tmpdisk; } if (olddisk == NULL) { nofit: if (pd->pd_disk_meta.flags & INTEL_F_SPARE) { g_raid_change_disk_state(disk, G_RAID_DISK_S_SPARE); return (1); } else { g_raid_change_disk_state(disk, G_RAID_DISK_S_STALE); return (0); } } oldpd = (struct g_raid_md_intel_perdisk *)olddisk->d_md_data; disk_pos = oldpd->pd_disk_pos; resurrection = 1; } if (olddisk == NULL) { /* Find placeholder by position. */ olddisk = g_raid_md_intel_get_disk(sc, disk_pos); if (olddisk == NULL) panic("No disk at position %d!", disk_pos); if (olddisk->d_state != G_RAID_DISK_S_OFFLINE) { G_RAID_DEBUG1(1, sc, "More than one disk for pos %d", disk_pos); g_raid_change_disk_state(disk, G_RAID_DISK_S_STALE); return (0); } oldpd = (struct g_raid_md_intel_perdisk *)olddisk->d_md_data; } /* Replace failed disk or placeholder with new disk. */ TAILQ_FOREACH_SAFE(sd, &olddisk->d_subdisks, sd_next, tmpsd) { TAILQ_REMOVE(&olddisk->d_subdisks, sd, sd_next); TAILQ_INSERT_TAIL(&disk->d_subdisks, sd, sd_next); sd->sd_disk = disk; } oldpd->pd_disk_pos = -2; pd->pd_disk_pos = disk_pos; /* If it was placeholder -- destroy it. */ if (olddisk->d_state == G_RAID_DISK_S_OFFLINE) { g_raid_destroy_disk(olddisk); } else { /* Otherwise, make it STALE_FAILED. */ g_raid_change_disk_state(olddisk, G_RAID_DISK_S_STALE_FAILED); /* Update global metadata just in case. */ memcpy(&meta->disk[disk_pos], &pd->pd_disk_meta, sizeof(struct intel_raid_disk)); } /* Welcome the new disk. */ if ((meta->disk[disk_pos].flags & INTEL_F_DISABLED) && !(pd->pd_disk_meta.flags & INTEL_F_SPARE)) g_raid_change_disk_state(disk, G_RAID_DISK_S_DISABLED); else if (resurrection) g_raid_change_disk_state(disk, G_RAID_DISK_S_ACTIVE); else if (meta->disk[disk_pos].flags & INTEL_F_FAILED) g_raid_change_disk_state(disk, G_RAID_DISK_S_FAILED); else if (meta->disk[disk_pos].flags & INTEL_F_SPARE) g_raid_change_disk_state(disk, G_RAID_DISK_S_SPARE); else g_raid_change_disk_state(disk, G_RAID_DISK_S_ACTIVE); TAILQ_FOREACH(sd, &disk->d_subdisks, sd_next) { pv = sd->sd_volume->v_md_data; mvol = intel_get_volume(meta, pv->pv_volume_pos); mmap0 = intel_get_map(mvol, 0); if (mvol->migr_state) mmap1 = intel_get_map(mvol, 1); else mmap1 = mmap0; migr_global = 1; for (i = 0; i < mmap0->total_disks; i++) { if ((mmap0->disk_idx[i] & INTEL_DI_RBLD) == 0 && (mmap1->disk_idx[i] & INTEL_DI_RBLD) != 0) migr_global = 0; } if ((meta->disk[disk_pos].flags & INTEL_F_DISABLED) && !(pd->pd_disk_meta.flags & INTEL_F_SPARE)) { /* Disabled disk, useless. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_NONE); } else if (resurrection) { /* Stale disk, almost same as new. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_NEW); } else if (meta->disk[disk_pos].flags & INTEL_F_FAILED) { /* Failed disk, almost useless. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_FAILED); } else if (mvol->migr_state == 0) { if (mmap0->status == INTEL_S_UNINITIALIZED && (!pv->pv_cng || pv->pv_cng_master_disk != disk_pos)) { /* Freshly created uninitialized volume. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_UNINITIALIZED); } else if (mmap0->disk_idx[sd->sd_pos] & INTEL_DI_RBLD) { /* Freshly inserted disk. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_NEW); } else if (mvol->dirty && (!pv->pv_cng || pv->pv_cng_master_disk != disk_pos)) { /* Dirty volume (unclean shutdown). */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_STALE); } else { /* Up to date disk. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_ACTIVE); } } else if (mvol->migr_type == INTEL_MT_INIT || mvol->migr_type == INTEL_MT_REBUILD) { if (mmap0->disk_idx[sd->sd_pos] & INTEL_DI_RBLD) { /* Freshly inserted disk. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_NEW); } else if (mmap1->disk_idx[sd->sd_pos] & INTEL_DI_RBLD) { /* Rebuilding disk. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_REBUILD); if (mvol->dirty) { sd->sd_rebuild_pos = 0; } else { sd->sd_rebuild_pos = intel_get_vol_curr_migr_unit(mvol) * sd->sd_volume->v_strip_size * mmap0->total_domains; } } else if (mvol->migr_type == INTEL_MT_INIT && migr_global) { /* Freshly created uninitialized volume. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_UNINITIALIZED); } else if (mvol->dirty && (!pv->pv_cng || pv->pv_cng_master_disk != disk_pos)) { /* Dirty volume (unclean shutdown). */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_STALE); } else { /* Up to date disk. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_ACTIVE); } } else if (mvol->migr_type == INTEL_MT_VERIFY || mvol->migr_type == INTEL_MT_REPAIR) { if (mmap0->disk_idx[sd->sd_pos] & INTEL_DI_RBLD) { /* Freshly inserted disk. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_NEW); } else if ((mmap1->disk_idx[sd->sd_pos] & INTEL_DI_RBLD) || migr_global) { /* Resyncing disk. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_RESYNC); if (mvol->dirty) { sd->sd_rebuild_pos = 0; } else { sd->sd_rebuild_pos = intel_get_vol_curr_migr_unit(mvol) * sd->sd_volume->v_strip_size * mmap0->total_domains; } } else if (mvol->dirty) { /* Dirty volume (unclean shutdown). */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_STALE); } else { /* Up to date disk. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_ACTIVE); } } else if (mvol->migr_type == INTEL_MT_GEN_MIGR) { if ((mmap1->disk_idx[0] & INTEL_DI_IDX) != disk_pos) { /* Freshly inserted disk. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_NEW); } else { /* Up to date disk. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_ACTIVE); } } g_raid_event_send(sd, G_RAID_SUBDISK_E_NEW, G_RAID_EVENT_SUBDISK); } /* Update status of our need for spare. */ if (mdi->mdio_started) { mdi->mdio_incomplete = (g_raid_ndisks(sc, G_RAID_DISK_S_ACTIVE) + g_raid_ndisks(sc, G_RAID_DISK_S_DISABLED) < meta->total_disks); } return (resurrection); } static void g_disk_md_intel_retaste(void *arg, int pending) { G_RAID_DEBUG(1, "Array is not complete, trying to retaste."); g_retaste(&g_raid_class); free(arg, M_MD_INTEL); } static void g_raid_md_intel_refill(struct g_raid_softc *sc) { struct g_raid_md_object *md; struct g_raid_md_intel_object *mdi; struct intel_raid_conf *meta; struct g_raid_disk *disk; struct task *task; int update, na; md = sc->sc_md; mdi = (struct g_raid_md_intel_object *)md; meta = mdi->mdio_meta; update = 0; do { /* Make sure we miss anything. */ na = g_raid_ndisks(sc, G_RAID_DISK_S_ACTIVE) + g_raid_ndisks(sc, G_RAID_DISK_S_DISABLED); if (na == meta->total_disks) break; G_RAID_DEBUG1(1, md->mdo_softc, "Array is not complete (%d of %d), " "trying to refill.", na, meta->total_disks); /* Try to get use some of STALE disks. */ TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_state == G_RAID_DISK_S_STALE) { update += g_raid_md_intel_start_disk(disk); if (disk->d_state == G_RAID_DISK_S_ACTIVE || disk->d_state == G_RAID_DISK_S_DISABLED) break; } } if (disk != NULL) continue; /* Try to get use some of SPARE disks. */ TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_state == G_RAID_DISK_S_SPARE) { update += g_raid_md_intel_start_disk(disk); if (disk->d_state == G_RAID_DISK_S_ACTIVE) break; } } } while (disk != NULL); /* Write new metadata if we changed something. */ if (update) { g_raid_md_write_intel(md, NULL, NULL, NULL); meta = mdi->mdio_meta; } /* Update status of our need for spare. */ mdi->mdio_incomplete = (g_raid_ndisks(sc, G_RAID_DISK_S_ACTIVE) + g_raid_ndisks(sc, G_RAID_DISK_S_DISABLED) < meta->total_disks); /* Request retaste hoping to find spare. */ if (mdi->mdio_incomplete) { task = malloc(sizeof(struct task), M_MD_INTEL, M_WAITOK | M_ZERO); TASK_INIT(task, 0, g_disk_md_intel_retaste, task); taskqueue_enqueue(taskqueue_swi, task); } } static void g_raid_md_intel_start(struct g_raid_softc *sc) { struct g_raid_md_object *md; struct g_raid_md_intel_object *mdi; struct g_raid_md_intel_pervolume *pv; struct g_raid_md_intel_perdisk *pd; struct intel_raid_conf *meta; struct intel_raid_vol *mvol; struct intel_raid_map *mmap; struct g_raid_volume *vol; struct g_raid_subdisk *sd; struct g_raid_disk *disk; int i, j, disk_pos; md = sc->sc_md; mdi = (struct g_raid_md_intel_object *)md; meta = mdi->mdio_meta; /* Create volumes and subdisks. */ for (i = 0; i < meta->total_volumes; i++) { mvol = intel_get_volume(meta, i); mmap = intel_get_map(mvol, 0); vol = g_raid_create_volume(sc, mvol->name, mvol->tid - 1); pv = malloc(sizeof(*pv), M_MD_INTEL, M_WAITOK | M_ZERO); pv->pv_volume_pos = i; pv->pv_cng = (mvol->state & INTEL_ST_CLONE_N_GO) != 0; pv->pv_cng_man_sync = (mvol->state & INTEL_ST_CLONE_MAN_SYNC) != 0; if (mvol->cng_master_disk < mmap->total_disks) pv->pv_cng_master_disk = mvol->cng_master_disk; vol->v_md_data = pv; vol->v_raid_level_qualifier = G_RAID_VOLUME_RLQ_NONE; if (mmap->type == INTEL_T_RAID0) vol->v_raid_level = G_RAID_VOLUME_RL_RAID0; else if (mmap->type == INTEL_T_RAID1 && mmap->total_domains >= 2 && mmap->total_domains <= mmap->total_disks) { /* Assume total_domains is correct. */ if (mmap->total_domains == mmap->total_disks) vol->v_raid_level = G_RAID_VOLUME_RL_RAID1; else vol->v_raid_level = G_RAID_VOLUME_RL_RAID1E; } else if (mmap->type == INTEL_T_RAID1) { /* total_domains looks wrong. */ if (mmap->total_disks <= 2) vol->v_raid_level = G_RAID_VOLUME_RL_RAID1; else vol->v_raid_level = G_RAID_VOLUME_RL_RAID1E; } else if (mmap->type == INTEL_T_RAID5) { vol->v_raid_level = G_RAID_VOLUME_RL_RAID5; vol->v_raid_level_qualifier = G_RAID_VOLUME_RLQ_R5LA; } else vol->v_raid_level = G_RAID_VOLUME_RL_UNKNOWN; vol->v_strip_size = (u_int)mmap->strip_sectors * 512; //ZZZ vol->v_disks_count = mmap->total_disks; vol->v_mediasize = (off_t)mvol->total_sectors * 512; //ZZZ vol->v_sectorsize = 512; //ZZZ for (j = 0; j < vol->v_disks_count; j++) { sd = &vol->v_subdisks[j]; sd->sd_offset = intel_get_map_offset(mmap) * 512; //ZZZ sd->sd_size = intel_get_map_disk_sectors(mmap) * 512; //ZZZ } g_raid_start_volume(vol); } /* Create disk placeholders to store data for later writing. */ for (disk_pos = 0; disk_pos < meta->total_disks; disk_pos++) { pd = malloc(sizeof(*pd), M_MD_INTEL, M_WAITOK | M_ZERO); pd->pd_disk_pos = disk_pos; pd->pd_disk_meta = meta->disk[disk_pos]; disk = g_raid_create_disk(sc); disk->d_md_data = (void *)pd; disk->d_state = G_RAID_DISK_S_OFFLINE; for (i = 0; i < meta->total_volumes; i++) { mvol = intel_get_volume(meta, i); mmap = intel_get_map(mvol, 0); for (j = 0; j < mmap->total_disks; j++) { if ((mmap->disk_idx[j] & INTEL_DI_IDX) == disk_pos) break; } if (j == mmap->total_disks) continue; vol = g_raid_md_intel_get_volume(sc, i); sd = &vol->v_subdisks[j]; sd->sd_disk = disk; TAILQ_INSERT_TAIL(&disk->d_subdisks, sd, sd_next); } } /* Make all disks found till the moment take their places. */ do { TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_state == G_RAID_DISK_S_NONE) { g_raid_md_intel_start_disk(disk); break; } } } while (disk != NULL); mdi->mdio_started = 1; G_RAID_DEBUG1(0, sc, "Array started."); g_raid_md_write_intel(md, NULL, NULL, NULL); /* Pickup any STALE/SPARE disks to refill array if needed. */ g_raid_md_intel_refill(sc); TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) { g_raid_event_send(vol, G_RAID_VOLUME_E_START, G_RAID_EVENT_VOLUME); } callout_stop(&mdi->mdio_start_co); G_RAID_DEBUG1(1, sc, "root_mount_rel %p", mdi->mdio_rootmount); root_mount_rel(mdi->mdio_rootmount); mdi->mdio_rootmount = NULL; } static void g_raid_md_intel_new_disk(struct g_raid_disk *disk) { struct g_raid_softc *sc; struct g_raid_md_object *md; struct g_raid_md_intel_object *mdi; struct intel_raid_conf *pdmeta; struct g_raid_md_intel_perdisk *pd; sc = disk->d_softc; md = sc->sc_md; mdi = (struct g_raid_md_intel_object *)md; pd = (struct g_raid_md_intel_perdisk *)disk->d_md_data; pdmeta = pd->pd_meta; if (mdi->mdio_started) { if (g_raid_md_intel_start_disk(disk)) g_raid_md_write_intel(md, NULL, NULL, NULL); } else { /* If we haven't started yet - check metadata freshness. */ if (mdi->mdio_meta == NULL || ((int32_t)(pdmeta->generation - mdi->mdio_generation)) > 0) { G_RAID_DEBUG1(1, sc, "Newer disk"); if (mdi->mdio_meta != NULL) free(mdi->mdio_meta, M_MD_INTEL); mdi->mdio_meta = intel_meta_copy(pdmeta); mdi->mdio_generation = mdi->mdio_meta->generation; mdi->mdio_disks_present = 1; } else if (pdmeta->generation == mdi->mdio_generation) { mdi->mdio_disks_present++; G_RAID_DEBUG1(1, sc, "Matching disk (%d of %d up)", mdi->mdio_disks_present, mdi->mdio_meta->total_disks); } else { G_RAID_DEBUG1(1, sc, "Older disk"); } /* If we collected all needed disks - start array. */ if (mdi->mdio_disks_present == mdi->mdio_meta->total_disks) g_raid_md_intel_start(sc); } } static void g_raid_intel_go(void *arg) { struct g_raid_softc *sc; struct g_raid_md_object *md; struct g_raid_md_intel_object *mdi; sc = arg; md = sc->sc_md; mdi = (struct g_raid_md_intel_object *)md; if (!mdi->mdio_started) { G_RAID_DEBUG1(0, sc, "Force array start due to timeout."); g_raid_event_send(sc, G_RAID_NODE_E_START, 0); } } static int g_raid_md_create_intel(struct g_raid_md_object *md, struct g_class *mp, struct g_geom **gp) { struct g_raid_softc *sc; struct g_raid_md_intel_object *mdi; char name[16]; mdi = (struct g_raid_md_intel_object *)md; mdi->mdio_config_id = mdi->mdio_orig_config_id = arc4random(); mdi->mdio_generation = 0; snprintf(name, sizeof(name), "Intel-%08x", mdi->mdio_config_id); sc = g_raid_create_node(mp, name, md); if (sc == NULL) return (G_RAID_MD_TASTE_FAIL); md->mdo_softc = sc; *gp = sc->sc_geom; return (G_RAID_MD_TASTE_NEW); } /* * Return the last N characters of the serial label. The Linux and * ataraid(7) code always uses the last 16 characters of the label to * store into the Intel meta format. Generalize this to N characters * since that's easy. Labels can be up to 20 characters for SATA drives * and up 251 characters for SAS drives. Since intel controllers don't * support SAS drives, just stick with the SATA limits for stack friendliness. */ static int g_raid_md_get_label(struct g_consumer *cp, char *serial, int serlen) { char serial_buffer[24]; int len, error; len = sizeof(serial_buffer); error = g_io_getattr("GEOM::ident", cp, &len, serial_buffer); if (error != 0) return (error); len = strlen(serial_buffer); if (len > serlen) len -= serlen; else len = 0; strncpy(serial, serial_buffer + len, serlen); return (0); } static int g_raid_md_taste_intel(struct g_raid_md_object *md, struct g_class *mp, struct g_consumer *cp, struct g_geom **gp) { struct g_consumer *rcp; struct g_provider *pp; struct g_raid_md_intel_object *mdi, *mdi1; struct g_raid_softc *sc; struct g_raid_disk *disk; struct intel_raid_conf *meta; struct g_raid_md_intel_perdisk *pd; struct g_geom *geom; int error, disk_pos, result, spare, len; char serial[INTEL_SERIAL_LEN]; char name[16]; uint16_t vendor; G_RAID_DEBUG(1, "Tasting Intel on %s", cp->provider->name); mdi = (struct g_raid_md_intel_object *)md; pp = cp->provider; /* Read metadata from device. */ meta = NULL; disk_pos = 0; g_topology_unlock(); error = g_raid_md_get_label(cp, serial, sizeof(serial)); if (error != 0) { G_RAID_DEBUG(1, "Cannot get serial number from %s (error=%d).", pp->name, error); goto fail2; } vendor = 0xffff; len = sizeof(vendor); if (pp->geom->rank == 1) g_io_getattr("GEOM::hba_vendor", cp, &len, &vendor); meta = intel_meta_read(cp); g_topology_lock(); if (meta == NULL) { if (g_raid_aggressive_spare) { if (vendor != 0x8086) { G_RAID_DEBUG(1, "Intel vendor mismatch 0x%04x != 0x8086", vendor); } else { G_RAID_DEBUG(1, "No Intel metadata, forcing spare."); spare = 2; goto search; } } return (G_RAID_MD_TASTE_FAIL); } /* Check this disk position in obtained metadata. */ disk_pos = intel_meta_find_disk(meta, serial); if (disk_pos < 0) { G_RAID_DEBUG(1, "Intel serial '%s' not found", serial); goto fail1; } if (intel_get_disk_sectors(&meta->disk[disk_pos]) != (pp->mediasize / pp->sectorsize)) { G_RAID_DEBUG(1, "Intel size mismatch %ju != %ju", intel_get_disk_sectors(&meta->disk[disk_pos]), (off_t)(pp->mediasize / pp->sectorsize)); goto fail1; } G_RAID_DEBUG(1, "Intel disk position %d", disk_pos); spare = meta->disk[disk_pos].flags & INTEL_F_SPARE; search: /* Search for matching node. */ sc = NULL; mdi1 = NULL; LIST_FOREACH(geom, &mp->geom, geom) { sc = geom->softc; if (sc == NULL) continue; if (sc->sc_stopping != 0) continue; if (sc->sc_md->mdo_class != md->mdo_class) continue; mdi1 = (struct g_raid_md_intel_object *)sc->sc_md; if (spare) { if (mdi1->mdio_incomplete) break; } else { if (mdi1->mdio_config_id == meta->config_id) break; } } /* Found matching node. */ if (geom != NULL) { G_RAID_DEBUG(1, "Found matching array %s", sc->sc_name); result = G_RAID_MD_TASTE_EXISTING; } else if (spare) { /* Not found needy node -- left for later. */ G_RAID_DEBUG(1, "Spare is not needed at this time"); goto fail1; } else { /* Not found matching node -- create one. */ result = G_RAID_MD_TASTE_NEW; mdi->mdio_config_id = meta->config_id; mdi->mdio_orig_config_id = meta->orig_config_id; snprintf(name, sizeof(name), "Intel-%08x", meta->config_id); sc = g_raid_create_node(mp, name, md); md->mdo_softc = sc; geom = sc->sc_geom; callout_init(&mdi->mdio_start_co, 1); callout_reset(&mdi->mdio_start_co, g_raid_start_timeout * hz, g_raid_intel_go, sc); mdi->mdio_rootmount = root_mount_hold("GRAID-Intel"); G_RAID_DEBUG1(1, sc, "root_mount_hold %p", mdi->mdio_rootmount); } /* There is no return after this point, so we close passed consumer. */ g_access(cp, -1, 0, 0); rcp = g_new_consumer(geom); rcp->flags |= G_CF_DIRECT_RECEIVE; g_attach(rcp, pp); if (g_access(rcp, 1, 1, 1) != 0) ; //goto fail1; g_topology_unlock(); sx_xlock(&sc->sc_lock); pd = malloc(sizeof(*pd), M_MD_INTEL, M_WAITOK | M_ZERO); pd->pd_meta = meta; pd->pd_disk_pos = -1; if (spare == 2) { memcpy(&pd->pd_disk_meta.serial[0], serial, INTEL_SERIAL_LEN); intel_set_disk_sectors(&pd->pd_disk_meta, pp->mediasize / pp->sectorsize); pd->pd_disk_meta.id = 0; pd->pd_disk_meta.flags = INTEL_F_SPARE; } else { pd->pd_disk_meta = meta->disk[disk_pos]; } disk = g_raid_create_disk(sc); disk->d_md_data = (void *)pd; disk->d_consumer = rcp; rcp->private = disk; g_raid_get_disk_info(disk); g_raid_md_intel_new_disk(disk); sx_xunlock(&sc->sc_lock); g_topology_lock(); *gp = geom; return (result); fail2: g_topology_lock(); fail1: free(meta, M_MD_INTEL); return (G_RAID_MD_TASTE_FAIL); } static int g_raid_md_event_intel(struct g_raid_md_object *md, struct g_raid_disk *disk, u_int event) { struct g_raid_softc *sc; struct g_raid_subdisk *sd; struct g_raid_md_intel_object *mdi; struct g_raid_md_intel_perdisk *pd; sc = md->mdo_softc; mdi = (struct g_raid_md_intel_object *)md; if (disk == NULL) { switch (event) { case G_RAID_NODE_E_START: if (!mdi->mdio_started) g_raid_md_intel_start(sc); return (0); } return (-1); } pd = (struct g_raid_md_intel_perdisk *)disk->d_md_data; switch (event) { case G_RAID_DISK_E_DISCONNECTED: /* If disk was assigned, just update statuses. */ if (pd->pd_disk_pos >= 0) { g_raid_change_disk_state(disk, G_RAID_DISK_S_OFFLINE); if (disk->d_consumer) { g_raid_kill_consumer(sc, disk->d_consumer); disk->d_consumer = NULL; } TAILQ_FOREACH(sd, &disk->d_subdisks, sd_next) { g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_NONE); g_raid_event_send(sd, G_RAID_SUBDISK_E_DISCONNECTED, G_RAID_EVENT_SUBDISK); } } else { /* Otherwise -- delete. */ g_raid_change_disk_state(disk, G_RAID_DISK_S_NONE); g_raid_destroy_disk(disk); } /* Write updated metadata to all disks. */ g_raid_md_write_intel(md, NULL, NULL, NULL); /* Check if anything left except placeholders. */ if (g_raid_ndisks(sc, -1) == g_raid_ndisks(sc, G_RAID_DISK_S_OFFLINE)) g_raid_destroy_node(sc, 0); else g_raid_md_intel_refill(sc); return (0); } return (-2); } static int g_raid_md_ctl_intel(struct g_raid_md_object *md, struct gctl_req *req) { struct g_raid_softc *sc; struct g_raid_volume *vol, *vol1; struct g_raid_subdisk *sd; struct g_raid_disk *disk; struct g_raid_md_intel_object *mdi; struct g_raid_md_intel_pervolume *pv; struct g_raid_md_intel_perdisk *pd; struct g_consumer *cp; struct g_provider *pp; char arg[16], serial[INTEL_SERIAL_LEN]; const char *nodename, *verb, *volname, *levelname, *diskname; char *tmp; int *nargs, *force; off_t off, size, sectorsize, strip, disk_sectors; intmax_t *sizearg, *striparg; int numdisks, i, len, level, qual, update; int error; sc = md->mdo_softc; mdi = (struct g_raid_md_intel_object *)md; verb = gctl_get_param(req, "verb", NULL); nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); error = 0; if (strcmp(verb, "label") == 0) { if (*nargs < 4) { gctl_error(req, "Invalid number of arguments."); return (-1); } volname = gctl_get_asciiparam(req, "arg1"); if (volname == NULL) { gctl_error(req, "No volume name."); return (-2); } levelname = gctl_get_asciiparam(req, "arg2"); if (levelname == NULL) { gctl_error(req, "No RAID level."); return (-3); } if (strcasecmp(levelname, "RAID5") == 0) levelname = "RAID5-LA"; if (g_raid_volume_str2level(levelname, &level, &qual)) { gctl_error(req, "Unknown RAID level '%s'.", levelname); return (-4); } numdisks = *nargs - 3; force = gctl_get_paraml(req, "force", sizeof(*force)); if (!g_raid_md_intel_supported(level, qual, numdisks, force ? *force : 0)) { gctl_error(req, "Unsupported RAID level " "(0x%02x/0x%02x), or number of disks (%d).", level, qual, numdisks); return (-5); } /* Search for disks, connect them and probe. */ size = 0x7fffffffffffffffllu; sectorsize = 0; for (i = 0; i < numdisks; i++) { snprintf(arg, sizeof(arg), "arg%d", i + 3); diskname = gctl_get_asciiparam(req, arg); if (diskname == NULL) { gctl_error(req, "No disk name (%s).", arg); error = -6; break; } if (strcmp(diskname, "NONE") == 0) { cp = NULL; pp = NULL; } else { g_topology_lock(); cp = g_raid_open_consumer(sc, diskname); if (cp == NULL) { gctl_error(req, "Can't open disk '%s'.", diskname); g_topology_unlock(); error = -7; break; } pp = cp->provider; } pd = malloc(sizeof(*pd), M_MD_INTEL, M_WAITOK | M_ZERO); pd->pd_disk_pos = i; disk = g_raid_create_disk(sc); disk->d_md_data = (void *)pd; disk->d_consumer = cp; if (cp == NULL) { strcpy(&pd->pd_disk_meta.serial[0], "NONE"); pd->pd_disk_meta.id = 0xffffffff; pd->pd_disk_meta.flags = INTEL_F_ASSIGNED; continue; } cp->private = disk; g_topology_unlock(); error = g_raid_md_get_label(cp, &pd->pd_disk_meta.serial[0], INTEL_SERIAL_LEN); if (error != 0) { gctl_error(req, "Can't get serial for provider '%s'.", diskname); error = -8; break; } g_raid_get_disk_info(disk); intel_set_disk_sectors(&pd->pd_disk_meta, pp->mediasize / pp->sectorsize); if (size > pp->mediasize) size = pp->mediasize; if (sectorsize < pp->sectorsize) sectorsize = pp->sectorsize; pd->pd_disk_meta.id = 0; pd->pd_disk_meta.flags = INTEL_F_ASSIGNED | INTEL_F_ONLINE; } if (error != 0) return (error); if (sectorsize <= 0) { gctl_error(req, "Can't get sector size."); return (-8); } /* Reserve some space for metadata. */ size -= ((4096 + sectorsize - 1) / sectorsize) * sectorsize; /* Handle size argument. */ len = sizeof(*sizearg); sizearg = gctl_get_param(req, "size", &len); if (sizearg != NULL && len == sizeof(*sizearg) && *sizearg > 0) { if (*sizearg > size) { gctl_error(req, "Size too big %lld > %lld.", (long long)*sizearg, (long long)size); return (-9); } size = *sizearg; } /* Handle strip argument. */ strip = 131072; len = sizeof(*striparg); striparg = gctl_get_param(req, "strip", &len); if (striparg != NULL && len == sizeof(*striparg) && *striparg > 0) { if (*striparg < sectorsize) { gctl_error(req, "Strip size too small."); return (-10); } if (*striparg % sectorsize != 0) { gctl_error(req, "Incorrect strip size."); return (-11); } if (strip > 65535 * sectorsize) { gctl_error(req, "Strip size too big."); return (-12); } strip = *striparg; } /* Round size down to strip or sector. */ if (level == G_RAID_VOLUME_RL_RAID1) size -= (size % sectorsize); else if (level == G_RAID_VOLUME_RL_RAID1E && (numdisks & 1) != 0) size -= (size % (2 * strip)); else size -= (size % strip); if (size <= 0) { gctl_error(req, "Size too small."); return (-13); } /* We have all we need, create things: volume, ... */ mdi->mdio_started = 1; vol = g_raid_create_volume(sc, volname, -1); pv = malloc(sizeof(*pv), M_MD_INTEL, M_WAITOK | M_ZERO); pv->pv_volume_pos = 0; vol->v_md_data = pv; vol->v_raid_level = level; vol->v_raid_level_qualifier = qual; vol->v_strip_size = strip; vol->v_disks_count = numdisks; if (level == G_RAID_VOLUME_RL_RAID0) vol->v_mediasize = size * numdisks; else if (level == G_RAID_VOLUME_RL_RAID1) vol->v_mediasize = size; else if (level == G_RAID_VOLUME_RL_RAID5) vol->v_mediasize = size * (numdisks - 1); else { /* RAID1E */ vol->v_mediasize = ((size * numdisks) / strip / 2) * strip; } vol->v_sectorsize = sectorsize; g_raid_start_volume(vol); /* , and subdisks. */ TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { pd = (struct g_raid_md_intel_perdisk *)disk->d_md_data; sd = &vol->v_subdisks[pd->pd_disk_pos]; sd->sd_disk = disk; sd->sd_offset = 0; sd->sd_size = size; TAILQ_INSERT_TAIL(&disk->d_subdisks, sd, sd_next); if (sd->sd_disk->d_consumer != NULL) { g_raid_change_disk_state(disk, G_RAID_DISK_S_ACTIVE); if (level == G_RAID_VOLUME_RL_RAID5) g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_UNINITIALIZED); else g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_ACTIVE); g_raid_event_send(sd, G_RAID_SUBDISK_E_NEW, G_RAID_EVENT_SUBDISK); } else { g_raid_change_disk_state(disk, G_RAID_DISK_S_OFFLINE); } } /* Write metadata based on created entities. */ G_RAID_DEBUG1(0, sc, "Array started."); g_raid_md_write_intel(md, NULL, NULL, NULL); /* Pickup any STALE/SPARE disks to refill array if needed. */ g_raid_md_intel_refill(sc); g_raid_event_send(vol, G_RAID_VOLUME_E_START, G_RAID_EVENT_VOLUME); return (0); } if (strcmp(verb, "add") == 0) { if (*nargs != 3) { gctl_error(req, "Invalid number of arguments."); return (-1); } volname = gctl_get_asciiparam(req, "arg1"); if (volname == NULL) { gctl_error(req, "No volume name."); return (-2); } levelname = gctl_get_asciiparam(req, "arg2"); if (levelname == NULL) { gctl_error(req, "No RAID level."); return (-3); } if (strcasecmp(levelname, "RAID5") == 0) levelname = "RAID5-LA"; if (g_raid_volume_str2level(levelname, &level, &qual)) { gctl_error(req, "Unknown RAID level '%s'.", levelname); return (-4); } /* Look for existing volumes. */ i = 0; vol1 = NULL; TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) { vol1 = vol; i++; } if (i > 1) { gctl_error(req, "Maximum two volumes supported."); return (-6); } if (vol1 == NULL) { gctl_error(req, "At least one volume must exist."); return (-7); } numdisks = vol1->v_disks_count; force = gctl_get_paraml(req, "force", sizeof(*force)); if (!g_raid_md_intel_supported(level, qual, numdisks, force ? *force : 0)) { gctl_error(req, "Unsupported RAID level " "(0x%02x/0x%02x), or number of disks (%d).", level, qual, numdisks); return (-5); } /* Collect info about present disks. */ size = 0x7fffffffffffffffllu; sectorsize = 512; for (i = 0; i < numdisks; i++) { disk = vol1->v_subdisks[i].sd_disk; pd = (struct g_raid_md_intel_perdisk *) disk->d_md_data; disk_sectors = intel_get_disk_sectors(&pd->pd_disk_meta); if (disk_sectors * 512 < size) size = disk_sectors * 512; if (disk->d_consumer != NULL && disk->d_consumer->provider != NULL && disk->d_consumer->provider->sectorsize > sectorsize) { sectorsize = disk->d_consumer->provider->sectorsize; } } /* Reserve some space for metadata. */ size -= ((4096 + sectorsize - 1) / sectorsize) * sectorsize; /* Decide insert before or after. */ sd = &vol1->v_subdisks[0]; if (sd->sd_offset > size - (sd->sd_offset + sd->sd_size)) { off = 0; size = sd->sd_offset; } else { off = sd->sd_offset + sd->sd_size; size = size - (sd->sd_offset + sd->sd_size); } /* Handle strip argument. */ strip = 131072; len = sizeof(*striparg); striparg = gctl_get_param(req, "strip", &len); if (striparg != NULL && len == sizeof(*striparg) && *striparg > 0) { if (*striparg < sectorsize) { gctl_error(req, "Strip size too small."); return (-10); } if (*striparg % sectorsize != 0) { gctl_error(req, "Incorrect strip size."); return (-11); } if (strip > 65535 * sectorsize) { gctl_error(req, "Strip size too big."); return (-12); } strip = *striparg; } /* Round offset up to strip. */ if (off % strip != 0) { size -= strip - off % strip; off += strip - off % strip; } /* Handle size argument. */ len = sizeof(*sizearg); sizearg = gctl_get_param(req, "size", &len); if (sizearg != NULL && len == sizeof(*sizearg) && *sizearg > 0) { if (*sizearg > size) { gctl_error(req, "Size too big %lld > %lld.", (long long)*sizearg, (long long)size); return (-9); } size = *sizearg; } /* Round size down to strip or sector. */ if (level == G_RAID_VOLUME_RL_RAID1) size -= (size % sectorsize); else size -= (size % strip); if (size <= 0) { gctl_error(req, "Size too small."); return (-13); } if (size > 0xffffffffllu * sectorsize) { gctl_error(req, "Size too big."); return (-14); } /* We have all we need, create things: volume, ... */ vol = g_raid_create_volume(sc, volname, -1); pv = malloc(sizeof(*pv), M_MD_INTEL, M_WAITOK | M_ZERO); pv->pv_volume_pos = i; vol->v_md_data = pv; vol->v_raid_level = level; vol->v_raid_level_qualifier = qual; vol->v_strip_size = strip; vol->v_disks_count = numdisks; if (level == G_RAID_VOLUME_RL_RAID0) vol->v_mediasize = size * numdisks; else if (level == G_RAID_VOLUME_RL_RAID1) vol->v_mediasize = size; else if (level == G_RAID_VOLUME_RL_RAID5) vol->v_mediasize = size * (numdisks - 1); else { /* RAID1E */ vol->v_mediasize = ((size * numdisks) / strip / 2) * strip; } vol->v_sectorsize = sectorsize; g_raid_start_volume(vol); /* , and subdisks. */ for (i = 0; i < numdisks; i++) { disk = vol1->v_subdisks[i].sd_disk; sd = &vol->v_subdisks[i]; sd->sd_disk = disk; sd->sd_offset = off; sd->sd_size = size; TAILQ_INSERT_TAIL(&disk->d_subdisks, sd, sd_next); if (disk->d_state == G_RAID_DISK_S_ACTIVE) { if (level == G_RAID_VOLUME_RL_RAID5) g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_UNINITIALIZED); else g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_ACTIVE); g_raid_event_send(sd, G_RAID_SUBDISK_E_NEW, G_RAID_EVENT_SUBDISK); } } /* Write metadata based on created entities. */ g_raid_md_write_intel(md, NULL, NULL, NULL); g_raid_event_send(vol, G_RAID_VOLUME_E_START, G_RAID_EVENT_VOLUME); return (0); } if (strcmp(verb, "delete") == 0) { nodename = gctl_get_asciiparam(req, "arg0"); if (nodename != NULL && strcasecmp(sc->sc_name, nodename) != 0) nodename = NULL; /* Full node destruction. */ if (*nargs == 1 && nodename != NULL) { /* Check if some volume is still open. */ force = gctl_get_paraml(req, "force", sizeof(*force)); if (force != NULL && *force == 0 && g_raid_nopens(sc) != 0) { gctl_error(req, "Some volume is still open."); return (-4); } TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_consumer) intel_meta_erase(disk->d_consumer); } g_raid_destroy_node(sc, 0); return (0); } /* Destroy specified volume. If it was last - all node. */ if (*nargs > 2) { gctl_error(req, "Invalid number of arguments."); return (-1); } volname = gctl_get_asciiparam(req, nodename != NULL ? "arg1" : "arg0"); if (volname == NULL) { gctl_error(req, "No volume name."); return (-2); } /* Search for volume. */ TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) { if (strcmp(vol->v_name, volname) == 0) break; pp = vol->v_provider; if (pp == NULL) continue; if (strcmp(pp->name, volname) == 0) break; if (strncmp(pp->name, "raid/", 5) == 0 && strcmp(pp->name + 5, volname) == 0) break; } if (vol == NULL) { i = strtol(volname, &tmp, 10); if (verb != volname && tmp[0] == 0) { TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) { if (vol->v_global_id == i) break; } } } if (vol == NULL) { gctl_error(req, "Volume '%s' not found.", volname); return (-3); } /* Check if volume is still open. */ force = gctl_get_paraml(req, "force", sizeof(*force)); if (force != NULL && *force == 0 && vol->v_provider_open != 0) { gctl_error(req, "Volume is still open."); return (-4); } /* Destroy volume and potentially node. */ i = 0; TAILQ_FOREACH(vol1, &sc->sc_volumes, v_next) i++; if (i >= 2) { g_raid_destroy_volume(vol); g_raid_md_write_intel(md, NULL, NULL, NULL); } else { TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_consumer) intel_meta_erase(disk->d_consumer); } g_raid_destroy_node(sc, 0); } return (0); } if (strcmp(verb, "remove") == 0 || strcmp(verb, "fail") == 0) { if (*nargs < 2) { gctl_error(req, "Invalid number of arguments."); return (-1); } for (i = 1; i < *nargs; i++) { snprintf(arg, sizeof(arg), "arg%d", i); diskname = gctl_get_asciiparam(req, arg); if (diskname == NULL) { gctl_error(req, "No disk name (%s).", arg); error = -2; break; } if (strncmp(diskname, "/dev/", 5) == 0) diskname += 5; TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_consumer != NULL && disk->d_consumer->provider != NULL && strcmp(disk->d_consumer->provider->name, diskname) == 0) break; } if (disk == NULL) { gctl_error(req, "Disk '%s' not found.", diskname); error = -3; break; } if (strcmp(verb, "fail") == 0) { g_raid_md_fail_disk_intel(md, NULL, disk); continue; } pd = (struct g_raid_md_intel_perdisk *)disk->d_md_data; /* Erase metadata on deleting disk. */ intel_meta_erase(disk->d_consumer); /* If disk was assigned, just update statuses. */ if (pd->pd_disk_pos >= 0) { g_raid_change_disk_state(disk, G_RAID_DISK_S_OFFLINE); g_raid_kill_consumer(sc, disk->d_consumer); disk->d_consumer = NULL; TAILQ_FOREACH(sd, &disk->d_subdisks, sd_next) { g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_NONE); g_raid_event_send(sd, G_RAID_SUBDISK_E_DISCONNECTED, G_RAID_EVENT_SUBDISK); } } else { /* Otherwise -- delete. */ g_raid_change_disk_state(disk, G_RAID_DISK_S_NONE); g_raid_destroy_disk(disk); } } /* Write updated metadata to remaining disks. */ g_raid_md_write_intel(md, NULL, NULL, NULL); /* Check if anything left except placeholders. */ if (g_raid_ndisks(sc, -1) == g_raid_ndisks(sc, G_RAID_DISK_S_OFFLINE)) g_raid_destroy_node(sc, 0); else g_raid_md_intel_refill(sc); return (error); } if (strcmp(verb, "insert") == 0) { if (*nargs < 2) { gctl_error(req, "Invalid number of arguments."); return (-1); } update = 0; for (i = 1; i < *nargs; i++) { /* Get disk name. */ snprintf(arg, sizeof(arg), "arg%d", i); diskname = gctl_get_asciiparam(req, arg); if (diskname == NULL) { gctl_error(req, "No disk name (%s).", arg); error = -3; break; } /* Try to find provider with specified name. */ g_topology_lock(); cp = g_raid_open_consumer(sc, diskname); if (cp == NULL) { gctl_error(req, "Can't open disk '%s'.", diskname); g_topology_unlock(); error = -4; break; } pp = cp->provider; g_topology_unlock(); /* Read disk serial. */ error = g_raid_md_get_label(cp, &serial[0], INTEL_SERIAL_LEN); if (error != 0) { gctl_error(req, "Can't get serial for provider '%s'.", diskname); g_raid_kill_consumer(sc, cp); error = -7; break; } pd = malloc(sizeof(*pd), M_MD_INTEL, M_WAITOK | M_ZERO); pd->pd_disk_pos = -1; disk = g_raid_create_disk(sc); disk->d_consumer = cp; disk->d_md_data = (void *)pd; cp->private = disk; g_raid_get_disk_info(disk); memcpy(&pd->pd_disk_meta.serial[0], &serial[0], INTEL_SERIAL_LEN); intel_set_disk_sectors(&pd->pd_disk_meta, pp->mediasize / pp->sectorsize); pd->pd_disk_meta.id = 0; pd->pd_disk_meta.flags = INTEL_F_SPARE; /* Welcome the "new" disk. */ update += g_raid_md_intel_start_disk(disk); if (disk->d_state == G_RAID_DISK_S_SPARE) { intel_meta_write_spare(cp, &pd->pd_disk_meta); g_raid_destroy_disk(disk); } else if (disk->d_state != G_RAID_DISK_S_ACTIVE) { gctl_error(req, "Disk '%s' doesn't fit.", diskname); g_raid_destroy_disk(disk); error = -8; break; } } /* Write new metadata if we changed something. */ if (update) g_raid_md_write_intel(md, NULL, NULL, NULL); return (error); } return (-100); } static int g_raid_md_write_intel(struct g_raid_md_object *md, struct g_raid_volume *tvol, struct g_raid_subdisk *tsd, struct g_raid_disk *tdisk) { struct g_raid_softc *sc; struct g_raid_volume *vol; struct g_raid_subdisk *sd; struct g_raid_disk *disk; struct g_raid_md_intel_object *mdi; struct g_raid_md_intel_pervolume *pv; struct g_raid_md_intel_perdisk *pd; struct intel_raid_conf *meta; struct intel_raid_vol *mvol; struct intel_raid_map *mmap0, *mmap1; off_t sectorsize = 512, pos; const char *version, *cv; int vi, sdi, numdisks, len, state, stale; sc = md->mdo_softc; mdi = (struct g_raid_md_intel_object *)md; if (sc->sc_stopping == G_RAID_DESTROY_HARD) return (0); /* Bump generation. Newly written metadata may differ from previous. */ mdi->mdio_generation++; /* Count number of disks. */ numdisks = 0; TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { pd = (struct g_raid_md_intel_perdisk *)disk->d_md_data; if (pd->pd_disk_pos < 0) continue; numdisks++; if (disk->d_state == G_RAID_DISK_S_ACTIVE) { pd->pd_disk_meta.flags = INTEL_F_ONLINE | INTEL_F_ASSIGNED; } else if (disk->d_state == G_RAID_DISK_S_FAILED) { pd->pd_disk_meta.flags = INTEL_F_FAILED | INTEL_F_ASSIGNED; } else if (disk->d_state == G_RAID_DISK_S_DISABLED) { pd->pd_disk_meta.flags = INTEL_F_FAILED | INTEL_F_ASSIGNED | INTEL_F_DISABLED; } else { if (!(pd->pd_disk_meta.flags & INTEL_F_DISABLED)) pd->pd_disk_meta.flags = INTEL_F_ASSIGNED; if (pd->pd_disk_meta.id != 0xffffffff) { pd->pd_disk_meta.id = 0xffffffff; len = strlen(pd->pd_disk_meta.serial); len = min(len, INTEL_SERIAL_LEN - 3); strcpy(pd->pd_disk_meta.serial + len, ":0"); } } } /* Fill anchor and disks. */ meta = malloc(INTEL_MAX_MD_SIZE(numdisks), M_MD_INTEL, M_WAITOK | M_ZERO); memcpy(&meta->intel_id[0], INTEL_MAGIC, sizeof(INTEL_MAGIC) - 1); meta->config_size = INTEL_MAX_MD_SIZE(numdisks); meta->config_id = mdi->mdio_config_id; meta->orig_config_id = mdi->mdio_orig_config_id; meta->generation = mdi->mdio_generation; meta->attributes = INTEL_ATTR_CHECKSUM; meta->total_disks = numdisks; TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { pd = (struct g_raid_md_intel_perdisk *)disk->d_md_data; if (pd->pd_disk_pos < 0) continue; meta->disk[pd->pd_disk_pos] = pd->pd_disk_meta; if (pd->pd_disk_meta.sectors_hi != 0) meta->attributes |= INTEL_ATTR_2TB_DISK; } /* Fill volumes and maps. */ vi = 0; version = INTEL_VERSION_1000; TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) { pv = vol->v_md_data; if (vol->v_stopping) continue; mvol = intel_get_volume(meta, vi); /* New metadata may have different volumes order. */ pv->pv_volume_pos = vi; for (sdi = 0; sdi < vol->v_disks_count; sdi++) { sd = &vol->v_subdisks[sdi]; if (sd->sd_disk != NULL) break; } if (sdi >= vol->v_disks_count) panic("No any filled subdisk in volume"); if (vol->v_mediasize >= 0x20000000000llu) meta->attributes |= INTEL_ATTR_2TB; if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID0) meta->attributes |= INTEL_ATTR_RAID0; else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1) meta->attributes |= INTEL_ATTR_RAID1; else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID5) meta->attributes |= INTEL_ATTR_RAID5; else if ((vol->v_disks_count & 1) == 0) meta->attributes |= INTEL_ATTR_RAID10; else meta->attributes |= INTEL_ATTR_RAID1E; if (pv->pv_cng) meta->attributes |= INTEL_ATTR_RAIDCNG; if (vol->v_strip_size > 131072) meta->attributes |= INTEL_ATTR_EXT_STRIP; if (pv->pv_cng) cv = INTEL_VERSION_1206; else if (vol->v_disks_count > 4) cv = INTEL_VERSION_1204; else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID5) cv = INTEL_VERSION_1202; else if (vol->v_disks_count > 2) cv = INTEL_VERSION_1201; else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1) cv = INTEL_VERSION_1100; else cv = INTEL_VERSION_1000; if (strcmp(cv, version) > 0) version = cv; strlcpy(&mvol->name[0], vol->v_name, sizeof(mvol->name)); mvol->total_sectors = vol->v_mediasize / sectorsize; mvol->state = (INTEL_ST_READ_COALESCING | INTEL_ST_WRITE_COALESCING); mvol->tid = vol->v_global_id + 1; if (pv->pv_cng) { mvol->state |= INTEL_ST_CLONE_N_GO; if (pv->pv_cng_man_sync) mvol->state |= INTEL_ST_CLONE_MAN_SYNC; mvol->cng_master_disk = pv->pv_cng_master_disk; if (vol->v_subdisks[pv->pv_cng_master_disk].sd_state == G_RAID_SUBDISK_S_NONE) mvol->cng_state = INTEL_CNGST_MASTER_MISSING; else if (vol->v_state != G_RAID_VOLUME_S_OPTIMAL) mvol->cng_state = INTEL_CNGST_NEEDS_UPDATE; else mvol->cng_state = INTEL_CNGST_UPDATED; } /* Check for any recovery in progress. */ state = G_RAID_SUBDISK_S_ACTIVE; pos = 0x7fffffffffffffffllu; stale = 0; for (sdi = 0; sdi < vol->v_disks_count; sdi++) { sd = &vol->v_subdisks[sdi]; if (sd->sd_state == G_RAID_SUBDISK_S_REBUILD) state = G_RAID_SUBDISK_S_REBUILD; else if (sd->sd_state == G_RAID_SUBDISK_S_RESYNC && state != G_RAID_SUBDISK_S_REBUILD) state = G_RAID_SUBDISK_S_RESYNC; else if (sd->sd_state == G_RAID_SUBDISK_S_STALE) stale = 1; if ((sd->sd_state == G_RAID_SUBDISK_S_REBUILD || sd->sd_state == G_RAID_SUBDISK_S_RESYNC) && sd->sd_rebuild_pos < pos) pos = sd->sd_rebuild_pos; } if (state == G_RAID_SUBDISK_S_REBUILD) { mvol->migr_state = 1; mvol->migr_type = INTEL_MT_REBUILD; } else if (state == G_RAID_SUBDISK_S_RESYNC) { mvol->migr_state = 1; /* mvol->migr_type = INTEL_MT_REPAIR; */ mvol->migr_type = INTEL_MT_VERIFY; mvol->state |= INTEL_ST_VERIFY_AND_FIX; } else mvol->migr_state = 0; mvol->dirty = (vol->v_dirty || stale); mmap0 = intel_get_map(mvol, 0); /* Write map / common part of two maps. */ intel_set_map_offset(mmap0, sd->sd_offset / sectorsize); intel_set_map_disk_sectors(mmap0, sd->sd_size / sectorsize); mmap0->strip_sectors = vol->v_strip_size / sectorsize; if (vol->v_state == G_RAID_VOLUME_S_BROKEN) mmap0->status = INTEL_S_FAILURE; else if (vol->v_state == G_RAID_VOLUME_S_DEGRADED) mmap0->status = INTEL_S_DEGRADED; else if (g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_UNINITIALIZED) == g_raid_nsubdisks(vol, -1)) mmap0->status = INTEL_S_UNINITIALIZED; else mmap0->status = INTEL_S_READY; if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID0) mmap0->type = INTEL_T_RAID0; else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1 || vol->v_raid_level == G_RAID_VOLUME_RL_RAID1E) mmap0->type = INTEL_T_RAID1; else mmap0->type = INTEL_T_RAID5; mmap0->total_disks = vol->v_disks_count; if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1) mmap0->total_domains = vol->v_disks_count; else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1E) mmap0->total_domains = 2; else mmap0->total_domains = 1; intel_set_map_stripe_count(mmap0, sd->sd_size / vol->v_strip_size / mmap0->total_domains); mmap0->failed_disk_num = 0xff; mmap0->ddf = 1; /* If there are two maps - copy common and update. */ if (mvol->migr_state) { intel_set_vol_curr_migr_unit(mvol, pos / vol->v_strip_size / mmap0->total_domains); mmap1 = intel_get_map(mvol, 1); memcpy(mmap1, mmap0, sizeof(struct intel_raid_map)); mmap0->status = INTEL_S_READY; } else mmap1 = NULL; /* Write disk indexes and put rebuild flags. */ for (sdi = 0; sdi < vol->v_disks_count; sdi++) { sd = &vol->v_subdisks[sdi]; pd = (struct g_raid_md_intel_perdisk *) sd->sd_disk->d_md_data; mmap0->disk_idx[sdi] = pd->pd_disk_pos; if (mvol->migr_state) mmap1->disk_idx[sdi] = pd->pd_disk_pos; if (sd->sd_state == G_RAID_SUBDISK_S_REBUILD || sd->sd_state == G_RAID_SUBDISK_S_RESYNC) { mmap1->disk_idx[sdi] |= INTEL_DI_RBLD; } else if (sd->sd_state != G_RAID_SUBDISK_S_ACTIVE && sd->sd_state != G_RAID_SUBDISK_S_STALE && sd->sd_state != G_RAID_SUBDISK_S_UNINITIALIZED) { mmap0->disk_idx[sdi] |= INTEL_DI_RBLD; if (mvol->migr_state) mmap1->disk_idx[sdi] |= INTEL_DI_RBLD; } if ((sd->sd_state == G_RAID_SUBDISK_S_NONE || sd->sd_state == G_RAID_SUBDISK_S_FAILED || sd->sd_state == G_RAID_SUBDISK_S_REBUILD) && mmap0->failed_disk_num == 0xff) { mmap0->failed_disk_num = sdi; if (mvol->migr_state) mmap1->failed_disk_num = sdi; } } vi++; } meta->total_volumes = vi; if (vi > 1 || meta->attributes & (INTEL_ATTR_EXT_STRIP | INTEL_ATTR_2TB_DISK | INTEL_ATTR_2TB)) version = INTEL_VERSION_1300; if (strcmp(version, INTEL_VERSION_1300) < 0) meta->attributes &= INTEL_ATTR_CHECKSUM; memcpy(&meta->version[0], version, sizeof(INTEL_VERSION_1000) - 1); /* We are done. Print meta data and store them to disks. */ g_raid_md_intel_print(meta); if (mdi->mdio_meta != NULL) free(mdi->mdio_meta, M_MD_INTEL); mdi->mdio_meta = meta; TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { pd = (struct g_raid_md_intel_perdisk *)disk->d_md_data; if (disk->d_state != G_RAID_DISK_S_ACTIVE) continue; if (pd->pd_meta != NULL) { free(pd->pd_meta, M_MD_INTEL); pd->pd_meta = NULL; } pd->pd_meta = intel_meta_copy(meta); intel_meta_write(disk->d_consumer, meta); } return (0); } static int g_raid_md_fail_disk_intel(struct g_raid_md_object *md, struct g_raid_subdisk *tsd, struct g_raid_disk *tdisk) { struct g_raid_softc *sc; struct g_raid_md_intel_object *mdi; struct g_raid_md_intel_perdisk *pd; struct g_raid_subdisk *sd; sc = md->mdo_softc; mdi = (struct g_raid_md_intel_object *)md; pd = (struct g_raid_md_intel_perdisk *)tdisk->d_md_data; /* We can't fail disk that is not a part of array now. */ if (pd->pd_disk_pos < 0) return (-1); /* * Mark disk as failed in metadata and try to write that metadata * to the disk itself to prevent it's later resurrection as STALE. */ mdi->mdio_meta->disk[pd->pd_disk_pos].flags = INTEL_F_FAILED; pd->pd_disk_meta.flags = INTEL_F_FAILED; g_raid_md_intel_print(mdi->mdio_meta); if (tdisk->d_consumer) intel_meta_write(tdisk->d_consumer, mdi->mdio_meta); /* Change states. */ g_raid_change_disk_state(tdisk, G_RAID_DISK_S_FAILED); TAILQ_FOREACH(sd, &tdisk->d_subdisks, sd_next) { g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_FAILED); g_raid_event_send(sd, G_RAID_SUBDISK_E_FAILED, G_RAID_EVENT_SUBDISK); } /* Write updated metadata to remaining disks. */ g_raid_md_write_intel(md, NULL, NULL, tdisk); /* Check if anything left except placeholders. */ if (g_raid_ndisks(sc, -1) == g_raid_ndisks(sc, G_RAID_DISK_S_OFFLINE)) g_raid_destroy_node(sc, 0); else g_raid_md_intel_refill(sc); return (0); } static int g_raid_md_free_disk_intel(struct g_raid_md_object *md, struct g_raid_disk *disk) { struct g_raid_md_intel_perdisk *pd; pd = (struct g_raid_md_intel_perdisk *)disk->d_md_data; if (pd->pd_meta != NULL) { free(pd->pd_meta, M_MD_INTEL); pd->pd_meta = NULL; } free(pd, M_MD_INTEL); disk->d_md_data = NULL; return (0); } static int g_raid_md_free_volume_intel(struct g_raid_md_object *md, struct g_raid_volume *vol) { struct g_raid_md_intel_pervolume *pv; pv = (struct g_raid_md_intel_pervolume *)vol->v_md_data; free(pv, M_MD_INTEL); vol->v_md_data = NULL; return (0); } static int g_raid_md_free_intel(struct g_raid_md_object *md) { struct g_raid_md_intel_object *mdi; mdi = (struct g_raid_md_intel_object *)md; if (!mdi->mdio_started) { mdi->mdio_started = 0; callout_stop(&mdi->mdio_start_co); G_RAID_DEBUG1(1, md->mdo_softc, "root_mount_rel %p", mdi->mdio_rootmount); root_mount_rel(mdi->mdio_rootmount); mdi->mdio_rootmount = NULL; } if (mdi->mdio_meta != NULL) { free(mdi->mdio_meta, M_MD_INTEL); mdi->mdio_meta = NULL; } return (0); } G_RAID_MD_DECLARE(intel, "Intel"); Index: head/sys/geom/raid/md_jmicron.c =================================================================== --- head/sys/geom/raid/md_jmicron.c (revision 308456) +++ head/sys/geom/raid/md_jmicron.c (revision 308457) @@ -1,1563 +1,1563 @@ /*- * Copyright (c) 2010 Alexander Motin * Copyright (c) 2000 - 2008 Søren Schmidt * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include "geom/raid/g_raid.h" #include "g_raid_md_if.h" static MALLOC_DEFINE(M_MD_JMICRON, "md_jmicron_data", "GEOM_RAID JMicron metadata"); #define JMICRON_MAX_DISKS 8 #define JMICRON_MAX_SPARE 2 struct jmicron_raid_conf { u_int8_t signature[2]; #define JMICRON_MAGIC "JM" u_int16_t version; #define JMICRON_VERSION 0x0001 u_int16_t checksum; u_int8_t filler_1[10]; u_int32_t disk_id; u_int32_t offset; u_int32_t disk_sectors_high; u_int16_t disk_sectors_low; u_int8_t filler_2[2]; u_int8_t name[16]; u_int8_t type; #define JMICRON_T_RAID0 0 #define JMICRON_T_RAID1 1 #define JMICRON_T_RAID01 2 #define JMICRON_T_CONCAT 3 #define JMICRON_T_RAID5 5 u_int8_t stripe_shift; u_int16_t flags; #define JMICRON_F_READY 0x0001 #define JMICRON_F_BOOTABLE 0x0002 #define JMICRON_F_BADSEC 0x0004 #define JMICRON_F_ACTIVE 0x0010 #define JMICRON_F_UNSYNC 0x0020 #define JMICRON_F_NEWEST 0x0040 u_int8_t filler_3[4]; u_int32_t spare[JMICRON_MAX_SPARE]; u_int32_t disks[JMICRON_MAX_DISKS]; #define JMICRON_DISK_MASK 0xFFFFFFF0 #define JMICRON_SEG_MASK 0x0000000F u_int8_t filler_4[32]; u_int8_t filler_5[384]; }; struct g_raid_md_jmicron_perdisk { struct jmicron_raid_conf *pd_meta; int pd_disk_pos; int pd_disk_id; off_t pd_disk_size; }; struct g_raid_md_jmicron_object { struct g_raid_md_object mdio_base; uint32_t mdio_config_id; struct jmicron_raid_conf *mdio_meta; struct callout mdio_start_co; /* STARTING state timer. */ int mdio_total_disks; int mdio_disks_present; int mdio_started; int mdio_incomplete; struct root_hold_token *mdio_rootmount; /* Root mount delay token. */ }; static g_raid_md_create_t g_raid_md_create_jmicron; static g_raid_md_taste_t g_raid_md_taste_jmicron; static g_raid_md_event_t g_raid_md_event_jmicron; static g_raid_md_ctl_t g_raid_md_ctl_jmicron; static g_raid_md_write_t g_raid_md_write_jmicron; static g_raid_md_fail_disk_t g_raid_md_fail_disk_jmicron; static g_raid_md_free_disk_t g_raid_md_free_disk_jmicron; static g_raid_md_free_t g_raid_md_free_jmicron; static kobj_method_t g_raid_md_jmicron_methods[] = { KOBJMETHOD(g_raid_md_create, g_raid_md_create_jmicron), KOBJMETHOD(g_raid_md_taste, g_raid_md_taste_jmicron), KOBJMETHOD(g_raid_md_event, g_raid_md_event_jmicron), KOBJMETHOD(g_raid_md_ctl, g_raid_md_ctl_jmicron), KOBJMETHOD(g_raid_md_write, g_raid_md_write_jmicron), KOBJMETHOD(g_raid_md_fail_disk, g_raid_md_fail_disk_jmicron), KOBJMETHOD(g_raid_md_free_disk, g_raid_md_free_disk_jmicron), KOBJMETHOD(g_raid_md_free, g_raid_md_free_jmicron), { 0, 0 } }; static struct g_raid_md_class g_raid_md_jmicron_class = { "JMicron", g_raid_md_jmicron_methods, sizeof(struct g_raid_md_jmicron_object), .mdc_enable = 1, .mdc_priority = 100 }; static void g_raid_md_jmicron_print(struct jmicron_raid_conf *meta) { int k; if (g_raid_debug < 1) return; printf("********* ATA JMicron RAID Metadata *********\n"); printf("signature <%c%c>\n", meta->signature[0], meta->signature[1]); printf("version %04x\n", meta->version); printf("checksum 0x%04x\n", meta->checksum); printf("disk_id 0x%08x\n", meta->disk_id); printf("offset 0x%08x\n", meta->offset); printf("disk_sectors_high 0x%08x\n", meta->disk_sectors_high); printf("disk_sectors_low 0x%04x\n", meta->disk_sectors_low); printf("name <%.16s>\n", meta->name); printf("type %d\n", meta->type); printf("stripe_shift %d\n", meta->stripe_shift); printf("flags %04x\n", meta->flags); printf("spare "); for (k = 0; k < JMICRON_MAX_SPARE; k++) printf(" 0x%08x", meta->spare[k]); printf("\n"); printf("disks "); for (k = 0; k < JMICRON_MAX_DISKS; k++) printf(" 0x%08x", meta->disks[k]); printf("\n"); printf("=================================================\n"); } static struct jmicron_raid_conf * jmicron_meta_copy(struct jmicron_raid_conf *meta) { struct jmicron_raid_conf *nmeta; nmeta = malloc(sizeof(*meta), M_MD_JMICRON, M_WAITOK); memcpy(nmeta, meta, sizeof(*meta)); return (nmeta); } static int jmicron_meta_total_disks(struct jmicron_raid_conf *meta) { int pos; for (pos = 0; pos < JMICRON_MAX_DISKS; pos++) { if (meta->disks[pos] == 0) break; } return (pos); } static int jmicron_meta_total_spare(struct jmicron_raid_conf *meta) { int pos, n; n = 0; for (pos = 0; pos < JMICRON_MAX_SPARE; pos++) { if (meta->spare[pos] != 0) n++; } return (n); } /* * Generate fake Configuration ID based on disk IDs. * Note: it will change after each disk set change. */ static uint32_t jmicron_meta_config_id(struct jmicron_raid_conf *meta) { int pos; uint32_t config_id; config_id = 0; for (pos = 0; pos < JMICRON_MAX_DISKS; pos++) config_id += meta->disks[pos] << pos; return (config_id); } static void jmicron_meta_get_name(struct jmicron_raid_conf *meta, char *buf) { int i; strncpy(buf, meta->name, 16); buf[16] = 0; for (i = 15; i >= 0; i--) { if (buf[i] > 0x20) break; buf[i] = 0; } } static void jmicron_meta_put_name(struct jmicron_raid_conf *meta, char *buf) { memset(meta->name, 0x20, 16); memcpy(meta->name, buf, MIN(strlen(buf), 16)); } static int jmicron_meta_find_disk(struct jmicron_raid_conf *meta, uint32_t id) { int pos; id &= JMICRON_DISK_MASK; for (pos = 0; pos < JMICRON_MAX_DISKS; pos++) { if ((meta->disks[pos] & JMICRON_DISK_MASK) == id) return (pos); } for (pos = 0; pos < JMICRON_MAX_SPARE; pos++) { if ((meta->spare[pos] & JMICRON_DISK_MASK) == id) return (-3); } return (-1); } static struct jmicron_raid_conf * jmicron_meta_read(struct g_consumer *cp) { struct g_provider *pp; struct jmicron_raid_conf *meta; char *buf; int error, i; uint16_t checksum, *ptr; pp = cp->provider; /* Read the anchor sector. */ buf = g_read_data(cp, pp->mediasize - pp->sectorsize, pp->sectorsize, &error); if (buf == NULL) { G_RAID_DEBUG(1, "Cannot read metadata from %s (error=%d).", pp->name, error); return (NULL); } meta = (struct jmicron_raid_conf *)buf; /* Check if this is an JMicron RAID struct */ if (strncmp(meta->signature, JMICRON_MAGIC, strlen(JMICRON_MAGIC))) { G_RAID_DEBUG(1, "JMicron signature check failed on %s", pp->name); g_free(buf); return (NULL); } meta = malloc(sizeof(*meta), M_MD_JMICRON, M_WAITOK); memcpy(meta, buf, min(sizeof(*meta), pp->sectorsize)); g_free(buf); /* Check metadata checksum. */ for (checksum = 0, ptr = (uint16_t *)meta, i = 0; i < 64; i++) checksum += *ptr++; if (checksum != 0) { G_RAID_DEBUG(1, "JMicron checksum check failed on %s", pp->name); free(meta, M_MD_JMICRON); return (NULL); } return (meta); } static int jmicron_meta_write(struct g_consumer *cp, struct jmicron_raid_conf *meta) { struct g_provider *pp; char *buf; int error, i; uint16_t checksum, *ptr; pp = cp->provider; /* Recalculate checksum for case if metadata were changed. */ meta->checksum = 0; for (checksum = 0, ptr = (uint16_t *)meta, i = 0; i < 64; i++) checksum += *ptr++; meta->checksum -= checksum; /* Create and fill buffer. */ buf = malloc(pp->sectorsize, M_MD_JMICRON, M_WAITOK | M_ZERO); memcpy(buf, meta, sizeof(*meta)); error = g_write_data(cp, pp->mediasize - pp->sectorsize, buf, pp->sectorsize); if (error != 0) { G_RAID_DEBUG(1, "Cannot write metadata to %s (error=%d).", pp->name, error); } free(buf, M_MD_JMICRON); return (error); } static int jmicron_meta_erase(struct g_consumer *cp) { struct g_provider *pp; char *buf; int error; pp = cp->provider; buf = malloc(pp->sectorsize, M_MD_JMICRON, M_WAITOK | M_ZERO); error = g_write_data(cp, pp->mediasize - pp->sectorsize, buf, pp->sectorsize); if (error != 0) { G_RAID_DEBUG(1, "Cannot erase metadata on %s (error=%d).", pp->name, error); } free(buf, M_MD_JMICRON); return (error); } static struct g_raid_disk * g_raid_md_jmicron_get_disk(struct g_raid_softc *sc, int id) { struct g_raid_disk *disk; struct g_raid_md_jmicron_perdisk *pd; TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { pd = (struct g_raid_md_jmicron_perdisk *)disk->d_md_data; if (pd->pd_disk_pos == id) break; } return (disk); } static int g_raid_md_jmicron_supported(int level, int qual, int disks, int force) { if (disks > 8) return (0); switch (level) { case G_RAID_VOLUME_RL_RAID0: if (disks < 1) return (0); if (!force && (disks < 2 || disks > 6)) return (0); break; case G_RAID_VOLUME_RL_RAID1: if (disks < 1) return (0); if (!force && (disks != 2)) return (0); break; case G_RAID_VOLUME_RL_RAID1E: if (disks < 2) return (0); if (!force && (disks != 4)) return (0); break; case G_RAID_VOLUME_RL_SINGLE: if (disks != 1) return (0); if (!force) return (0); break; case G_RAID_VOLUME_RL_CONCAT: if (disks < 2) return (0); break; case G_RAID_VOLUME_RL_RAID5: if (disks < 3) return (0); if (qual != G_RAID_VOLUME_RLQ_R5LA) return (0); if (!force) return (0); break; default: return (0); } if (level != G_RAID_VOLUME_RL_RAID5 && qual != G_RAID_VOLUME_RLQ_NONE) return (0); return (1); } static int g_raid_md_jmicron_start_disk(struct g_raid_disk *disk) { struct g_raid_softc *sc; struct g_raid_subdisk *sd, *tmpsd; struct g_raid_disk *olddisk, *tmpdisk; struct g_raid_md_object *md; struct g_raid_md_jmicron_object *mdi; struct g_raid_md_jmicron_perdisk *pd, *oldpd; struct jmicron_raid_conf *meta; int disk_pos, resurrection = 0; sc = disk->d_softc; md = sc->sc_md; mdi = (struct g_raid_md_jmicron_object *)md; meta = mdi->mdio_meta; pd = (struct g_raid_md_jmicron_perdisk *)disk->d_md_data; olddisk = NULL; - /* Find disk position in metadata by it's serial. */ + /* Find disk position in metadata by its serial. */ if (pd->pd_meta != NULL) disk_pos = jmicron_meta_find_disk(meta, pd->pd_disk_id); else disk_pos = -1; if (disk_pos < 0) { G_RAID_DEBUG1(1, sc, "Unknown, probably new or stale disk"); /* If we are in the start process, that's all for now. */ if (!mdi->mdio_started) goto nofit; /* * If we have already started - try to get use of the disk. * Try to replace OFFLINE disks first, then FAILED. */ TAILQ_FOREACH(tmpdisk, &sc->sc_disks, d_next) { if (tmpdisk->d_state != G_RAID_DISK_S_OFFLINE && tmpdisk->d_state != G_RAID_DISK_S_FAILED) continue; /* Make sure this disk is big enough. */ TAILQ_FOREACH(sd, &tmpdisk->d_subdisks, sd_next) { if (sd->sd_offset + sd->sd_size + 512 > pd->pd_disk_size) { G_RAID_DEBUG1(1, sc, "Disk too small (%ju < %ju)", pd->pd_disk_size, sd->sd_offset + sd->sd_size + 512); break; } } if (sd != NULL) continue; if (tmpdisk->d_state == G_RAID_DISK_S_OFFLINE) { olddisk = tmpdisk; break; } else if (olddisk == NULL) olddisk = tmpdisk; } if (olddisk == NULL) { nofit: if (disk_pos == -3 || pd->pd_disk_pos == -3) { g_raid_change_disk_state(disk, G_RAID_DISK_S_SPARE); return (1); } else { g_raid_change_disk_state(disk, G_RAID_DISK_S_STALE); return (0); } } oldpd = (struct g_raid_md_jmicron_perdisk *)olddisk->d_md_data; disk_pos = oldpd->pd_disk_pos; resurrection = 1; } if (olddisk == NULL) { /* Find placeholder by position. */ olddisk = g_raid_md_jmicron_get_disk(sc, disk_pos); if (olddisk == NULL) panic("No disk at position %d!", disk_pos); if (olddisk->d_state != G_RAID_DISK_S_OFFLINE) { G_RAID_DEBUG1(1, sc, "More than one disk for pos %d", disk_pos); g_raid_change_disk_state(disk, G_RAID_DISK_S_STALE); return (0); } oldpd = (struct g_raid_md_jmicron_perdisk *)olddisk->d_md_data; } /* Replace failed disk or placeholder with new disk. */ TAILQ_FOREACH_SAFE(sd, &olddisk->d_subdisks, sd_next, tmpsd) { TAILQ_REMOVE(&olddisk->d_subdisks, sd, sd_next); TAILQ_INSERT_TAIL(&disk->d_subdisks, sd, sd_next); sd->sd_disk = disk; } oldpd->pd_disk_pos = -2; pd->pd_disk_pos = disk_pos; /* Update global metadata just in case. */ meta->disks[disk_pos] = pd->pd_disk_id; /* If it was placeholder -- destroy it. */ if (olddisk->d_state == G_RAID_DISK_S_OFFLINE) { g_raid_destroy_disk(olddisk); } else { /* Otherwise, make it STALE_FAILED. */ g_raid_change_disk_state(olddisk, G_RAID_DISK_S_STALE_FAILED); } /* Welcome the new disk. */ g_raid_change_disk_state(disk, G_RAID_DISK_S_ACTIVE); TAILQ_FOREACH(sd, &disk->d_subdisks, sd_next) { /* * Different disks may have different sizes/offsets, * especially in concat mode. Update. */ if (!resurrection) { sd->sd_offset = (off_t)pd->pd_meta->offset * 16 * 512; //ZZZ sd->sd_size = (((off_t)pd->pd_meta->disk_sectors_high << 16) + pd->pd_meta->disk_sectors_low) * 512; } if (resurrection) { /* Stale disk, almost same as new. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_NEW); } else if ((meta->flags & JMICRON_F_BADSEC) != 0 && (pd->pd_meta->flags & JMICRON_F_BADSEC) == 0) { /* Cold-inserted or rebuilding disk. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_NEW); } else if (pd->pd_meta->flags & JMICRON_F_UNSYNC) { /* Dirty or resyncing disk.. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_STALE); } else { /* Up to date disk. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_ACTIVE); } g_raid_event_send(sd, G_RAID_SUBDISK_E_NEW, G_RAID_EVENT_SUBDISK); } /* Update status of our need for spare. */ if (mdi->mdio_started) { mdi->mdio_incomplete = (g_raid_ndisks(sc, G_RAID_DISK_S_ACTIVE) < mdi->mdio_total_disks); } return (resurrection); } static void g_disk_md_jmicron_retaste(void *arg, int pending) { G_RAID_DEBUG(1, "Array is not complete, trying to retaste."); g_retaste(&g_raid_class); free(arg, M_MD_JMICRON); } static void g_raid_md_jmicron_refill(struct g_raid_softc *sc) { struct g_raid_md_object *md; struct g_raid_md_jmicron_object *mdi; struct g_raid_disk *disk; struct task *task; int update, na; md = sc->sc_md; mdi = (struct g_raid_md_jmicron_object *)md; update = 0; do { /* Make sure we miss anything. */ na = g_raid_ndisks(sc, G_RAID_DISK_S_ACTIVE); if (na == mdi->mdio_total_disks) break; G_RAID_DEBUG1(1, md->mdo_softc, "Array is not complete (%d of %d), " "trying to refill.", na, mdi->mdio_total_disks); /* Try to get use some of STALE disks. */ TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_state == G_RAID_DISK_S_STALE) { update += g_raid_md_jmicron_start_disk(disk); if (disk->d_state == G_RAID_DISK_S_ACTIVE) break; } } if (disk != NULL) continue; /* Try to get use some of SPARE disks. */ TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_state == G_RAID_DISK_S_SPARE) { update += g_raid_md_jmicron_start_disk(disk); if (disk->d_state == G_RAID_DISK_S_ACTIVE) break; } } } while (disk != NULL); /* Write new metadata if we changed something. */ if (update) g_raid_md_write_jmicron(md, NULL, NULL, NULL); /* Update status of our need for spare. */ mdi->mdio_incomplete = (g_raid_ndisks(sc, G_RAID_DISK_S_ACTIVE) < mdi->mdio_total_disks); /* Request retaste hoping to find spare. */ if (mdi->mdio_incomplete) { task = malloc(sizeof(struct task), M_MD_JMICRON, M_WAITOK | M_ZERO); TASK_INIT(task, 0, g_disk_md_jmicron_retaste, task); taskqueue_enqueue(taskqueue_swi, task); } } static void g_raid_md_jmicron_start(struct g_raid_softc *sc) { struct g_raid_md_object *md; struct g_raid_md_jmicron_object *mdi; struct g_raid_md_jmicron_perdisk *pd; struct jmicron_raid_conf *meta; struct g_raid_volume *vol; struct g_raid_subdisk *sd; struct g_raid_disk *disk; off_t size; int j, disk_pos; char buf[17]; md = sc->sc_md; mdi = (struct g_raid_md_jmicron_object *)md; meta = mdi->mdio_meta; /* Create volumes and subdisks. */ jmicron_meta_get_name(meta, buf); vol = g_raid_create_volume(sc, buf, -1); size = ((off_t)meta->disk_sectors_high << 16) + meta->disk_sectors_low; size *= 512; //ZZZ vol->v_raid_level_qualifier = G_RAID_VOLUME_RLQ_NONE; if (meta->type == JMICRON_T_RAID0) { vol->v_raid_level = G_RAID_VOLUME_RL_RAID0; vol->v_mediasize = size * mdi->mdio_total_disks; } else if (meta->type == JMICRON_T_RAID1) { vol->v_raid_level = G_RAID_VOLUME_RL_RAID1; vol->v_mediasize = size; } else if (meta->type == JMICRON_T_RAID01) { vol->v_raid_level = G_RAID_VOLUME_RL_RAID1E; vol->v_mediasize = size * mdi->mdio_total_disks / 2; } else if (meta->type == JMICRON_T_CONCAT) { if (mdi->mdio_total_disks == 1) vol->v_raid_level = G_RAID_VOLUME_RL_SINGLE; else vol->v_raid_level = G_RAID_VOLUME_RL_CONCAT; vol->v_mediasize = 0; } else if (meta->type == JMICRON_T_RAID5) { vol->v_raid_level = G_RAID_VOLUME_RL_RAID5; vol->v_raid_level_qualifier = G_RAID_VOLUME_RLQ_R5LA; vol->v_mediasize = size * (mdi->mdio_total_disks - 1); } else { vol->v_raid_level = G_RAID_VOLUME_RL_UNKNOWN; vol->v_mediasize = 0; } vol->v_strip_size = 1024 << meta->stripe_shift; //ZZZ vol->v_disks_count = mdi->mdio_total_disks; vol->v_sectorsize = 512; //ZZZ for (j = 0; j < vol->v_disks_count; j++) { sd = &vol->v_subdisks[j]; sd->sd_offset = (off_t)meta->offset * 16 * 512; //ZZZ sd->sd_size = size; } g_raid_start_volume(vol); /* Create disk placeholders to store data for later writing. */ for (disk_pos = 0; disk_pos < mdi->mdio_total_disks; disk_pos++) { pd = malloc(sizeof(*pd), M_MD_JMICRON, M_WAITOK | M_ZERO); pd->pd_disk_pos = disk_pos; pd->pd_disk_id = meta->disks[disk_pos]; disk = g_raid_create_disk(sc); disk->d_md_data = (void *)pd; disk->d_state = G_RAID_DISK_S_OFFLINE; sd = &vol->v_subdisks[disk_pos]; sd->sd_disk = disk; TAILQ_INSERT_TAIL(&disk->d_subdisks, sd, sd_next); } /* Make all disks found till the moment take their places. */ do { TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_state == G_RAID_DISK_S_NONE) { g_raid_md_jmicron_start_disk(disk); break; } } } while (disk != NULL); mdi->mdio_started = 1; G_RAID_DEBUG1(0, sc, "Array started."); g_raid_md_write_jmicron(md, NULL, NULL, NULL); /* Pickup any STALE/SPARE disks to refill array if needed. */ g_raid_md_jmicron_refill(sc); g_raid_event_send(vol, G_RAID_VOLUME_E_START, G_RAID_EVENT_VOLUME); callout_stop(&mdi->mdio_start_co); G_RAID_DEBUG1(1, sc, "root_mount_rel %p", mdi->mdio_rootmount); root_mount_rel(mdi->mdio_rootmount); mdi->mdio_rootmount = NULL; } static void g_raid_md_jmicron_new_disk(struct g_raid_disk *disk) { struct g_raid_softc *sc; struct g_raid_md_object *md; struct g_raid_md_jmicron_object *mdi; struct jmicron_raid_conf *pdmeta; struct g_raid_md_jmicron_perdisk *pd; sc = disk->d_softc; md = sc->sc_md; mdi = (struct g_raid_md_jmicron_object *)md; pd = (struct g_raid_md_jmicron_perdisk *)disk->d_md_data; pdmeta = pd->pd_meta; if (mdi->mdio_started) { if (g_raid_md_jmicron_start_disk(disk)) g_raid_md_write_jmicron(md, NULL, NULL, NULL); } else { /* * If we haven't started yet - update common metadata * to get subdisks details, avoiding data from spare disks. */ if (mdi->mdio_meta == NULL || jmicron_meta_find_disk(mdi->mdio_meta, mdi->mdio_meta->disk_id) == -3) { if (mdi->mdio_meta != NULL) free(mdi->mdio_meta, M_MD_JMICRON); mdi->mdio_meta = jmicron_meta_copy(pdmeta); mdi->mdio_total_disks = jmicron_meta_total_disks(pdmeta); } mdi->mdio_meta->flags |= pdmeta->flags & JMICRON_F_BADSEC; mdi->mdio_disks_present++; G_RAID_DEBUG1(1, sc, "Matching disk (%d of %d+%d up)", mdi->mdio_disks_present, mdi->mdio_total_disks, jmicron_meta_total_spare(mdi->mdio_meta)); /* If we collected all needed disks - start array. */ if (mdi->mdio_disks_present == mdi->mdio_total_disks + jmicron_meta_total_spare(mdi->mdio_meta)) g_raid_md_jmicron_start(sc); } } static void g_raid_jmicron_go(void *arg) { struct g_raid_softc *sc; struct g_raid_md_object *md; struct g_raid_md_jmicron_object *mdi; sc = arg; md = sc->sc_md; mdi = (struct g_raid_md_jmicron_object *)md; if (!mdi->mdio_started) { G_RAID_DEBUG1(0, sc, "Force array start due to timeout."); g_raid_event_send(sc, G_RAID_NODE_E_START, 0); } } static int g_raid_md_create_jmicron(struct g_raid_md_object *md, struct g_class *mp, struct g_geom **gp) { struct g_raid_softc *sc; struct g_raid_md_jmicron_object *mdi; char name[16]; mdi = (struct g_raid_md_jmicron_object *)md; mdi->mdio_config_id = arc4random(); snprintf(name, sizeof(name), "JMicron-%08x", mdi->mdio_config_id); sc = g_raid_create_node(mp, name, md); if (sc == NULL) return (G_RAID_MD_TASTE_FAIL); md->mdo_softc = sc; *gp = sc->sc_geom; return (G_RAID_MD_TASTE_NEW); } static int g_raid_md_taste_jmicron(struct g_raid_md_object *md, struct g_class *mp, struct g_consumer *cp, struct g_geom **gp) { struct g_consumer *rcp; struct g_provider *pp; struct g_raid_md_jmicron_object *mdi, *mdi1; struct g_raid_softc *sc; struct g_raid_disk *disk; struct jmicron_raid_conf *meta; struct g_raid_md_jmicron_perdisk *pd; struct g_geom *geom; int disk_pos, result, spare, len; char name[16]; uint16_t vendor; G_RAID_DEBUG(1, "Tasting JMicron on %s", cp->provider->name); mdi = (struct g_raid_md_jmicron_object *)md; pp = cp->provider; /* Read metadata from device. */ meta = NULL; g_topology_unlock(); vendor = 0xffff; len = sizeof(vendor); if (pp->geom->rank == 1) g_io_getattr("GEOM::hba_vendor", cp, &len, &vendor); meta = jmicron_meta_read(cp); g_topology_lock(); if (meta == NULL) { if (g_raid_aggressive_spare) { if (vendor == 0x197b) { G_RAID_DEBUG(1, "No JMicron metadata, forcing spare."); spare = 2; goto search; } else { G_RAID_DEBUG(1, "JMicron vendor mismatch 0x%04x != 0x197b", vendor); } } return (G_RAID_MD_TASTE_FAIL); } /* Check this disk position in obtained metadata. */ disk_pos = jmicron_meta_find_disk(meta, meta->disk_id); if (disk_pos == -1) { G_RAID_DEBUG(1, "JMicron disk_id %08x not found", meta->disk_id); goto fail1; } /* Metadata valid. Print it. */ g_raid_md_jmicron_print(meta); G_RAID_DEBUG(1, "JMicron disk position %d", disk_pos); spare = (disk_pos == -2) ? 1 : 0; search: /* Search for matching node. */ sc = NULL; mdi1 = NULL; LIST_FOREACH(geom, &mp->geom, geom) { sc = geom->softc; if (sc == NULL) continue; if (sc->sc_stopping != 0) continue; if (sc->sc_md->mdo_class != md->mdo_class) continue; mdi1 = (struct g_raid_md_jmicron_object *)sc->sc_md; if (spare == 2) { if (mdi1->mdio_incomplete) break; } else { if (mdi1->mdio_config_id == jmicron_meta_config_id(meta)) break; } } /* Found matching node. */ if (geom != NULL) { G_RAID_DEBUG(1, "Found matching array %s", sc->sc_name); result = G_RAID_MD_TASTE_EXISTING; } else if (spare) { /* Not found needy node -- left for later. */ G_RAID_DEBUG(1, "Spare is not needed at this time"); goto fail1; } else { /* Not found matching node -- create one. */ result = G_RAID_MD_TASTE_NEW; mdi->mdio_config_id = jmicron_meta_config_id(meta); snprintf(name, sizeof(name), "JMicron-%08x", mdi->mdio_config_id); sc = g_raid_create_node(mp, name, md); md->mdo_softc = sc; geom = sc->sc_geom; callout_init(&mdi->mdio_start_co, 1); callout_reset(&mdi->mdio_start_co, g_raid_start_timeout * hz, g_raid_jmicron_go, sc); mdi->mdio_rootmount = root_mount_hold("GRAID-JMicron"); G_RAID_DEBUG1(1, sc, "root_mount_hold %p", mdi->mdio_rootmount); } /* There is no return after this point, so we close passed consumer. */ g_access(cp, -1, 0, 0); rcp = g_new_consumer(geom); rcp->flags |= G_CF_DIRECT_RECEIVE; g_attach(rcp, pp); if (g_access(rcp, 1, 1, 1) != 0) ; //goto fail1; g_topology_unlock(); sx_xlock(&sc->sc_lock); pd = malloc(sizeof(*pd), M_MD_JMICRON, M_WAITOK | M_ZERO); pd->pd_meta = meta; if (spare == 2) { pd->pd_disk_pos = -3; pd->pd_disk_id = arc4random() & JMICRON_DISK_MASK; } else { pd->pd_disk_pos = -1; pd->pd_disk_id = meta->disk_id; } pd->pd_disk_size = pp->mediasize; disk = g_raid_create_disk(sc); disk->d_md_data = (void *)pd; disk->d_consumer = rcp; rcp->private = disk; g_raid_get_disk_info(disk); g_raid_md_jmicron_new_disk(disk); sx_xunlock(&sc->sc_lock); g_topology_lock(); *gp = geom; return (result); fail1: free(meta, M_MD_JMICRON); return (G_RAID_MD_TASTE_FAIL); } static int g_raid_md_event_jmicron(struct g_raid_md_object *md, struct g_raid_disk *disk, u_int event) { struct g_raid_softc *sc; struct g_raid_subdisk *sd; struct g_raid_md_jmicron_object *mdi; struct g_raid_md_jmicron_perdisk *pd; sc = md->mdo_softc; mdi = (struct g_raid_md_jmicron_object *)md; if (disk == NULL) { switch (event) { case G_RAID_NODE_E_START: if (!mdi->mdio_started) g_raid_md_jmicron_start(sc); return (0); } return (-1); } pd = (struct g_raid_md_jmicron_perdisk *)disk->d_md_data; switch (event) { case G_RAID_DISK_E_DISCONNECTED: /* If disk was assigned, just update statuses. */ if (pd->pd_disk_pos >= 0) { g_raid_change_disk_state(disk, G_RAID_DISK_S_OFFLINE); if (disk->d_consumer) { g_raid_kill_consumer(sc, disk->d_consumer); disk->d_consumer = NULL; } TAILQ_FOREACH(sd, &disk->d_subdisks, sd_next) { g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_NONE); g_raid_event_send(sd, G_RAID_SUBDISK_E_DISCONNECTED, G_RAID_EVENT_SUBDISK); } } else { /* Otherwise -- delete. */ g_raid_change_disk_state(disk, G_RAID_DISK_S_NONE); g_raid_destroy_disk(disk); } /* Write updated metadata to all disks. */ g_raid_md_write_jmicron(md, NULL, NULL, NULL); /* Check if anything left except placeholders. */ if (g_raid_ndisks(sc, -1) == g_raid_ndisks(sc, G_RAID_DISK_S_OFFLINE)) g_raid_destroy_node(sc, 0); else g_raid_md_jmicron_refill(sc); return (0); } return (-2); } static int g_raid_md_ctl_jmicron(struct g_raid_md_object *md, struct gctl_req *req) { struct g_raid_softc *sc; struct g_raid_volume *vol; struct g_raid_subdisk *sd; struct g_raid_disk *disk; struct g_raid_md_jmicron_object *mdi; struct g_raid_md_jmicron_perdisk *pd; struct g_consumer *cp; struct g_provider *pp; char arg[16]; const char *verb, *volname, *levelname, *diskname; int *nargs, *force; off_t size, sectorsize, strip; intmax_t *sizearg, *striparg; int numdisks, i, len, level, qual, update; int error; sc = md->mdo_softc; mdi = (struct g_raid_md_jmicron_object *)md; verb = gctl_get_param(req, "verb", NULL); nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); error = 0; if (strcmp(verb, "label") == 0) { if (*nargs < 4) { gctl_error(req, "Invalid number of arguments."); return (-1); } volname = gctl_get_asciiparam(req, "arg1"); if (volname == NULL) { gctl_error(req, "No volume name."); return (-2); } levelname = gctl_get_asciiparam(req, "arg2"); if (levelname == NULL) { gctl_error(req, "No RAID level."); return (-3); } if (strcasecmp(levelname, "RAID5") == 0) levelname = "RAID5-LA"; if (g_raid_volume_str2level(levelname, &level, &qual)) { gctl_error(req, "Unknown RAID level '%s'.", levelname); return (-4); } numdisks = *nargs - 3; force = gctl_get_paraml(req, "force", sizeof(*force)); if (!g_raid_md_jmicron_supported(level, qual, numdisks, force ? *force : 0)) { gctl_error(req, "Unsupported RAID level " "(0x%02x/0x%02x), or number of disks (%d).", level, qual, numdisks); return (-5); } /* Search for disks, connect them and probe. */ size = 0x7fffffffffffffffllu; sectorsize = 0; for (i = 0; i < numdisks; i++) { snprintf(arg, sizeof(arg), "arg%d", i + 3); diskname = gctl_get_asciiparam(req, arg); if (diskname == NULL) { gctl_error(req, "No disk name (%s).", arg); error = -6; break; } if (strcmp(diskname, "NONE") == 0) { cp = NULL; pp = NULL; } else { g_topology_lock(); cp = g_raid_open_consumer(sc, diskname); if (cp == NULL) { gctl_error(req, "Can't open '%s'.", diskname); g_topology_unlock(); error = -7; break; } pp = cp->provider; } pd = malloc(sizeof(*pd), M_MD_JMICRON, M_WAITOK | M_ZERO); pd->pd_disk_pos = i; pd->pd_disk_id = arc4random() & JMICRON_DISK_MASK; disk = g_raid_create_disk(sc); disk->d_md_data = (void *)pd; disk->d_consumer = cp; if (cp == NULL) continue; cp->private = disk; g_topology_unlock(); g_raid_get_disk_info(disk); pd->pd_disk_size = pp->mediasize; if (size > pp->mediasize) size = pp->mediasize; if (sectorsize < pp->sectorsize) sectorsize = pp->sectorsize; } if (error != 0) return (error); if (sectorsize <= 0) { gctl_error(req, "Can't get sector size."); return (-8); } /* Reserve space for metadata. */ size -= sectorsize; /* Handle size argument. */ len = sizeof(*sizearg); sizearg = gctl_get_param(req, "size", &len); if (sizearg != NULL && len == sizeof(*sizearg) && *sizearg > 0) { if (*sizearg > size) { gctl_error(req, "Size too big %lld > %lld.", (long long)*sizearg, (long long)size); return (-9); } size = *sizearg; } /* Handle strip argument. */ strip = 131072; len = sizeof(*striparg); striparg = gctl_get_param(req, "strip", &len); if (striparg != NULL && len == sizeof(*striparg) && *striparg > 0) { if (*striparg < sectorsize) { gctl_error(req, "Strip size too small."); return (-10); } if (*striparg % sectorsize != 0) { gctl_error(req, "Incorrect strip size."); return (-11); } if (strip > 65535 * sectorsize) { gctl_error(req, "Strip size too big."); return (-12); } strip = *striparg; } /* Round size down to strip or sector. */ if (level == G_RAID_VOLUME_RL_RAID1) size -= (size % sectorsize); else if (level == G_RAID_VOLUME_RL_RAID1E && (numdisks & 1) != 0) size -= (size % (2 * strip)); else size -= (size % strip); if (size <= 0) { gctl_error(req, "Size too small."); return (-13); } if (size > 0xffffffffffffllu * sectorsize) { gctl_error(req, "Size too big."); return (-14); } /* We have all we need, create things: volume, ... */ mdi->mdio_total_disks = numdisks; mdi->mdio_started = 1; vol = g_raid_create_volume(sc, volname, -1); vol->v_md_data = (void *)(intptr_t)0; vol->v_raid_level = level; vol->v_raid_level_qualifier = qual; vol->v_strip_size = strip; vol->v_disks_count = numdisks; if (level == G_RAID_VOLUME_RL_RAID0 || level == G_RAID_VOLUME_RL_CONCAT || level == G_RAID_VOLUME_RL_SINGLE) vol->v_mediasize = size * numdisks; else if (level == G_RAID_VOLUME_RL_RAID1) vol->v_mediasize = size; else if (level == G_RAID_VOLUME_RL_RAID5) vol->v_mediasize = size * (numdisks - 1); else { /* RAID1E */ vol->v_mediasize = ((size * numdisks) / strip / 2) * strip; } vol->v_sectorsize = sectorsize; g_raid_start_volume(vol); /* , and subdisks. */ TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { pd = (struct g_raid_md_jmicron_perdisk *)disk->d_md_data; sd = &vol->v_subdisks[pd->pd_disk_pos]; sd->sd_disk = disk; sd->sd_offset = 0; sd->sd_size = size; TAILQ_INSERT_TAIL(&disk->d_subdisks, sd, sd_next); if (sd->sd_disk->d_consumer != NULL) { g_raid_change_disk_state(disk, G_RAID_DISK_S_ACTIVE); g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_ACTIVE); g_raid_event_send(sd, G_RAID_SUBDISK_E_NEW, G_RAID_EVENT_SUBDISK); } else { g_raid_change_disk_state(disk, G_RAID_DISK_S_OFFLINE); } } /* Write metadata based on created entities. */ G_RAID_DEBUG1(0, sc, "Array started."); g_raid_md_write_jmicron(md, NULL, NULL, NULL); /* Pickup any STALE/SPARE disks to refill array if needed. */ g_raid_md_jmicron_refill(sc); g_raid_event_send(vol, G_RAID_VOLUME_E_START, G_RAID_EVENT_VOLUME); return (0); } if (strcmp(verb, "delete") == 0) { /* Check if some volume is still open. */ force = gctl_get_paraml(req, "force", sizeof(*force)); if (force != NULL && *force == 0 && g_raid_nopens(sc) != 0) { gctl_error(req, "Some volume is still open."); return (-4); } TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_consumer) jmicron_meta_erase(disk->d_consumer); } g_raid_destroy_node(sc, 0); return (0); } if (strcmp(verb, "remove") == 0 || strcmp(verb, "fail") == 0) { if (*nargs < 2) { gctl_error(req, "Invalid number of arguments."); return (-1); } for (i = 1; i < *nargs; i++) { snprintf(arg, sizeof(arg), "arg%d", i); diskname = gctl_get_asciiparam(req, arg); if (diskname == NULL) { gctl_error(req, "No disk name (%s).", arg); error = -2; break; } if (strncmp(diskname, "/dev/", 5) == 0) diskname += 5; TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_consumer != NULL && disk->d_consumer->provider != NULL && strcmp(disk->d_consumer->provider->name, diskname) == 0) break; } if (disk == NULL) { gctl_error(req, "Disk '%s' not found.", diskname); error = -3; break; } if (strcmp(verb, "fail") == 0) { g_raid_md_fail_disk_jmicron(md, NULL, disk); continue; } pd = (struct g_raid_md_jmicron_perdisk *)disk->d_md_data; /* Erase metadata on deleting disk. */ jmicron_meta_erase(disk->d_consumer); /* If disk was assigned, just update statuses. */ if (pd->pd_disk_pos >= 0) { g_raid_change_disk_state(disk, G_RAID_DISK_S_OFFLINE); g_raid_kill_consumer(sc, disk->d_consumer); disk->d_consumer = NULL; TAILQ_FOREACH(sd, &disk->d_subdisks, sd_next) { g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_NONE); g_raid_event_send(sd, G_RAID_SUBDISK_E_DISCONNECTED, G_RAID_EVENT_SUBDISK); } } else { /* Otherwise -- delete. */ g_raid_change_disk_state(disk, G_RAID_DISK_S_NONE); g_raid_destroy_disk(disk); } } /* Write updated metadata to remaining disks. */ g_raid_md_write_jmicron(md, NULL, NULL, NULL); /* Check if anything left except placeholders. */ if (g_raid_ndisks(sc, -1) == g_raid_ndisks(sc, G_RAID_DISK_S_OFFLINE)) g_raid_destroy_node(sc, 0); else g_raid_md_jmicron_refill(sc); return (error); } if (strcmp(verb, "insert") == 0) { if (*nargs < 2) { gctl_error(req, "Invalid number of arguments."); return (-1); } update = 0; for (i = 1; i < *nargs; i++) { /* Get disk name. */ snprintf(arg, sizeof(arg), "arg%d", i); diskname = gctl_get_asciiparam(req, arg); if (diskname == NULL) { gctl_error(req, "No disk name (%s).", arg); error = -3; break; } /* Try to find provider with specified name. */ g_topology_lock(); cp = g_raid_open_consumer(sc, diskname); if (cp == NULL) { gctl_error(req, "Can't open disk '%s'.", diskname); g_topology_unlock(); error = -4; break; } pp = cp->provider; pd = malloc(sizeof(*pd), M_MD_JMICRON, M_WAITOK | M_ZERO); pd->pd_disk_pos = -3; pd->pd_disk_id = arc4random() & JMICRON_DISK_MASK; pd->pd_disk_size = pp->mediasize; disk = g_raid_create_disk(sc); disk->d_consumer = cp; disk->d_md_data = (void *)pd; cp->private = disk; g_topology_unlock(); g_raid_get_disk_info(disk); /* Welcome the "new" disk. */ update += g_raid_md_jmicron_start_disk(disk); if (disk->d_state != G_RAID_DISK_S_ACTIVE && disk->d_state != G_RAID_DISK_S_SPARE) { gctl_error(req, "Disk '%s' doesn't fit.", diskname); g_raid_destroy_disk(disk); error = -8; break; } } /* Write new metadata if we changed something. */ if (update) g_raid_md_write_jmicron(md, NULL, NULL, NULL); return (error); } gctl_error(req, "Command '%s' is not supported.", verb); return (-100); } static int g_raid_md_write_jmicron(struct g_raid_md_object *md, struct g_raid_volume *tvol, struct g_raid_subdisk *tsd, struct g_raid_disk *tdisk) { struct g_raid_softc *sc; struct g_raid_volume *vol; struct g_raid_subdisk *sd; struct g_raid_disk *disk; struct g_raid_md_jmicron_object *mdi; struct g_raid_md_jmicron_perdisk *pd; struct jmicron_raid_conf *meta; int i, spares; sc = md->mdo_softc; mdi = (struct g_raid_md_jmicron_object *)md; if (sc->sc_stopping == G_RAID_DESTROY_HARD) return (0); /* There is only one volume. */ vol = TAILQ_FIRST(&sc->sc_volumes); /* Fill global fields. */ meta = malloc(sizeof(*meta), M_MD_JMICRON, M_WAITOK | M_ZERO); strncpy(meta->signature, JMICRON_MAGIC, 2); meta->version = JMICRON_VERSION; jmicron_meta_put_name(meta, vol->v_name); if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID0) meta->type = JMICRON_T_RAID0; else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1) meta->type = JMICRON_T_RAID1; else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1E) meta->type = JMICRON_T_RAID01; else if (vol->v_raid_level == G_RAID_VOLUME_RL_CONCAT || vol->v_raid_level == G_RAID_VOLUME_RL_SINGLE) meta->type = JMICRON_T_CONCAT; else meta->type = JMICRON_T_RAID5; meta->stripe_shift = fls(vol->v_strip_size / 2048); meta->flags = JMICRON_F_READY | JMICRON_F_BOOTABLE; for (i = 0; i < vol->v_disks_count; i++) { sd = &vol->v_subdisks[i]; if (sd->sd_disk == NULL || sd->sd_disk->d_md_data == NULL) meta->disks[i] = 0xffffffff; else { pd = (struct g_raid_md_jmicron_perdisk *) sd->sd_disk->d_md_data; meta->disks[i] = pd->pd_disk_id; } if (sd->sd_state < G_RAID_SUBDISK_S_STALE) meta->flags |= JMICRON_F_BADSEC; if (vol->v_dirty) meta->flags |= JMICRON_F_UNSYNC; } /* Put spares to their slots. */ spares = 0; TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { pd = (struct g_raid_md_jmicron_perdisk *)disk->d_md_data; if (disk->d_state != G_RAID_DISK_S_SPARE) continue; meta->spare[spares] = pd->pd_disk_id; if (++spares >= 2) break; } /* We are done. Print meta data and store them to disks. */ if (mdi->mdio_meta != NULL) free(mdi->mdio_meta, M_MD_JMICRON); mdi->mdio_meta = meta; TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { pd = (struct g_raid_md_jmicron_perdisk *)disk->d_md_data; if (disk->d_state != G_RAID_DISK_S_ACTIVE && disk->d_state != G_RAID_DISK_S_SPARE) continue; if (pd->pd_meta != NULL) { free(pd->pd_meta, M_MD_JMICRON); pd->pd_meta = NULL; } pd->pd_meta = jmicron_meta_copy(meta); pd->pd_meta->disk_id = pd->pd_disk_id; if ((sd = TAILQ_FIRST(&disk->d_subdisks)) != NULL) { pd->pd_meta->offset = (sd->sd_offset / 512) / 16; pd->pd_meta->disk_sectors_high = (sd->sd_size / 512) >> 16; pd->pd_meta->disk_sectors_low = (sd->sd_size / 512) & 0xffff; if (sd->sd_state < G_RAID_SUBDISK_S_STALE) pd->pd_meta->flags &= ~JMICRON_F_BADSEC; else if (sd->sd_state < G_RAID_SUBDISK_S_ACTIVE) pd->pd_meta->flags |= JMICRON_F_UNSYNC; } G_RAID_DEBUG(1, "Writing JMicron metadata to %s", g_raid_get_diskname(disk)); g_raid_md_jmicron_print(pd->pd_meta); jmicron_meta_write(disk->d_consumer, pd->pd_meta); } return (0); } static int g_raid_md_fail_disk_jmicron(struct g_raid_md_object *md, struct g_raid_subdisk *tsd, struct g_raid_disk *tdisk) { struct g_raid_softc *sc; struct g_raid_md_jmicron_perdisk *pd; struct g_raid_subdisk *sd; sc = md->mdo_softc; pd = (struct g_raid_md_jmicron_perdisk *)tdisk->d_md_data; /* We can't fail disk that is not a part of array now. */ if (pd->pd_disk_pos < 0) return (-1); if (tdisk->d_consumer) jmicron_meta_erase(tdisk->d_consumer); /* Change states. */ g_raid_change_disk_state(tdisk, G_RAID_DISK_S_FAILED); TAILQ_FOREACH(sd, &tdisk->d_subdisks, sd_next) { g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_FAILED); g_raid_event_send(sd, G_RAID_SUBDISK_E_FAILED, G_RAID_EVENT_SUBDISK); } /* Write updated metadata to remaining disks. */ g_raid_md_write_jmicron(md, NULL, NULL, tdisk); /* Check if anything left except placeholders. */ if (g_raid_ndisks(sc, -1) == g_raid_ndisks(sc, G_RAID_DISK_S_OFFLINE)) g_raid_destroy_node(sc, 0); else g_raid_md_jmicron_refill(sc); return (0); } static int g_raid_md_free_disk_jmicron(struct g_raid_md_object *md, struct g_raid_disk *disk) { struct g_raid_md_jmicron_perdisk *pd; pd = (struct g_raid_md_jmicron_perdisk *)disk->d_md_data; if (pd->pd_meta != NULL) { free(pd->pd_meta, M_MD_JMICRON); pd->pd_meta = NULL; } free(pd, M_MD_JMICRON); disk->d_md_data = NULL; return (0); } static int g_raid_md_free_jmicron(struct g_raid_md_object *md) { struct g_raid_md_jmicron_object *mdi; mdi = (struct g_raid_md_jmicron_object *)md; if (!mdi->mdio_started) { mdi->mdio_started = 0; callout_stop(&mdi->mdio_start_co); G_RAID_DEBUG1(1, md->mdo_softc, "root_mount_rel %p", mdi->mdio_rootmount); root_mount_rel(mdi->mdio_rootmount); mdi->mdio_rootmount = NULL; } if (mdi->mdio_meta != NULL) { free(mdi->mdio_meta, M_MD_JMICRON); mdi->mdio_meta = NULL; } return (0); } G_RAID_MD_DECLARE(jmicron, "JMicron"); Index: head/sys/geom/raid/md_nvidia.c =================================================================== --- head/sys/geom/raid/md_nvidia.c (revision 308456) +++ head/sys/geom/raid/md_nvidia.c (revision 308457) @@ -1,1583 +1,1583 @@ /*- * Copyright (c) 2011 Alexander Motin * Copyright (c) 2000 - 2008 Søren Schmidt * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include "geom/raid/g_raid.h" #include "g_raid_md_if.h" static MALLOC_DEFINE(M_MD_NVIDIA, "md_nvidia_data", "GEOM_RAID NVIDIA metadata"); struct nvidia_raid_conf { uint8_t nvidia_id[8]; #define NVIDIA_MAGIC "NVIDIA " uint32_t config_size; uint32_t checksum; uint16_t version; uint8_t disk_number; uint8_t dummy_0; uint32_t total_sectors; uint32_t sector_size; uint8_t name[16]; uint8_t revision[4]; uint32_t disk_status; uint32_t magic_0; #define NVIDIA_MAGIC0 0x00640044 uint64_t volume_id[2]; uint8_t state; #define NVIDIA_S_IDLE 0 #define NVIDIA_S_INIT 2 #define NVIDIA_S_REBUILD 3 #define NVIDIA_S_UPGRADE 4 #define NVIDIA_S_SYNC 5 uint8_t array_width; uint8_t total_disks; uint8_t orig_array_width; uint16_t type; #define NVIDIA_T_RAID0 0x0080 #define NVIDIA_T_RAID1 0x0081 #define NVIDIA_T_RAID3 0x0083 #define NVIDIA_T_RAID5 0x0085 /* RLQ = 00/02? */ #define NVIDIA_T_RAID5_SYM 0x0095 /* RLQ = 03 */ #define NVIDIA_T_RAID10 0x008a #define NVIDIA_T_RAID01 0x8180 #define NVIDIA_T_CONCAT 0x00ff uint16_t dummy_3; uint32_t strip_sectors; uint32_t strip_bytes; uint32_t strip_shift; uint32_t strip_mask; uint32_t stripe_sectors; uint32_t stripe_bytes; uint32_t rebuild_lba; uint32_t orig_type; uint32_t orig_total_sectors; uint32_t status; #define NVIDIA_S_BOOTABLE 0x00000001 #define NVIDIA_S_DEGRADED 0x00000002 uint32_t filler[98]; } __packed; struct g_raid_md_nvidia_perdisk { struct nvidia_raid_conf *pd_meta; int pd_disk_pos; off_t pd_disk_size; }; struct g_raid_md_nvidia_object { struct g_raid_md_object mdio_base; uint64_t mdio_volume_id[2]; struct nvidia_raid_conf *mdio_meta; struct callout mdio_start_co; /* STARTING state timer. */ int mdio_total_disks; int mdio_disks_present; int mdio_started; int mdio_incomplete; struct root_hold_token *mdio_rootmount; /* Root mount delay token. */ }; static g_raid_md_create_t g_raid_md_create_nvidia; static g_raid_md_taste_t g_raid_md_taste_nvidia; static g_raid_md_event_t g_raid_md_event_nvidia; static g_raid_md_ctl_t g_raid_md_ctl_nvidia; static g_raid_md_write_t g_raid_md_write_nvidia; static g_raid_md_fail_disk_t g_raid_md_fail_disk_nvidia; static g_raid_md_free_disk_t g_raid_md_free_disk_nvidia; static g_raid_md_free_t g_raid_md_free_nvidia; static kobj_method_t g_raid_md_nvidia_methods[] = { KOBJMETHOD(g_raid_md_create, g_raid_md_create_nvidia), KOBJMETHOD(g_raid_md_taste, g_raid_md_taste_nvidia), KOBJMETHOD(g_raid_md_event, g_raid_md_event_nvidia), KOBJMETHOD(g_raid_md_ctl, g_raid_md_ctl_nvidia), KOBJMETHOD(g_raid_md_write, g_raid_md_write_nvidia), KOBJMETHOD(g_raid_md_fail_disk, g_raid_md_fail_disk_nvidia), KOBJMETHOD(g_raid_md_free_disk, g_raid_md_free_disk_nvidia), KOBJMETHOD(g_raid_md_free, g_raid_md_free_nvidia), { 0, 0 } }; static struct g_raid_md_class g_raid_md_nvidia_class = { "NVIDIA", g_raid_md_nvidia_methods, sizeof(struct g_raid_md_nvidia_object), .mdc_enable = 1, .mdc_priority = 100 }; static int NVIDIANodeID = 1; static void g_raid_md_nvidia_print(struct nvidia_raid_conf *meta) { if (g_raid_debug < 1) return; printf("********* ATA NVIDIA RAID Metadata *********\n"); printf("nvidia_id <%.8s>\n", meta->nvidia_id); printf("config_size %u\n", meta->config_size); printf("checksum 0x%08x\n", meta->checksum); printf("version 0x%04x\n", meta->version); printf("disk_number %d\n", meta->disk_number); printf("dummy_0 0x%02x\n", meta->dummy_0); printf("total_sectors %u\n", meta->total_sectors); printf("sector_size %u\n", meta->sector_size); printf("name <%.16s>\n", meta->name); printf("revision 0x%02x%02x%02x%02x\n", meta->revision[0], meta->revision[1], meta->revision[2], meta->revision[3]); printf("disk_status 0x%08x\n", meta->disk_status); printf("magic_0 0x%08x\n", meta->magic_0); printf("volume_id 0x%016jx%016jx\n", meta->volume_id[1], meta->volume_id[0]); printf("state 0x%02x\n", meta->state); printf("array_width %u\n", meta->array_width); printf("total_disks %u\n", meta->total_disks); printf("orig_array_width %u\n", meta->orig_array_width); printf("type 0x%04x\n", meta->type); printf("dummy_3 0x%04x\n", meta->dummy_3); printf("strip_sectors %u\n", meta->strip_sectors); printf("strip_bytes %u\n", meta->strip_bytes); printf("strip_shift %u\n", meta->strip_shift); printf("strip_mask 0x%08x\n", meta->strip_mask); printf("stripe_sectors %u\n", meta->stripe_sectors); printf("stripe_bytes %u\n", meta->stripe_bytes); printf("rebuild_lba %u\n", meta->rebuild_lba); printf("orig_type 0x%04x\n", meta->orig_type); printf("orig_total_sectors %u\n", meta->orig_total_sectors); printf("status 0x%08x\n", meta->status); printf("=================================================\n"); } static struct nvidia_raid_conf * nvidia_meta_copy(struct nvidia_raid_conf *meta) { struct nvidia_raid_conf *nmeta; nmeta = malloc(sizeof(*meta), M_MD_NVIDIA, M_WAITOK); memcpy(nmeta, meta, sizeof(*meta)); return (nmeta); } static int nvidia_meta_translate_disk(struct nvidia_raid_conf *meta, int md_disk_pos) { int disk_pos; if (md_disk_pos >= 0 && meta->type == NVIDIA_T_RAID01) { disk_pos = (md_disk_pos / meta->array_width) + (md_disk_pos % meta->array_width) * meta->array_width; } else disk_pos = md_disk_pos; return (disk_pos); } static void nvidia_meta_get_name(struct nvidia_raid_conf *meta, char *buf) { int i; strncpy(buf, meta->name, 16); buf[16] = 0; for (i = 15; i >= 0; i--) { if (buf[i] > 0x20) break; buf[i] = 0; } } static void nvidia_meta_put_name(struct nvidia_raid_conf *meta, char *buf) { memset(meta->name, 0x20, 16); memcpy(meta->name, buf, MIN(strlen(buf), 16)); } static struct nvidia_raid_conf * nvidia_meta_read(struct g_consumer *cp) { struct g_provider *pp; struct nvidia_raid_conf *meta; char *buf; int error, i; uint32_t checksum, *ptr; pp = cp->provider; /* Read the anchor sector. */ buf = g_read_data(cp, pp->mediasize - 2 * pp->sectorsize, pp->sectorsize, &error); if (buf == NULL) { G_RAID_DEBUG(1, "Cannot read metadata from %s (error=%d).", pp->name, error); return (NULL); } meta = (struct nvidia_raid_conf *)buf; /* Check if this is an NVIDIA RAID struct */ if (strncmp(meta->nvidia_id, NVIDIA_MAGIC, strlen(NVIDIA_MAGIC))) { G_RAID_DEBUG(1, "NVIDIA signature check failed on %s", pp->name); g_free(buf); return (NULL); } if (meta->config_size > 128 || meta->config_size < 30) { G_RAID_DEBUG(1, "NVIDIA metadata size looks wrong: %d", meta->config_size); g_free(buf); return (NULL); } meta = malloc(sizeof(*meta), M_MD_NVIDIA, M_WAITOK); memcpy(meta, buf, min(sizeof(*meta), pp->sectorsize)); g_free(buf); /* Check metadata checksum. */ for (checksum = 0, ptr = (uint32_t *)meta, i = 0; i < meta->config_size; i++) checksum += *ptr++; if (checksum != 0) { G_RAID_DEBUG(1, "NVIDIA checksum check failed on %s", pp->name); free(meta, M_MD_NVIDIA); return (NULL); } /* Check volume state. */ if (meta->state != NVIDIA_S_IDLE && meta->state != NVIDIA_S_INIT && meta->state != NVIDIA_S_REBUILD && meta->state != NVIDIA_S_SYNC) { G_RAID_DEBUG(1, "NVIDIA unknown state on %s (0x%02x)", pp->name, meta->state); free(meta, M_MD_NVIDIA); return (NULL); } /* Check raid type. */ if (meta->type != NVIDIA_T_RAID0 && meta->type != NVIDIA_T_RAID1 && meta->type != NVIDIA_T_RAID3 && meta->type != NVIDIA_T_RAID5 && meta->type != NVIDIA_T_RAID5_SYM && meta->type != NVIDIA_T_RAID01 && meta->type != NVIDIA_T_CONCAT) { G_RAID_DEBUG(1, "NVIDIA unknown RAID level on %s (0x%02x)", pp->name, meta->type); free(meta, M_MD_NVIDIA); return (NULL); } return (meta); } static int nvidia_meta_write(struct g_consumer *cp, struct nvidia_raid_conf *meta) { struct g_provider *pp; char *buf; int error, i; uint32_t checksum, *ptr; pp = cp->provider; /* Recalculate checksum for case if metadata were changed. */ meta->checksum = 0; for (checksum = 0, ptr = (uint32_t *)meta, i = 0; i < meta->config_size; i++) checksum += *ptr++; meta->checksum -= checksum; /* Create and fill buffer. */ buf = malloc(pp->sectorsize, M_MD_NVIDIA, M_WAITOK | M_ZERO); memcpy(buf, meta, sizeof(*meta)); /* Write metadata. */ error = g_write_data(cp, pp->mediasize - 2 * pp->sectorsize, buf, pp->sectorsize); if (error != 0) { G_RAID_DEBUG(1, "Cannot write metadata to %s (error=%d).", pp->name, error); } free(buf, M_MD_NVIDIA); return (error); } static int nvidia_meta_erase(struct g_consumer *cp) { struct g_provider *pp; char *buf; int error; pp = cp->provider; buf = malloc(pp->sectorsize, M_MD_NVIDIA, M_WAITOK | M_ZERO); error = g_write_data(cp, pp->mediasize - 2 * pp->sectorsize, buf, pp->sectorsize); if (error != 0) { G_RAID_DEBUG(1, "Cannot erase metadata on %s (error=%d).", pp->name, error); } free(buf, M_MD_NVIDIA); return (error); } static struct g_raid_disk * g_raid_md_nvidia_get_disk(struct g_raid_softc *sc, int id) { struct g_raid_disk *disk; struct g_raid_md_nvidia_perdisk *pd; TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { pd = (struct g_raid_md_nvidia_perdisk *)disk->d_md_data; if (pd->pd_disk_pos == id) break; } return (disk); } static int g_raid_md_nvidia_supported(int level, int qual, int disks, int force) { switch (level) { case G_RAID_VOLUME_RL_RAID0: if (disks < 1) return (0); if (!force && (disks < 2 || disks > 6)) return (0); break; case G_RAID_VOLUME_RL_RAID1: if (disks < 1) return (0); if (!force && (disks != 2)) return (0); break; case G_RAID_VOLUME_RL_RAID1E: if (disks < 2) return (0); if (disks % 2 != 0) return (0); if (!force && (disks < 4)) return (0); break; case G_RAID_VOLUME_RL_SINGLE: if (disks != 1) return (0); break; case G_RAID_VOLUME_RL_CONCAT: if (disks < 2) return (0); break; case G_RAID_VOLUME_RL_RAID5: if (disks < 3) return (0); if (qual != G_RAID_VOLUME_RLQ_R5LA && qual != G_RAID_VOLUME_RLQ_R5LS) return (0); break; default: return (0); } if (level != G_RAID_VOLUME_RL_RAID5 && qual != G_RAID_VOLUME_RLQ_NONE) return (0); return (1); } static int g_raid_md_nvidia_start_disk(struct g_raid_disk *disk) { struct g_raid_softc *sc; struct g_raid_subdisk *sd, *tmpsd; struct g_raid_disk *olddisk, *tmpdisk; struct g_raid_md_object *md; struct g_raid_md_nvidia_object *mdi; struct g_raid_md_nvidia_perdisk *pd, *oldpd; struct nvidia_raid_conf *meta; int disk_pos, resurrection = 0; sc = disk->d_softc; md = sc->sc_md; mdi = (struct g_raid_md_nvidia_object *)md; meta = mdi->mdio_meta; pd = (struct g_raid_md_nvidia_perdisk *)disk->d_md_data; olddisk = NULL; - /* Find disk position in metadata by it's serial. */ + /* Find disk position in metadata by its serial. */ if (pd->pd_meta != NULL) { disk_pos = pd->pd_meta->disk_number; if (disk_pos >= meta->total_disks || mdi->mdio_started) disk_pos = -3; } else disk_pos = -3; /* For RAID0+1 we need to translate order. */ disk_pos = nvidia_meta_translate_disk(meta, disk_pos); if (disk_pos < 0) { G_RAID_DEBUG1(1, sc, "Unknown, probably new or stale disk"); /* If we are in the start process, that's all for now. */ if (!mdi->mdio_started) goto nofit; /* * If we have already started - try to get use of the disk. * Try to replace OFFLINE disks first, then FAILED. */ TAILQ_FOREACH(tmpdisk, &sc->sc_disks, d_next) { if (tmpdisk->d_state != G_RAID_DISK_S_OFFLINE && tmpdisk->d_state != G_RAID_DISK_S_FAILED) continue; /* Make sure this disk is big enough. */ TAILQ_FOREACH(sd, &tmpdisk->d_subdisks, sd_next) { if (sd->sd_offset + sd->sd_size + 2 * 512 > pd->pd_disk_size) { G_RAID_DEBUG1(1, sc, "Disk too small (%ju < %ju)", pd->pd_disk_size, sd->sd_offset + sd->sd_size + 512); break; } } if (sd != NULL) continue; if (tmpdisk->d_state == G_RAID_DISK_S_OFFLINE) { olddisk = tmpdisk; break; } else if (olddisk == NULL) olddisk = tmpdisk; } if (olddisk == NULL) { nofit: g_raid_change_disk_state(disk, G_RAID_DISK_S_SPARE); return (1); } oldpd = (struct g_raid_md_nvidia_perdisk *)olddisk->d_md_data; disk_pos = oldpd->pd_disk_pos; resurrection = 1; } if (olddisk == NULL) { /* Find placeholder by position. */ olddisk = g_raid_md_nvidia_get_disk(sc, disk_pos); if (olddisk == NULL) panic("No disk at position %d!", disk_pos); if (olddisk->d_state != G_RAID_DISK_S_OFFLINE) { G_RAID_DEBUG1(1, sc, "More than one disk for pos %d", disk_pos); g_raid_change_disk_state(disk, G_RAID_DISK_S_STALE); return (0); } oldpd = (struct g_raid_md_nvidia_perdisk *)olddisk->d_md_data; } /* Replace failed disk or placeholder with new disk. */ TAILQ_FOREACH_SAFE(sd, &olddisk->d_subdisks, sd_next, tmpsd) { TAILQ_REMOVE(&olddisk->d_subdisks, sd, sd_next); TAILQ_INSERT_TAIL(&disk->d_subdisks, sd, sd_next); sd->sd_disk = disk; } oldpd->pd_disk_pos = -2; pd->pd_disk_pos = disk_pos; /* If it was placeholder -- destroy it. */ if (olddisk->d_state == G_RAID_DISK_S_OFFLINE) { g_raid_destroy_disk(olddisk); } else { /* Otherwise, make it STALE_FAILED. */ g_raid_change_disk_state(olddisk, G_RAID_DISK_S_STALE_FAILED); } /* Welcome the new disk. */ if (resurrection) g_raid_change_disk_state(disk, G_RAID_DISK_S_ACTIVE); else// if (pd->pd_meta->disk_status == NVIDIA_S_CURRENT || //pd->pd_meta->disk_status == NVIDIA_S_REBUILD) g_raid_change_disk_state(disk, G_RAID_DISK_S_ACTIVE); // else // g_raid_change_disk_state(disk, G_RAID_DISK_S_FAILED); TAILQ_FOREACH(sd, &disk->d_subdisks, sd_next) { /* * Different disks may have different sizes, * in concat mode. Update from real disk size. */ if (meta->type == NVIDIA_T_CONCAT) sd->sd_size = pd->pd_disk_size - 0x800 * 512; if (resurrection) { /* New or ex-spare disk. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_NEW); } else if (meta->state == NVIDIA_S_REBUILD && (pd->pd_meta->disk_status & 0x100)) { /* Rebuilding disk. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_REBUILD); sd->sd_rebuild_pos = (off_t)pd->pd_meta->rebuild_lba / meta->array_width * pd->pd_meta->sector_size; } else if (meta->state == NVIDIA_S_SYNC) { /* Resyncing/dirty disk. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_RESYNC); sd->sd_rebuild_pos = (off_t)pd->pd_meta->rebuild_lba / meta->array_width * pd->pd_meta->sector_size; } else { /* Up to date disk. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_ACTIVE); } g_raid_event_send(sd, G_RAID_SUBDISK_E_NEW, G_RAID_EVENT_SUBDISK); } /* Update status of our need for spare. */ if (mdi->mdio_started) { mdi->mdio_incomplete = (g_raid_ndisks(sc, G_RAID_DISK_S_ACTIVE) < mdi->mdio_total_disks); } return (resurrection); } static void g_disk_md_nvidia_retaste(void *arg, int pending) { G_RAID_DEBUG(1, "Array is not complete, trying to retaste."); g_retaste(&g_raid_class); free(arg, M_MD_NVIDIA); } static void g_raid_md_nvidia_refill(struct g_raid_softc *sc) { struct g_raid_md_object *md; struct g_raid_md_nvidia_object *mdi; struct g_raid_disk *disk; struct task *task; int update, na; md = sc->sc_md; mdi = (struct g_raid_md_nvidia_object *)md; update = 0; do { /* Make sure we miss anything. */ na = g_raid_ndisks(sc, G_RAID_DISK_S_ACTIVE); if (na == mdi->mdio_total_disks) break; G_RAID_DEBUG1(1, md->mdo_softc, "Array is not complete (%d of %d), " "trying to refill.", na, mdi->mdio_total_disks); /* Try to get use some of STALE disks. */ TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_state == G_RAID_DISK_S_STALE) { update += g_raid_md_nvidia_start_disk(disk); if (disk->d_state == G_RAID_DISK_S_ACTIVE) break; } } if (disk != NULL) continue; /* Try to get use some of SPARE disks. */ TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_state == G_RAID_DISK_S_SPARE) { update += g_raid_md_nvidia_start_disk(disk); if (disk->d_state == G_RAID_DISK_S_ACTIVE) break; } } } while (disk != NULL); /* Write new metadata if we changed something. */ if (update) g_raid_md_write_nvidia(md, NULL, NULL, NULL); /* Update status of our need for spare. */ mdi->mdio_incomplete = (g_raid_ndisks(sc, G_RAID_DISK_S_ACTIVE) < mdi->mdio_total_disks); /* Request retaste hoping to find spare. */ if (mdi->mdio_incomplete) { task = malloc(sizeof(struct task), M_MD_NVIDIA, M_WAITOK | M_ZERO); TASK_INIT(task, 0, g_disk_md_nvidia_retaste, task); taskqueue_enqueue(taskqueue_swi, task); } } static void g_raid_md_nvidia_start(struct g_raid_softc *sc) { struct g_raid_md_object *md; struct g_raid_md_nvidia_object *mdi; struct g_raid_md_nvidia_perdisk *pd; struct nvidia_raid_conf *meta; struct g_raid_volume *vol; struct g_raid_subdisk *sd; struct g_raid_disk *disk; off_t size; int j, disk_pos; char buf[17]; md = sc->sc_md; mdi = (struct g_raid_md_nvidia_object *)md; meta = mdi->mdio_meta; /* Create volumes and subdisks. */ nvidia_meta_get_name(meta, buf); vol = g_raid_create_volume(sc, buf, -1); vol->v_mediasize = (off_t)meta->total_sectors * 512; vol->v_raid_level_qualifier = G_RAID_VOLUME_RLQ_NONE; if (meta->type == NVIDIA_T_RAID0) { vol->v_raid_level = G_RAID_VOLUME_RL_RAID0; size = vol->v_mediasize / mdi->mdio_total_disks; } else if (meta->type == NVIDIA_T_RAID1) { vol->v_raid_level = G_RAID_VOLUME_RL_RAID1; size = vol->v_mediasize; } else if (meta->type == NVIDIA_T_RAID01) { vol->v_raid_level = G_RAID_VOLUME_RL_RAID1E; size = vol->v_mediasize / (mdi->mdio_total_disks / 2); } else if (meta->type == NVIDIA_T_CONCAT) { if (mdi->mdio_total_disks == 1) vol->v_raid_level = G_RAID_VOLUME_RL_SINGLE; else vol->v_raid_level = G_RAID_VOLUME_RL_CONCAT; size = 0; } else if (meta->type == NVIDIA_T_RAID5) { vol->v_raid_level = G_RAID_VOLUME_RL_RAID5; vol->v_raid_level_qualifier = G_RAID_VOLUME_RLQ_R5LA; size = vol->v_mediasize / (mdi->mdio_total_disks - 1); } else if (meta->type == NVIDIA_T_RAID5_SYM) { vol->v_raid_level = G_RAID_VOLUME_RL_RAID5; vol->v_raid_level_qualifier = G_RAID_VOLUME_RLQ_R5LS; size = vol->v_mediasize / (mdi->mdio_total_disks - 1); } else { vol->v_raid_level = G_RAID_VOLUME_RL_UNKNOWN; size = 0; } vol->v_strip_size = meta->strip_sectors * 512; //ZZZ vol->v_disks_count = mdi->mdio_total_disks; vol->v_sectorsize = 512; //ZZZ for (j = 0; j < vol->v_disks_count; j++) { sd = &vol->v_subdisks[j]; sd->sd_offset = 0; sd->sd_size = size; } g_raid_start_volume(vol); /* Create disk placeholders to store data for later writing. */ for (disk_pos = 0; disk_pos < mdi->mdio_total_disks; disk_pos++) { pd = malloc(sizeof(*pd), M_MD_NVIDIA, M_WAITOK | M_ZERO); pd->pd_disk_pos = disk_pos; disk = g_raid_create_disk(sc); disk->d_md_data = (void *)pd; disk->d_state = G_RAID_DISK_S_OFFLINE; sd = &vol->v_subdisks[disk_pos]; sd->sd_disk = disk; TAILQ_INSERT_TAIL(&disk->d_subdisks, sd, sd_next); } /* Make all disks found till the moment take their places. */ do { TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_state == G_RAID_DISK_S_NONE) { g_raid_md_nvidia_start_disk(disk); break; } } } while (disk != NULL); mdi->mdio_started = 1; G_RAID_DEBUG1(0, sc, "Array started."); g_raid_md_write_nvidia(md, NULL, NULL, NULL); /* Pickup any STALE/SPARE disks to refill array if needed. */ g_raid_md_nvidia_refill(sc); g_raid_event_send(vol, G_RAID_VOLUME_E_START, G_RAID_EVENT_VOLUME); callout_stop(&mdi->mdio_start_co); G_RAID_DEBUG1(1, sc, "root_mount_rel %p", mdi->mdio_rootmount); root_mount_rel(mdi->mdio_rootmount); mdi->mdio_rootmount = NULL; } static void g_raid_md_nvidia_new_disk(struct g_raid_disk *disk) { struct g_raid_softc *sc; struct g_raid_md_object *md; struct g_raid_md_nvidia_object *mdi; struct nvidia_raid_conf *pdmeta; struct g_raid_md_nvidia_perdisk *pd; sc = disk->d_softc; md = sc->sc_md; mdi = (struct g_raid_md_nvidia_object *)md; pd = (struct g_raid_md_nvidia_perdisk *)disk->d_md_data; pdmeta = pd->pd_meta; if (mdi->mdio_started) { if (g_raid_md_nvidia_start_disk(disk)) g_raid_md_write_nvidia(md, NULL, NULL, NULL); } else { if (mdi->mdio_meta == NULL || mdi->mdio_meta->disk_number >= mdi->mdio_meta->total_disks) { G_RAID_DEBUG1(1, sc, "Newer disk"); if (mdi->mdio_meta != NULL) free(mdi->mdio_meta, M_MD_NVIDIA); mdi->mdio_meta = nvidia_meta_copy(pdmeta); mdi->mdio_total_disks = pdmeta->total_disks; mdi->mdio_disks_present = 1; } else if (pdmeta->disk_number < mdi->mdio_meta->total_disks) { mdi->mdio_disks_present++; G_RAID_DEBUG1(1, sc, "Matching disk (%d of %d up)", mdi->mdio_disks_present, mdi->mdio_total_disks); } else G_RAID_DEBUG1(1, sc, "Spare disk"); /* If we collected all needed disks - start array. */ if (mdi->mdio_disks_present == mdi->mdio_total_disks) g_raid_md_nvidia_start(sc); } } static void g_raid_nvidia_go(void *arg) { struct g_raid_softc *sc; struct g_raid_md_object *md; struct g_raid_md_nvidia_object *mdi; sc = arg; md = sc->sc_md; mdi = (struct g_raid_md_nvidia_object *)md; if (!mdi->mdio_started) { G_RAID_DEBUG1(0, sc, "Force array start due to timeout."); g_raid_event_send(sc, G_RAID_NODE_E_START, 0); } } static int g_raid_md_create_nvidia(struct g_raid_md_object *md, struct g_class *mp, struct g_geom **gp) { struct g_raid_softc *sc; struct g_raid_md_nvidia_object *mdi; char name[32]; mdi = (struct g_raid_md_nvidia_object *)md; arc4rand(&mdi->mdio_volume_id, 16, 0); snprintf(name, sizeof(name), "NVIDIA-%d", atomic_fetchadd_int(&NVIDIANodeID, 1)); sc = g_raid_create_node(mp, name, md); if (sc == NULL) return (G_RAID_MD_TASTE_FAIL); md->mdo_softc = sc; *gp = sc->sc_geom; return (G_RAID_MD_TASTE_NEW); } static int g_raid_md_taste_nvidia(struct g_raid_md_object *md, struct g_class *mp, struct g_consumer *cp, struct g_geom **gp) { struct g_consumer *rcp; struct g_provider *pp; struct g_raid_md_nvidia_object *mdi, *mdi1; struct g_raid_softc *sc; struct g_raid_disk *disk; struct nvidia_raid_conf *meta; struct g_raid_md_nvidia_perdisk *pd; struct g_geom *geom; int result, spare, len; char name[32]; uint16_t vendor; G_RAID_DEBUG(1, "Tasting NVIDIA on %s", cp->provider->name); mdi = (struct g_raid_md_nvidia_object *)md; pp = cp->provider; /* Read metadata from device. */ meta = NULL; g_topology_unlock(); vendor = 0xffff; len = sizeof(vendor); if (pp->geom->rank == 1) g_io_getattr("GEOM::hba_vendor", cp, &len, &vendor); meta = nvidia_meta_read(cp); g_topology_lock(); if (meta == NULL) { if (g_raid_aggressive_spare) { if (vendor == 0x10de) { G_RAID_DEBUG(1, "No NVIDIA metadata, forcing spare."); spare = 2; goto search; } else { G_RAID_DEBUG(1, "NVIDIA vendor mismatch 0x%04x != 0x10de", vendor); } } return (G_RAID_MD_TASTE_FAIL); } /* Metadata valid. Print it. */ g_raid_md_nvidia_print(meta); G_RAID_DEBUG(1, "NVIDIA disk position %d", meta->disk_number); spare = 0;//(meta->type == NVIDIA_T_SPARE) ? 1 : 0; search: /* Search for matching node. */ sc = NULL; mdi1 = NULL; LIST_FOREACH(geom, &mp->geom, geom) { sc = geom->softc; if (sc == NULL) continue; if (sc->sc_stopping != 0) continue; if (sc->sc_md->mdo_class != md->mdo_class) continue; mdi1 = (struct g_raid_md_nvidia_object *)sc->sc_md; if (spare) { if (mdi1->mdio_incomplete) break; } else { if (memcmp(&mdi1->mdio_volume_id, &meta->volume_id, 16) == 0) break; } } /* Found matching node. */ if (geom != NULL) { G_RAID_DEBUG(1, "Found matching array %s", sc->sc_name); result = G_RAID_MD_TASTE_EXISTING; } else if (spare) { /* Not found needy node -- left for later. */ G_RAID_DEBUG(1, "Spare is not needed at this time"); goto fail1; } else { /* Not found matching node -- create one. */ result = G_RAID_MD_TASTE_NEW; memcpy(&mdi->mdio_volume_id, &meta->volume_id, 16); snprintf(name, sizeof(name), "NVIDIA-%d", atomic_fetchadd_int(&NVIDIANodeID, 1)); sc = g_raid_create_node(mp, name, md); md->mdo_softc = sc; geom = sc->sc_geom; callout_init(&mdi->mdio_start_co, 1); callout_reset(&mdi->mdio_start_co, g_raid_start_timeout * hz, g_raid_nvidia_go, sc); mdi->mdio_rootmount = root_mount_hold("GRAID-NVIDIA"); G_RAID_DEBUG1(1, sc, "root_mount_hold %p", mdi->mdio_rootmount); } /* There is no return after this point, so we close passed consumer. */ g_access(cp, -1, 0, 0); rcp = g_new_consumer(geom); rcp->flags |= G_CF_DIRECT_RECEIVE; g_attach(rcp, pp); if (g_access(rcp, 1, 1, 1) != 0) ; //goto fail1; g_topology_unlock(); sx_xlock(&sc->sc_lock); pd = malloc(sizeof(*pd), M_MD_NVIDIA, M_WAITOK | M_ZERO); pd->pd_meta = meta; if (spare == 2) { pd->pd_disk_pos = -3; } else { pd->pd_disk_pos = -1; } pd->pd_disk_size = pp->mediasize; disk = g_raid_create_disk(sc); disk->d_md_data = (void *)pd; disk->d_consumer = rcp; rcp->private = disk; g_raid_get_disk_info(disk); g_raid_md_nvidia_new_disk(disk); sx_xunlock(&sc->sc_lock); g_topology_lock(); *gp = geom; return (result); fail1: free(meta, M_MD_NVIDIA); return (G_RAID_MD_TASTE_FAIL); } static int g_raid_md_event_nvidia(struct g_raid_md_object *md, struct g_raid_disk *disk, u_int event) { struct g_raid_softc *sc; struct g_raid_subdisk *sd; struct g_raid_md_nvidia_object *mdi; struct g_raid_md_nvidia_perdisk *pd; sc = md->mdo_softc; mdi = (struct g_raid_md_nvidia_object *)md; if (disk == NULL) { switch (event) { case G_RAID_NODE_E_START: if (!mdi->mdio_started) { /* Bump volume ID to drop missing disks. */ arc4rand(&mdi->mdio_volume_id, 16, 0); g_raid_md_nvidia_start(sc); } return (0); } return (-1); } pd = (struct g_raid_md_nvidia_perdisk *)disk->d_md_data; switch (event) { case G_RAID_DISK_E_DISCONNECTED: /* If disk was assigned, just update statuses. */ if (pd->pd_disk_pos >= 0) { g_raid_change_disk_state(disk, G_RAID_DISK_S_OFFLINE); if (disk->d_consumer) { g_raid_kill_consumer(sc, disk->d_consumer); disk->d_consumer = NULL; } TAILQ_FOREACH(sd, &disk->d_subdisks, sd_next) { g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_NONE); g_raid_event_send(sd, G_RAID_SUBDISK_E_DISCONNECTED, G_RAID_EVENT_SUBDISK); } } else { /* Otherwise -- delete. */ g_raid_change_disk_state(disk, G_RAID_DISK_S_NONE); g_raid_destroy_disk(disk); } if (mdi->mdio_started) { /* Bump volume ID to prevent disk resurrection. */ if (pd->pd_disk_pos >= 0) arc4rand(&mdi->mdio_volume_id, 16, 0); /* Write updated metadata to all disks. */ g_raid_md_write_nvidia(md, NULL, NULL, NULL); } /* Check if anything left except placeholders. */ if (g_raid_ndisks(sc, -1) == g_raid_ndisks(sc, G_RAID_DISK_S_OFFLINE)) g_raid_destroy_node(sc, 0); else g_raid_md_nvidia_refill(sc); return (0); } return (-2); } static int g_raid_md_ctl_nvidia(struct g_raid_md_object *md, struct gctl_req *req) { struct g_raid_softc *sc; struct g_raid_volume *vol; struct g_raid_subdisk *sd; struct g_raid_disk *disk; struct g_raid_md_nvidia_object *mdi; struct g_raid_md_nvidia_perdisk *pd; struct g_consumer *cp; struct g_provider *pp; char arg[16]; const char *verb, *volname, *levelname, *diskname; int *nargs, *force; off_t size, sectorsize, strip, volsize; intmax_t *sizearg, *striparg; int numdisks, i, len, level, qual, update; int error; sc = md->mdo_softc; mdi = (struct g_raid_md_nvidia_object *)md; verb = gctl_get_param(req, "verb", NULL); nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); error = 0; if (strcmp(verb, "label") == 0) { if (*nargs < 4) { gctl_error(req, "Invalid number of arguments."); return (-1); } volname = gctl_get_asciiparam(req, "arg1"); if (volname == NULL) { gctl_error(req, "No volume name."); return (-2); } levelname = gctl_get_asciiparam(req, "arg2"); if (levelname == NULL) { gctl_error(req, "No RAID level."); return (-3); } if (strcasecmp(levelname, "RAID5") == 0) levelname = "RAID5-LS"; if (g_raid_volume_str2level(levelname, &level, &qual)) { gctl_error(req, "Unknown RAID level '%s'.", levelname); return (-4); } numdisks = *nargs - 3; force = gctl_get_paraml(req, "force", sizeof(*force)); if (!g_raid_md_nvidia_supported(level, qual, numdisks, force ? *force : 0)) { gctl_error(req, "Unsupported RAID level " "(0x%02x/0x%02x), or number of disks (%d).", level, qual, numdisks); return (-5); } /* Search for disks, connect them and probe. */ size = 0x7fffffffffffffffllu; sectorsize = 0; for (i = 0; i < numdisks; i++) { snprintf(arg, sizeof(arg), "arg%d", i + 3); diskname = gctl_get_asciiparam(req, arg); if (diskname == NULL) { gctl_error(req, "No disk name (%s).", arg); error = -6; break; } if (strcmp(diskname, "NONE") == 0) { cp = NULL; pp = NULL; } else { g_topology_lock(); cp = g_raid_open_consumer(sc, diskname); if (cp == NULL) { gctl_error(req, "Can't open '%s'.", diskname); g_topology_unlock(); error = -7; break; } pp = cp->provider; } pd = malloc(sizeof(*pd), M_MD_NVIDIA, M_WAITOK | M_ZERO); pd->pd_disk_pos = i; disk = g_raid_create_disk(sc); disk->d_md_data = (void *)pd; disk->d_consumer = cp; if (cp == NULL) continue; cp->private = disk; g_topology_unlock(); g_raid_get_disk_info(disk); pd->pd_disk_size = pp->mediasize; if (size > pp->mediasize) size = pp->mediasize; if (sectorsize < pp->sectorsize) sectorsize = pp->sectorsize; } if (error != 0) return (error); if (sectorsize <= 0) { gctl_error(req, "Can't get sector size."); return (-8); } /* Reserve space for metadata. */ size -= 2 * sectorsize; /* Handle size argument. */ len = sizeof(*sizearg); sizearg = gctl_get_param(req, "size", &len); if (sizearg != NULL && len == sizeof(*sizearg) && *sizearg > 0) { if (*sizearg > size) { gctl_error(req, "Size too big %lld > %lld.", (long long)*sizearg, (long long)size); return (-9); } size = *sizearg; } /* Handle strip argument. */ strip = 131072; len = sizeof(*striparg); striparg = gctl_get_param(req, "strip", &len); if (striparg != NULL && len == sizeof(*striparg) && *striparg > 0) { if (*striparg < sectorsize) { gctl_error(req, "Strip size too small."); return (-10); } if (*striparg % sectorsize != 0) { gctl_error(req, "Incorrect strip size."); return (-11); } if (strip > 65535 * sectorsize) { gctl_error(req, "Strip size too big."); return (-12); } strip = *striparg; } /* Round size down to strip or sector. */ if (level == G_RAID_VOLUME_RL_RAID1) size -= (size % sectorsize); else if (level == G_RAID_VOLUME_RL_RAID1E && (numdisks & 1) != 0) size -= (size % (2 * strip)); else size -= (size % strip); if (size <= 0) { gctl_error(req, "Size too small."); return (-13); } if (level == G_RAID_VOLUME_RL_RAID0 || level == G_RAID_VOLUME_RL_CONCAT || level == G_RAID_VOLUME_RL_SINGLE) volsize = size * numdisks; else if (level == G_RAID_VOLUME_RL_RAID1) volsize = size; else if (level == G_RAID_VOLUME_RL_RAID5) volsize = size * (numdisks - 1); else { /* RAID1E */ volsize = ((size * numdisks) / strip / 2) * strip; } if (volsize > 0xffffffffllu * sectorsize) { gctl_error(req, "Size too big."); return (-14); } /* We have all we need, create things: volume, ... */ mdi->mdio_total_disks = numdisks; mdi->mdio_started = 1; vol = g_raid_create_volume(sc, volname, -1); vol->v_md_data = (void *)(intptr_t)0; vol->v_raid_level = level; vol->v_raid_level_qualifier = qual; vol->v_strip_size = strip; vol->v_disks_count = numdisks; vol->v_mediasize = volsize; vol->v_sectorsize = sectorsize; g_raid_start_volume(vol); /* , and subdisks. */ TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { pd = (struct g_raid_md_nvidia_perdisk *)disk->d_md_data; sd = &vol->v_subdisks[pd->pd_disk_pos]; sd->sd_disk = disk; sd->sd_offset = 0; sd->sd_size = size; TAILQ_INSERT_TAIL(&disk->d_subdisks, sd, sd_next); if (sd->sd_disk->d_consumer != NULL) { g_raid_change_disk_state(disk, G_RAID_DISK_S_ACTIVE); g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_ACTIVE); g_raid_event_send(sd, G_RAID_SUBDISK_E_NEW, G_RAID_EVENT_SUBDISK); } else { g_raid_change_disk_state(disk, G_RAID_DISK_S_OFFLINE); } } /* Write metadata based on created entities. */ G_RAID_DEBUG1(0, sc, "Array started."); g_raid_md_write_nvidia(md, NULL, NULL, NULL); /* Pickup any STALE/SPARE disks to refill array if needed. */ g_raid_md_nvidia_refill(sc); g_raid_event_send(vol, G_RAID_VOLUME_E_START, G_RAID_EVENT_VOLUME); return (0); } if (strcmp(verb, "delete") == 0) { /* Check if some volume is still open. */ force = gctl_get_paraml(req, "force", sizeof(*force)); if (force != NULL && *force == 0 && g_raid_nopens(sc) != 0) { gctl_error(req, "Some volume is still open."); return (-4); } TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_consumer) nvidia_meta_erase(disk->d_consumer); } g_raid_destroy_node(sc, 0); return (0); } if (strcmp(verb, "remove") == 0 || strcmp(verb, "fail") == 0) { if (*nargs < 2) { gctl_error(req, "Invalid number of arguments."); return (-1); } for (i = 1; i < *nargs; i++) { snprintf(arg, sizeof(arg), "arg%d", i); diskname = gctl_get_asciiparam(req, arg); if (diskname == NULL) { gctl_error(req, "No disk name (%s).", arg); error = -2; break; } if (strncmp(diskname, "/dev/", 5) == 0) diskname += 5; TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_consumer != NULL && disk->d_consumer->provider != NULL && strcmp(disk->d_consumer->provider->name, diskname) == 0) break; } if (disk == NULL) { gctl_error(req, "Disk '%s' not found.", diskname); error = -3; break; } if (strcmp(verb, "fail") == 0) { g_raid_md_fail_disk_nvidia(md, NULL, disk); continue; } pd = (struct g_raid_md_nvidia_perdisk *)disk->d_md_data; /* Erase metadata on deleting disk. */ nvidia_meta_erase(disk->d_consumer); /* If disk was assigned, just update statuses. */ if (pd->pd_disk_pos >= 0) { g_raid_change_disk_state(disk, G_RAID_DISK_S_OFFLINE); g_raid_kill_consumer(sc, disk->d_consumer); disk->d_consumer = NULL; TAILQ_FOREACH(sd, &disk->d_subdisks, sd_next) { g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_NONE); g_raid_event_send(sd, G_RAID_SUBDISK_E_DISCONNECTED, G_RAID_EVENT_SUBDISK); } } else { /* Otherwise -- delete. */ g_raid_change_disk_state(disk, G_RAID_DISK_S_NONE); g_raid_destroy_disk(disk); } } /* Write updated metadata to remaining disks. */ g_raid_md_write_nvidia(md, NULL, NULL, NULL); /* Check if anything left except placeholders. */ if (g_raid_ndisks(sc, -1) == g_raid_ndisks(sc, G_RAID_DISK_S_OFFLINE)) g_raid_destroy_node(sc, 0); else g_raid_md_nvidia_refill(sc); return (error); } if (strcmp(verb, "insert") == 0) { if (*nargs < 2) { gctl_error(req, "Invalid number of arguments."); return (-1); } update = 0; for (i = 1; i < *nargs; i++) { /* Get disk name. */ snprintf(arg, sizeof(arg), "arg%d", i); diskname = gctl_get_asciiparam(req, arg); if (diskname == NULL) { gctl_error(req, "No disk name (%s).", arg); error = -3; break; } /* Try to find provider with specified name. */ g_topology_lock(); cp = g_raid_open_consumer(sc, diskname); if (cp == NULL) { gctl_error(req, "Can't open disk '%s'.", diskname); g_topology_unlock(); error = -4; break; } pp = cp->provider; pd = malloc(sizeof(*pd), M_MD_NVIDIA, M_WAITOK | M_ZERO); pd->pd_disk_pos = -3; pd->pd_disk_size = pp->mediasize; disk = g_raid_create_disk(sc); disk->d_consumer = cp; disk->d_md_data = (void *)pd; cp->private = disk; g_topology_unlock(); g_raid_get_disk_info(disk); /* Welcome the "new" disk. */ update += g_raid_md_nvidia_start_disk(disk); if (disk->d_state != G_RAID_DISK_S_SPARE && disk->d_state != G_RAID_DISK_S_ACTIVE) { gctl_error(req, "Disk '%s' doesn't fit.", diskname); g_raid_destroy_disk(disk); error = -8; break; } } /* Write new metadata if we changed something. */ if (update) g_raid_md_write_nvidia(md, NULL, NULL, NULL); return (error); } gctl_error(req, "Command '%s' is not supported.", verb); return (-100); } static int g_raid_md_write_nvidia(struct g_raid_md_object *md, struct g_raid_volume *tvol, struct g_raid_subdisk *tsd, struct g_raid_disk *tdisk) { struct g_raid_softc *sc; struct g_raid_volume *vol; struct g_raid_subdisk *sd; struct g_raid_disk *disk; struct g_raid_md_nvidia_object *mdi; struct g_raid_md_nvidia_perdisk *pd; struct nvidia_raid_conf *meta; int i, spares; sc = md->mdo_softc; mdi = (struct g_raid_md_nvidia_object *)md; if (sc->sc_stopping == G_RAID_DESTROY_HARD) return (0); /* There is only one volume. */ vol = TAILQ_FIRST(&sc->sc_volumes); /* Fill global fields. */ meta = malloc(sizeof(*meta), M_MD_NVIDIA, M_WAITOK | M_ZERO); if (mdi->mdio_meta) memcpy(meta, mdi->mdio_meta, sizeof(*meta)); memcpy(meta->nvidia_id, NVIDIA_MAGIC, sizeof(NVIDIA_MAGIC) - 1); meta->config_size = 30; meta->version = 0x0064; meta->total_sectors = vol->v_mediasize / vol->v_sectorsize; meta->sector_size = vol->v_sectorsize; nvidia_meta_put_name(meta, vol->v_name); meta->magic_0 = NVIDIA_MAGIC0; memcpy(&meta->volume_id, &mdi->mdio_volume_id, 16); meta->state = NVIDIA_S_IDLE; if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1) meta->array_width = 1; else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1E) meta->array_width = vol->v_disks_count / 2; else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID5) meta->array_width = vol->v_disks_count - 1; else meta->array_width = vol->v_disks_count; meta->total_disks = vol->v_disks_count; meta->orig_array_width = meta->array_width; if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID0) meta->type = NVIDIA_T_RAID0; else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1) meta->type = NVIDIA_T_RAID1; else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1E) meta->type = NVIDIA_T_RAID01; else if (vol->v_raid_level == G_RAID_VOLUME_RL_CONCAT || vol->v_raid_level == G_RAID_VOLUME_RL_SINGLE) meta->type = NVIDIA_T_CONCAT; else if (vol->v_raid_level_qualifier == G_RAID_VOLUME_RLQ_R5LA) meta->type = NVIDIA_T_RAID5; else meta->type = NVIDIA_T_RAID5_SYM; meta->strip_sectors = vol->v_strip_size / vol->v_sectorsize; meta->strip_bytes = vol->v_strip_size; meta->strip_shift = ffs(meta->strip_sectors) - 1; meta->strip_mask = meta->strip_sectors - 1; meta->stripe_sectors = meta->strip_sectors * meta->orig_array_width; meta->stripe_bytes = meta->stripe_sectors * vol->v_sectorsize; meta->rebuild_lba = 0; meta->orig_type = meta->type; meta->orig_total_sectors = meta->total_sectors; meta->status = 0; for (i = 0; i < vol->v_disks_count; i++) { sd = &vol->v_subdisks[i]; if ((sd->sd_state == G_RAID_SUBDISK_S_STALE || sd->sd_state == G_RAID_SUBDISK_S_RESYNC || vol->v_dirty) && meta->state != NVIDIA_S_REBUILD) meta->state = NVIDIA_S_SYNC; else if (sd->sd_state == G_RAID_SUBDISK_S_NEW || sd->sd_state == G_RAID_SUBDISK_S_REBUILD) meta->state = NVIDIA_S_REBUILD; } /* We are done. Print meta data and store them to disks. */ if (mdi->mdio_meta != NULL) free(mdi->mdio_meta, M_MD_NVIDIA); mdi->mdio_meta = meta; spares = 0; TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { pd = (struct g_raid_md_nvidia_perdisk *)disk->d_md_data; if (disk->d_state != G_RAID_DISK_S_ACTIVE && disk->d_state != G_RAID_DISK_S_SPARE) continue; if (pd->pd_meta != NULL) { free(pd->pd_meta, M_MD_NVIDIA); pd->pd_meta = NULL; } pd->pd_meta = nvidia_meta_copy(meta); if ((sd = TAILQ_FIRST(&disk->d_subdisks)) != NULL) { /* For RAID0+1 we need to translate order. */ pd->pd_meta->disk_number = nvidia_meta_translate_disk(meta, sd->sd_pos); if (sd->sd_state != G_RAID_SUBDISK_S_ACTIVE) { pd->pd_meta->disk_status = 0x100; pd->pd_meta->rebuild_lba = sd->sd_rebuild_pos / vol->v_sectorsize * meta->array_width; } } else pd->pd_meta->disk_number = meta->total_disks + spares++; G_RAID_DEBUG(1, "Writing NVIDIA metadata to %s", g_raid_get_diskname(disk)); g_raid_md_nvidia_print(pd->pd_meta); nvidia_meta_write(disk->d_consumer, pd->pd_meta); } return (0); } static int g_raid_md_fail_disk_nvidia(struct g_raid_md_object *md, struct g_raid_subdisk *tsd, struct g_raid_disk *tdisk) { struct g_raid_softc *sc; struct g_raid_md_nvidia_perdisk *pd; struct g_raid_subdisk *sd; sc = md->mdo_softc; pd = (struct g_raid_md_nvidia_perdisk *)tdisk->d_md_data; /* We can't fail disk that is not a part of array now. */ if (pd->pd_disk_pos < 0) return (-1); /* Erase metadata to prevent disks's later resurrection. */ if (tdisk->d_consumer) nvidia_meta_erase(tdisk->d_consumer); /* Change states. */ g_raid_change_disk_state(tdisk, G_RAID_DISK_S_FAILED); TAILQ_FOREACH(sd, &tdisk->d_subdisks, sd_next) { g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_FAILED); g_raid_event_send(sd, G_RAID_SUBDISK_E_FAILED, G_RAID_EVENT_SUBDISK); } /* Write updated metadata to remaining disks. */ g_raid_md_write_nvidia(md, NULL, NULL, tdisk); /* Check if anything left except placeholders. */ if (g_raid_ndisks(sc, -1) == g_raid_ndisks(sc, G_RAID_DISK_S_OFFLINE)) g_raid_destroy_node(sc, 0); else g_raid_md_nvidia_refill(sc); return (0); } static int g_raid_md_free_disk_nvidia(struct g_raid_md_object *md, struct g_raid_disk *disk) { struct g_raid_md_nvidia_perdisk *pd; pd = (struct g_raid_md_nvidia_perdisk *)disk->d_md_data; if (pd->pd_meta != NULL) { free(pd->pd_meta, M_MD_NVIDIA); pd->pd_meta = NULL; } free(pd, M_MD_NVIDIA); disk->d_md_data = NULL; return (0); } static int g_raid_md_free_nvidia(struct g_raid_md_object *md) { struct g_raid_md_nvidia_object *mdi; mdi = (struct g_raid_md_nvidia_object *)md; if (!mdi->mdio_started) { mdi->mdio_started = 0; callout_stop(&mdi->mdio_start_co); G_RAID_DEBUG1(1, md->mdo_softc, "root_mount_rel %p", mdi->mdio_rootmount); root_mount_rel(mdi->mdio_rootmount); mdi->mdio_rootmount = NULL; } if (mdi->mdio_meta != NULL) { free(mdi->mdio_meta, M_MD_NVIDIA); mdi->mdio_meta = NULL; } return (0); } G_RAID_MD_DECLARE(nvidia, "NVIDIA"); Index: head/sys/geom/raid/md_promise.c =================================================================== --- head/sys/geom/raid/md_promise.c (revision 308456) +++ head/sys/geom/raid/md_promise.c (revision 308457) @@ -1,2001 +1,2001 @@ /*- * Copyright (c) 2011 Alexander Motin * Copyright (c) 2000 - 2008 Søren Schmidt * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include "geom/raid/g_raid.h" #include "g_raid_md_if.h" static MALLOC_DEFINE(M_MD_PROMISE, "md_promise_data", "GEOM_RAID Promise metadata"); #define PROMISE_MAX_DISKS 8 #define PROMISE_MAX_SUBDISKS 2 #define PROMISE_META_OFFSET 14 struct promise_raid_disk { uint8_t flags; /* Subdisk status. */ #define PROMISE_F_VALID 0x01 #define PROMISE_F_ONLINE 0x02 #define PROMISE_F_ASSIGNED 0x04 #define PROMISE_F_SPARE 0x08 #define PROMISE_F_DUPLICATE 0x10 #define PROMISE_F_REDIR 0x20 #define PROMISE_F_DOWN 0x40 #define PROMISE_F_READY 0x80 uint8_t number; /* Position in a volume. */ uint8_t channel; /* ATA channel number. */ uint8_t device; /* ATA device number. */ uint64_t id __packed; /* Subdisk ID. */ } __packed; struct promise_raid_conf { char promise_id[24]; #define PROMISE_MAGIC "Promise Technology, Inc." #define FREEBSD_MAGIC "FreeBSD ATA driver RAID " uint32_t dummy_0; uint64_t magic_0; #define PROMISE_MAGIC0(x) (((uint64_t)(x.channel) << 48) | \ ((uint64_t)(x.device != 0) << 56)) uint16_t magic_1; uint32_t magic_2; uint8_t filler1[470]; uint32_t integrity; #define PROMISE_I_VALID 0x00000080 struct promise_raid_disk disk; /* This subdisk info. */ uint32_t disk_offset; /* Subdisk offset. */ uint32_t disk_sectors; /* Subdisk size */ uint32_t disk_rebuild; /* Rebuild position. */ uint16_t generation; /* Generation number. */ uint8_t status; /* Volume status. */ #define PROMISE_S_VALID 0x01 #define PROMISE_S_ONLINE 0x02 #define PROMISE_S_INITED 0x04 #define PROMISE_S_READY 0x08 #define PROMISE_S_DEGRADED 0x10 #define PROMISE_S_MARKED 0x20 #define PROMISE_S_MIGRATING 0x40 #define PROMISE_S_FUNCTIONAL 0x80 uint8_t type; /* Voluem type. */ #define PROMISE_T_RAID0 0x00 #define PROMISE_T_RAID1 0x01 #define PROMISE_T_RAID3 0x02 #define PROMISE_T_RAID5 0x04 #define PROMISE_T_SPAN 0x08 #define PROMISE_T_JBOD 0x10 uint8_t total_disks; /* Disks in this volume. */ uint8_t stripe_shift; /* Strip size. */ uint8_t array_width; /* Number of RAID0 stripes. */ uint8_t array_number; /* Global volume number. */ uint32_t total_sectors; /* Volume size. */ uint16_t cylinders; /* Volume geometry: C. */ uint8_t heads; /* Volume geometry: H. */ uint8_t sectors; /* Volume geometry: S. */ uint64_t volume_id __packed; /* Volume ID, */ struct promise_raid_disk disks[PROMISE_MAX_DISKS]; /* Subdisks in this volume. */ char name[32]; /* Volume label. */ uint32_t filler2[8]; uint32_t magic_3; /* Something related to rebuild. */ uint64_t rebuild_lba64; /* Per-volume rebuild position. */ uint32_t magic_4; uint32_t magic_5; uint32_t total_sectors_high; uint8_t magic_6; uint8_t sector_size; uint16_t magic_7; uint32_t magic_8[31]; uint32_t backup_time; uint16_t magic_9; uint32_t disk_offset_high; uint32_t disk_sectors_high; uint32_t disk_rebuild_high; uint16_t magic_10; uint32_t magic_11[3]; uint32_t filler3[284]; uint32_t checksum; } __packed; struct g_raid_md_promise_perdisk { int pd_updated; int pd_subdisks; struct promise_raid_conf *pd_meta[PROMISE_MAX_SUBDISKS]; }; struct g_raid_md_promise_pervolume { struct promise_raid_conf *pv_meta; uint64_t pv_id; uint16_t pv_generation; int pv_disks_present; int pv_started; struct callout pv_start_co; /* STARTING state timer. */ }; static g_raid_md_create_t g_raid_md_create_promise; static g_raid_md_taste_t g_raid_md_taste_promise; static g_raid_md_event_t g_raid_md_event_promise; static g_raid_md_volume_event_t g_raid_md_volume_event_promise; static g_raid_md_ctl_t g_raid_md_ctl_promise; static g_raid_md_write_t g_raid_md_write_promise; static g_raid_md_fail_disk_t g_raid_md_fail_disk_promise; static g_raid_md_free_disk_t g_raid_md_free_disk_promise; static g_raid_md_free_volume_t g_raid_md_free_volume_promise; static g_raid_md_free_t g_raid_md_free_promise; static kobj_method_t g_raid_md_promise_methods[] = { KOBJMETHOD(g_raid_md_create, g_raid_md_create_promise), KOBJMETHOD(g_raid_md_taste, g_raid_md_taste_promise), KOBJMETHOD(g_raid_md_event, g_raid_md_event_promise), KOBJMETHOD(g_raid_md_volume_event, g_raid_md_volume_event_promise), KOBJMETHOD(g_raid_md_ctl, g_raid_md_ctl_promise), KOBJMETHOD(g_raid_md_write, g_raid_md_write_promise), KOBJMETHOD(g_raid_md_fail_disk, g_raid_md_fail_disk_promise), KOBJMETHOD(g_raid_md_free_disk, g_raid_md_free_disk_promise), KOBJMETHOD(g_raid_md_free_volume, g_raid_md_free_volume_promise), KOBJMETHOD(g_raid_md_free, g_raid_md_free_promise), { 0, 0 } }; static struct g_raid_md_class g_raid_md_promise_class = { "Promise", g_raid_md_promise_methods, sizeof(struct g_raid_md_object), .mdc_enable = 1, .mdc_priority = 100 }; static void g_raid_md_promise_print(struct promise_raid_conf *meta) { int i; if (g_raid_debug < 1) return; printf("********* ATA Promise Metadata *********\n"); printf("promise_id <%.24s>\n", meta->promise_id); printf("disk %02x %02x %02x %02x %016jx\n", meta->disk.flags, meta->disk.number, meta->disk.channel, meta->disk.device, meta->disk.id); printf("disk_offset %u\n", meta->disk_offset); printf("disk_sectors %u\n", meta->disk_sectors); printf("disk_rebuild %u\n", meta->disk_rebuild); printf("generation %u\n", meta->generation); printf("status 0x%02x\n", meta->status); printf("type %u\n", meta->type); printf("total_disks %u\n", meta->total_disks); printf("stripe_shift %u\n", meta->stripe_shift); printf("array_width %u\n", meta->array_width); printf("array_number %u\n", meta->array_number); printf("total_sectors %u\n", meta->total_sectors); printf("cylinders %u\n", meta->cylinders); printf("heads %u\n", meta->heads); printf("sectors %u\n", meta->sectors); printf("volume_id 0x%016jx\n", meta->volume_id); printf("disks:\n"); for (i = 0; i < PROMISE_MAX_DISKS; i++ ) { printf(" %02x %02x %02x %02x %016jx\n", meta->disks[i].flags, meta->disks[i].number, meta->disks[i].channel, meta->disks[i].device, meta->disks[i].id); } printf("name <%.32s>\n", meta->name); printf("magic_3 0x%08x\n", meta->magic_3); printf("rebuild_lba64 %ju\n", meta->rebuild_lba64); printf("magic_4 0x%08x\n", meta->magic_4); printf("magic_5 0x%08x\n", meta->magic_5); printf("total_sectors_high 0x%08x\n", meta->total_sectors_high); printf("sector_size %u\n", meta->sector_size); printf("backup_time %d\n", meta->backup_time); printf("disk_offset_high 0x%08x\n", meta->disk_offset_high); printf("disk_sectors_high 0x%08x\n", meta->disk_sectors_high); printf("disk_rebuild_high 0x%08x\n", meta->disk_rebuild_high); printf("=================================================\n"); } static struct promise_raid_conf * promise_meta_copy(struct promise_raid_conf *meta) { struct promise_raid_conf *nmeta; nmeta = malloc(sizeof(*nmeta), M_MD_PROMISE, M_WAITOK); memcpy(nmeta, meta, sizeof(*nmeta)); return (nmeta); } static int promise_meta_find_disk(struct promise_raid_conf *meta, uint64_t id) { int pos; for (pos = 0; pos < meta->total_disks; pos++) { if (meta->disks[pos].id == id) return (pos); } return (-1); } static int promise_meta_unused_range(struct promise_raid_conf **metaarr, int nsd, off_t sectors, off_t *off, off_t *size) { off_t coff, csize, tmp; int i, j; sectors -= 131072; *off = 0; *size = 0; coff = 0; csize = sectors; i = 0; while (1) { for (j = 0; j < nsd; j++) { tmp = ((off_t)metaarr[j]->disk_offset_high << 32) + metaarr[j]->disk_offset; if (tmp >= coff) csize = MIN(csize, tmp - coff); } if (csize > *size) { *off = coff; *size = csize; } if (i >= nsd) break; coff = ((off_t)metaarr[i]->disk_offset_high << 32) + metaarr[i]->disk_offset + ((off_t)metaarr[i]->disk_sectors_high << 32) + metaarr[i]->disk_sectors; csize = sectors - coff; i++; } return ((*size > 0) ? 1 : 0); } static int promise_meta_translate_disk(struct g_raid_volume *vol, int md_disk_pos) { int disk_pos, width; if (md_disk_pos >= 0 && vol->v_raid_level == G_RAID_VOLUME_RL_RAID1E) { width = vol->v_disks_count / 2; disk_pos = (md_disk_pos / width) + (md_disk_pos % width) * width; } else disk_pos = md_disk_pos; return (disk_pos); } static void promise_meta_get_name(struct promise_raid_conf *meta, char *buf) { int i; strncpy(buf, meta->name, 32); buf[32] = 0; for (i = 31; i >= 0; i--) { if (buf[i] > 0x20) break; buf[i] = 0; } } static void promise_meta_put_name(struct promise_raid_conf *meta, char *buf) { memset(meta->name, 0x20, 32); memcpy(meta->name, buf, MIN(strlen(buf), 32)); } static int promise_meta_read(struct g_consumer *cp, struct promise_raid_conf **metaarr) { struct g_provider *pp; struct promise_raid_conf *meta; char *buf; int error, i, subdisks; uint32_t checksum, *ptr; pp = cp->provider; subdisks = 0; next: /* Read metadata block. */ buf = g_read_data(cp, pp->mediasize - pp->sectorsize * (63 - subdisks * PROMISE_META_OFFSET), pp->sectorsize * 4, &error); if (buf == NULL) { G_RAID_DEBUG(1, "Cannot read metadata from %s (error=%d).", pp->name, error); return (subdisks); } meta = (struct promise_raid_conf *)buf; /* Check if this is an Promise RAID struct */ if (strncmp(meta->promise_id, PROMISE_MAGIC, strlen(PROMISE_MAGIC)) && strncmp(meta->promise_id, FREEBSD_MAGIC, strlen(FREEBSD_MAGIC))) { if (subdisks == 0) G_RAID_DEBUG(1, "Promise signature check failed on %s", pp->name); g_free(buf); return (subdisks); } meta = malloc(sizeof(*meta), M_MD_PROMISE, M_WAITOK); memcpy(meta, buf, MIN(sizeof(*meta), pp->sectorsize * 4)); g_free(buf); /* Check metadata checksum. */ for (checksum = 0, ptr = (uint32_t *)meta, i = 0; i < 511; i++) checksum += *ptr++; if (checksum != meta->checksum) { G_RAID_DEBUG(1, "Promise checksum check failed on %s", pp->name); free(meta, M_MD_PROMISE); return (subdisks); } if ((meta->integrity & PROMISE_I_VALID) == 0) { G_RAID_DEBUG(1, "Promise metadata is invalid on %s", pp->name); free(meta, M_MD_PROMISE); return (subdisks); } if (meta->total_disks > PROMISE_MAX_DISKS) { G_RAID_DEBUG(1, "Wrong number of disks on %s (%d)", pp->name, meta->total_disks); free(meta, M_MD_PROMISE); return (subdisks); } /* Remove filler garbage from fields used in newer metadata. */ if (meta->disk_offset_high == 0x8b8c8d8e && meta->disk_sectors_high == 0x8788898a && meta->disk_rebuild_high == 0x83848586) { meta->disk_offset_high = 0; meta->disk_sectors_high = 0; if (meta->disk_rebuild == UINT32_MAX) meta->disk_rebuild_high = UINT32_MAX; else meta->disk_rebuild_high = 0; if (meta->total_sectors_high == 0x15161718) { meta->total_sectors_high = 0; meta->backup_time = 0; if (meta->rebuild_lba64 == 0x2122232425262728) meta->rebuild_lba64 = UINT64_MAX; } } if (meta->sector_size < 1 || meta->sector_size > 8) meta->sector_size = 1; /* Save this part and look for next. */ *metaarr = meta; metaarr++; subdisks++; if (subdisks < PROMISE_MAX_SUBDISKS) goto next; return (subdisks); } static int promise_meta_write(struct g_consumer *cp, struct promise_raid_conf **metaarr, int nsd) { struct g_provider *pp; struct promise_raid_conf *meta; char *buf; off_t off, size; int error, i, subdisk, fake; uint32_t checksum, *ptr; pp = cp->provider; subdisk = 0; fake = 0; next: buf = malloc(pp->sectorsize * 4, M_MD_PROMISE, M_WAITOK | M_ZERO); meta = NULL; if (subdisk < nsd) { meta = metaarr[subdisk]; } else if (!fake && promise_meta_unused_range(metaarr, nsd, cp->provider->mediasize / cp->provider->sectorsize, &off, &size)) { /* Optionally add record for unused space. */ meta = (struct promise_raid_conf *)buf; memcpy(&meta->promise_id[0], PROMISE_MAGIC, sizeof(PROMISE_MAGIC) - 1); meta->dummy_0 = 0x00020000; meta->integrity = PROMISE_I_VALID; meta->disk.flags = PROMISE_F_ONLINE | PROMISE_F_VALID; meta->disk.number = 0xff; arc4rand(&meta->disk.id, sizeof(meta->disk.id), 0); meta->disk_offset_high = off >> 32; meta->disk_offset = (uint32_t)off; meta->disk_sectors_high = size >> 32; meta->disk_sectors = (uint32_t)size; meta->disk_rebuild_high = UINT32_MAX; meta->disk_rebuild = UINT32_MAX; fake = 1; } if (meta != NULL) { /* Recalculate checksum for case if metadata were changed. */ meta->checksum = 0; for (checksum = 0, ptr = (uint32_t *)meta, i = 0; i < 511; i++) checksum += *ptr++; meta->checksum = checksum; memcpy(buf, meta, MIN(pp->sectorsize * 4, sizeof(*meta))); } error = g_write_data(cp, pp->mediasize - pp->sectorsize * (63 - subdisk * PROMISE_META_OFFSET), buf, pp->sectorsize * 4); if (error != 0) { G_RAID_DEBUG(1, "Cannot write metadata to %s (error=%d).", pp->name, error); } free(buf, M_MD_PROMISE); subdisk++; if (subdisk < PROMISE_MAX_SUBDISKS) goto next; return (error); } static int promise_meta_erase(struct g_consumer *cp) { struct g_provider *pp; char *buf; int error, subdisk; pp = cp->provider; buf = malloc(4 * pp->sectorsize, M_MD_PROMISE, M_WAITOK | M_ZERO); for (subdisk = 0; subdisk < PROMISE_MAX_SUBDISKS; subdisk++) { error = g_write_data(cp, pp->mediasize - pp->sectorsize * (63 - subdisk * PROMISE_META_OFFSET), buf, 4 * pp->sectorsize); if (error != 0) { G_RAID_DEBUG(1, "Cannot erase metadata on %s (error=%d).", pp->name, error); } } free(buf, M_MD_PROMISE); return (error); } static int promise_meta_write_spare(struct g_consumer *cp) { struct promise_raid_conf *meta; off_t tmp; int error; meta = malloc(sizeof(*meta), M_MD_PROMISE, M_WAITOK | M_ZERO); memcpy(&meta->promise_id[0], PROMISE_MAGIC, sizeof(PROMISE_MAGIC) - 1); meta->dummy_0 = 0x00020000; meta->integrity = PROMISE_I_VALID; meta->disk.flags = PROMISE_F_SPARE | PROMISE_F_ONLINE | PROMISE_F_VALID; meta->disk.number = 0xff; arc4rand(&meta->disk.id, sizeof(meta->disk.id), 0); tmp = cp->provider->mediasize / cp->provider->sectorsize - 131072; meta->disk_sectors_high = tmp >> 32; meta->disk_sectors = (uint32_t)tmp; meta->disk_rebuild_high = UINT32_MAX; meta->disk_rebuild = UINT32_MAX; error = promise_meta_write(cp, &meta, 1); free(meta, M_MD_PROMISE); return (error); } static struct g_raid_volume * g_raid_md_promise_get_volume(struct g_raid_softc *sc, uint64_t id) { struct g_raid_volume *vol; struct g_raid_md_promise_pervolume *pv; TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) { pv = vol->v_md_data; if (pv->pv_id == id) break; } return (vol); } static int g_raid_md_promise_purge_volumes(struct g_raid_softc *sc) { struct g_raid_volume *vol, *tvol; struct g_raid_md_promise_pervolume *pv; int i, res; res = 0; TAILQ_FOREACH_SAFE(vol, &sc->sc_volumes, v_next, tvol) { pv = vol->v_md_data; if (!pv->pv_started || vol->v_stopping) continue; for (i = 0; i < vol->v_disks_count; i++) { if (vol->v_subdisks[i].sd_state != G_RAID_SUBDISK_S_NONE) break; } if (i >= vol->v_disks_count) { g_raid_destroy_volume(vol); res = 1; } } return (res); } static int g_raid_md_promise_purge_disks(struct g_raid_softc *sc) { struct g_raid_disk *disk, *tdisk; struct g_raid_volume *vol; struct g_raid_md_promise_perdisk *pd; int i, j, res; res = 0; TAILQ_FOREACH_SAFE(disk, &sc->sc_disks, d_next, tdisk) { if (disk->d_state == G_RAID_DISK_S_SPARE) continue; pd = (struct g_raid_md_promise_perdisk *)disk->d_md_data; /* Scan for deleted volumes. */ for (i = 0; i < pd->pd_subdisks; ) { vol = g_raid_md_promise_get_volume(sc, pd->pd_meta[i]->volume_id); if (vol != NULL && !vol->v_stopping) { i++; continue; } free(pd->pd_meta[i], M_MD_PROMISE); for (j = i; j < pd->pd_subdisks - 1; j++) pd->pd_meta[j] = pd->pd_meta[j + 1]; pd->pd_meta[pd->pd_subdisks - 1] = NULL; pd->pd_subdisks--; pd->pd_updated = 1; } /* If there is no metadata left - erase and delete disk. */ if (pd->pd_subdisks == 0) { promise_meta_erase(disk->d_consumer); g_raid_destroy_disk(disk); res = 1; } } return (res); } static int g_raid_md_promise_supported(int level, int qual, int disks, int force) { if (disks > PROMISE_MAX_DISKS) return (0); switch (level) { case G_RAID_VOLUME_RL_RAID0: if (disks < 1) return (0); if (!force && disks < 2) return (0); break; case G_RAID_VOLUME_RL_RAID1: if (disks < 1) return (0); if (!force && (disks != 2)) return (0); break; case G_RAID_VOLUME_RL_RAID1E: if (disks < 2) return (0); if (disks % 2 != 0) return (0); if (!force && (disks != 4)) return (0); break; case G_RAID_VOLUME_RL_SINGLE: if (disks != 1) return (0); break; case G_RAID_VOLUME_RL_CONCAT: if (disks < 2) return (0); break; case G_RAID_VOLUME_RL_RAID5: if (disks < 3) return (0); if (qual != G_RAID_VOLUME_RLQ_R5LA) return (0); break; default: return (0); } if (level != G_RAID_VOLUME_RL_RAID5 && qual != G_RAID_VOLUME_RLQ_NONE) return (0); return (1); } static int g_raid_md_promise_start_disk(struct g_raid_disk *disk, int sdn, struct g_raid_volume *vol) { struct g_raid_softc *sc; struct g_raid_subdisk *sd; struct g_raid_md_promise_perdisk *pd; struct g_raid_md_promise_pervolume *pv; struct promise_raid_conf *meta; off_t eoff, esize, size; int disk_pos, md_disk_pos, i, resurrection = 0; sc = disk->d_softc; pd = (struct g_raid_md_promise_perdisk *)disk->d_md_data; pv = vol->v_md_data; meta = pv->pv_meta; if (sdn >= 0) { - /* Find disk position in metadata by it's serial. */ + /* Find disk position in metadata by its serial. */ md_disk_pos = promise_meta_find_disk(meta, pd->pd_meta[sdn]->disk.id); /* For RAID0+1 we need to translate order. */ disk_pos = promise_meta_translate_disk(vol, md_disk_pos); } else { md_disk_pos = -1; disk_pos = -1; } if (disk_pos < 0) { G_RAID_DEBUG1(1, sc, "Disk %s is not part of the volume %s", g_raid_get_diskname(disk), vol->v_name); /* Failed stale disk is useless for us. */ if (sdn >= 0 && pd->pd_meta[sdn]->disk.flags & PROMISE_F_DOWN) { g_raid_change_disk_state(disk, G_RAID_DISK_S_STALE_FAILED); return (0); } /* If we were given specific metadata subdisk - erase it. */ if (sdn >= 0) { free(pd->pd_meta[sdn], M_MD_PROMISE); for (i = sdn; i < pd->pd_subdisks - 1; i++) pd->pd_meta[i] = pd->pd_meta[i + 1]; pd->pd_meta[pd->pd_subdisks - 1] = NULL; pd->pd_subdisks--; } /* If we are in the start process, that's all for now. */ if (!pv->pv_started) goto nofit; /* * If we have already started - try to get use of the disk. * Try to replace OFFLINE disks first, then FAILED. */ promise_meta_unused_range(pd->pd_meta, pd->pd_subdisks, disk->d_consumer->provider->mediasize / disk->d_consumer->provider->sectorsize, &eoff, &esize); if (esize == 0) { G_RAID_DEBUG1(1, sc, "No free space on disk %s", g_raid_get_diskname(disk)); goto nofit; } size = INT64_MAX; for (i = 0; i < vol->v_disks_count; i++) { sd = &vol->v_subdisks[i]; if (sd->sd_state != G_RAID_SUBDISK_S_NONE) size = sd->sd_size; if (sd->sd_state <= G_RAID_SUBDISK_S_FAILED && (disk_pos < 0 || vol->v_subdisks[i].sd_state < sd->sd_state)) disk_pos = i; } if (disk_pos >= 0 && vol->v_raid_level != G_RAID_VOLUME_RL_CONCAT && (off_t)esize * 512 < size) { G_RAID_DEBUG1(1, sc, "Disk %s free space " "is too small (%ju < %ju)", g_raid_get_diskname(disk), (off_t)esize * 512, size); disk_pos = -1; } if (disk_pos >= 0) { if (vol->v_raid_level != G_RAID_VOLUME_RL_CONCAT) esize = size / 512; /* For RAID0+1 we need to translate order. */ md_disk_pos = promise_meta_translate_disk(vol, disk_pos); } else { nofit: if (pd->pd_subdisks == 0) { g_raid_change_disk_state(disk, G_RAID_DISK_S_SPARE); } return (0); } G_RAID_DEBUG1(1, sc, "Disk %s takes pos %d in the volume %s", g_raid_get_diskname(disk), disk_pos, vol->v_name); resurrection = 1; } sd = &vol->v_subdisks[disk_pos]; if (resurrection && sd->sd_disk != NULL) { g_raid_change_disk_state(sd->sd_disk, G_RAID_DISK_S_STALE_FAILED); TAILQ_REMOVE(&sd->sd_disk->d_subdisks, sd, sd_next); } vol->v_subdisks[disk_pos].sd_disk = disk; TAILQ_INSERT_TAIL(&disk->d_subdisks, sd, sd_next); /* Welcome the new disk. */ if (resurrection) g_raid_change_disk_state(disk, G_RAID_DISK_S_ACTIVE); else if (meta->disks[md_disk_pos].flags & PROMISE_F_DOWN) g_raid_change_disk_state(disk, G_RAID_DISK_S_FAILED); else g_raid_change_disk_state(disk, G_RAID_DISK_S_ACTIVE); if (resurrection) { sd->sd_offset = (off_t)eoff * 512; sd->sd_size = (off_t)esize * 512; } else { sd->sd_offset = (((off_t)pd->pd_meta[sdn]->disk_offset_high << 32) + pd->pd_meta[sdn]->disk_offset) * 512; sd->sd_size = (((off_t)pd->pd_meta[sdn]->disk_sectors_high << 32) + pd->pd_meta[sdn]->disk_sectors) * 512; } if (resurrection) { /* Stale disk, almost same as new. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_NEW); } else if (meta->disks[md_disk_pos].flags & PROMISE_F_DOWN) { /* Failed disk. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_FAILED); } else if (meta->disks[md_disk_pos].flags & PROMISE_F_REDIR) { /* Rebuilding disk. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_REBUILD); if (pd->pd_meta[sdn]->generation != meta->generation) sd->sd_rebuild_pos = 0; else { sd->sd_rebuild_pos = (((off_t)pd->pd_meta[sdn]->disk_rebuild_high << 32) + pd->pd_meta[sdn]->disk_rebuild) * 512; } } else if (!(meta->disks[md_disk_pos].flags & PROMISE_F_ONLINE)) { /* Rebuilding disk. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_NEW); } else if (pd->pd_meta[sdn]->generation != meta->generation || (meta->status & PROMISE_S_MARKED)) { /* Stale disk or dirty volume (unclean shutdown). */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_STALE); } else { /* Up to date disk. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_ACTIVE); } g_raid_event_send(sd, G_RAID_SUBDISK_E_NEW, G_RAID_EVENT_SUBDISK); return (resurrection); } static void g_raid_md_promise_refill(struct g_raid_softc *sc) { struct g_raid_volume *vol; struct g_raid_subdisk *sd; struct g_raid_disk *disk; struct g_raid_md_object *md; struct g_raid_md_promise_perdisk *pd; struct g_raid_md_promise_pervolume *pv; int update, updated, i, bad; md = sc->sc_md; restart: updated = 0; TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) { pv = vol->v_md_data; if (!pv->pv_started || vol->v_stopping) continue; /* Search for subdisk that needs replacement. */ bad = 0; for (i = 0; i < vol->v_disks_count; i++) { sd = &vol->v_subdisks[i]; if (sd->sd_state == G_RAID_SUBDISK_S_NONE || sd->sd_state == G_RAID_SUBDISK_S_FAILED) bad = 1; } if (!bad) continue; G_RAID_DEBUG1(1, sc, "Volume %s is not complete, " "trying to refill.", vol->v_name); TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { /* Skip failed. */ if (disk->d_state < G_RAID_DISK_S_SPARE) continue; /* Skip already used by this volume. */ for (i = 0; i < vol->v_disks_count; i++) { sd = &vol->v_subdisks[i]; if (sd->sd_disk == disk) break; } if (i < vol->v_disks_count) continue; /* Try to use disk if it has empty extents. */ pd = disk->d_md_data; if (pd->pd_subdisks < PROMISE_MAX_SUBDISKS) { update = g_raid_md_promise_start_disk(disk, -1, vol); } else update = 0; if (update) { updated = 1; g_raid_md_write_promise(md, vol, NULL, disk); break; } } } if (updated) goto restart; } static void g_raid_md_promise_start(struct g_raid_volume *vol) { struct g_raid_softc *sc; struct g_raid_subdisk *sd; struct g_raid_disk *disk; struct g_raid_md_object *md; struct g_raid_md_promise_perdisk *pd; struct g_raid_md_promise_pervolume *pv; struct promise_raid_conf *meta; u_int i; sc = vol->v_softc; md = sc->sc_md; pv = vol->v_md_data; meta = pv->pv_meta; vol->v_raid_level_qualifier = G_RAID_VOLUME_RLQ_NONE; if (meta->type == PROMISE_T_RAID0) vol->v_raid_level = G_RAID_VOLUME_RL_RAID0; else if (meta->type == PROMISE_T_RAID1) { if (meta->array_width == 1) vol->v_raid_level = G_RAID_VOLUME_RL_RAID1; else vol->v_raid_level = G_RAID_VOLUME_RL_RAID1E; } else if (meta->type == PROMISE_T_RAID3) vol->v_raid_level = G_RAID_VOLUME_RL_RAID3; else if (meta->type == PROMISE_T_RAID5) { vol->v_raid_level = G_RAID_VOLUME_RL_RAID5; vol->v_raid_level_qualifier = G_RAID_VOLUME_RLQ_R5LA; } else if (meta->type == PROMISE_T_SPAN) vol->v_raid_level = G_RAID_VOLUME_RL_CONCAT; else if (meta->type == PROMISE_T_JBOD) vol->v_raid_level = G_RAID_VOLUME_RL_SINGLE; else vol->v_raid_level = G_RAID_VOLUME_RL_UNKNOWN; vol->v_strip_size = 512 << meta->stripe_shift; //ZZZ vol->v_disks_count = meta->total_disks; vol->v_mediasize = (off_t)meta->total_sectors * 512; //ZZZ if (meta->total_sectors_high < 256) /* If value looks sane. */ vol->v_mediasize += ((off_t)meta->total_sectors_high << 32) * 512; //ZZZ vol->v_sectorsize = 512 * meta->sector_size; for (i = 0; i < vol->v_disks_count; i++) { sd = &vol->v_subdisks[i]; sd->sd_offset = (((off_t)meta->disk_offset_high << 32) + meta->disk_offset) * 512; sd->sd_size = (((off_t)meta->disk_sectors_high << 32) + meta->disk_sectors) * 512; } g_raid_start_volume(vol); /* Make all disks found till the moment take their places. */ TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { pd = disk->d_md_data; for (i = 0; i < pd->pd_subdisks; i++) { if (pd->pd_meta[i]->volume_id == meta->volume_id) g_raid_md_promise_start_disk(disk, i, vol); } } pv->pv_started = 1; callout_stop(&pv->pv_start_co); G_RAID_DEBUG1(0, sc, "Volume started."); g_raid_md_write_promise(md, vol, NULL, NULL); /* Pickup any STALE/SPARE disks to refill array if needed. */ g_raid_md_promise_refill(sc); g_raid_event_send(vol, G_RAID_VOLUME_E_START, G_RAID_EVENT_VOLUME); } static void g_raid_promise_go(void *arg) { struct g_raid_volume *vol; struct g_raid_softc *sc; struct g_raid_md_promise_pervolume *pv; vol = arg; pv = vol->v_md_data; sc = vol->v_softc; if (!pv->pv_started) { G_RAID_DEBUG1(0, sc, "Force volume start due to timeout."); g_raid_event_send(vol, G_RAID_VOLUME_E_STARTMD, G_RAID_EVENT_VOLUME); } } static void g_raid_md_promise_new_disk(struct g_raid_disk *disk) { struct g_raid_softc *sc; struct g_raid_md_object *md; struct promise_raid_conf *pdmeta; struct g_raid_md_promise_perdisk *pd; struct g_raid_md_promise_pervolume *pv; struct g_raid_volume *vol; int i; char buf[33]; sc = disk->d_softc; md = sc->sc_md; pd = (struct g_raid_md_promise_perdisk *)disk->d_md_data; if (pd->pd_subdisks == 0) { g_raid_change_disk_state(disk, G_RAID_DISK_S_SPARE); g_raid_md_promise_refill(sc); return; } for (i = 0; i < pd->pd_subdisks; i++) { pdmeta = pd->pd_meta[i]; /* Look for volume with matching ID. */ vol = g_raid_md_promise_get_volume(sc, pdmeta->volume_id); if (vol == NULL) { promise_meta_get_name(pdmeta, buf); vol = g_raid_create_volume(sc, buf, pdmeta->array_number); pv = malloc(sizeof(*pv), M_MD_PROMISE, M_WAITOK | M_ZERO); pv->pv_id = pdmeta->volume_id; vol->v_md_data = pv; callout_init(&pv->pv_start_co, 1); callout_reset(&pv->pv_start_co, g_raid_start_timeout * hz, g_raid_promise_go, vol); } else pv = vol->v_md_data; /* If we haven't started yet - check metadata freshness. */ if (pv->pv_meta == NULL || !pv->pv_started) { if (pv->pv_meta == NULL || ((int16_t)(pdmeta->generation - pv->pv_generation)) > 0) { G_RAID_DEBUG1(1, sc, "Newer disk"); if (pv->pv_meta != NULL) free(pv->pv_meta, M_MD_PROMISE); pv->pv_meta = promise_meta_copy(pdmeta); pv->pv_generation = pv->pv_meta->generation; pv->pv_disks_present = 1; } else if (pdmeta->generation == pv->pv_generation) { pv->pv_disks_present++; G_RAID_DEBUG1(1, sc, "Matching disk (%d of %d up)", pv->pv_disks_present, pv->pv_meta->total_disks); } else { G_RAID_DEBUG1(1, sc, "Older disk"); } } } for (i = 0; i < pd->pd_subdisks; i++) { pdmeta = pd->pd_meta[i]; /* Look for volume with matching ID. */ vol = g_raid_md_promise_get_volume(sc, pdmeta->volume_id); if (vol == NULL) continue; pv = vol->v_md_data; if (pv->pv_started) { if (g_raid_md_promise_start_disk(disk, i, vol)) g_raid_md_write_promise(md, vol, NULL, NULL); } else { /* If we collected all needed disks - start array. */ if (pv->pv_disks_present == pv->pv_meta->total_disks) g_raid_md_promise_start(vol); } } } static int g_raid_md_create_promise(struct g_raid_md_object *md, struct g_class *mp, struct g_geom **gp) { struct g_geom *geom; struct g_raid_softc *sc; /* Search for existing node. */ LIST_FOREACH(geom, &mp->geom, geom) { sc = geom->softc; if (sc == NULL) continue; if (sc->sc_stopping != 0) continue; if (sc->sc_md->mdo_class != md->mdo_class) continue; break; } if (geom != NULL) { *gp = geom; return (G_RAID_MD_TASTE_EXISTING); } /* Create new one if not found. */ sc = g_raid_create_node(mp, "Promise", md); if (sc == NULL) return (G_RAID_MD_TASTE_FAIL); md->mdo_softc = sc; *gp = sc->sc_geom; return (G_RAID_MD_TASTE_NEW); } static int g_raid_md_taste_promise(struct g_raid_md_object *md, struct g_class *mp, struct g_consumer *cp, struct g_geom **gp) { struct g_consumer *rcp; struct g_provider *pp; struct g_raid_softc *sc; struct g_raid_disk *disk; struct promise_raid_conf *meta, *metaarr[4]; struct g_raid_md_promise_perdisk *pd; struct g_geom *geom; int i, j, result, len, subdisks; char name[16]; uint16_t vendor; G_RAID_DEBUG(1, "Tasting Promise on %s", cp->provider->name); pp = cp->provider; /* Read metadata from device. */ meta = NULL; g_topology_unlock(); vendor = 0xffff; len = sizeof(vendor); if (pp->geom->rank == 1) g_io_getattr("GEOM::hba_vendor", cp, &len, &vendor); subdisks = promise_meta_read(cp, metaarr); g_topology_lock(); if (subdisks == 0) { if (g_raid_aggressive_spare) { if (vendor == 0x105a || vendor == 0x1002) { G_RAID_DEBUG(1, "No Promise metadata, forcing spare."); goto search; } else { G_RAID_DEBUG(1, "Promise/ATI vendor mismatch " "0x%04x != 0x105a/0x1002", vendor); } } return (G_RAID_MD_TASTE_FAIL); } /* Metadata valid. Print it. */ for (i = 0; i < subdisks; i++) g_raid_md_promise_print(metaarr[i]); /* Purge meaningless (empty/spare) records. */ for (i = 0; i < subdisks; ) { if (metaarr[i]->disk.flags & PROMISE_F_ASSIGNED) { i++; continue; } free(metaarr[i], M_MD_PROMISE); for (j = i; j < subdisks - 1; j++) metaarr[i] = metaarr[j + 1]; metaarr[subdisks - 1] = NULL; subdisks--; } search: /* Search for matching node. */ sc = NULL; LIST_FOREACH(geom, &mp->geom, geom) { sc = geom->softc; if (sc == NULL) continue; if (sc->sc_stopping != 0) continue; if (sc->sc_md->mdo_class != md->mdo_class) continue; break; } /* Found matching node. */ if (geom != NULL) { G_RAID_DEBUG(1, "Found matching array %s", sc->sc_name); result = G_RAID_MD_TASTE_EXISTING; } else { /* Not found matching node -- create one. */ result = G_RAID_MD_TASTE_NEW; snprintf(name, sizeof(name), "Promise"); sc = g_raid_create_node(mp, name, md); md->mdo_softc = sc; geom = sc->sc_geom; } /* There is no return after this point, so we close passed consumer. */ g_access(cp, -1, 0, 0); rcp = g_new_consumer(geom); rcp->flags |= G_CF_DIRECT_RECEIVE; g_attach(rcp, pp); if (g_access(rcp, 1, 1, 1) != 0) ; //goto fail1; g_topology_unlock(); sx_xlock(&sc->sc_lock); pd = malloc(sizeof(*pd), M_MD_PROMISE, M_WAITOK | M_ZERO); pd->pd_subdisks = subdisks; for (i = 0; i < subdisks; i++) pd->pd_meta[i] = metaarr[i]; disk = g_raid_create_disk(sc); disk->d_md_data = (void *)pd; disk->d_consumer = rcp; rcp->private = disk; g_raid_get_disk_info(disk); g_raid_md_promise_new_disk(disk); sx_xunlock(&sc->sc_lock); g_topology_lock(); *gp = geom; return (result); } static int g_raid_md_event_promise(struct g_raid_md_object *md, struct g_raid_disk *disk, u_int event) { struct g_raid_softc *sc; sc = md->mdo_softc; if (disk == NULL) return (-1); switch (event) { case G_RAID_DISK_E_DISCONNECTED: /* Delete disk. */ g_raid_change_disk_state(disk, G_RAID_DISK_S_NONE); g_raid_destroy_disk(disk); g_raid_md_promise_purge_volumes(sc); /* Write updated metadata to all disks. */ g_raid_md_write_promise(md, NULL, NULL, NULL); /* Check if anything left. */ if (g_raid_ndisks(sc, -1) == 0) g_raid_destroy_node(sc, 0); else g_raid_md_promise_refill(sc); return (0); } return (-2); } static int g_raid_md_volume_event_promise(struct g_raid_md_object *md, struct g_raid_volume *vol, u_int event) { struct g_raid_md_promise_pervolume *pv; pv = (struct g_raid_md_promise_pervolume *)vol->v_md_data; switch (event) { case G_RAID_VOLUME_E_STARTMD: if (!pv->pv_started) g_raid_md_promise_start(vol); return (0); } return (-2); } static int g_raid_md_ctl_promise(struct g_raid_md_object *md, struct gctl_req *req) { struct g_raid_softc *sc; struct g_raid_volume *vol, *vol1; struct g_raid_subdisk *sd; struct g_raid_disk *disk, *disks[PROMISE_MAX_DISKS]; struct g_raid_md_promise_perdisk *pd; struct g_raid_md_promise_pervolume *pv; struct g_consumer *cp; struct g_provider *pp; char arg[16]; const char *nodename, *verb, *volname, *levelname, *diskname; char *tmp; int *nargs, *force; off_t esize, offs[PROMISE_MAX_DISKS], size, sectorsize, strip; intmax_t *sizearg, *striparg; int numdisks, i, len, level, qual; int error; sc = md->mdo_softc; verb = gctl_get_param(req, "verb", NULL); nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); error = 0; if (strcmp(verb, "label") == 0) { if (*nargs < 4) { gctl_error(req, "Invalid number of arguments."); return (-1); } volname = gctl_get_asciiparam(req, "arg1"); if (volname == NULL) { gctl_error(req, "No volume name."); return (-2); } levelname = gctl_get_asciiparam(req, "arg2"); if (levelname == NULL) { gctl_error(req, "No RAID level."); return (-3); } if (strcasecmp(levelname, "RAID5") == 0) levelname = "RAID5-LA"; if (g_raid_volume_str2level(levelname, &level, &qual)) { gctl_error(req, "Unknown RAID level '%s'.", levelname); return (-4); } numdisks = *nargs - 3; force = gctl_get_paraml(req, "force", sizeof(*force)); if (!g_raid_md_promise_supported(level, qual, numdisks, force ? *force : 0)) { gctl_error(req, "Unsupported RAID level " "(0x%02x/0x%02x), or number of disks (%d).", level, qual, numdisks); return (-5); } /* Search for disks, connect them and probe. */ size = INT64_MAX; sectorsize = 0; bzero(disks, sizeof(disks)); bzero(offs, sizeof(offs)); for (i = 0; i < numdisks; i++) { snprintf(arg, sizeof(arg), "arg%d", i + 3); diskname = gctl_get_asciiparam(req, arg); if (diskname == NULL) { gctl_error(req, "No disk name (%s).", arg); error = -6; break; } if (strcmp(diskname, "NONE") == 0) continue; TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_consumer != NULL && disk->d_consumer->provider != NULL && strcmp(disk->d_consumer->provider->name, diskname) == 0) break; } if (disk != NULL) { if (disk->d_state != G_RAID_DISK_S_ACTIVE) { gctl_error(req, "Disk '%s' is in a " "wrong state (%s).", diskname, g_raid_disk_state2str(disk->d_state)); error = -7; break; } pd = disk->d_md_data; if (pd->pd_subdisks >= PROMISE_MAX_SUBDISKS) { gctl_error(req, "Disk '%s' already " "used by %d volumes.", diskname, pd->pd_subdisks); error = -7; break; } pp = disk->d_consumer->provider; disks[i] = disk; promise_meta_unused_range(pd->pd_meta, pd->pd_subdisks, pp->mediasize / pp->sectorsize, &offs[i], &esize); size = MIN(size, (off_t)esize * pp->sectorsize); sectorsize = MAX(sectorsize, pp->sectorsize); continue; } g_topology_lock(); cp = g_raid_open_consumer(sc, diskname); if (cp == NULL) { gctl_error(req, "Can't open disk '%s'.", diskname); g_topology_unlock(); error = -8; break; } pp = cp->provider; pd = malloc(sizeof(*pd), M_MD_PROMISE, M_WAITOK | M_ZERO); disk = g_raid_create_disk(sc); disk->d_md_data = (void *)pd; disk->d_consumer = cp; disks[i] = disk; cp->private = disk; g_topology_unlock(); g_raid_get_disk_info(disk); /* Reserve some space for metadata. */ size = MIN(size, pp->mediasize - 131072llu * pp->sectorsize); sectorsize = MAX(sectorsize, pp->sectorsize); } if (error != 0) { for (i = 0; i < numdisks; i++) { if (disks[i] != NULL && disks[i]->d_state == G_RAID_DISK_S_NONE) g_raid_destroy_disk(disks[i]); } return (error); } if (sectorsize <= 0) { gctl_error(req, "Can't get sector size."); return (-8); } /* Handle size argument. */ len = sizeof(*sizearg); sizearg = gctl_get_param(req, "size", &len); if (sizearg != NULL && len == sizeof(*sizearg) && *sizearg > 0) { if (*sizearg > size) { gctl_error(req, "Size too big %lld > %lld.", (long long)*sizearg, (long long)size); return (-9); } size = *sizearg; } /* Handle strip argument. */ strip = 131072; len = sizeof(*striparg); striparg = gctl_get_param(req, "strip", &len); if (striparg != NULL && len == sizeof(*striparg) && *striparg > 0) { if (*striparg < sectorsize) { gctl_error(req, "Strip size too small."); return (-10); } if (*striparg % sectorsize != 0) { gctl_error(req, "Incorrect strip size."); return (-11); } strip = *striparg; } /* Round size down to strip or sector. */ if (level == G_RAID_VOLUME_RL_RAID1 || level == G_RAID_VOLUME_RL_SINGLE || level == G_RAID_VOLUME_RL_CONCAT) size -= (size % sectorsize); else if (level == G_RAID_VOLUME_RL_RAID1E && (numdisks & 1) != 0) size -= (size % (2 * strip)); else size -= (size % strip); if (size <= 0) { gctl_error(req, "Size too small."); return (-13); } /* We have all we need, create things: volume, ... */ pv = malloc(sizeof(*pv), M_MD_PROMISE, M_WAITOK | M_ZERO); arc4rand(&pv->pv_id, sizeof(pv->pv_id), 0); pv->pv_generation = 0; pv->pv_started = 1; vol = g_raid_create_volume(sc, volname, -1); vol->v_md_data = pv; vol->v_raid_level = level; vol->v_raid_level_qualifier = qual; vol->v_strip_size = strip; vol->v_disks_count = numdisks; if (level == G_RAID_VOLUME_RL_RAID0 || level == G_RAID_VOLUME_RL_CONCAT || level == G_RAID_VOLUME_RL_SINGLE) vol->v_mediasize = size * numdisks; else if (level == G_RAID_VOLUME_RL_RAID1) vol->v_mediasize = size; else if (level == G_RAID_VOLUME_RL_RAID3 || level == G_RAID_VOLUME_RL_RAID5) vol->v_mediasize = size * (numdisks - 1); else { /* RAID1E */ vol->v_mediasize = ((size * numdisks) / strip / 2) * strip; } vol->v_sectorsize = sectorsize; g_raid_start_volume(vol); /* , and subdisks. */ for (i = 0; i < numdisks; i++) { disk = disks[i]; sd = &vol->v_subdisks[i]; sd->sd_disk = disk; sd->sd_offset = (off_t)offs[i] * 512; sd->sd_size = size; if (disk == NULL) continue; TAILQ_INSERT_TAIL(&disk->d_subdisks, sd, sd_next); g_raid_change_disk_state(disk, G_RAID_DISK_S_ACTIVE); g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_ACTIVE); g_raid_event_send(sd, G_RAID_SUBDISK_E_NEW, G_RAID_EVENT_SUBDISK); } /* Write metadata based on created entities. */ G_RAID_DEBUG1(0, sc, "Array started."); g_raid_md_write_promise(md, vol, NULL, NULL); /* Pickup any STALE/SPARE disks to refill array if needed. */ g_raid_md_promise_refill(sc); g_raid_event_send(vol, G_RAID_VOLUME_E_START, G_RAID_EVENT_VOLUME); return (0); } if (strcmp(verb, "add") == 0) { gctl_error(req, "`add` command is not applicable, " "use `label` instead."); return (-99); } if (strcmp(verb, "delete") == 0) { nodename = gctl_get_asciiparam(req, "arg0"); if (nodename != NULL && strcasecmp(sc->sc_name, nodename) != 0) nodename = NULL; /* Full node destruction. */ if (*nargs == 1 && nodename != NULL) { /* Check if some volume is still open. */ force = gctl_get_paraml(req, "force", sizeof(*force)); if (force != NULL && *force == 0 && g_raid_nopens(sc) != 0) { gctl_error(req, "Some volume is still open."); return (-4); } TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_consumer) promise_meta_erase(disk->d_consumer); } g_raid_destroy_node(sc, 0); return (0); } /* Destroy specified volume. If it was last - all node. */ if (*nargs > 2) { gctl_error(req, "Invalid number of arguments."); return (-1); } volname = gctl_get_asciiparam(req, nodename != NULL ? "arg1" : "arg0"); if (volname == NULL) { gctl_error(req, "No volume name."); return (-2); } /* Search for volume. */ TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) { if (strcmp(vol->v_name, volname) == 0) break; pp = vol->v_provider; if (pp == NULL) continue; if (strcmp(pp->name, volname) == 0) break; if (strncmp(pp->name, "raid/", 5) == 0 && strcmp(pp->name + 5, volname) == 0) break; } if (vol == NULL) { i = strtol(volname, &tmp, 10); if (verb != volname && tmp[0] == 0) { TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) { if (vol->v_global_id == i) break; } } } if (vol == NULL) { gctl_error(req, "Volume '%s' not found.", volname); return (-3); } /* Check if volume is still open. */ force = gctl_get_paraml(req, "force", sizeof(*force)); if (force != NULL && *force == 0 && vol->v_provider_open != 0) { gctl_error(req, "Volume is still open."); return (-4); } /* Destroy volume and potentially node. */ i = 0; TAILQ_FOREACH(vol1, &sc->sc_volumes, v_next) i++; if (i >= 2) { g_raid_destroy_volume(vol); g_raid_md_promise_purge_disks(sc); g_raid_md_write_promise(md, NULL, NULL, NULL); } else { TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_consumer) promise_meta_erase(disk->d_consumer); } g_raid_destroy_node(sc, 0); } return (0); } if (strcmp(verb, "remove") == 0 || strcmp(verb, "fail") == 0) { if (*nargs < 2) { gctl_error(req, "Invalid number of arguments."); return (-1); } for (i = 1; i < *nargs; i++) { snprintf(arg, sizeof(arg), "arg%d", i); diskname = gctl_get_asciiparam(req, arg); if (diskname == NULL) { gctl_error(req, "No disk name (%s).", arg); error = -2; break; } if (strncmp(diskname, "/dev/", 5) == 0) diskname += 5; TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_consumer != NULL && disk->d_consumer->provider != NULL && strcmp(disk->d_consumer->provider->name, diskname) == 0) break; } if (disk == NULL) { gctl_error(req, "Disk '%s' not found.", diskname); error = -3; break; } if (strcmp(verb, "fail") == 0) { g_raid_md_fail_disk_promise(md, NULL, disk); continue; } /* Erase metadata on deleting disk and destroy it. */ promise_meta_erase(disk->d_consumer); g_raid_destroy_disk(disk); } g_raid_md_promise_purge_volumes(sc); /* Write updated metadata to remaining disks. */ g_raid_md_write_promise(md, NULL, NULL, NULL); /* Check if anything left. */ if (g_raid_ndisks(sc, -1) == 0) g_raid_destroy_node(sc, 0); else g_raid_md_promise_refill(sc); return (error); } if (strcmp(verb, "insert") == 0) { if (*nargs < 2) { gctl_error(req, "Invalid number of arguments."); return (-1); } for (i = 1; i < *nargs; i++) { /* Get disk name. */ snprintf(arg, sizeof(arg), "arg%d", i); diskname = gctl_get_asciiparam(req, arg); if (diskname == NULL) { gctl_error(req, "No disk name (%s).", arg); error = -3; break; } /* Try to find provider with specified name. */ g_topology_lock(); cp = g_raid_open_consumer(sc, diskname); if (cp == NULL) { gctl_error(req, "Can't open disk '%s'.", diskname); g_topology_unlock(); error = -4; break; } pp = cp->provider; g_topology_unlock(); pd = malloc(sizeof(*pd), M_MD_PROMISE, M_WAITOK | M_ZERO); disk = g_raid_create_disk(sc); disk->d_consumer = cp; disk->d_md_data = (void *)pd; cp->private = disk; g_raid_get_disk_info(disk); /* Welcome the "new" disk. */ g_raid_change_disk_state(disk, G_RAID_DISK_S_SPARE); promise_meta_write_spare(cp); g_raid_md_promise_refill(sc); } return (error); } return (-100); } static int g_raid_md_write_promise(struct g_raid_md_object *md, struct g_raid_volume *tvol, struct g_raid_subdisk *tsd, struct g_raid_disk *tdisk) { struct g_raid_softc *sc; struct g_raid_volume *vol; struct g_raid_subdisk *sd; struct g_raid_disk *disk; struct g_raid_md_promise_perdisk *pd; struct g_raid_md_promise_pervolume *pv; struct promise_raid_conf *meta; off_t rebuild_lba64; int i, j, pos, rebuild; sc = md->mdo_softc; if (sc->sc_stopping == G_RAID_DESTROY_HARD) return (0); /* Generate new per-volume metadata for affected volumes. */ TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) { if (vol->v_stopping) continue; /* Skip volumes not related to specified targets. */ if (tvol != NULL && vol != tvol) continue; if (tsd != NULL && vol != tsd->sd_volume) continue; if (tdisk != NULL) { for (i = 0; i < vol->v_disks_count; i++) { if (vol->v_subdisks[i].sd_disk == tdisk) break; } if (i >= vol->v_disks_count) continue; } pv = (struct g_raid_md_promise_pervolume *)vol->v_md_data; pv->pv_generation++; meta = malloc(sizeof(*meta), M_MD_PROMISE, M_WAITOK | M_ZERO); if (pv->pv_meta != NULL) memcpy(meta, pv->pv_meta, sizeof(*meta)); memcpy(meta->promise_id, PROMISE_MAGIC, sizeof(PROMISE_MAGIC) - 1); meta->dummy_0 = 0x00020000; meta->integrity = PROMISE_I_VALID; meta->generation = pv->pv_generation; meta->status = PROMISE_S_VALID | PROMISE_S_ONLINE | PROMISE_S_INITED | PROMISE_S_READY; if (vol->v_state <= G_RAID_VOLUME_S_DEGRADED) meta->status |= PROMISE_S_DEGRADED; if (vol->v_dirty) meta->status |= PROMISE_S_MARKED; /* XXX: INVENTED! */ if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID0 || vol->v_raid_level == G_RAID_VOLUME_RL_SINGLE) meta->type = PROMISE_T_RAID0; else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1 || vol->v_raid_level == G_RAID_VOLUME_RL_RAID1E) meta->type = PROMISE_T_RAID1; else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID3) meta->type = PROMISE_T_RAID3; else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID5) meta->type = PROMISE_T_RAID5; else if (vol->v_raid_level == G_RAID_VOLUME_RL_CONCAT) meta->type = PROMISE_T_SPAN; else meta->type = PROMISE_T_JBOD; meta->total_disks = vol->v_disks_count; meta->stripe_shift = ffs(vol->v_strip_size / 1024); meta->array_width = vol->v_disks_count; if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1 || vol->v_raid_level == G_RAID_VOLUME_RL_RAID1E) meta->array_width /= 2; meta->array_number = vol->v_global_id; meta->total_sectors = vol->v_mediasize / 512; meta->total_sectors_high = (vol->v_mediasize / 512) >> 32; meta->sector_size = vol->v_sectorsize / 512; meta->cylinders = meta->total_sectors / (255 * 63) - 1; meta->heads = 254; meta->sectors = 63; meta->volume_id = pv->pv_id; rebuild_lba64 = UINT64_MAX; rebuild = 0; for (i = 0; i < vol->v_disks_count; i++) { sd = &vol->v_subdisks[i]; /* For RAID0+1 we need to translate order. */ pos = promise_meta_translate_disk(vol, i); meta->disks[pos].flags = PROMISE_F_VALID | PROMISE_F_ASSIGNED; if (sd->sd_state == G_RAID_SUBDISK_S_NONE) { meta->disks[pos].flags |= 0; } else if (sd->sd_state == G_RAID_SUBDISK_S_FAILED) { meta->disks[pos].flags |= PROMISE_F_DOWN | PROMISE_F_REDIR; } else if (sd->sd_state <= G_RAID_SUBDISK_S_REBUILD) { meta->disks[pos].flags |= PROMISE_F_ONLINE | PROMISE_F_REDIR; if (sd->sd_state == G_RAID_SUBDISK_S_REBUILD) { rebuild_lba64 = MIN(rebuild_lba64, sd->sd_rebuild_pos / 512); } else rebuild_lba64 = 0; rebuild = 1; } else { meta->disks[pos].flags |= PROMISE_F_ONLINE; if (sd->sd_state < G_RAID_SUBDISK_S_ACTIVE) { meta->status |= PROMISE_S_MARKED; if (sd->sd_state == G_RAID_SUBDISK_S_RESYNC) { rebuild_lba64 = MIN(rebuild_lba64, sd->sd_rebuild_pos / 512); } else rebuild_lba64 = 0; } } if (pv->pv_meta != NULL) { meta->disks[pos].id = pv->pv_meta->disks[pos].id; } else { meta->disks[pos].number = i * 2; arc4rand(&meta->disks[pos].id, sizeof(meta->disks[pos].id), 0); } } promise_meta_put_name(meta, vol->v_name); /* Try to mimic AMD BIOS rebuild/resync behavior. */ if (rebuild_lba64 != UINT64_MAX) { if (rebuild) meta->magic_3 = 0x03040010UL; /* Rebuild? */ else meta->magic_3 = 0x03040008UL; /* Resync? */ /* Translate from per-disk to per-volume LBA. */ if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1 || vol->v_raid_level == G_RAID_VOLUME_RL_RAID1E) { rebuild_lba64 *= meta->array_width; } else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID3 || vol->v_raid_level == G_RAID_VOLUME_RL_RAID5) { rebuild_lba64 *= meta->array_width - 1; } else rebuild_lba64 = 0; } else meta->magic_3 = 0x03000000UL; meta->rebuild_lba64 = rebuild_lba64; meta->magic_4 = 0x04010101UL; /* Replace per-volume metadata with new. */ if (pv->pv_meta != NULL) free(pv->pv_meta, M_MD_PROMISE); pv->pv_meta = meta; /* Copy new metadata to the disks, adding or replacing old. */ for (i = 0; i < vol->v_disks_count; i++) { sd = &vol->v_subdisks[i]; disk = sd->sd_disk; if (disk == NULL) continue; /* For RAID0+1 we need to translate order. */ pos = promise_meta_translate_disk(vol, i); pd = (struct g_raid_md_promise_perdisk *)disk->d_md_data; for (j = 0; j < pd->pd_subdisks; j++) { if (pd->pd_meta[j]->volume_id == meta->volume_id) break; } if (j == pd->pd_subdisks) pd->pd_subdisks++; if (pd->pd_meta[j] != NULL) free(pd->pd_meta[j], M_MD_PROMISE); pd->pd_meta[j] = promise_meta_copy(meta); pd->pd_meta[j]->disk = meta->disks[pos]; pd->pd_meta[j]->disk.number = pos; pd->pd_meta[j]->disk_offset_high = (sd->sd_offset / 512) >> 32; pd->pd_meta[j]->disk_offset = sd->sd_offset / 512; pd->pd_meta[j]->disk_sectors_high = (sd->sd_size / 512) >> 32; pd->pd_meta[j]->disk_sectors = sd->sd_size / 512; if (sd->sd_state == G_RAID_SUBDISK_S_REBUILD) { pd->pd_meta[j]->disk_rebuild_high = (sd->sd_rebuild_pos / 512) >> 32; pd->pd_meta[j]->disk_rebuild = sd->sd_rebuild_pos / 512; } else if (sd->sd_state < G_RAID_SUBDISK_S_REBUILD) { pd->pd_meta[j]->disk_rebuild_high = 0; pd->pd_meta[j]->disk_rebuild = 0; } else { pd->pd_meta[j]->disk_rebuild_high = UINT32_MAX; pd->pd_meta[j]->disk_rebuild = UINT32_MAX; } pd->pd_updated = 1; } } TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { pd = (struct g_raid_md_promise_perdisk *)disk->d_md_data; if (disk->d_state != G_RAID_DISK_S_ACTIVE) continue; if (!pd->pd_updated) continue; G_RAID_DEBUG(1, "Writing Promise metadata to %s", g_raid_get_diskname(disk)); for (i = 0; i < pd->pd_subdisks; i++) g_raid_md_promise_print(pd->pd_meta[i]); promise_meta_write(disk->d_consumer, pd->pd_meta, pd->pd_subdisks); pd->pd_updated = 0; } return (0); } static int g_raid_md_fail_disk_promise(struct g_raid_md_object *md, struct g_raid_subdisk *tsd, struct g_raid_disk *tdisk) { struct g_raid_softc *sc; struct g_raid_md_promise_perdisk *pd; struct g_raid_subdisk *sd; int i, pos; sc = md->mdo_softc; pd = (struct g_raid_md_promise_perdisk *)tdisk->d_md_data; /* We can't fail disk that is not a part of array now. */ if (tdisk->d_state != G_RAID_DISK_S_ACTIVE) return (-1); /* * Mark disk as failed in metadata and try to write that metadata * to the disk itself to prevent it's later resurrection as STALE. */ if (pd->pd_subdisks > 0 && tdisk->d_consumer != NULL) G_RAID_DEBUG(1, "Writing Promise metadata to %s", g_raid_get_diskname(tdisk)); for (i = 0; i < pd->pd_subdisks; i++) { pd->pd_meta[i]->disk.flags |= PROMISE_F_DOWN | PROMISE_F_REDIR; pos = pd->pd_meta[i]->disk.number; if (pos >= 0 && pos < PROMISE_MAX_DISKS) { pd->pd_meta[i]->disks[pos].flags |= PROMISE_F_DOWN | PROMISE_F_REDIR; } g_raid_md_promise_print(pd->pd_meta[i]); } if (tdisk->d_consumer != NULL) promise_meta_write(tdisk->d_consumer, pd->pd_meta, pd->pd_subdisks); /* Change states. */ g_raid_change_disk_state(tdisk, G_RAID_DISK_S_FAILED); TAILQ_FOREACH(sd, &tdisk->d_subdisks, sd_next) { g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_FAILED); g_raid_event_send(sd, G_RAID_SUBDISK_E_FAILED, G_RAID_EVENT_SUBDISK); } /* Write updated metadata to remaining disks. */ g_raid_md_write_promise(md, NULL, NULL, tdisk); g_raid_md_promise_refill(sc); return (0); } static int g_raid_md_free_disk_promise(struct g_raid_md_object *md, struct g_raid_disk *disk) { struct g_raid_md_promise_perdisk *pd; int i; pd = (struct g_raid_md_promise_perdisk *)disk->d_md_data; for (i = 0; i < pd->pd_subdisks; i++) { if (pd->pd_meta[i] != NULL) { free(pd->pd_meta[i], M_MD_PROMISE); pd->pd_meta[i] = NULL; } } free(pd, M_MD_PROMISE); disk->d_md_data = NULL; return (0); } static int g_raid_md_free_volume_promise(struct g_raid_md_object *md, struct g_raid_volume *vol) { struct g_raid_md_promise_pervolume *pv; pv = (struct g_raid_md_promise_pervolume *)vol->v_md_data; if (pv && pv->pv_meta != NULL) { free(pv->pv_meta, M_MD_PROMISE); pv->pv_meta = NULL; } if (pv && !pv->pv_started) { pv->pv_started = 1; callout_stop(&pv->pv_start_co); } free(pv, M_MD_PROMISE); vol->v_md_data = NULL; return (0); } static int g_raid_md_free_promise(struct g_raid_md_object *md) { return (0); } G_RAID_MD_DECLARE(promise, "Promise"); Index: head/sys/geom/raid/md_sii.c =================================================================== --- head/sys/geom/raid/md_sii.c (revision 308456) +++ head/sys/geom/raid/md_sii.c (revision 308457) @@ -1,1671 +1,1671 @@ /*- * Copyright (c) 2011 Alexander Motin * Copyright (c) 2000 - 2008 Søren Schmidt * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include "geom/raid/g_raid.h" #include "g_raid_md_if.h" static MALLOC_DEFINE(M_MD_SII, "md_sii_data", "GEOM_RAID SiI metadata"); struct sii_raid_conf { uint16_t ata_params_00_53[54]; uint64_t total_sectors; /* 54 - 57 */ uint16_t ata_params_58_81[72]; uint16_t product_id; /* 130 */ uint16_t vendor_id; /* 131 */ uint16_t version_minor; /* 132 */ uint16_t version_major; /* 133 */ uint8_t timestamp[6]; /* 134 - 136 */ uint16_t strip_sectors; /* 137 */ uint16_t dummy_2; uint8_t disk_number; /* 139 */ uint8_t type; #define SII_T_RAID0 0x00 #define SII_T_RAID1 0x01 #define SII_T_RAID01 0x02 #define SII_T_SPARE 0x03 #define SII_T_CONCAT 0x04 #define SII_T_RAID5 0x10 #define SII_T_RESERVED 0xfd #define SII_T_JBOD 0xff uint8_t raid0_disks; /* 140 */ uint8_t raid0_ident; uint8_t raid1_disks; /* 141 */ uint8_t raid1_ident; uint64_t rebuild_lba; /* 142 - 145 */ uint32_t generation; /* 146 - 147 */ uint8_t disk_status; /* 148 */ #define SII_S_CURRENT 0x01 #define SII_S_REBUILD 0x02 #define SII_S_DROPPED 0x03 #define SII_S_REMOVED 0x04 uint8_t raid_status; #define SII_S_ONLINE 0x01 #define SII_S_AVAILABLE 0x02 uint8_t raid_location; /* 149 */ uint8_t disk_location; uint8_t auto_rebuild; /* 150 */ #define SII_R_REBUILD 0x00 #define SII_R_NOREBUILD 0xff uint8_t dummy_3; uint8_t name[16]; /* 151 - 158 */ uint16_t checksum; /* 159 */ uint16_t ata_params_160_255[96]; } __packed; struct g_raid_md_sii_perdisk { struct sii_raid_conf *pd_meta; int pd_disk_pos; off_t pd_disk_size; }; struct g_raid_md_sii_object { struct g_raid_md_object mdio_base; uint8_t mdio_timestamp[6]; uint8_t mdio_location; uint32_t mdio_generation; struct sii_raid_conf *mdio_meta; struct callout mdio_start_co; /* STARTING state timer. */ int mdio_total_disks; int mdio_disks_present; int mdio_started; int mdio_incomplete; struct root_hold_token *mdio_rootmount; /* Root mount delay token. */ }; static g_raid_md_create_t g_raid_md_create_sii; static g_raid_md_taste_t g_raid_md_taste_sii; static g_raid_md_event_t g_raid_md_event_sii; static g_raid_md_ctl_t g_raid_md_ctl_sii; static g_raid_md_write_t g_raid_md_write_sii; static g_raid_md_fail_disk_t g_raid_md_fail_disk_sii; static g_raid_md_free_disk_t g_raid_md_free_disk_sii; static g_raid_md_free_t g_raid_md_free_sii; static kobj_method_t g_raid_md_sii_methods[] = { KOBJMETHOD(g_raid_md_create, g_raid_md_create_sii), KOBJMETHOD(g_raid_md_taste, g_raid_md_taste_sii), KOBJMETHOD(g_raid_md_event, g_raid_md_event_sii), KOBJMETHOD(g_raid_md_ctl, g_raid_md_ctl_sii), KOBJMETHOD(g_raid_md_write, g_raid_md_write_sii), KOBJMETHOD(g_raid_md_fail_disk, g_raid_md_fail_disk_sii), KOBJMETHOD(g_raid_md_free_disk, g_raid_md_free_disk_sii), KOBJMETHOD(g_raid_md_free, g_raid_md_free_sii), { 0, 0 } }; static struct g_raid_md_class g_raid_md_sii_class = { "SiI", g_raid_md_sii_methods, sizeof(struct g_raid_md_sii_object), .mdc_enable = 1, .mdc_priority = 100 }; static void g_raid_md_sii_print(struct sii_raid_conf *meta) { if (g_raid_debug < 1) return; printf("********* ATA SiI RAID Metadata *********\n"); printf("total_sectors %llu\n", (long long unsigned)meta->total_sectors); printf("product_id 0x%04x\n", meta->product_id); printf("vendor_id 0x%04x\n", meta->vendor_id); printf("version_minor 0x%04x\n", meta->version_minor); printf("version_major 0x%04x\n", meta->version_major); printf("timestamp 0x%02x%02x%02x%02x%02x%02x\n", meta->timestamp[5], meta->timestamp[4], meta->timestamp[3], meta->timestamp[2], meta->timestamp[1], meta->timestamp[0]); printf("strip_sectors %d\n", meta->strip_sectors); printf("disk_number %d\n", meta->disk_number); printf("type 0x%02x\n", meta->type); printf("raid0_disks %d\n", meta->raid0_disks); printf("raid0_ident %d\n", meta->raid0_ident); printf("raid1_disks %d\n", meta->raid1_disks); printf("raid1_ident %d\n", meta->raid1_ident); printf("rebuild_lba %llu\n", (long long unsigned)meta->rebuild_lba); printf("generation %d\n", meta->generation); printf("disk_status %d\n", meta->disk_status); printf("raid_status %d\n", meta->raid_status); printf("raid_location %d\n", meta->raid_location); printf("disk_location %d\n", meta->disk_location); printf("auto_rebuild %d\n", meta->auto_rebuild); printf("name <%.16s>\n", meta->name); printf("checksum 0x%04x\n", meta->checksum); printf("=================================================\n"); } static struct sii_raid_conf * sii_meta_copy(struct sii_raid_conf *meta) { struct sii_raid_conf *nmeta; nmeta = malloc(sizeof(*meta), M_MD_SII, M_WAITOK); memcpy(nmeta, meta, sizeof(*meta)); return (nmeta); } static int sii_meta_total_disks(struct sii_raid_conf *meta) { switch (meta->type) { case SII_T_RAID0: case SII_T_RAID5: case SII_T_CONCAT: return (meta->raid0_disks); case SII_T_RAID1: return (meta->raid1_disks); case SII_T_RAID01: return (meta->raid0_disks * meta->raid1_disks); case SII_T_SPARE: case SII_T_JBOD: return (1); } return (0); } static int sii_meta_disk_pos(struct sii_raid_conf *meta, struct sii_raid_conf *pdmeta) { if (pdmeta->type == SII_T_SPARE) return (-3); if (memcmp(&meta->timestamp, &pdmeta->timestamp, 6) != 0) return (-1); switch (pdmeta->type) { case SII_T_RAID0: case SII_T_RAID1: case SII_T_RAID5: case SII_T_CONCAT: return (pdmeta->disk_number); case SII_T_RAID01: return (pdmeta->raid1_ident * pdmeta->raid1_disks + pdmeta->raid0_ident); case SII_T_JBOD: return (0); } return (-1); } static void sii_meta_get_name(struct sii_raid_conf *meta, char *buf) { int i; strncpy(buf, meta->name, 16); buf[16] = 0; for (i = 15; i >= 0; i--) { if (buf[i] > 0x20) break; buf[i] = 0; } } static void sii_meta_put_name(struct sii_raid_conf *meta, char *buf) { memset(meta->name, 0x20, 16); memcpy(meta->name, buf, MIN(strlen(buf), 16)); } static struct sii_raid_conf * sii_meta_read(struct g_consumer *cp) { struct g_provider *pp; struct sii_raid_conf *meta; char *buf; int error, i; uint16_t checksum, *ptr; pp = cp->provider; /* Read the anchor sector. */ buf = g_read_data(cp, pp->mediasize - pp->sectorsize, pp->sectorsize, &error); if (buf == NULL) { G_RAID_DEBUG(1, "Cannot read metadata from %s (error=%d).", pp->name, error); return (NULL); } meta = (struct sii_raid_conf *)buf; /* Check vendor ID. */ if (meta->vendor_id != 0x1095) { G_RAID_DEBUG(1, "SiI vendor ID check failed on %s (0x%04x)", pp->name, meta->vendor_id); g_free(buf); return (NULL); } /* Check metadata major version. */ if (meta->version_major != 2) { G_RAID_DEBUG(1, "SiI version check failed on %s (%d.%d)", pp->name, meta->version_major, meta->version_minor); g_free(buf); return (NULL); } meta = malloc(sizeof(*meta), M_MD_SII, M_WAITOK); memcpy(meta, buf, min(sizeof(*meta), pp->sectorsize)); g_free(buf); /* Check metadata checksum. */ for (checksum = 0, ptr = (uint16_t *)meta, i = 0; i <= 159; i++) checksum += *ptr++; if (checksum != 0) { G_RAID_DEBUG(1, "SiI checksum check failed on %s", pp->name); free(meta, M_MD_SII); return (NULL); } /* Check raid type. */ if (meta->type != SII_T_RAID0 && meta->type != SII_T_RAID1 && meta->type != SII_T_RAID01 && meta->type != SII_T_SPARE && meta->type != SII_T_RAID5 && meta->type != SII_T_CONCAT && meta->type != SII_T_JBOD) { G_RAID_DEBUG(1, "SiI unknown RAID level on %s (0x%02x)", pp->name, meta->type); free(meta, M_MD_SII); return (NULL); } return (meta); } static int sii_meta_write(struct g_consumer *cp, struct sii_raid_conf *meta) { struct g_provider *pp; char *buf; int error, i; uint16_t checksum, *ptr; pp = cp->provider; /* Recalculate checksum for case if metadata were changed. */ meta->checksum = 0; for (checksum = 0, ptr = (uint16_t *)meta, i = 0; i < 159; i++) checksum += *ptr++; meta->checksum -= checksum; /* Create and fill buffer. */ buf = malloc(pp->sectorsize, M_MD_SII, M_WAITOK | M_ZERO); memcpy(buf, meta, sizeof(*meta)); /* Write 4 copies of metadata. */ for (i = 0; i < 4; i++) { error = g_write_data(cp, pp->mediasize - (pp->sectorsize * (1 + 0x200 * i)), buf, pp->sectorsize); if (error != 0) { G_RAID_DEBUG(1, "Cannot write metadata to %s (error=%d).", pp->name, error); break; } } free(buf, M_MD_SII); return (error); } static int sii_meta_erase(struct g_consumer *cp) { struct g_provider *pp; char *buf; int error, i; pp = cp->provider; buf = malloc(pp->sectorsize, M_MD_SII, M_WAITOK | M_ZERO); /* Write 4 copies of metadata. */ for (i = 0; i < 4; i++) { error = g_write_data(cp, pp->mediasize - (pp->sectorsize * (1 + 0x200 * i)), buf, pp->sectorsize); if (error != 0) { G_RAID_DEBUG(1, "Cannot erase metadata on %s (error=%d).", pp->name, error); } } free(buf, M_MD_SII); return (error); } static int sii_meta_write_spare(struct g_consumer *cp) { struct sii_raid_conf *meta; int error; meta = malloc(sizeof(*meta), M_MD_SII, M_WAITOK | M_ZERO); meta->total_sectors = cp->provider->mediasize / cp->provider->sectorsize - 0x800; meta->vendor_id = 0x1095; meta->version_minor = 0; meta->version_major = 2; meta->timestamp[0] = arc4random(); meta->timestamp[1] = arc4random(); meta->timestamp[2] = arc4random(); meta->timestamp[3] = arc4random(); meta->timestamp[4] = arc4random(); meta->timestamp[5] = arc4random(); meta->type = SII_T_SPARE; meta->generation = 1; meta->raid1_ident = 0xff; meta->raid_location = arc4random(); error = sii_meta_write(cp, meta); free(meta, M_MD_SII); return (error); } static struct g_raid_disk * g_raid_md_sii_get_disk(struct g_raid_softc *sc, int id) { struct g_raid_disk *disk; struct g_raid_md_sii_perdisk *pd; TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { pd = (struct g_raid_md_sii_perdisk *)disk->d_md_data; if (pd->pd_disk_pos == id) break; } return (disk); } static int g_raid_md_sii_supported(int level, int qual, int disks, int force) { if (disks > 8) return (0); switch (level) { case G_RAID_VOLUME_RL_RAID0: if (disks < 1) return (0); if (!force && (disks < 2 || disks > 6)) return (0); break; case G_RAID_VOLUME_RL_RAID1: if (disks < 1) return (0); if (!force && (disks != 2)) return (0); break; case G_RAID_VOLUME_RL_RAID1E: if (disks < 2) return (0); if (disks % 2 != 0) return (0); if (!force && (disks < 4)) return (0); break; case G_RAID_VOLUME_RL_SINGLE: if (disks != 1) return (0); break; case G_RAID_VOLUME_RL_CONCAT: if (disks < 2) return (0); break; case G_RAID_VOLUME_RL_RAID5: if (disks < 3) return (0); if (qual != G_RAID_VOLUME_RLQ_R5LS) return (0); break; default: return (0); } if (level != G_RAID_VOLUME_RL_RAID5 && qual != G_RAID_VOLUME_RLQ_NONE) return (0); return (1); } static int g_raid_md_sii_start_disk(struct g_raid_disk *disk) { struct g_raid_softc *sc; struct g_raid_subdisk *sd, *tmpsd; struct g_raid_disk *olddisk, *tmpdisk; struct g_raid_md_object *md; struct g_raid_md_sii_object *mdi; struct g_raid_md_sii_perdisk *pd, *oldpd; struct sii_raid_conf *meta; int disk_pos, resurrection = 0; sc = disk->d_softc; md = sc->sc_md; mdi = (struct g_raid_md_sii_object *)md; meta = mdi->mdio_meta; pd = (struct g_raid_md_sii_perdisk *)disk->d_md_data; olddisk = NULL; - /* Find disk position in metadata by it's serial. */ + /* Find disk position in metadata by its serial. */ if (pd->pd_meta != NULL) disk_pos = sii_meta_disk_pos(meta, pd->pd_meta); else disk_pos = -3; if (disk_pos < 0) { G_RAID_DEBUG1(1, sc, "Unknown, probably new or stale disk"); /* If we are in the start process, that's all for now. */ if (!mdi->mdio_started) goto nofit; /* * If we have already started - try to get use of the disk. * Try to replace OFFLINE disks first, then FAILED. */ TAILQ_FOREACH(tmpdisk, &sc->sc_disks, d_next) { if (tmpdisk->d_state != G_RAID_DISK_S_OFFLINE && tmpdisk->d_state != G_RAID_DISK_S_FAILED) continue; /* Make sure this disk is big enough. */ TAILQ_FOREACH(sd, &tmpdisk->d_subdisks, sd_next) { if (sd->sd_offset + sd->sd_size + 512 > pd->pd_disk_size) { G_RAID_DEBUG1(1, sc, "Disk too small (%ju < %ju)", pd->pd_disk_size, sd->sd_offset + sd->sd_size + 512); break; } } if (sd != NULL) continue; if (tmpdisk->d_state == G_RAID_DISK_S_OFFLINE) { olddisk = tmpdisk; break; } else if (olddisk == NULL) olddisk = tmpdisk; } if (olddisk == NULL) { nofit: if (disk_pos == -3 || pd->pd_disk_pos == -3) { g_raid_change_disk_state(disk, G_RAID_DISK_S_SPARE); return (1); } else { g_raid_change_disk_state(disk, G_RAID_DISK_S_STALE); return (0); } } oldpd = (struct g_raid_md_sii_perdisk *)olddisk->d_md_data; disk_pos = oldpd->pd_disk_pos; resurrection = 1; } if (olddisk == NULL) { /* Find placeholder by position. */ olddisk = g_raid_md_sii_get_disk(sc, disk_pos); if (olddisk == NULL) panic("No disk at position %d!", disk_pos); if (olddisk->d_state != G_RAID_DISK_S_OFFLINE) { G_RAID_DEBUG1(1, sc, "More than one disk for pos %d", disk_pos); g_raid_change_disk_state(disk, G_RAID_DISK_S_STALE); return (0); } oldpd = (struct g_raid_md_sii_perdisk *)olddisk->d_md_data; } /* Replace failed disk or placeholder with new disk. */ TAILQ_FOREACH_SAFE(sd, &olddisk->d_subdisks, sd_next, tmpsd) { TAILQ_REMOVE(&olddisk->d_subdisks, sd, sd_next); TAILQ_INSERT_TAIL(&disk->d_subdisks, sd, sd_next); sd->sd_disk = disk; } oldpd->pd_disk_pos = -2; pd->pd_disk_pos = disk_pos; /* If it was placeholder -- destroy it. */ if (olddisk->d_state == G_RAID_DISK_S_OFFLINE) { g_raid_destroy_disk(olddisk); } else { /* Otherwise, make it STALE_FAILED. */ g_raid_change_disk_state(olddisk, G_RAID_DISK_S_STALE_FAILED); } /* Welcome the new disk. */ if (resurrection) g_raid_change_disk_state(disk, G_RAID_DISK_S_ACTIVE); else if (pd->pd_meta->disk_status == SII_S_CURRENT || pd->pd_meta->disk_status == SII_S_REBUILD) g_raid_change_disk_state(disk, G_RAID_DISK_S_ACTIVE); else g_raid_change_disk_state(disk, G_RAID_DISK_S_FAILED); TAILQ_FOREACH(sd, &disk->d_subdisks, sd_next) { /* * Different disks may have different sizes, * in concat mode. Update from real disk size. */ if (meta->type == SII_T_CONCAT || meta->type == SII_T_JBOD) sd->sd_size = pd->pd_disk_size - 0x800 * 512; if (resurrection) { /* New or ex-spare disk. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_NEW); } else if (pd->pd_meta->disk_status == SII_S_REBUILD) { /* Rebuilding disk. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_REBUILD); if (pd->pd_meta->generation == meta->generation) sd->sd_rebuild_pos = pd->pd_meta->rebuild_lba * 512; else sd->sd_rebuild_pos = 0; } else if (pd->pd_meta->disk_status == SII_S_CURRENT) { if (pd->pd_meta->raid_status == SII_S_ONLINE || pd->pd_meta->generation != meta->generation) { /* Dirty or resyncing disk. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_STALE); } else { /* Up to date disk. */ g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_ACTIVE); } } else { g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_FAILED); } g_raid_event_send(sd, G_RAID_SUBDISK_E_NEW, G_RAID_EVENT_SUBDISK); } /* Update status of our need for spare. */ if (mdi->mdio_started) { mdi->mdio_incomplete = (g_raid_ndisks(sc, G_RAID_DISK_S_ACTIVE) < mdi->mdio_total_disks); } return (resurrection); } static void g_disk_md_sii_retaste(void *arg, int pending) { G_RAID_DEBUG(1, "Array is not complete, trying to retaste."); g_retaste(&g_raid_class); free(arg, M_MD_SII); } static void g_raid_md_sii_refill(struct g_raid_softc *sc) { struct g_raid_md_object *md; struct g_raid_md_sii_object *mdi; struct g_raid_disk *disk; struct task *task; int update, na; md = sc->sc_md; mdi = (struct g_raid_md_sii_object *)md; update = 0; do { /* Make sure we miss anything. */ na = g_raid_ndisks(sc, G_RAID_DISK_S_ACTIVE); if (na == mdi->mdio_total_disks) break; G_RAID_DEBUG1(1, md->mdo_softc, "Array is not complete (%d of %d), " "trying to refill.", na, mdi->mdio_total_disks); /* Try to get use some of STALE disks. */ TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_state == G_RAID_DISK_S_STALE) { update += g_raid_md_sii_start_disk(disk); if (disk->d_state == G_RAID_DISK_S_ACTIVE) break; } } if (disk != NULL) continue; /* Try to get use some of SPARE disks. */ TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_state == G_RAID_DISK_S_SPARE) { update += g_raid_md_sii_start_disk(disk); if (disk->d_state == G_RAID_DISK_S_ACTIVE) break; } } } while (disk != NULL); /* Write new metadata if we changed something. */ if (update) g_raid_md_write_sii(md, NULL, NULL, NULL); /* Update status of our need for spare. */ mdi->mdio_incomplete = (g_raid_ndisks(sc, G_RAID_DISK_S_ACTIVE) < mdi->mdio_total_disks); /* Request retaste hoping to find spare. */ if (mdi->mdio_incomplete) { task = malloc(sizeof(struct task), M_MD_SII, M_WAITOK | M_ZERO); TASK_INIT(task, 0, g_disk_md_sii_retaste, task); taskqueue_enqueue(taskqueue_swi, task); } } static void g_raid_md_sii_start(struct g_raid_softc *sc) { struct g_raid_md_object *md; struct g_raid_md_sii_object *mdi; struct g_raid_md_sii_perdisk *pd; struct sii_raid_conf *meta; struct g_raid_volume *vol; struct g_raid_subdisk *sd; struct g_raid_disk *disk, *best; off_t size; int j, disk_pos; uint32_t gendiff, bestgendiff; char buf[17]; md = sc->sc_md; mdi = (struct g_raid_md_sii_object *)md; meta = mdi->mdio_meta; /* Create volumes and subdisks. */ sii_meta_get_name(meta, buf); vol = g_raid_create_volume(sc, buf, -1); vol->v_mediasize = (off_t)meta->total_sectors * 512; vol->v_raid_level_qualifier = G_RAID_VOLUME_RLQ_NONE; if (meta->type == SII_T_RAID0) { vol->v_raid_level = G_RAID_VOLUME_RL_RAID0; size = vol->v_mediasize / mdi->mdio_total_disks; } else if (meta->type == SII_T_RAID1) { vol->v_raid_level = G_RAID_VOLUME_RL_RAID1; size = vol->v_mediasize; } else if (meta->type == SII_T_RAID01) { vol->v_raid_level = G_RAID_VOLUME_RL_RAID1E; size = vol->v_mediasize / (mdi->mdio_total_disks / 2); } else if (meta->type == SII_T_CONCAT) { if (mdi->mdio_total_disks == 1) vol->v_raid_level = G_RAID_VOLUME_RL_SINGLE; else vol->v_raid_level = G_RAID_VOLUME_RL_CONCAT; size = 0; } else if (meta->type == SII_T_RAID5) { vol->v_raid_level = G_RAID_VOLUME_RL_RAID5; vol->v_raid_level_qualifier = G_RAID_VOLUME_RLQ_R5LS; size = vol->v_mediasize / (mdi->mdio_total_disks - 1); } else if (meta->type == SII_T_JBOD) { vol->v_raid_level = G_RAID_VOLUME_RL_SINGLE; size = 0; } else { vol->v_raid_level = G_RAID_VOLUME_RL_UNKNOWN; size = 0; } vol->v_strip_size = meta->strip_sectors * 512; //ZZZ vol->v_disks_count = mdi->mdio_total_disks; vol->v_sectorsize = 512; //ZZZ for (j = 0; j < vol->v_disks_count; j++) { sd = &vol->v_subdisks[j]; sd->sd_offset = 0; sd->sd_size = size; } g_raid_start_volume(vol); /* Create disk placeholders to store data for later writing. */ for (disk_pos = 0; disk_pos < mdi->mdio_total_disks; disk_pos++) { pd = malloc(sizeof(*pd), M_MD_SII, M_WAITOK | M_ZERO); pd->pd_disk_pos = disk_pos; disk = g_raid_create_disk(sc); disk->d_md_data = (void *)pd; disk->d_state = G_RAID_DISK_S_OFFLINE; sd = &vol->v_subdisks[disk_pos]; sd->sd_disk = disk; TAILQ_INSERT_TAIL(&disk->d_subdisks, sd, sd_next); } /* * Make all disks found till the moment take their places * in order of their generation numbers. */ do { best = NULL; bestgendiff = 0xffffffff; TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_state != G_RAID_DISK_S_NONE) continue; pd = disk->d_md_data; if (pd->pd_meta == NULL) gendiff = 0xfffffffe; else gendiff = meta->generation - pd->pd_meta->generation; if (gendiff < bestgendiff) { best = disk; bestgendiff = gendiff; } } if (best != NULL) g_raid_md_sii_start_disk(best); } while (best != NULL); mdi->mdio_started = 1; G_RAID_DEBUG1(0, sc, "Array started."); g_raid_md_write_sii(md, NULL, NULL, NULL); /* Pickup any STALE/SPARE disks to refill array if needed. */ g_raid_md_sii_refill(sc); g_raid_event_send(vol, G_RAID_VOLUME_E_START, G_RAID_EVENT_VOLUME); callout_stop(&mdi->mdio_start_co); G_RAID_DEBUG1(1, sc, "root_mount_rel %p", mdi->mdio_rootmount); root_mount_rel(mdi->mdio_rootmount); mdi->mdio_rootmount = NULL; } static void g_raid_md_sii_new_disk(struct g_raid_disk *disk) { struct g_raid_softc *sc; struct g_raid_md_object *md; struct g_raid_md_sii_object *mdi; struct sii_raid_conf *pdmeta; struct g_raid_md_sii_perdisk *pd; sc = disk->d_softc; md = sc->sc_md; mdi = (struct g_raid_md_sii_object *)md; pd = (struct g_raid_md_sii_perdisk *)disk->d_md_data; pdmeta = pd->pd_meta; if (mdi->mdio_started) { if (g_raid_md_sii_start_disk(disk)) g_raid_md_write_sii(md, NULL, NULL, NULL); } else { if (mdi->mdio_meta == NULL || ((int32_t)(pdmeta->generation - mdi->mdio_generation)) > 0) { G_RAID_DEBUG1(1, sc, "Newer disk"); if (mdi->mdio_meta != NULL) free(mdi->mdio_meta, M_MD_SII); mdi->mdio_meta = sii_meta_copy(pdmeta); mdi->mdio_generation = mdi->mdio_meta->generation; mdi->mdio_total_disks = sii_meta_total_disks(pdmeta); mdi->mdio_disks_present = 1; } else if (pdmeta->generation == mdi->mdio_generation) { mdi->mdio_disks_present++; G_RAID_DEBUG1(1, sc, "Matching disk (%d of %d up)", mdi->mdio_disks_present, mdi->mdio_total_disks); } else { G_RAID_DEBUG1(1, sc, "Older disk"); } /* If we collected all needed disks - start array. */ if (mdi->mdio_disks_present == mdi->mdio_total_disks) g_raid_md_sii_start(sc); } } static void g_raid_sii_go(void *arg) { struct g_raid_softc *sc; struct g_raid_md_object *md; struct g_raid_md_sii_object *mdi; sc = arg; md = sc->sc_md; mdi = (struct g_raid_md_sii_object *)md; if (!mdi->mdio_started) { G_RAID_DEBUG1(0, sc, "Force array start due to timeout."); g_raid_event_send(sc, G_RAID_NODE_E_START, 0); } } static int g_raid_md_create_sii(struct g_raid_md_object *md, struct g_class *mp, struct g_geom **gp) { struct g_raid_softc *sc; struct g_raid_md_sii_object *mdi; char name[32]; mdi = (struct g_raid_md_sii_object *)md; mdi->mdio_timestamp[5] = arc4random(); mdi->mdio_timestamp[4] = arc4random(); mdi->mdio_timestamp[3] = arc4random(); mdi->mdio_timestamp[2] = arc4random(); mdi->mdio_timestamp[1] = arc4random(); mdi->mdio_timestamp[0] = arc4random(); mdi->mdio_location = arc4random(); mdi->mdio_generation = 0; snprintf(name, sizeof(name), "SiI-%02x%02x%02x%02x%02x%02x", mdi->mdio_timestamp[5], mdi->mdio_timestamp[4], mdi->mdio_timestamp[3], mdi->mdio_timestamp[2], mdi->mdio_timestamp[1], mdi->mdio_timestamp[0]); sc = g_raid_create_node(mp, name, md); if (sc == NULL) return (G_RAID_MD_TASTE_FAIL); md->mdo_softc = sc; *gp = sc->sc_geom; return (G_RAID_MD_TASTE_NEW); } static int g_raid_md_taste_sii(struct g_raid_md_object *md, struct g_class *mp, struct g_consumer *cp, struct g_geom **gp) { struct g_consumer *rcp; struct g_provider *pp; struct g_raid_md_sii_object *mdi, *mdi1; struct g_raid_softc *sc; struct g_raid_disk *disk; struct sii_raid_conf *meta; struct g_raid_md_sii_perdisk *pd; struct g_geom *geom; int disk_pos, result, spare, len; char name[32]; uint16_t vendor; G_RAID_DEBUG(1, "Tasting SiI on %s", cp->provider->name); mdi = (struct g_raid_md_sii_object *)md; pp = cp->provider; /* Read metadata from device. */ meta = NULL; g_topology_unlock(); vendor = 0xffff; len = sizeof(vendor); if (pp->geom->rank == 1) g_io_getattr("GEOM::hba_vendor", cp, &len, &vendor); meta = sii_meta_read(cp); g_topology_lock(); if (meta == NULL) { if (g_raid_aggressive_spare) { if (vendor == 0x1095) { G_RAID_DEBUG(1, "No SiI metadata, forcing spare."); spare = 2; goto search; } else { G_RAID_DEBUG(1, "SiI vendor mismatch 0x%04x != 0x1095", vendor); } } return (G_RAID_MD_TASTE_FAIL); } /* Check this disk position in obtained metadata. */ disk_pos = sii_meta_disk_pos(meta, meta); if (disk_pos == -1) { G_RAID_DEBUG(1, "SiI disk position not found"); goto fail1; } /* Metadata valid. Print it. */ g_raid_md_sii_print(meta); G_RAID_DEBUG(1, "SiI disk position %d", disk_pos); spare = (meta->type == SII_T_SPARE) ? 1 : 0; search: /* Search for matching node. */ sc = NULL; mdi1 = NULL; LIST_FOREACH(geom, &mp->geom, geom) { sc = geom->softc; if (sc == NULL) continue; if (sc->sc_stopping != 0) continue; if (sc->sc_md->mdo_class != md->mdo_class) continue; mdi1 = (struct g_raid_md_sii_object *)sc->sc_md; if (spare) { if (mdi1->mdio_incomplete) break; } else { if (mdi1->mdio_location == meta->raid_location && memcmp(&mdi1->mdio_timestamp, &meta->timestamp, 6) == 0) break; } } /* Found matching node. */ if (geom != NULL) { G_RAID_DEBUG(1, "Found matching array %s", sc->sc_name); result = G_RAID_MD_TASTE_EXISTING; } else if (spare) { /* Not found needy node -- left for later. */ G_RAID_DEBUG(1, "Spare is not needed at this time"); goto fail1; } else { /* Not found matching node -- create one. */ result = G_RAID_MD_TASTE_NEW; memcpy(&mdi->mdio_timestamp, &meta->timestamp, 6); mdi->mdio_location = meta->raid_location; snprintf(name, sizeof(name), "SiI-%02x%02x%02x%02x%02x%02x", mdi->mdio_timestamp[5], mdi->mdio_timestamp[4], mdi->mdio_timestamp[3], mdi->mdio_timestamp[2], mdi->mdio_timestamp[1], mdi->mdio_timestamp[0]); sc = g_raid_create_node(mp, name, md); md->mdo_softc = sc; geom = sc->sc_geom; callout_init(&mdi->mdio_start_co, 1); callout_reset(&mdi->mdio_start_co, g_raid_start_timeout * hz, g_raid_sii_go, sc); mdi->mdio_rootmount = root_mount_hold("GRAID-SiI"); G_RAID_DEBUG1(1, sc, "root_mount_hold %p", mdi->mdio_rootmount); } /* There is no return after this point, so we close passed consumer. */ g_access(cp, -1, 0, 0); rcp = g_new_consumer(geom); rcp->flags |= G_CF_DIRECT_RECEIVE; g_attach(rcp, pp); if (g_access(rcp, 1, 1, 1) != 0) ; //goto fail1; g_topology_unlock(); sx_xlock(&sc->sc_lock); pd = malloc(sizeof(*pd), M_MD_SII, M_WAITOK | M_ZERO); pd->pd_meta = meta; if (spare == 2) { pd->pd_disk_pos = -3; } else { pd->pd_disk_pos = -1; } pd->pd_disk_size = pp->mediasize; disk = g_raid_create_disk(sc); disk->d_md_data = (void *)pd; disk->d_consumer = rcp; rcp->private = disk; g_raid_get_disk_info(disk); g_raid_md_sii_new_disk(disk); sx_xunlock(&sc->sc_lock); g_topology_lock(); *gp = geom; return (result); fail1: free(meta, M_MD_SII); return (G_RAID_MD_TASTE_FAIL); } static int g_raid_md_event_sii(struct g_raid_md_object *md, struct g_raid_disk *disk, u_int event) { struct g_raid_softc *sc; struct g_raid_subdisk *sd; struct g_raid_md_sii_object *mdi; struct g_raid_md_sii_perdisk *pd; sc = md->mdo_softc; mdi = (struct g_raid_md_sii_object *)md; if (disk == NULL) { switch (event) { case G_RAID_NODE_E_START: if (!mdi->mdio_started) g_raid_md_sii_start(sc); return (0); } return (-1); } pd = (struct g_raid_md_sii_perdisk *)disk->d_md_data; switch (event) { case G_RAID_DISK_E_DISCONNECTED: /* If disk was assigned, just update statuses. */ if (pd->pd_disk_pos >= 0) { g_raid_change_disk_state(disk, G_RAID_DISK_S_OFFLINE); if (disk->d_consumer) { g_raid_kill_consumer(sc, disk->d_consumer); disk->d_consumer = NULL; } TAILQ_FOREACH(sd, &disk->d_subdisks, sd_next) { g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_NONE); g_raid_event_send(sd, G_RAID_SUBDISK_E_DISCONNECTED, G_RAID_EVENT_SUBDISK); } } else { /* Otherwise -- delete. */ g_raid_change_disk_state(disk, G_RAID_DISK_S_NONE); g_raid_destroy_disk(disk); } /* Write updated metadata to all disks. */ g_raid_md_write_sii(md, NULL, NULL, NULL); /* Check if anything left except placeholders. */ if (g_raid_ndisks(sc, -1) == g_raid_ndisks(sc, G_RAID_DISK_S_OFFLINE)) g_raid_destroy_node(sc, 0); else g_raid_md_sii_refill(sc); return (0); } return (-2); } static int g_raid_md_ctl_sii(struct g_raid_md_object *md, struct gctl_req *req) { struct g_raid_softc *sc; struct g_raid_volume *vol; struct g_raid_subdisk *sd; struct g_raid_disk *disk; struct g_raid_md_sii_object *mdi; struct g_raid_md_sii_perdisk *pd; struct g_consumer *cp; struct g_provider *pp; char arg[16]; const char *verb, *volname, *levelname, *diskname; int *nargs, *force; off_t size, sectorsize, strip; intmax_t *sizearg, *striparg; int numdisks, i, len, level, qual, update; int error; sc = md->mdo_softc; mdi = (struct g_raid_md_sii_object *)md; verb = gctl_get_param(req, "verb", NULL); nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); error = 0; if (strcmp(verb, "label") == 0) { if (*nargs < 4) { gctl_error(req, "Invalid number of arguments."); return (-1); } volname = gctl_get_asciiparam(req, "arg1"); if (volname == NULL) { gctl_error(req, "No volume name."); return (-2); } levelname = gctl_get_asciiparam(req, "arg2"); if (levelname == NULL) { gctl_error(req, "No RAID level."); return (-3); } if (strcasecmp(levelname, "RAID5") == 0) levelname = "RAID5-LS"; if (g_raid_volume_str2level(levelname, &level, &qual)) { gctl_error(req, "Unknown RAID level '%s'.", levelname); return (-4); } numdisks = *nargs - 3; force = gctl_get_paraml(req, "force", sizeof(*force)); if (!g_raid_md_sii_supported(level, qual, numdisks, force ? *force : 0)) { gctl_error(req, "Unsupported RAID level " "(0x%02x/0x%02x), or number of disks (%d).", level, qual, numdisks); return (-5); } /* Search for disks, connect them and probe. */ size = 0x7fffffffffffffffllu; sectorsize = 0; for (i = 0; i < numdisks; i++) { snprintf(arg, sizeof(arg), "arg%d", i + 3); diskname = gctl_get_asciiparam(req, arg); if (diskname == NULL) { gctl_error(req, "No disk name (%s).", arg); error = -6; break; } if (strcmp(diskname, "NONE") == 0) { cp = NULL; pp = NULL; } else { g_topology_lock(); cp = g_raid_open_consumer(sc, diskname); if (cp == NULL) { gctl_error(req, "Can't open '%s'.", diskname); g_topology_unlock(); error = -7; break; } pp = cp->provider; } pd = malloc(sizeof(*pd), M_MD_SII, M_WAITOK | M_ZERO); pd->pd_disk_pos = i; disk = g_raid_create_disk(sc); disk->d_md_data = (void *)pd; disk->d_consumer = cp; if (cp == NULL) continue; cp->private = disk; g_topology_unlock(); g_raid_get_disk_info(disk); pd->pd_disk_size = pp->mediasize; if (size > pp->mediasize) size = pp->mediasize; if (sectorsize < pp->sectorsize) sectorsize = pp->sectorsize; } if (error != 0) return (error); if (sectorsize <= 0) { gctl_error(req, "Can't get sector size."); return (-8); } /* Reserve space for metadata. */ size -= 0x800 * sectorsize; /* Handle size argument. */ len = sizeof(*sizearg); sizearg = gctl_get_param(req, "size", &len); if (sizearg != NULL && len == sizeof(*sizearg) && *sizearg > 0) { if (*sizearg > size) { gctl_error(req, "Size too big %lld > %lld.", (long long)*sizearg, (long long)size); return (-9); } size = *sizearg; } /* Handle strip argument. */ strip = 131072; len = sizeof(*striparg); striparg = gctl_get_param(req, "strip", &len); if (striparg != NULL && len == sizeof(*striparg) && *striparg > 0) { if (*striparg < sectorsize) { gctl_error(req, "Strip size too small."); return (-10); } if (*striparg % sectorsize != 0) { gctl_error(req, "Incorrect strip size."); return (-11); } if (strip > 65535 * sectorsize) { gctl_error(req, "Strip size too big."); return (-12); } strip = *striparg; } /* Round size down to strip or sector. */ if (level == G_RAID_VOLUME_RL_RAID1) size -= (size % sectorsize); else if (level == G_RAID_VOLUME_RL_RAID1E && (numdisks & 1) != 0) size -= (size % (2 * strip)); else size -= (size % strip); if (size <= 0) { gctl_error(req, "Size too small."); return (-13); } if (size > 0xffffffffffffllu * sectorsize) { gctl_error(req, "Size too big."); return (-14); } /* We have all we need, create things: volume, ... */ mdi->mdio_total_disks = numdisks; mdi->mdio_started = 1; vol = g_raid_create_volume(sc, volname, -1); vol->v_md_data = (void *)(intptr_t)0; vol->v_raid_level = level; vol->v_raid_level_qualifier = qual; vol->v_strip_size = strip; vol->v_disks_count = numdisks; if (level == G_RAID_VOLUME_RL_RAID0 || level == G_RAID_VOLUME_RL_CONCAT || level == G_RAID_VOLUME_RL_SINGLE) vol->v_mediasize = size * numdisks; else if (level == G_RAID_VOLUME_RL_RAID1) vol->v_mediasize = size; else if (level == G_RAID_VOLUME_RL_RAID5) vol->v_mediasize = size * (numdisks - 1); else { /* RAID1E */ vol->v_mediasize = ((size * numdisks) / strip / 2) * strip; } vol->v_sectorsize = sectorsize; g_raid_start_volume(vol); /* , and subdisks. */ TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { pd = (struct g_raid_md_sii_perdisk *)disk->d_md_data; sd = &vol->v_subdisks[pd->pd_disk_pos]; sd->sd_disk = disk; sd->sd_offset = 0; sd->sd_size = size; TAILQ_INSERT_TAIL(&disk->d_subdisks, sd, sd_next); if (sd->sd_disk->d_consumer != NULL) { g_raid_change_disk_state(disk, G_RAID_DISK_S_ACTIVE); g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_ACTIVE); g_raid_event_send(sd, G_RAID_SUBDISK_E_NEW, G_RAID_EVENT_SUBDISK); } else { g_raid_change_disk_state(disk, G_RAID_DISK_S_OFFLINE); } } /* Write metadata based on created entities. */ G_RAID_DEBUG1(0, sc, "Array started."); g_raid_md_write_sii(md, NULL, NULL, NULL); /* Pickup any STALE/SPARE disks to refill array if needed. */ g_raid_md_sii_refill(sc); g_raid_event_send(vol, G_RAID_VOLUME_E_START, G_RAID_EVENT_VOLUME); return (0); } if (strcmp(verb, "delete") == 0) { /* Check if some volume is still open. */ force = gctl_get_paraml(req, "force", sizeof(*force)); if (force != NULL && *force == 0 && g_raid_nopens(sc) != 0) { gctl_error(req, "Some volume is still open."); return (-4); } TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_consumer) sii_meta_erase(disk->d_consumer); } g_raid_destroy_node(sc, 0); return (0); } if (strcmp(verb, "remove") == 0 || strcmp(verb, "fail") == 0) { if (*nargs < 2) { gctl_error(req, "Invalid number of arguments."); return (-1); } for (i = 1; i < *nargs; i++) { snprintf(arg, sizeof(arg), "arg%d", i); diskname = gctl_get_asciiparam(req, arg); if (diskname == NULL) { gctl_error(req, "No disk name (%s).", arg); error = -2; break; } if (strncmp(diskname, "/dev/", 5) == 0) diskname += 5; TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { if (disk->d_consumer != NULL && disk->d_consumer->provider != NULL && strcmp(disk->d_consumer->provider->name, diskname) == 0) break; } if (disk == NULL) { gctl_error(req, "Disk '%s' not found.", diskname); error = -3; break; } if (strcmp(verb, "fail") == 0) { g_raid_md_fail_disk_sii(md, NULL, disk); continue; } pd = (struct g_raid_md_sii_perdisk *)disk->d_md_data; /* Erase metadata on deleting disk. */ sii_meta_erase(disk->d_consumer); /* If disk was assigned, just update statuses. */ if (pd->pd_disk_pos >= 0) { g_raid_change_disk_state(disk, G_RAID_DISK_S_OFFLINE); g_raid_kill_consumer(sc, disk->d_consumer); disk->d_consumer = NULL; TAILQ_FOREACH(sd, &disk->d_subdisks, sd_next) { g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_NONE); g_raid_event_send(sd, G_RAID_SUBDISK_E_DISCONNECTED, G_RAID_EVENT_SUBDISK); } } else { /* Otherwise -- delete. */ g_raid_change_disk_state(disk, G_RAID_DISK_S_NONE); g_raid_destroy_disk(disk); } } /* Write updated metadata to remaining disks. */ g_raid_md_write_sii(md, NULL, NULL, NULL); /* Check if anything left except placeholders. */ if (g_raid_ndisks(sc, -1) == g_raid_ndisks(sc, G_RAID_DISK_S_OFFLINE)) g_raid_destroy_node(sc, 0); else g_raid_md_sii_refill(sc); return (error); } if (strcmp(verb, "insert") == 0) { if (*nargs < 2) { gctl_error(req, "Invalid number of arguments."); return (-1); } update = 0; for (i = 1; i < *nargs; i++) { /* Get disk name. */ snprintf(arg, sizeof(arg), "arg%d", i); diskname = gctl_get_asciiparam(req, arg); if (diskname == NULL) { gctl_error(req, "No disk name (%s).", arg); error = -3; break; } /* Try to find provider with specified name. */ g_topology_lock(); cp = g_raid_open_consumer(sc, diskname); if (cp == NULL) { gctl_error(req, "Can't open disk '%s'.", diskname); g_topology_unlock(); error = -4; break; } pp = cp->provider; pd = malloc(sizeof(*pd), M_MD_SII, M_WAITOK | M_ZERO); pd->pd_disk_pos = -3; pd->pd_disk_size = pp->mediasize; disk = g_raid_create_disk(sc); disk->d_consumer = cp; disk->d_md_data = (void *)pd; cp->private = disk; g_topology_unlock(); g_raid_get_disk_info(disk); /* Welcome the "new" disk. */ update += g_raid_md_sii_start_disk(disk); if (disk->d_state == G_RAID_DISK_S_SPARE) { sii_meta_write_spare(cp); g_raid_destroy_disk(disk); } else if (disk->d_state != G_RAID_DISK_S_ACTIVE) { gctl_error(req, "Disk '%s' doesn't fit.", diskname); g_raid_destroy_disk(disk); error = -8; break; } } /* Write new metadata if we changed something. */ if (update) g_raid_md_write_sii(md, NULL, NULL, NULL); return (error); } gctl_error(req, "Command '%s' is not supported.", verb); return (-100); } static int g_raid_md_write_sii(struct g_raid_md_object *md, struct g_raid_volume *tvol, struct g_raid_subdisk *tsd, struct g_raid_disk *tdisk) { struct g_raid_softc *sc; struct g_raid_volume *vol; struct g_raid_subdisk *sd; struct g_raid_disk *disk; struct g_raid_md_sii_object *mdi; struct g_raid_md_sii_perdisk *pd; struct sii_raid_conf *meta; u_int i; sc = md->mdo_softc; mdi = (struct g_raid_md_sii_object *)md; if (sc->sc_stopping == G_RAID_DESTROY_HARD) return (0); /* Bump generation. Newly written metadata may differ from previous. */ mdi->mdio_generation++; /* There is only one volume. */ vol = TAILQ_FIRST(&sc->sc_volumes); /* Fill global fields. */ meta = malloc(sizeof(*meta), M_MD_SII, M_WAITOK | M_ZERO); if (mdi->mdio_meta) memcpy(meta, mdi->mdio_meta, sizeof(*meta)); meta->total_sectors = vol->v_mediasize / vol->v_sectorsize; meta->vendor_id = 0x1095; meta->version_minor = 0; meta->version_major = 2; memcpy(&meta->timestamp, &mdi->mdio_timestamp, 6); meta->strip_sectors = vol->v_strip_size / vol->v_sectorsize; if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID0) { meta->type = SII_T_RAID0; meta->raid0_disks = vol->v_disks_count; meta->raid1_disks = 0xff; } else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1) { meta->type = SII_T_RAID1; meta->raid0_disks = 0xff; meta->raid1_disks = vol->v_disks_count; } else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1E) { meta->type = SII_T_RAID01; meta->raid0_disks = vol->v_disks_count / 2; meta->raid1_disks = 2; } else if (vol->v_raid_level == G_RAID_VOLUME_RL_CONCAT || vol->v_raid_level == G_RAID_VOLUME_RL_SINGLE) { meta->type = SII_T_JBOD; meta->raid0_disks = vol->v_disks_count; meta->raid1_disks = 0xff; } else { meta->type = SII_T_RAID5; meta->raid0_disks = vol->v_disks_count; meta->raid1_disks = 0xff; } meta->generation = mdi->mdio_generation; meta->raid_status = vol->v_dirty ? SII_S_ONLINE : SII_S_AVAILABLE; for (i = 0; i < vol->v_disks_count; i++) { sd = &vol->v_subdisks[i]; if (sd->sd_state == G_RAID_SUBDISK_S_STALE || sd->sd_state == G_RAID_SUBDISK_S_RESYNC) meta->raid_status = SII_S_ONLINE; } meta->raid_location = mdi->mdio_location; sii_meta_put_name(meta, vol->v_name); /* We are done. Print meta data and store them to disks. */ if (mdi->mdio_meta != NULL) free(mdi->mdio_meta, M_MD_SII); mdi->mdio_meta = meta; TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { pd = (struct g_raid_md_sii_perdisk *)disk->d_md_data; if (disk->d_state != G_RAID_DISK_S_ACTIVE) continue; if (pd->pd_meta != NULL) { free(pd->pd_meta, M_MD_SII); pd->pd_meta = NULL; } pd->pd_meta = sii_meta_copy(meta); if ((sd = TAILQ_FIRST(&disk->d_subdisks)) != NULL) { if (sd->sd_state < G_RAID_SUBDISK_S_NEW) pd->pd_meta->disk_status = SII_S_DROPPED; else if (sd->sd_state < G_RAID_SUBDISK_S_STALE) { pd->pd_meta->disk_status = SII_S_REBUILD; pd->pd_meta->rebuild_lba = sd->sd_rebuild_pos / vol->v_sectorsize; } else pd->pd_meta->disk_status = SII_S_CURRENT; if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1) { pd->pd_meta->disk_number = sd->sd_pos; pd->pd_meta->raid0_ident = 0xff; pd->pd_meta->raid1_ident = 0; } else if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1E) { pd->pd_meta->disk_number = sd->sd_pos / meta->raid1_disks; pd->pd_meta->raid0_ident = sd->sd_pos % meta->raid1_disks; pd->pd_meta->raid1_ident = sd->sd_pos / meta->raid1_disks; } else { pd->pd_meta->disk_number = sd->sd_pos; pd->pd_meta->raid0_ident = 0; pd->pd_meta->raid1_ident = 0xff; } } G_RAID_DEBUG(1, "Writing SiI metadata to %s", g_raid_get_diskname(disk)); g_raid_md_sii_print(pd->pd_meta); sii_meta_write(disk->d_consumer, pd->pd_meta); } return (0); } static int g_raid_md_fail_disk_sii(struct g_raid_md_object *md, struct g_raid_subdisk *tsd, struct g_raid_disk *tdisk) { struct g_raid_softc *sc; struct g_raid_md_sii_perdisk *pd; struct g_raid_subdisk *sd; sc = md->mdo_softc; pd = (struct g_raid_md_sii_perdisk *)tdisk->d_md_data; /* We can't fail disk that is not a part of array now. */ if (pd->pd_disk_pos < 0) return (-1); /* * Mark disk as failed in metadata and try to write that metadata * to the disk itself to prevent it's later resurrection as STALE. */ if (tdisk->d_consumer) { if (pd->pd_meta) { pd->pd_meta->disk_status = SII_S_REMOVED; sii_meta_write(tdisk->d_consumer, pd->pd_meta); } else sii_meta_erase(tdisk->d_consumer); } /* Change states. */ g_raid_change_disk_state(tdisk, G_RAID_DISK_S_FAILED); TAILQ_FOREACH(sd, &tdisk->d_subdisks, sd_next) { g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_FAILED); g_raid_event_send(sd, G_RAID_SUBDISK_E_FAILED, G_RAID_EVENT_SUBDISK); } /* Write updated metadata to remaining disks. */ g_raid_md_write_sii(md, NULL, NULL, tdisk); /* Check if anything left except placeholders. */ if (g_raid_ndisks(sc, -1) == g_raid_ndisks(sc, G_RAID_DISK_S_OFFLINE)) g_raid_destroy_node(sc, 0); else g_raid_md_sii_refill(sc); return (0); } static int g_raid_md_free_disk_sii(struct g_raid_md_object *md, struct g_raid_disk *disk) { struct g_raid_md_sii_perdisk *pd; pd = (struct g_raid_md_sii_perdisk *)disk->d_md_data; if (pd->pd_meta != NULL) { free(pd->pd_meta, M_MD_SII); pd->pd_meta = NULL; } free(pd, M_MD_SII); disk->d_md_data = NULL; return (0); } static int g_raid_md_free_sii(struct g_raid_md_object *md) { struct g_raid_md_sii_object *mdi; mdi = (struct g_raid_md_sii_object *)md; if (!mdi->mdio_started) { mdi->mdio_started = 0; callout_stop(&mdi->mdio_start_co); G_RAID_DEBUG1(1, md->mdo_softc, "root_mount_rel %p", mdi->mdio_rootmount); root_mount_rel(mdi->mdio_rootmount); mdi->mdio_rootmount = NULL; } if (mdi->mdio_meta != NULL) { free(mdi->mdio_meta, M_MD_SII); mdi->mdio_meta = NULL; } return (0); } G_RAID_MD_DECLARE(sii, "SiI"); Index: head/sys/i386/include/cserial.h =================================================================== --- head/sys/i386/include/cserial.h (revision 308456) +++ head/sys/i386/include/cserial.h (revision 308457) @@ -1,519 +1,519 @@ /*- * Ioctl interface to Cronyx serial drivers. * * Copyright (C) 1997-2002 Cronyx Engineering. * Author: Serge Vakulenko, * * Copyright (C) 2001-2005 Cronyx Engineering. * Author: Roman Kurakin, * * Copyright (C) 2004-2005 Cronyx Engineering. * Author: Leo Yuriev, * * This software is distributed with NO WARRANTIES, not even the implied * warranties for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. * * Authors grant any other persons or organisations permission to use * or modify this software as long as this message is kept with the software, * all derivative works or modified versions. * * Cronyx Id: cserial.h,v 1.4.2.2 2005/11/09 13:01:35 rik Exp $ * $FreeBSD$ */ /* * General channel statistics. */ struct serial_statistics { unsigned long rintr; /* receive interrupts */ unsigned long tintr; /* transmit interrupts */ unsigned long mintr; /* modem interrupts */ unsigned long ibytes; /* input bytes */ unsigned long ipkts; /* input packets */ unsigned long ierrs; /* input errors */ unsigned long obytes; /* output bytes */ unsigned long opkts; /* output packets */ unsigned long oerrs; /* output errors */ }; /* * Statistics for E1/G703 channels. */ struct e1_counters { unsigned long bpv; /* bipolar violations */ unsigned long fse; /* frame sync errors */ unsigned long crce; /* CRC errors */ unsigned long rcrce; /* remote CRC errors (E-bit) */ unsigned long uas; /* unavailable seconds */ unsigned long les; /* line errored seconds */ unsigned long es; /* errored seconds */ unsigned long bes; /* bursty errored seconds */ unsigned long ses; /* severely errored seconds */ unsigned long oofs; /* out of frame seconds */ unsigned long css; /* controlled slip seconds */ unsigned long dm; /* degraded minutes */ }; struct e1_statistics { unsigned long status; /* line status bit mask */ unsigned long cursec; /* seconds in current interval */ unsigned long totsec; /* total seconds elapsed */ struct e1_counters currnt; /* current 15-min interval data */ struct e1_counters total; /* total statistics data */ struct e1_counters interval [48]; /* 12 hour period data */ }; struct e3_statistics { unsigned long status; unsigned long cursec; unsigned long totsec; unsigned long ccv; unsigned long tcv; unsigned long icv[48]; }; #define M_ASYNC 0 /* asynchronous mode */ #define M_HDLC 1 /* bit-sync mode (HDLC) */ #define M_G703 2 #define M_E1 3 /* * Receive error codes. */ #define ER_FRAMING 1 /* framing error */ #define ER_CHECKSUM 2 /* parity/CRC error */ #define ER_BREAK 3 /* break state */ #define ER_OVERFLOW 4 /* receive buffer overflow */ #define ER_OVERRUN 5 /* receive fifo overrun */ #define ER_UNDERRUN 6 /* transmit fifo underrun */ #define ER_SCC_FRAMING 7 /* subchannel framing error */ #define ER_SCC_OVERFLOW 8 /* subchannel receive buffer overflow */ #define ER_SCC_OVERRUN 9 /* subchannel receiver overrun */ #define ER_SCC_UNDERRUN 10 /* subchannel transmiter underrun */ #define ER_BUS 11 /* system bus is too busy (e.g PCI) */ /* * E1 channel status. */ #define E1_NOALARM 0x0001 /* no alarm present */ #define E1_FARLOF 0x0002 /* receiving far loss of framing */ #define E1_CRC4E 0x0004 /* crc4 errors */ #define E1_AIS 0x0008 /* receiving all ones */ #define E1_LOF 0x0020 /* loss of framing */ #define E1_LOS 0x0040 /* loss of signal */ #define E1_AIS16 0x0100 /* receiving all ones in timeslot 16 */ #define E1_FARLOMF 0x0200 /* receiving alarm in timeslot 16 */ #define E1_LOMF 0x0400 /* loss of multiframe sync */ #define E1_TSTREQ 0x0800 /* test code detected */ #define E1_TSTERR 0x1000 /* test error */ #define E3_LOS 0x00000002 /* Lost of synchronization */ #define E3_TXE 0x00000004 /* Transmit error */ /* * Query the mask of all registered channels, max 128. */ #define SERIAL_GETREGISTERED _IOR ('x', 0, char[16]) /* * Attach/detach the protocol to the channel. - * The protocol is given by it's name, char[8]. + * The protocol is given by its name, char[8]. * For example "async", "hdlc", "cisco", "fr", "ppp". */ #define SERIAL_GETPROTO _IOR ('x', 1, char [8]) #define SERIAL_SETPROTO _IOW ('x', 1, char [8]) /* * Query/set the hardware mode for the channel. */ #define SERIAL_GETMODE _IOR ('x', 2, int) #define SERIAL_SETMODE _IOW ('x', 2, int) #define SERIAL_ASYNC 1 #define SERIAL_HDLC 2 #define SERIAL_RAW 3 /* * Get/clear the channel statistics. */ #define SERIAL_GETSTAT _IOR ('x', 3, struct serial_statistics) #define SERIAL_GETESTAT _IOR ('x', 3, struct e1_statistics) #define SERIAL_GETE3STAT _IOR ('x', 3, struct e3_statistics) #define SERIAL_CLRSTAT _IO ('x', 3) /* * Query/set the synchronization mode and baud rate. * If baud==0 then the external clock is used. */ #define SERIAL_GETBAUD _IOR ('x', 4, long) #define SERIAL_SETBAUD _IOW ('x', 4, long) /* * Query/set the internal loopback mode, * useful for debugging purposes. */ #define SERIAL_GETLOOP _IOR ('x', 5, int) #define SERIAL_SETLOOP _IOW ('x', 5, int) /* * Query/set the DPLL mode, commonly used with NRZI * for channels lacking synchro signals. */ #define SERIAL_GETDPLL _IOR ('x', 6, int) #define SERIAL_SETDPLL _IOW ('x', 6, int) /* * Query/set the NRZI encoding (default is NRZ). */ #define SERIAL_GETNRZI _IOR ('x', 7, int) #define SERIAL_SETNRZI _IOW ('x', 7, int) /* * Invert receive and transmit clock. */ #define SERIAL_GETINVCLK _IOR ('x', 8, int) #define SERIAL_SETINVCLK _IOW ('x', 8, int) /* * Query/set the E1/G703 synchronization mode. */ #define SERIAL_GETCLK _IOR ('x', 9, int) #define SERIAL_SETCLK _IOW ('x', 9, int) #define E1CLK_RECOVERY -1 #define E1CLK_INTERNAL 0 #define E1CLK_RECEIVE 1 #define E1CLK_RECEIVE_CHAN0 2 #define E1CLK_RECEIVE_CHAN1 3 #define E1CLK_RECEIVE_CHAN2 4 #define E1CLK_RECEIVE_CHAN3 5 /* * Query/set the E1 timeslot mask. */ #define SERIAL_GETTIMESLOTS _IOR ('x', 10, long) #define SERIAL_SETTIMESLOTS _IOW ('x', 10, long) /* * Query/set the E1 subchannel timeslot mask. */ #define SERIAL_GETSUBCHAN _IOR ('x', 11, long) #define SERIAL_SETSUBCHAN _IOW ('x', 11, long) /* * Query/set the high input sensitivity mode (E1). */ #define SERIAL_GETHIGAIN _IOR ('x', 12, int) #define SERIAL_SETHIGAIN _IOW ('x', 12, int) /* * Query the input signal level in santibells. */ #define SERIAL_GETLEVEL _IOR ('x', 13, int) /* * Get the channel name. */ #define SERIAL_GETNAME _IOR ('x', 14, char [32]) /* * Get version string. */ #define SERIAL_GETVERSIONSTRING _IOR ('x', 15, char [256]) /* * Query/set master channel. */ #define SERIAL_GETMASTER _IOR ('x', 16, char [16]) #define SERIAL_SETMASTER _IOW ('x', 16, char [16]) /* * Query/set keepalive. */ #define SERIAL_GETKEEPALIVE _IOR ('x', 17, int) #define SERIAL_SETKEEPALIVE _IOW ('x', 17, int) /* * Query/set E1 configuration. */ #define SERIAL_GETCFG _IOR ('x', 18, char) #define SERIAL_SETCFG _IOW ('x', 18, char) /* * Query/set debug. */ #define SERIAL_GETDEBUG _IOR ('x', 19, int) #define SERIAL_SETDEBUG _IOW ('x', 19, int) /* * Query/set phony mode (E1). */ #define SERIAL_GETPHONY _IOR ('x', 20, int) #define SERIAL_SETPHONY _IOW ('x', 20, int) /* * Query/set timeslot 16 usage mode (E1). */ #define SERIAL_GETUSE16 _IOR ('x', 21, int) #define SERIAL_SETUSE16 _IOW ('x', 21, int) /* * Query/set crc4 mode (E1). */ #define SERIAL_GETCRC4 _IOR ('x', 22, int) #define SERIAL_SETCRC4 _IOW ('x', 22, int) /* * Query/set the timeout to recover after transmit interrupt loss. * If timo==0 recover will be disabled. */ #define SERIAL_GETTIMO _IOR ('x', 23, long) #define SERIAL_SETTIMO _IOW ('x', 23, long) /* * Query/set port type for old models of Sigma * -1 Fixed or cable select * 0 RS-232 * 1 V35 * 2 RS-449 * 3 E1 (only for Windows 2000) * 4 G.703 (only for Windows 2000) * 5 DATA (only for Windows 2000) * 6 E3 (only for Windows 2000) * 7 T3 (only for Windows 2000) * 8 STS1 (only for Windows 2000) */ #define SERIAL_GETPORT _IOR ('x', 25, int) #define SERIAL_SETPORT _IOW ('x', 25, int) /* * Add the virtual channel DLCI (Frame Relay). */ #define SERIAL_ADDDLCI _IOW ('x', 26, int) /* * Invert receive clock. */ #define SERIAL_GETINVRCLK _IOR ('x', 27, int) #define SERIAL_SETINVRCLK _IOW ('x', 27, int) /* * Invert transmit clock. */ #define SERIAL_GETINVTCLK _IOR ('x', 28, int) #define SERIAL_SETINVTCLK _IOW ('x', 28, int) /* * Unframed E1 mode. */ #define SERIAL_GETUNFRAM _IOR ('x', 29, int) #define SERIAL_SETUNFRAM _IOW ('x', 29, int) /* * E1 monitoring mode. */ #define SERIAL_GETMONITOR _IOR ('x', 30, int) #define SERIAL_SETMONITOR _IOW ('x', 30, int) /* * Interrupt number. */ #define SERIAL_GETIRQ _IOR ('x', 31, int) /* * Reset. */ #define SERIAL_RESET _IO ('x', 32) /* * Hard reset. */ #define SERIAL_HARDRESET _IO ('x', 33) /* * Query cable type. */ #define SERIAL_GETCABLE _IOR ('x', 34, int) /* * Assignment of HDLC ports to E1 channels. */ #define SERIAL_GETDIR _IOR ('x', 35, int) #define SERIAL_SETDIR _IOW ('x', 35, int) struct dxc_table { /* cross-connector parameters */ unsigned char ts [32]; /* timeslot number */ unsigned char link [32]; /* E1 link number */ }; /* * DXC cross-connector settings for E1 channels. */ #define SERIAL_GETDXC _IOR ('x', 36, struct dxc_table) #define SERIAL_SETDXC _IOW ('x', 36, struct dxc_table) /* * Scrambler for G.703. */ #define SERIAL_GETSCRAMBLER _IOR ('x', 37, int) #define SERIAL_SETSCRAMBLER _IOW ('x', 37, int) /* * Length of cable for T3 and STS-1. */ #define SERIAL_GETCABLEN _IOR ('x', 38, int) #define SERIAL_SETCABLEN _IOW ('x', 38, int) /* * Remote loopback for E3, T3 and STS-1. */ #define SERIAL_GETRLOOP _IOR ('x', 39, int) #define SERIAL_SETRLOOP _IOW ('x', 39, int) /* * G.703 line code */ #define SERIAL_GETLCODE _IOR ('x', 40, int) #define SERIAL_SETLCODE _IOW ('x', 40, int) /* * MTU */ #define SERIAL_GETMTU _IOR ('x', 41, int) #define SERIAL_SETMTU _IOW ('x', 41, int) /* * Receive Queue Length */ #define SERIAL_GETRQLEN _IOR ('x', 42, int) #define SERIAL_SETRQLEN _IOW ('x', 42, int) #ifdef __KERNEL__ #ifdef CRONYX_LYSAP # define LYSAP_PEER_ADD _IOWR('x', 101, lysap_peer_config_t) # define LYSAP_PEER_REMOVE _IOW('x', 102, unsigned) # define LYSAP_PEER_INFO _IOWR('x', 103, lysap_peer_info_t) # define LYSAP_PEER_COUNT _IOR('x', 104, unsigned) # define LYSAP_PEER_ENUM _IOWR('x', 105, unsigned) # define LYSAP_PEER_CLEAR _IOW('x', 106, unsigned) # define LYSAP_CHAN_ADD _IOWR('x', 111, lysap_channel_config_t) # define LYSAP_CHAN_REMOVE _IO('x', 112) # define LYSAP_CHAN_INFO _IOR('x', 113, lysap_channel_info_t) # define LYSAP_CHAN_COUNT _IOR('x', 114, unsigned) # define LYSAP_CHAN_ENUM _IOWR('x', 115, unsigned) # define LYSAP_CHAN_CLEAR _IO('x', 116) # include "lysap-linux.h" #else /* CRONYX_LYSAP */ typedef struct _lysap_channel_t lysap_channel_t; typedef struct _lysap_channel_config_t lysap_channel_config_t; typedef struct _LYSAP_DeviceInterfaceConfig LYSAP_DeviceInterfaceConfig; typedef struct _LYSAP_ChannelConfig LYSAP_ChannelConfig; typedef struct _lysap_buf_t lysap_buf_t; #endif /* !CRONYX_LYSAP */ /* * Dynamic binder interface. */ typedef struct _chan_t chan_t; typedef struct _proto_t proto_t; void binder_register_protocol (proto_t *p); void binder_unregister_protocol (proto_t *p); int binder_register_channel (chan_t *h, char *prefix, int minor); void binder_unregister_channel (chan_t *h); /* * Hardware channel driver structure. */ struct sk_buff; struct _chan_t { char name [16]; int mtu; /* max packet size */ int fifosz; /* total hardware i/o buffer size */ int port; /* hardware base i/o port */ int irq; /* hardware interrupt line */ int minor; /* minor number 0..127, assigned by binder */ int debug; /* debug level, 0..2 */ int running; /* running, 0..1 */ struct _proto_t *proto; /* protocol interface data */ void *sw; /* protocol private data */ void *hw; /* hardware layer private data */ /* Interface to protocol */ int (*up) (chan_t *h); void (*down) (chan_t *h); int (*transmit) (chan_t *h, struct sk_buff *skb); void (*set_dtr) (chan_t *h, int val); void (*set_rts) (chan_t *h, int val); int (*query_dtr) (chan_t *h); int (*query_rts) (chan_t *h); int (*query_dsr) (chan_t *h); int (*query_cts) (chan_t *h); int (*query_dcd) (chan_t *h); /* Interface to async protocol */ void (*set_async_param) (chan_t *h, int baud, int bits, int parity, int stop2, int ignpar, int rtscts, int ixon, int ixany, int symstart, int symstop); void (*send_break) (chan_t *h, int msec); void (*send_xon) (chan_t *h); void (*send_xoff) (chan_t *h); void (*start_transmitter) (chan_t *h); void (*stop_transmitter) (chan_t *h); void (*flush_transmit_buffer) (chan_t *h); /* Control interface */ int (*control) (chan_t *h, unsigned int cmd, unsigned long arg); /* LYSAP interface */ struct lysap_t { lysap_channel_t *link; int (*inspect_config)(chan_t *h, lysap_channel_config_t *, LYSAP_DeviceInterfaceConfig *, LYSAP_ChannelConfig *); unsigned long (*probe_freq)(chan_t *h, unsigned long freq); unsigned long (*set_freq)(chan_t *h, unsigned long freq); unsigned (*get_status)(chan_t *h); int (*transmit) (chan_t *h, lysap_buf_t *b); lysap_buf_t* (*alloc_buf) (chan_t *h, unsigned len); int (*set_clock_master)(chan_t *h, int enable); unsigned long (*get_master_freq)(chan_t *h); } lysap; }; /* * Protocol driver structure. */ struct _proto_t { char *name; struct _proto_t *next; /* Interface to channel */ void (*receive) (chan_t *h, struct sk_buff *skb); void (*receive_error) (chan_t *h, int errcode); void (*transmit) (chan_t *h); void (*modem_event) (chan_t *h); /* Interface to binder */ int (*open) (chan_t *h); void (*close) (chan_t *h); int (*read) (chan_t *h, unsigned short flg, char *buf, int len); int (*write) (chan_t *h, unsigned short flg, const char *buf, int len); int (*select) (chan_t *h, int type, void *st, struct file *filp); struct fasync_struct *fasync; /* Control interface */ int (*attach) (chan_t *h); int (*detach) (chan_t *h); int (*control) (chan_t *h, unsigned int cmd, unsigned long arg); /* LYSAP interface */ void (*transmit_error) (chan_t *h, int errcode); void (*lysap_notify_receive) (chan_t *h, lysap_buf_t *b); void (*lysap_notify_transmit) (chan_t *h); lysap_buf_t* (*lysap_get_data)(chan_t *h); }; #endif /* KERNEL */ Index: head/sys/kern/init_main.c =================================================================== --- head/sys/kern/init_main.c (revision 308456) +++ head/sys/kern/init_main.c (revision 308457) @@ -1,858 +1,858 @@ /*- * Copyright (c) 1995 Terrence R. Lambert * All rights reserved. * * Copyright (c) 1982, 1986, 1989, 1991, 1992, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)init_main.c 8.9 (Berkeley) 1/21/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include "opt_init_path.h" #include "opt_verbose_sysinit.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include void mi_startup(void); /* Should be elsewhere */ /* Components of the first process -- never freed. */ static struct session session0; static struct pgrp pgrp0; struct proc proc0; struct thread0_storage thread0_st __aligned(16); struct vmspace vmspace0; struct proc *initproc; #ifndef BOOTHOWTO #define BOOTHOWTO 0 #endif int boothowto = BOOTHOWTO; /* initialized so that it can be patched */ SYSCTL_INT(_debug, OID_AUTO, boothowto, CTLFLAG_RD, &boothowto, 0, "Boot control flags, passed from loader"); #ifndef BOOTVERBOSE #define BOOTVERBOSE 0 #endif int bootverbose = BOOTVERBOSE; SYSCTL_INT(_debug, OID_AUTO, bootverbose, CTLFLAG_RW, &bootverbose, 0, "Control the output of verbose kernel messages"); #ifdef INVARIANTS FEATURE(invariants, "Kernel compiled with INVARIANTS, may affect performance"); #endif /* * This ensures that there is at least one entry so that the sysinit_set * symbol is not undefined. A sybsystem ID of SI_SUB_DUMMY is never * executed. */ SYSINIT(placeholder, SI_SUB_DUMMY, SI_ORDER_ANY, NULL, NULL); /* * The sysinit table itself. Items are checked off as the are run. * If we want to register new sysinit types, add them to newsysinit. */ SET_DECLARE(sysinit_set, struct sysinit); struct sysinit **sysinit, **sysinit_end; struct sysinit **newsysinit, **newsysinit_end; /* * Merge a new sysinit set into the current set, reallocating it if * necessary. This can only be called after malloc is running. */ void sysinit_add(struct sysinit **set, struct sysinit **set_end) { struct sysinit **newset; struct sysinit **sipp; struct sysinit **xipp; int count; count = set_end - set; if (newsysinit) count += newsysinit_end - newsysinit; else count += sysinit_end - sysinit; newset = malloc(count * sizeof(*sipp), M_TEMP, M_NOWAIT); if (newset == NULL) panic("cannot malloc for sysinit"); xipp = newset; if (newsysinit) for (sipp = newsysinit; sipp < newsysinit_end; sipp++) *xipp++ = *sipp; else for (sipp = sysinit; sipp < sysinit_end; sipp++) *xipp++ = *sipp; for (sipp = set; sipp < set_end; sipp++) *xipp++ = *sipp; if (newsysinit) free(newsysinit, M_TEMP); newsysinit = newset; newsysinit_end = newset + count; } #if defined (DDB) && defined(VERBOSE_SYSINIT) static const char * symbol_name(vm_offset_t va, db_strategy_t strategy) { const char *name; c_db_sym_t sym; db_expr_t offset; if (va == 0) return (NULL); sym = db_search_symbol(va, strategy, &offset); if (offset != 0) return (NULL); db_symbol_values(sym, &name, NULL); return (name); } #endif /* * System startup; initialize the world, create process 0, mount root * filesystem, and fork to create init and pagedaemon. Most of the * hard work is done in the lower-level initialization routines including * startup(), which does memory initialization and autoconfiguration. * * This allows simple addition of new kernel subsystems that require * boot time initialization. It also allows substitution of subsystem * (for instance, a scheduler, kernel profiler, or VM system) by object * module. Finally, it allows for optional "kernel threads". */ void mi_startup(void) { struct sysinit **sipp; /* system initialization*/ struct sysinit **xipp; /* interior loop of sort*/ struct sysinit *save; /* bubble*/ #if defined(VERBOSE_SYSINIT) int last; int verbose; #endif if (boothowto & RB_VERBOSE) bootverbose++; if (sysinit == NULL) { sysinit = SET_BEGIN(sysinit_set); sysinit_end = SET_LIMIT(sysinit_set); } restart: /* * Perform a bubble sort of the system initialization objects by * their subsystem (primary key) and order (secondary key). */ for (sipp = sysinit; sipp < sysinit_end; sipp++) { for (xipp = sipp + 1; xipp < sysinit_end; xipp++) { if ((*sipp)->subsystem < (*xipp)->subsystem || ((*sipp)->subsystem == (*xipp)->subsystem && (*sipp)->order <= (*xipp)->order)) continue; /* skip*/ save = *sipp; *sipp = *xipp; *xipp = save; } } #if defined(VERBOSE_SYSINIT) last = SI_SUB_COPYRIGHT; verbose = 0; #if !defined(DDB) printf("VERBOSE_SYSINIT: DDB not enabled, symbol lookups disabled.\n"); #endif #endif /* * Traverse the (now) ordered list of system initialization tasks. * Perform each task, and continue on to the next task. */ for (sipp = sysinit; sipp < sysinit_end; sipp++) { if ((*sipp)->subsystem == SI_SUB_DUMMY) continue; /* skip dummy task(s)*/ if ((*sipp)->subsystem == SI_SUB_DONE) continue; #if defined(VERBOSE_SYSINIT) if ((*sipp)->subsystem > last) { verbose = 1; last = (*sipp)->subsystem; printf("subsystem %x\n", last); } if (verbose) { #if defined(DDB) const char *func, *data; func = symbol_name((vm_offset_t)(*sipp)->func, DB_STGY_PROC); data = symbol_name((vm_offset_t)(*sipp)->udata, DB_STGY_ANY); if (func != NULL && data != NULL) printf(" %s(&%s)... ", func, data); else if (func != NULL) printf(" %s(%p)... ", func, (*sipp)->udata); else #endif printf(" %p(%p)... ", (*sipp)->func, (*sipp)->udata); } #endif /* Call function */ (*((*sipp)->func))((*sipp)->udata); #if defined(VERBOSE_SYSINIT) if (verbose) printf("done.\n"); #endif /* Check off the one we're just done */ (*sipp)->subsystem = SI_SUB_DONE; /* Check if we've installed more sysinit items via KLD */ if (newsysinit != NULL) { if (sysinit != SET_BEGIN(sysinit_set)) free(sysinit, M_TEMP); sysinit = newsysinit; sysinit_end = newsysinit_end; newsysinit = NULL; newsysinit_end = NULL; goto restart; } } mtx_assert(&Giant, MA_OWNED | MA_NOTRECURSED); mtx_unlock(&Giant); /* * Now hand over this thread to swapper. */ swapper(); /* NOTREACHED*/ } static void print_caddr_t(void *data) { printf("%s", (char *)data); } static void print_version(void *data __unused) { int len; /* Strip a trailing newline from version. */ len = strlen(version); while (len > 0 && version[len - 1] == '\n') len--; printf("%.*s %s\n", len, version, machine); printf("%s\n", compiler_version); } SYSINIT(announce, SI_SUB_COPYRIGHT, SI_ORDER_FIRST, print_caddr_t, copyright); SYSINIT(trademark, SI_SUB_COPYRIGHT, SI_ORDER_SECOND, print_caddr_t, trademark); SYSINIT(version, SI_SUB_COPYRIGHT, SI_ORDER_THIRD, print_version, NULL); #ifdef WITNESS static char wit_warn[] = "WARNING: WITNESS option enabled, expect reduced performance.\n"; SYSINIT(witwarn, SI_SUB_COPYRIGHT, SI_ORDER_THIRD + 1, print_caddr_t, wit_warn); SYSINIT(witwarn2, SI_SUB_LAST, SI_ORDER_THIRD + 1, print_caddr_t, wit_warn); #endif #ifdef DIAGNOSTIC static char diag_warn[] = "WARNING: DIAGNOSTIC option enabled, expect reduced performance.\n"; SYSINIT(diagwarn, SI_SUB_COPYRIGHT, SI_ORDER_THIRD + 2, print_caddr_t, diag_warn); SYSINIT(diagwarn2, SI_SUB_LAST, SI_ORDER_THIRD + 2, print_caddr_t, diag_warn); #endif static int null_fetch_syscall_args(struct thread *td __unused, struct syscall_args *sa __unused) { panic("null_fetch_syscall_args"); } static void null_set_syscall_retval(struct thread *td __unused, int error __unused) { panic("null_set_syscall_retval"); } struct sysentvec null_sysvec = { .sv_size = 0, .sv_table = NULL, .sv_mask = 0, .sv_errsize = 0, .sv_errtbl = NULL, .sv_transtrap = NULL, .sv_fixup = NULL, .sv_sendsig = NULL, .sv_sigcode = NULL, .sv_szsigcode = NULL, .sv_name = "null", .sv_coredump = NULL, .sv_imgact_try = NULL, .sv_minsigstksz = 0, .sv_pagesize = PAGE_SIZE, .sv_minuser = VM_MIN_ADDRESS, .sv_maxuser = VM_MAXUSER_ADDRESS, .sv_usrstack = USRSTACK, .sv_psstrings = PS_STRINGS, .sv_stackprot = VM_PROT_ALL, .sv_copyout_strings = NULL, .sv_setregs = NULL, .sv_fixlimit = NULL, .sv_maxssiz = NULL, .sv_flags = 0, .sv_set_syscall_retval = null_set_syscall_retval, .sv_fetch_syscall_args = null_fetch_syscall_args, .sv_syscallnames = NULL, .sv_schedtail = NULL, .sv_thread_detach = NULL, .sv_trap = NULL, }; /* * The two following SYSINIT's are proc0 specific glue code. I am not * convinced that they can not be safely combined, but their order of * operation has been maintained as the same as the original init_main.c * for right now. */ /* ARGSUSED*/ static void proc0_init(void *dummy __unused) { struct proc *p; struct thread *td; struct ucred *newcred; vm_paddr_t pageablemem; int i; GIANT_REQUIRED; p = &proc0; td = &thread0; /* * Initialize magic number and osrel. */ p->p_magic = P_MAGIC; p->p_osrel = osreldate; /* * Initialize thread and process structures. */ procinit(); /* set up proc zone */ threadinit(); /* set up UMA zones */ /* * Initialise scheduler resources. * Add scheduler specific parts to proc, thread as needed. */ schedinit(); /* scheduler gets its house in order */ /* * Create process 0 (the swapper). */ LIST_INSERT_HEAD(&allproc, p, p_list); LIST_INSERT_HEAD(PIDHASH(0), p, p_hash); mtx_init(&pgrp0.pg_mtx, "process group", NULL, MTX_DEF | MTX_DUPOK); p->p_pgrp = &pgrp0; LIST_INSERT_HEAD(PGRPHASH(0), &pgrp0, pg_hash); LIST_INIT(&pgrp0.pg_members); LIST_INSERT_HEAD(&pgrp0.pg_members, p, p_pglist); pgrp0.pg_session = &session0; mtx_init(&session0.s_mtx, "session", NULL, MTX_DEF); refcount_init(&session0.s_count, 1); session0.s_leader = p; p->p_sysent = &null_sysvec; p->p_flag = P_SYSTEM | P_INMEM | P_KPROC; p->p_flag2 = 0; p->p_state = PRS_NORMAL; p->p_klist = knlist_alloc(&p->p_mtx); STAILQ_INIT(&p->p_ktr); p->p_nice = NZERO; /* pid_max cannot be greater than PID_MAX */ td->td_tid = PID_MAX + 1; LIST_INSERT_HEAD(TIDHASH(td->td_tid), td, td_hash); td->td_state = TDS_RUNNING; td->td_pri_class = PRI_TIMESHARE; td->td_user_pri = PUSER; td->td_base_user_pri = PUSER; td->td_lend_user_pri = PRI_MAX; td->td_priority = PVM; td->td_base_pri = PVM; td->td_oncpu = curcpu; td->td_flags = TDF_INMEM; td->td_pflags = TDP_KTHREAD; td->td_cpuset = cpuset_thread0(); vm_domain_policy_init(&td->td_vm_dom_policy); vm_domain_policy_set(&td->td_vm_dom_policy, VM_POLICY_NONE, -1); vm_domain_policy_init(&p->p_vm_dom_policy); vm_domain_policy_set(&p->p_vm_dom_policy, VM_POLICY_NONE, -1); prison0_init(); p->p_peers = 0; p->p_leader = p; p->p_reaper = p; LIST_INIT(&p->p_reaplist); strncpy(p->p_comm, "kernel", sizeof (p->p_comm)); strncpy(td->td_name, "swapper", sizeof (td->td_name)); callout_init_mtx(&p->p_itcallout, &p->p_mtx, 0); callout_init_mtx(&p->p_limco, &p->p_mtx, 0); callout_init(&td->td_slpcallout, 1); /* Create credentials. */ newcred = crget(); newcred->cr_ngroups = 1; /* group 0 */ newcred->cr_uidinfo = uifind(0); newcred->cr_ruidinfo = uifind(0); newcred->cr_prison = &prison0; newcred->cr_loginclass = loginclass_find("default"); proc_set_cred_init(p, newcred); #ifdef AUDIT audit_cred_kproc0(newcred); #endif #ifdef MAC mac_cred_create_swapper(newcred); #endif /* Create sigacts. */ p->p_sigacts = sigacts_alloc(); /* Initialize signal state for process 0. */ siginit(&proc0); /* Create the file descriptor table. */ p->p_fd = fdinit(NULL, false); p->p_fdtol = NULL; /* Create the limits structures. */ p->p_limit = lim_alloc(); for (i = 0; i < RLIM_NLIMITS; i++) p->p_limit->pl_rlimit[i].rlim_cur = p->p_limit->pl_rlimit[i].rlim_max = RLIM_INFINITY; p->p_limit->pl_rlimit[RLIMIT_NOFILE].rlim_cur = p->p_limit->pl_rlimit[RLIMIT_NOFILE].rlim_max = maxfiles; p->p_limit->pl_rlimit[RLIMIT_NPROC].rlim_cur = p->p_limit->pl_rlimit[RLIMIT_NPROC].rlim_max = maxproc; p->p_limit->pl_rlimit[RLIMIT_DATA].rlim_cur = dfldsiz; p->p_limit->pl_rlimit[RLIMIT_DATA].rlim_max = maxdsiz; p->p_limit->pl_rlimit[RLIMIT_STACK].rlim_cur = dflssiz; p->p_limit->pl_rlimit[RLIMIT_STACK].rlim_max = maxssiz; /* Cast to avoid overflow on i386/PAE. */ pageablemem = ptoa((vm_paddr_t)vm_cnt.v_free_count); p->p_limit->pl_rlimit[RLIMIT_RSS].rlim_cur = p->p_limit->pl_rlimit[RLIMIT_RSS].rlim_max = pageablemem; p->p_limit->pl_rlimit[RLIMIT_MEMLOCK].rlim_cur = pageablemem / 3; p->p_limit->pl_rlimit[RLIMIT_MEMLOCK].rlim_max = pageablemem; p->p_cpulimit = RLIM_INFINITY; PROC_LOCK(p); thread_cow_get_proc(td, p); PROC_UNLOCK(p); /* Initialize resource accounting structures. */ racct_create(&p->p_racct); p->p_stats = pstats_alloc(); /* Allocate a prototype map so we have something to fork. */ p->p_vmspace = &vmspace0; vmspace0.vm_refcnt = 1; pmap_pinit0(vmspace_pmap(&vmspace0)); /* * proc0 is not expected to enter usermode, so there is no special * handling for sv_minuser here, like is done for exec_new_vmspace(). */ vm_map_init(&vmspace0.vm_map, vmspace_pmap(&vmspace0), p->p_sysent->sv_minuser, p->p_sysent->sv_maxuser); /* * Call the init and ctor for the new thread and proc. We wait * to do this until all other structures are fairly sane. */ EVENTHANDLER_INVOKE(process_init, p); EVENTHANDLER_INVOKE(thread_init, td); EVENTHANDLER_INVOKE(process_ctor, p); EVENTHANDLER_INVOKE(thread_ctor, td); /* * Charge root for one process. */ (void)chgproccnt(p->p_ucred->cr_ruidinfo, 1, 0); PROC_LOCK(p); racct_add_force(p, RACCT_NPROC, 1); PROC_UNLOCK(p); } SYSINIT(p0init, SI_SUB_INTRINSIC, SI_ORDER_FIRST, proc0_init, NULL); /* ARGSUSED*/ static void proc0_post(void *dummy __unused) { struct timespec ts; struct proc *p; struct rusage ru; struct thread *td; /* * Now we can look at the time, having had a chance to verify the * time from the filesystem. Pretend that proc0 started now. */ sx_slock(&allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { microuptime(&p->p_stats->p_start); PROC_STATLOCK(p); rufetch(p, &ru); /* Clears thread stats */ PROC_STATUNLOCK(p); p->p_rux.rux_runtime = 0; p->p_rux.rux_uticks = 0; p->p_rux.rux_sticks = 0; p->p_rux.rux_iticks = 0; FOREACH_THREAD_IN_PROC(p, td) { td->td_runtime = 0; } } sx_sunlock(&allproc_lock); PCPU_SET(switchtime, cpu_ticks()); PCPU_SET(switchticks, ticks); /* * Give the ``random'' number generator a thump. */ nanotime(&ts); srandom(ts.tv_sec ^ ts.tv_nsec); } SYSINIT(p0post, SI_SUB_INTRINSIC_POST, SI_ORDER_FIRST, proc0_post, NULL); static void random_init(void *dummy __unused) { /* * After CPU has been started we have some randomness on most * platforms via get_cyclecount(). For platforms that don't * we will reseed random(9) in proc0_post() as well. */ srandom(get_cyclecount()); } SYSINIT(random, SI_SUB_RANDOM, SI_ORDER_FIRST, random_init, NULL); /* *************************************************************************** **** **** The following SYSINIT's and glue code should be moved to the **** respective files on a per subsystem basis. **** *************************************************************************** */ /* * List of paths to try when searching for "init". */ static char init_path[MAXPATHLEN] = #ifdef INIT_PATH __XSTRING(INIT_PATH); #else "/sbin/init:/sbin/oinit:/sbin/init.bak:/rescue/init"; #endif SYSCTL_STRING(_kern, OID_AUTO, init_path, CTLFLAG_RD, init_path, 0, "Path used to search the init process"); /* * Shutdown timeout of init(8). * Unused within kernel, but used to control init(8), hence do not remove. */ #ifndef INIT_SHUTDOWN_TIMEOUT #define INIT_SHUTDOWN_TIMEOUT 120 #endif static int init_shutdown_timeout = INIT_SHUTDOWN_TIMEOUT; SYSCTL_INT(_kern, OID_AUTO, init_shutdown_timeout, CTLFLAG_RW, &init_shutdown_timeout, 0, "Shutdown timeout of init(8). " "Unused within kernel, but used to control init(8)"); /* * Start the initial user process; try exec'ing each pathname in init_path. * The program is invoked with one argument containing the boot flags. */ static void start_init(void *dummy) { vm_offset_t addr; struct execve_args args; int options, error; char *var, *path, *next, *s; char *ucp, **uap, *arg0, *arg1; struct thread *td; struct proc *p; mtx_lock(&Giant); GIANT_REQUIRED; td = curthread; p = td->td_proc; vfs_mountroot(); /* Wipe GELI passphrase from the environment. */ kern_unsetenv("kern.geom.eli.passphrase"); /* * Need just enough stack to hold the faked-up "execve()" arguments. */ addr = p->p_sysent->sv_usrstack - PAGE_SIZE; if (vm_map_find(&p->p_vmspace->vm_map, NULL, 0, &addr, PAGE_SIZE, 0, VMFS_NO_SPACE, VM_PROT_ALL, VM_PROT_ALL, 0) != 0) panic("init: couldn't allocate argument space"); p->p_vmspace->vm_maxsaddr = (caddr_t)addr; p->p_vmspace->vm_ssize = 1; if ((var = kern_getenv("init_path")) != NULL) { strlcpy(init_path, var, sizeof(init_path)); freeenv(var); } for (path = init_path; *path != '\0'; path = next) { while (*path == ':') path++; if (*path == '\0') break; for (next = path; *next != '\0' && *next != ':'; next++) /* nothing */ ; if (bootverbose) printf("start_init: trying %.*s\n", (int)(next - path), path); /* * Move out the boot flag argument. */ options = 0; ucp = (char *)p->p_sysent->sv_usrstack; (void)subyte(--ucp, 0); /* trailing zero */ if (boothowto & RB_SINGLE) { (void)subyte(--ucp, 's'); options = 1; } #ifdef notyet if (boothowto & RB_FASTBOOT) { (void)subyte(--ucp, 'f'); options = 1; } #endif #ifdef BOOTCDROM (void)subyte(--ucp, 'C'); options = 1; #endif if (options == 0) (void)subyte(--ucp, '-'); (void)subyte(--ucp, '-'); /* leading hyphen */ arg1 = ucp; /* * Move out the file name (also arg 0). */ (void)subyte(--ucp, 0); for (s = next - 1; s >= path; s--) (void)subyte(--ucp, *s); arg0 = ucp; /* * Move out the arg pointers. */ uap = (char **)rounddown2((intptr_t)ucp, sizeof(intptr_t)); (void)suword((caddr_t)--uap, (long)0); /* terminator */ (void)suword((caddr_t)--uap, (long)(intptr_t)arg1); (void)suword((caddr_t)--uap, (long)(intptr_t)arg0); /* * Point at the arguments. */ args.fname = arg0; args.argv = uap; args.envv = NULL; /* * Now try to exec the program. If can't for any reason * other than it doesn't exist, complain. * * Otherwise, return via fork_trampoline() all the way * to user mode as init! */ if ((error = sys_execve(td, &args)) == 0) { mtx_unlock(&Giant); return; } if (error != ENOENT) printf("exec %.*s: error %d\n", (int)(next - path), path, error); } printf("init: not found in path %s\n", init_path); panic("no init"); } /* - * Like kproc_create(), but runs in it's own address space. + * Like kproc_create(), but runs in its own address space. * We do this early to reserve pid 1. * * Note special case - do not make it runnable yet. Other work * in progress will change this more. */ static void create_init(const void *udata __unused) { struct fork_req fr; struct ucred *newcred, *oldcred; struct thread *td; int error; bzero(&fr, sizeof(fr)); fr.fr_flags = RFFDG | RFPROC | RFSTOPPED; fr.fr_procp = &initproc; error = fork1(&thread0, &fr); if (error) panic("cannot fork init: %d\n", error); KASSERT(initproc->p_pid == 1, ("create_init: initproc->p_pid != 1")); /* divorce init's credentials from the kernel's */ newcred = crget(); sx_xlock(&proctree_lock); PROC_LOCK(initproc); initproc->p_flag |= P_SYSTEM | P_INMEM; initproc->p_treeflag |= P_TREE_REAPER; LIST_INSERT_HEAD(&initproc->p_reaplist, &proc0, p_reapsibling); oldcred = initproc->p_ucred; crcopy(newcred, oldcred); #ifdef MAC mac_cred_create_init(newcred); #endif #ifdef AUDIT audit_cred_proc1(newcred); #endif proc_set_cred(initproc, newcred); td = FIRST_THREAD_IN_PROC(initproc); crfree(td->td_ucred); td->td_ucred = crhold(initproc->p_ucred); PROC_UNLOCK(initproc); sx_xunlock(&proctree_lock); crfree(oldcred); cpu_fork_kthread_handler(FIRST_THREAD_IN_PROC(initproc), start_init, NULL); } SYSINIT(init, SI_SUB_CREATE_INIT, SI_ORDER_FIRST, create_init, NULL); /* * Make it runnable now. */ static void kick_init(const void *udata __unused) { struct thread *td; td = FIRST_THREAD_IN_PROC(initproc); thread_lock(td); TD_SET_CAN_RUN(td); sched_add(td, SRQ_BORING); thread_unlock(td); } SYSINIT(kickinit, SI_SUB_KTHREAD_INIT, SI_ORDER_MIDDLE, kick_init, NULL); Index: head/sys/kern/kern_ctf.c =================================================================== --- head/sys/kern/kern_ctf.c (revision 308456) +++ head/sys/kern/kern_ctf.c (revision 308457) @@ -1,324 +1,324 @@ /*- * Copyright (c) 2008 John Birrell * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ /* * Note this file is included by both link_elf.c and link_elf_obj.c. * * The CTF header structure definition can't be used here because it's * (annoyingly) covered by the CDDL. We will just use a few bytes from * it as an integer array where we 'know' what they mean. */ #define CTF_HDR_SIZE 36 #define CTF_HDR_STRTAB_U32 7 #define CTF_HDR_STRLEN_U32 8 #ifdef DDB_CTF static void * z_alloc(void *nil, u_int items, u_int size) { void *ptr; ptr = malloc(items * size, M_TEMP, M_NOWAIT); return ptr; } static void z_free(void *nil, void *ptr) { free(ptr, M_TEMP); } #endif static int link_elf_ctf_get(linker_file_t lf, linker_ctf_t *lc) { #ifdef DDB_CTF Elf_Ehdr *hdr = NULL; Elf_Shdr *shdr = NULL; caddr_t ctftab = NULL; caddr_t raw = NULL; caddr_t shstrtab = NULL; elf_file_t ef = (elf_file_t) lf; int flags; int i; int nbytes; size_t sz; struct nameidata nd; struct thread *td = curthread; uint8_t ctf_hdr[CTF_HDR_SIZE]; #endif int error = 0; if (lf == NULL || lc == NULL) return (EINVAL); /* Set the defaults for no CTF present. That's not a crime! */ bzero(lc, sizeof(*lc)); #ifdef DDB_CTF /* * First check if we've tried to load CTF data previously and the * CTF ELF section wasn't found. We flag that condition by setting * ctfcnt to -1. See below. */ if (ef->ctfcnt < 0) return (EFTYPE); /* Now check if we've already loaded the CTF data.. */ if (ef->ctfcnt > 0) { /* We only need to load once. */ lc->ctftab = ef->ctftab; lc->ctfcnt = ef->ctfcnt; lc->symtab = ef->ddbsymtab; lc->strtab = ef->ddbstrtab; lc->strcnt = ef->ddbstrcnt; lc->nsym = ef->ddbsymcnt; lc->ctfoffp = (uint32_t **) &ef->ctfoff; lc->typoffp = (uint32_t **) &ef->typoff; lc->typlenp = &ef->typlen; return (0); } /* * We need to try reading the CTF data. Flag no CTF data present * by default and if we actually succeed in reading it, we'll * update ctfcnt to the number of bytes read. */ ef->ctfcnt = -1; NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, lf->pathname, td); flags = FREAD; error = vn_open(&nd, &flags, 0, NULL); if (error) return (error); NDFREE(&nd, NDF_ONLY_PNBUF); /* Allocate memory for the FLF header. */ hdr = malloc(sizeof(*hdr), M_LINKER, M_WAITOK); /* Read the ELF header. */ if ((error = vn_rdwr(UIO_READ, nd.ni_vp, hdr, sizeof(*hdr), 0, UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, NOCRED, NULL, td)) != 0) goto out; /* Sanity check. */ if (!IS_ELF(*hdr)) { error = ENOEXEC; goto out; } nbytes = hdr->e_shnum * hdr->e_shentsize; if (nbytes == 0 || hdr->e_shoff == 0 || hdr->e_shentsize != sizeof(Elf_Shdr)) { error = ENOEXEC; goto out; } /* Allocate memory for all the section headers */ shdr = malloc(nbytes, M_LINKER, M_WAITOK); /* Read all the section headers */ if ((error = vn_rdwr(UIO_READ, nd.ni_vp, (caddr_t)shdr, nbytes, hdr->e_shoff, UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, NOCRED, NULL, td)) != 0) goto out; /* * We need to search for the CTF section by name, so if the * section names aren't present, then we can't locate the * .SUNW_ctf section containing the CTF data. */ if (hdr->e_shstrndx == 0 || shdr[hdr->e_shstrndx].sh_type != SHT_STRTAB) { printf("%s(%d): module %s e_shstrndx is %d, sh_type is %d\n", __func__, __LINE__, lf->pathname, hdr->e_shstrndx, shdr[hdr->e_shstrndx].sh_type); error = EFTYPE; goto out; } /* Allocate memory to buffer the section header strings. */ shstrtab = malloc(shdr[hdr->e_shstrndx].sh_size, M_LINKER, M_WAITOK); /* Read the section header strings. */ if ((error = vn_rdwr(UIO_READ, nd.ni_vp, shstrtab, shdr[hdr->e_shstrndx].sh_size, shdr[hdr->e_shstrndx].sh_offset, UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, NOCRED, NULL, td)) != 0) goto out; /* Search for the section containing the CTF data. */ for (i = 0; i < hdr->e_shnum; i++) if (strcmp(".SUNW_ctf", shstrtab + shdr[i].sh_name) == 0) break; /* Check if the CTF section wasn't found. */ if (i >= hdr->e_shnum) { printf("%s(%d): module %s has no .SUNW_ctf section\n", __func__, __LINE__, lf->pathname); error = EFTYPE; goto out; } /* Read the CTF header. */ if ((error = vn_rdwr(UIO_READ, nd.ni_vp, ctf_hdr, sizeof(ctf_hdr), shdr[i].sh_offset, UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, NOCRED, NULL, td)) != 0) goto out; /* Check the CTF magic number. (XXX check for big endian!) */ if (ctf_hdr[0] != 0xf1 || ctf_hdr[1] != 0xcf) { printf("%s(%d): module %s has invalid format\n", __func__, __LINE__, lf->pathname); error = EFTYPE; goto out; } /* Check if version 2. */ if (ctf_hdr[2] != 2) { printf("%s(%d): module %s CTF format version is %d " "(2 expected)\n", __func__, __LINE__, lf->pathname, ctf_hdr[2]); error = EFTYPE; goto out; } /* Check if the data is compressed. */ if ((ctf_hdr[3] & 0x1) != 0) { uint32_t *u32 = (uint32_t *) ctf_hdr; /* * The last two fields in the CTF header are the offset * from the end of the header to the start of the string * data and the length of that string data. se this * information to determine the decompressed CTF data * buffer required. */ sz = u32[CTF_HDR_STRTAB_U32] + u32[CTF_HDR_STRLEN_U32] + sizeof(ctf_hdr); /* * Allocate memory for the compressed CTF data, including * the header (which isn't compressed). */ raw = malloc(shdr[i].sh_size, M_LINKER, M_WAITOK); } else { /* * The CTF data is not compressed, so the ELF section * size is the same as the buffer size required. */ sz = shdr[i].sh_size; } /* - * Allocate memory to buffer the CTF data in it's decompressed + * Allocate memory to buffer the CTF data in its decompressed * form. */ ctftab = malloc(sz, M_LINKER, M_WAITOK); /* * Read the CTF data into the raw buffer if compressed, or * directly into the CTF buffer otherwise. */ if ((error = vn_rdwr(UIO_READ, nd.ni_vp, raw == NULL ? ctftab : raw, shdr[i].sh_size, shdr[i].sh_offset, UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, NOCRED, NULL, td)) != 0) goto out; /* Check if decompression is required. */ if (raw != NULL) { z_stream zs; int ret; /* * The header isn't compressed, so copy that into the * CTF buffer first. */ bcopy(ctf_hdr, ctftab, sizeof(ctf_hdr)); /* Initialise the zlib structure. */ bzero(&zs, sizeof(zs)); zs.zalloc = z_alloc; zs.zfree = z_free; if (inflateInit(&zs) != Z_OK) { error = EIO; goto out; } zs.avail_in = shdr[i].sh_size - sizeof(ctf_hdr); zs.next_in = ((uint8_t *) raw) + sizeof(ctf_hdr); zs.avail_out = sz - sizeof(ctf_hdr); zs.next_out = ((uint8_t *) ctftab) + sizeof(ctf_hdr); ret = inflate(&zs, Z_FINISH); inflateEnd(&zs); if (ret != Z_STREAM_END) { printf("%s(%d): zlib inflate returned %d\n", __func__, __LINE__, ret); error = EIO; goto out; } } /* Got the CTF data! */ ef->ctftab = ctftab; ef->ctfcnt = shdr[i].sh_size; /* We'll retain the memory allocated for the CTF data. */ ctftab = NULL; /* Let the caller use the CTF data read. */ lc->ctftab = ef->ctftab; lc->ctfcnt = ef->ctfcnt; lc->symtab = ef->ddbsymtab; lc->strtab = ef->ddbstrtab; lc->strcnt = ef->ddbstrcnt; lc->nsym = ef->ddbsymcnt; lc->ctfoffp = (uint32_t **) &ef->ctfoff; lc->typoffp = (uint32_t **) &ef->typoff; lc->typlenp = &ef->typlen; out: VOP_UNLOCK(nd.ni_vp, 0); vn_close(nd.ni_vp, FREAD, td->td_ucred, td); if (hdr != NULL) free(hdr, M_LINKER); if (shdr != NULL) free(shdr, M_LINKER); if (shstrtab != NULL) free(shstrtab, M_LINKER); if (ctftab != NULL) free(ctftab, M_LINKER); if (raw != NULL) free(raw, M_LINKER); #else error = EOPNOTSUPP; #endif return (error); } Index: head/sys/kern/subr_firmware.c =================================================================== --- head/sys/kern/subr_firmware.c (revision 308456) +++ head/sys/kern/subr_firmware.c (revision 308457) @@ -1,526 +1,526 @@ /*- * Copyright (c) 2005-2008, Sam Leffler * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Loadable firmware support. See sys/sys/firmware.h and firmware(9) * form more details on the subsystem. * * 'struct firmware' is the user-visible part of the firmware table. * Additional internal information is stored in a 'struct priv_fw' * (currently a static array). A slot is in use if FW_INUSE is true: */ #define FW_INUSE(p) ((p)->file != NULL || (p)->fw.name != NULL) /* * fw.name != NULL when an image is registered; file != NULL for * autoloaded images whose handling has not been completed. * * The state of a slot evolves as follows: * firmware_register --> fw.name = image_name * (autoloaded image) --> file = module reference * firmware_unregister --> fw.name = NULL * (unloadentry complete) --> file = NULL * * In order for the above to work, the 'file' field must remain * unchanged in firmware_unregister(). * * Images residing in the same module are linked to each other * through the 'parent' argument of firmware_register(). * One image (typically, one with the same name as the module to let * the autoloading mechanism work) is considered the parent image for * all other images in the same module. Children affect the refcount * on the parent image preventing improper unloading of the image itself. */ struct priv_fw { int refcnt; /* reference count */ /* * parent entry, see above. Set on firmware_register(), * cleared on firmware_unregister(). */ struct priv_fw *parent; int flags; /* record FIRMWARE_UNLOAD requests */ #define FW_UNLOAD 0x100 /* * 'file' is private info managed by the autoload/unload code. * Set at the end of firmware_get(), cleared only in the * firmware_unload_task, so the latter can depend on its value even * while the lock is not held. */ linker_file_t file; /* module file, if autoloaded */ /* * 'fw' is the externally visible image information. * We do not make it the first field in priv_fw, to avoid the * temptation of casting pointers to each other. * Use PRIV_FW(fw) to get a pointer to the cointainer of fw. * Beware, PRIV_FW does not work for a NULL pointer. */ struct firmware fw; /* externally visible information */ }; /* * PRIV_FW returns the pointer to the container of struct firmware *x. * Cast to intptr_t to override the 'const' attribute of x */ #define PRIV_FW(x) ((struct priv_fw *) \ ((intptr_t)(x) - offsetof(struct priv_fw, fw)) ) /* * At the moment we use a static array as backing store for the registry. * Should we move to a dynamic structure, keep in mind that we cannot * reallocate the array because pointers are held externally. * A list may work, though. */ #define FIRMWARE_MAX 50 static struct priv_fw firmware_table[FIRMWARE_MAX]; /* * Firmware module operations are handled in a separate task as they * might sleep and they require directory context to do i/o. */ static struct taskqueue *firmware_tq; static struct task firmware_unload_task; /* * This mutex protects accesses to the firmware table. */ static struct mtx firmware_mtx; MTX_SYSINIT(firmware, &firmware_mtx, "firmware table", MTX_DEF); /* * Helper function to lookup a name. * As a side effect, it sets the pointer to a free slot, if any. * This way we can concentrate most of the registry scanning in * this function, which makes it easier to replace the registry * with some other data structure. */ static struct priv_fw * lookup(const char *name, struct priv_fw **empty_slot) { struct priv_fw *fp = NULL; struct priv_fw *dummy; int i; if (empty_slot == NULL) empty_slot = &dummy; *empty_slot = NULL; for (i = 0; i < FIRMWARE_MAX; i++) { fp = &firmware_table[i]; if (fp->fw.name != NULL && strcasecmp(name, fp->fw.name) == 0) break; else if (!FW_INUSE(fp)) *empty_slot = fp; } return (i < FIRMWARE_MAX ) ? fp : NULL; } /* * Register a firmware image with the specified name. The * image name must not already be registered. If this is a * subimage then parent refers to a previously registered * image that this should be associated with. */ const struct firmware * firmware_register(const char *imagename, const void *data, size_t datasize, unsigned int version, const struct firmware *parent) { struct priv_fw *match, *frp; char *str; str = strdup(imagename, M_TEMP); mtx_lock(&firmware_mtx); /* * Do a lookup to make sure the name is unique or find a free slot. */ match = lookup(imagename, &frp); if (match != NULL) { mtx_unlock(&firmware_mtx); printf("%s: image %s already registered!\n", __func__, imagename); free(str, M_TEMP); return NULL; } if (frp == NULL) { mtx_unlock(&firmware_mtx); printf("%s: cannot register image %s, firmware table full!\n", __func__, imagename); free(str, M_TEMP); return NULL; } bzero(frp, sizeof(*frp)); /* start from a clean record */ frp->fw.name = str; frp->fw.data = data; frp->fw.datasize = datasize; frp->fw.version = version; if (parent != NULL) frp->parent = PRIV_FW(parent); mtx_unlock(&firmware_mtx); if (bootverbose) printf("firmware: '%s' version %u: %zu bytes loaded at %p\n", imagename, version, datasize, data); return &frp->fw; } /* * Unregister/remove a firmware image. If there are outstanding * references an error is returned and the image is not removed * from the registry. */ int firmware_unregister(const char *imagename) { struct priv_fw *fp; int err; mtx_lock(&firmware_mtx); fp = lookup(imagename, NULL); if (fp == NULL) { /* * It is ok for the lookup to fail; this can happen * when a module is unloaded on last reference and the - * module unload handler unregister's each of it's + * module unload handler unregister's each of its * firmware images. */ err = 0; } else if (fp->refcnt != 0) { /* cannot unregister */ err = EBUSY; } else { linker_file_t x = fp->file; /* save value */ /* * Clear the whole entry with bzero to make sure we * do not forget anything. Then restore 'file' which is * non-null for autoloaded images. */ free((void *) (uintptr_t) fp->fw.name, M_TEMP); bzero(fp, sizeof(struct priv_fw)); fp->file = x; err = 0; } mtx_unlock(&firmware_mtx); return err; } static void loadimage(void *arg, int npending) { struct thread *td = curthread; char *imagename = arg; struct priv_fw *fp; linker_file_t result; int error; /* synchronize with the thread that dispatched us */ mtx_lock(&firmware_mtx); mtx_unlock(&firmware_mtx); if (td->td_proc->p_fd->fd_rdir == NULL) { printf("%s: root not mounted yet, no way to load image\n", imagename); goto done; } error = linker_reference_module(imagename, NULL, &result); if (error != 0) { printf("%s: could not load firmware image, error %d\n", imagename, error); goto done; } mtx_lock(&firmware_mtx); fp = lookup(imagename, NULL); if (fp == NULL || fp->file != NULL) { mtx_unlock(&firmware_mtx); if (fp == NULL) printf("%s: firmware image loaded, " "but did not register\n", imagename); (void) linker_release_module(imagename, NULL, NULL); goto done; } fp->file = result; /* record the module identity */ mtx_unlock(&firmware_mtx); done: wakeup_one(imagename); /* we're done */ } /* * Lookup and potentially load the specified firmware image. * If the firmware is not found in the registry, try to load a kernel * module named as the image name. * If the firmware is located, a reference is returned. The caller must * release this reference for the image to be eligible for removal/unload. */ const struct firmware * firmware_get(const char *imagename) { struct task fwload_task; struct thread *td; struct priv_fw *fp; mtx_lock(&firmware_mtx); fp = lookup(imagename, NULL); if (fp != NULL) goto found; /* * Image not present, try to load the module holding it. */ td = curthread; if (priv_check(td, PRIV_FIRMWARE_LOAD) != 0 || securelevel_gt(td->td_ucred, 0) != 0) { mtx_unlock(&firmware_mtx); printf("%s: insufficient privileges to " "load firmware image %s\n", __func__, imagename); return NULL; } /* * Defer load to a thread with known context. linker_reference_module * may do filesystem i/o which requires root & current dirs, etc. * Also we must not hold any mtx's over this call which is problematic. */ if (!cold) { TASK_INIT(&fwload_task, 0, loadimage, __DECONST(void *, imagename)); taskqueue_enqueue(firmware_tq, &fwload_task); msleep(__DECONST(void *, imagename), &firmware_mtx, 0, "fwload", 0); } /* * After attempting to load the module, see if the image is registered. */ fp = lookup(imagename, NULL); if (fp == NULL) { mtx_unlock(&firmware_mtx); return NULL; } found: /* common exit point on success */ if (fp->refcnt == 0 && fp->parent != NULL) fp->parent->refcnt++; fp->refcnt++; mtx_unlock(&firmware_mtx); return &fp->fw; } /* * Release a reference to a firmware image returned by firmware_get. * The caller may specify, with the FIRMWARE_UNLOAD flag, its desire * to release the resource, but the flag is only advisory. * * If this is the last reference to the firmware image, and this is an * autoloaded module, wake up the firmware_unload_task to figure out * what to do with the associated module. */ void firmware_put(const struct firmware *p, int flags) { struct priv_fw *fp = PRIV_FW(p); mtx_lock(&firmware_mtx); fp->refcnt--; if (fp->refcnt == 0) { if (fp->parent != NULL) fp->parent->refcnt--; if (flags & FIRMWARE_UNLOAD) fp->flags |= FW_UNLOAD; if (fp->file) taskqueue_enqueue(firmware_tq, &firmware_unload_task); } mtx_unlock(&firmware_mtx); } /* * Setup directory state for the firmware_tq thread so we can do i/o. */ static void set_rootvnode(void *arg, int npending) { pwd_ensure_dirs(); free(arg, M_TEMP); } /* * Event handler called on mounting of /; bounce a task * into the task queue thread to setup it's directories. */ static void firmware_mountroot(void *arg) { struct task *setroot_task; setroot_task = malloc(sizeof(struct task), M_TEMP, M_NOWAIT); if (setroot_task != NULL) { TASK_INIT(setroot_task, 0, set_rootvnode, setroot_task); taskqueue_enqueue(firmware_tq, setroot_task); } else printf("%s: no memory for task!\n", __func__); } EVENTHANDLER_DEFINE(mountroot, firmware_mountroot, NULL, 0); /* * The body of the task in charge of unloading autoloaded modules * that are not needed anymore. * Images can be cross-linked so we may need to make multiple passes, * but the time we spend in the loop is bounded because we clear entries * as we touch them. */ static void unloadentry(void *unused1, int unused2) { int limit = FIRMWARE_MAX; int i; /* current cycle */ mtx_lock(&firmware_mtx); /* * Scan the table. limit is set to make sure we make another * full sweep after matching an entry that requires unloading. */ for (i = 0; i < limit; i++) { struct priv_fw *fp; int err; fp = &firmware_table[i % FIRMWARE_MAX]; if (fp->fw.name == NULL || fp->file == NULL || fp->refcnt != 0 || (fp->flags & FW_UNLOAD) == 0) continue; /* * Found an entry. Now: * 1. bump up limit to make sure we make another full round; * 2. clear FW_UNLOAD so we don't try this entry again. * 3. release the lock while trying to unload the module. * 'file' remains set so that the entry cannot be reused * in the meantime (it also means that fp->file will * not change while we release the lock). */ limit = i + FIRMWARE_MAX; /* make another full round */ fp->flags &= ~FW_UNLOAD; /* do not try again */ mtx_unlock(&firmware_mtx); err = linker_release_module(NULL, NULL, fp->file); mtx_lock(&firmware_mtx); /* * We rely on the module to call firmware_unregister() * on unload to actually release the entry. * If err = 0 we can drop our reference as the system * accepted it. Otherwise unloading failed (e.g. the * module itself gave an error) so our reference is * still valid. */ if (err == 0) fp->file = NULL; } mtx_unlock(&firmware_mtx); } /* * Module glue. */ static int firmware_modevent(module_t mod, int type, void *unused) { struct priv_fw *fp; int i, err; switch (type) { case MOD_LOAD: TASK_INIT(&firmware_unload_task, 0, unloadentry, NULL); firmware_tq = taskqueue_create("taskqueue_firmware", M_WAITOK, taskqueue_thread_enqueue, &firmware_tq); /* NB: use our own loop routine that sets up context */ (void) taskqueue_start_threads(&firmware_tq, 1, PWAIT, "firmware taskq"); if (rootvnode != NULL) { /* * Root is already mounted so we won't get an event; * simulate one here. */ firmware_mountroot(NULL); } return 0; case MOD_UNLOAD: /* request all autoloaded modules to be released */ mtx_lock(&firmware_mtx); for (i = 0; i < FIRMWARE_MAX; i++) { fp = &firmware_table[i]; fp->flags |= FW_UNLOAD; } mtx_unlock(&firmware_mtx); taskqueue_enqueue(firmware_tq, &firmware_unload_task); taskqueue_drain(firmware_tq, &firmware_unload_task); err = 0; for (i = 0; i < FIRMWARE_MAX; i++) { fp = &firmware_table[i]; if (fp->fw.name != NULL) { printf("%s: image %p ref %d still active slot %d\n", __func__, fp->fw.name, fp->refcnt, i); err = EINVAL; } } if (err == 0) taskqueue_free(firmware_tq); return err; } return EINVAL; } static moduledata_t firmware_mod = { "firmware", firmware_modevent, NULL }; DECLARE_MODULE(firmware, firmware_mod, SI_SUB_DRIVERS, SI_ORDER_FIRST); MODULE_VERSION(firmware, 1); Index: head/sys/kern/uipc_mbuf.c =================================================================== --- head/sys/kern/uipc_mbuf.c (revision 308456) +++ head/sys/kern/uipc_mbuf.c (revision 308457) @@ -1,1869 +1,1869 @@ /*- * Copyright (c) 1982, 1986, 1988, 1991, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)uipc_mbuf.c 8.2 (Berkeley) 1/4/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_param.h" #include "opt_mbuf_stress_test.h" #include "opt_mbuf_profiling.h" #include #include #include #include #include #include #include #include #include #include #include #include SDT_PROBE_DEFINE5_XLATE(sdt, , , m__init, "struct mbuf *", "mbufinfo_t *", "uint32_t", "uint32_t", "uint16_t", "uint16_t", "uint32_t", "uint32_t", "uint32_t", "uint32_t"); SDT_PROBE_DEFINE3_XLATE(sdt, , , m__gethdr, "uint32_t", "uint32_t", "uint16_t", "uint16_t", "struct mbuf *", "mbufinfo_t *"); SDT_PROBE_DEFINE3_XLATE(sdt, , , m__get, "uint32_t", "uint32_t", "uint16_t", "uint16_t", "struct mbuf *", "mbufinfo_t *"); SDT_PROBE_DEFINE4_XLATE(sdt, , , m__getcl, "uint32_t", "uint32_t", "uint16_t", "uint16_t", "uint32_t", "uint32_t", "struct mbuf *", "mbufinfo_t *"); SDT_PROBE_DEFINE3_XLATE(sdt, , , m__clget, "struct mbuf *", "mbufinfo_t *", "uint32_t", "uint32_t", "uint32_t", "uint32_t"); SDT_PROBE_DEFINE4_XLATE(sdt, , , m__cljget, "struct mbuf *", "mbufinfo_t *", "uint32_t", "uint32_t", "uint32_t", "uint32_t", "void*", "void*"); SDT_PROBE_DEFINE(sdt, , , m__cljset); SDT_PROBE_DEFINE1_XLATE(sdt, , , m__free, "struct mbuf *", "mbufinfo_t *"); SDT_PROBE_DEFINE1_XLATE(sdt, , , m__freem, "struct mbuf *", "mbufinfo_t *"); #include int max_linkhdr; int max_protohdr; int max_hdr; int max_datalen; #ifdef MBUF_STRESS_TEST int m_defragpackets; int m_defragbytes; int m_defraguseless; int m_defragfailure; int m_defragrandomfailures; #endif /* * sysctl(8) exported objects */ SYSCTL_INT(_kern_ipc, KIPC_MAX_LINKHDR, max_linkhdr, CTLFLAG_RD, &max_linkhdr, 0, "Size of largest link layer header"); SYSCTL_INT(_kern_ipc, KIPC_MAX_PROTOHDR, max_protohdr, CTLFLAG_RD, &max_protohdr, 0, "Size of largest protocol layer header"); SYSCTL_INT(_kern_ipc, KIPC_MAX_HDR, max_hdr, CTLFLAG_RD, &max_hdr, 0, "Size of largest link plus protocol header"); SYSCTL_INT(_kern_ipc, KIPC_MAX_DATALEN, max_datalen, CTLFLAG_RD, &max_datalen, 0, "Minimum space left in mbuf after max_hdr"); #ifdef MBUF_STRESS_TEST SYSCTL_INT(_kern_ipc, OID_AUTO, m_defragpackets, CTLFLAG_RD, &m_defragpackets, 0, ""); SYSCTL_INT(_kern_ipc, OID_AUTO, m_defragbytes, CTLFLAG_RD, &m_defragbytes, 0, ""); SYSCTL_INT(_kern_ipc, OID_AUTO, m_defraguseless, CTLFLAG_RD, &m_defraguseless, 0, ""); SYSCTL_INT(_kern_ipc, OID_AUTO, m_defragfailure, CTLFLAG_RD, &m_defragfailure, 0, ""); SYSCTL_INT(_kern_ipc, OID_AUTO, m_defragrandomfailures, CTLFLAG_RW, &m_defragrandomfailures, 0, ""); #endif /* * Ensure the correct size of various mbuf parameters. It could be off due * to compiler-induced padding and alignment artifacts. */ CTASSERT(MSIZE - offsetof(struct mbuf, m_dat) == MLEN); CTASSERT(MSIZE - offsetof(struct mbuf, m_pktdat) == MHLEN); /* * mbuf data storage should be 64-bit aligned regardless of architectural * pointer size; check this is the case with and without a packet header. */ CTASSERT(offsetof(struct mbuf, m_dat) % 8 == 0); CTASSERT(offsetof(struct mbuf, m_pktdat) % 8 == 0); /* * While the specific values here don't matter too much (i.e., +/- a few * words), we do want to ensure that changes to these values are carefully * reasoned about and properly documented. This is especially the case as * network-protocol and device-driver modules encode these layouts, and must * be recompiled if the structures change. Check these values at compile time * against the ones documented in comments in mbuf.h. * * NB: Possibly they should be documented there via #define's and not just * comments. */ #if defined(__LP64__) CTASSERT(offsetof(struct mbuf, m_dat) == 32); CTASSERT(sizeof(struct pkthdr) == 56); CTASSERT(sizeof(struct m_ext) == 48); #else CTASSERT(offsetof(struct mbuf, m_dat) == 24); CTASSERT(sizeof(struct pkthdr) == 48); CTASSERT(sizeof(struct m_ext) == 28); #endif /* * Assert that the queue(3) macros produce code of the same size as an old * plain pointer does. */ #ifdef INVARIANTS static struct mbuf m_assertbuf; CTASSERT(sizeof(m_assertbuf.m_slist) == sizeof(m_assertbuf.m_next)); CTASSERT(sizeof(m_assertbuf.m_stailq) == sizeof(m_assertbuf.m_next)); CTASSERT(sizeof(m_assertbuf.m_slistpkt) == sizeof(m_assertbuf.m_nextpkt)); CTASSERT(sizeof(m_assertbuf.m_stailqpkt) == sizeof(m_assertbuf.m_nextpkt)); #endif /* * Attach the cluster from *m to *n, set up m_ext in *n * and bump the refcount of the cluster. */ void mb_dupcl(struct mbuf *n, struct mbuf *m) { volatile u_int *refcnt; KASSERT(m->m_flags & M_EXT, ("%s: M_EXT not set on %p", __func__, m)); KASSERT(!(n->m_flags & M_EXT), ("%s: M_EXT set on %p", __func__, n)); n->m_ext = m->m_ext; n->m_flags |= M_EXT; n->m_flags |= m->m_flags & M_RDONLY; /* See if this is the mbuf that holds the embedded refcount. */ if (m->m_ext.ext_flags & EXT_FLAG_EMBREF) { refcnt = n->m_ext.ext_cnt = &m->m_ext.ext_count; n->m_ext.ext_flags &= ~EXT_FLAG_EMBREF; } else { KASSERT(m->m_ext.ext_cnt != NULL, ("%s: no refcounting pointer on %p", __func__, m)); refcnt = m->m_ext.ext_cnt; } if (*refcnt == 1) *refcnt += 1; else atomic_add_int(refcnt, 1); } void m_demote_pkthdr(struct mbuf *m) { M_ASSERTPKTHDR(m); m_tag_delete_chain(m, NULL); m->m_flags &= ~M_PKTHDR; bzero(&m->m_pkthdr, sizeof(struct pkthdr)); } /* * Clean up mbuf (chain) from any tags and packet headers. * If "all" is set then the first mbuf in the chain will be * cleaned too. */ void m_demote(struct mbuf *m0, int all, int flags) { struct mbuf *m; for (m = all ? m0 : m0->m_next; m != NULL; m = m->m_next) { KASSERT(m->m_nextpkt == NULL, ("%s: m_nextpkt in m %p, m0 %p", __func__, m, m0)); if (m->m_flags & M_PKTHDR) m_demote_pkthdr(m); m->m_flags = m->m_flags & (M_EXT | M_RDONLY | M_NOFREE | flags); } } /* * Sanity checks on mbuf (chain) for use in KASSERT() and general * debugging. * Returns 0 or panics when bad and 1 on all tests passed. * Sanitize, 0 to run M_SANITY_ACTION, 1 to garble things so they * blow up later. */ int m_sanity(struct mbuf *m0, int sanitize) { struct mbuf *m; caddr_t a, b; int pktlen = 0; #ifdef INVARIANTS #define M_SANITY_ACTION(s) panic("mbuf %p: " s, m) #else #define M_SANITY_ACTION(s) printf("mbuf %p: " s, m) #endif for (m = m0; m != NULL; m = m->m_next) { /* * Basic pointer checks. If any of these fails then some * unrelated kernel memory before or after us is trashed. * No way to recover from that. */ a = M_START(m); b = a + M_SIZE(m); if ((caddr_t)m->m_data < a) M_SANITY_ACTION("m_data outside mbuf data range left"); if ((caddr_t)m->m_data > b) M_SANITY_ACTION("m_data outside mbuf data range right"); if ((caddr_t)m->m_data + m->m_len > b) M_SANITY_ACTION("m_data + m_len exeeds mbuf space"); /* m->m_nextpkt may only be set on first mbuf in chain. */ if (m != m0 && m->m_nextpkt != NULL) { if (sanitize) { m_freem(m->m_nextpkt); m->m_nextpkt = (struct mbuf *)0xDEADC0DE; } else M_SANITY_ACTION("m->m_nextpkt on in-chain mbuf"); } /* packet length (not mbuf length!) calculation */ if (m0->m_flags & M_PKTHDR) pktlen += m->m_len; /* m_tags may only be attached to first mbuf in chain. */ if (m != m0 && m->m_flags & M_PKTHDR && !SLIST_EMPTY(&m->m_pkthdr.tags)) { if (sanitize) { m_tag_delete_chain(m, NULL); /* put in 0xDEADC0DE perhaps? */ } else M_SANITY_ACTION("m_tags on in-chain mbuf"); } /* M_PKTHDR may only be set on first mbuf in chain */ if (m != m0 && m->m_flags & M_PKTHDR) { if (sanitize) { bzero(&m->m_pkthdr, sizeof(m->m_pkthdr)); m->m_flags &= ~M_PKTHDR; /* put in 0xDEADCODE and leave hdr flag in */ } else M_SANITY_ACTION("M_PKTHDR on in-chain mbuf"); } } m = m0; if (pktlen && pktlen != m->m_pkthdr.len) { if (sanitize) m->m_pkthdr.len = 0; else M_SANITY_ACTION("m_pkthdr.len != mbuf chain length"); } return 1; #undef M_SANITY_ACTION } /* * Non-inlined part of m_init(). */ int m_pkthdr_init(struct mbuf *m, int how) { #ifdef MAC int error; #endif m->m_data = m->m_pktdat; bzero(&m->m_pkthdr, sizeof(m->m_pkthdr)); #ifdef MAC /* If the label init fails, fail the alloc */ error = mac_mbuf_init(m, how); if (error) return (error); #endif return (0); } /* * "Move" mbuf pkthdr from "from" to "to". * "from" must have M_PKTHDR set, and "to" must be empty. */ void m_move_pkthdr(struct mbuf *to, struct mbuf *from) { #if 0 /* see below for why these are not enabled */ M_ASSERTPKTHDR(to); /* Note: with MAC, this may not be a good assertion. */ KASSERT(SLIST_EMPTY(&to->m_pkthdr.tags), ("m_move_pkthdr: to has tags")); #endif #ifdef MAC /* * XXXMAC: It could be this should also occur for non-MAC? */ if (to->m_flags & M_PKTHDR) m_tag_delete_chain(to, NULL); #endif to->m_flags = (from->m_flags & M_COPYFLAGS) | (to->m_flags & M_EXT); if ((to->m_flags & M_EXT) == 0) to->m_data = to->m_pktdat; to->m_pkthdr = from->m_pkthdr; /* especially tags */ SLIST_INIT(&from->m_pkthdr.tags); /* purge tags from src */ from->m_flags &= ~M_PKTHDR; } /* * Duplicate "from"'s mbuf pkthdr in "to". * "from" must have M_PKTHDR set, and "to" must be empty. * In particular, this does a deep copy of the packet tags. */ int m_dup_pkthdr(struct mbuf *to, const struct mbuf *from, int how) { #if 0 /* * The mbuf allocator only initializes the pkthdr * when the mbuf is allocated with m_gethdr(). Many users * (e.g. m_copy*, m_prepend) use m_get() and then * smash the pkthdr as needed causing these * assertions to trip. For now just disable them. */ M_ASSERTPKTHDR(to); /* Note: with MAC, this may not be a good assertion. */ KASSERT(SLIST_EMPTY(&to->m_pkthdr.tags), ("m_dup_pkthdr: to has tags")); #endif MBUF_CHECKSLEEP(how); #ifdef MAC if (to->m_flags & M_PKTHDR) m_tag_delete_chain(to, NULL); #endif to->m_flags = (from->m_flags & M_COPYFLAGS) | (to->m_flags & M_EXT); if ((to->m_flags & M_EXT) == 0) to->m_data = to->m_pktdat; to->m_pkthdr = from->m_pkthdr; SLIST_INIT(&to->m_pkthdr.tags); return (m_tag_copy_chain(to, from, how)); } /* * Lesser-used path for M_PREPEND: * allocate new mbuf to prepend to chain, * copy junk along. */ struct mbuf * m_prepend(struct mbuf *m, int len, int how) { struct mbuf *mn; if (m->m_flags & M_PKTHDR) mn = m_gethdr(how, m->m_type); else mn = m_get(how, m->m_type); if (mn == NULL) { m_freem(m); return (NULL); } if (m->m_flags & M_PKTHDR) m_move_pkthdr(mn, m); mn->m_next = m; m = mn; if (len < M_SIZE(m)) M_ALIGN(m, len); m->m_len = len; return (m); } /* * Make a copy of an mbuf chain starting "off0" bytes from the beginning, * continuing for "len" bytes. If len is M_COPYALL, copy to end of mbuf. * The wait parameter is a choice of M_WAITOK/M_NOWAIT from caller. * Note that the copy is read-only, because clusters are not copied, * only their reference counts are incremented. */ struct mbuf * m_copym(struct mbuf *m, int off0, int len, int wait) { struct mbuf *n, **np; int off = off0; struct mbuf *top; int copyhdr = 0; KASSERT(off >= 0, ("m_copym, negative off %d", off)); KASSERT(len >= 0, ("m_copym, negative len %d", len)); MBUF_CHECKSLEEP(wait); if (off == 0 && m->m_flags & M_PKTHDR) copyhdr = 1; while (off > 0) { KASSERT(m != NULL, ("m_copym, offset > size of mbuf chain")); if (off < m->m_len) break; off -= m->m_len; m = m->m_next; } np = ⊤ top = NULL; while (len > 0) { if (m == NULL) { KASSERT(len == M_COPYALL, ("m_copym, length > size of mbuf chain")); break; } if (copyhdr) n = m_gethdr(wait, m->m_type); else n = m_get(wait, m->m_type); *np = n; if (n == NULL) goto nospace; if (copyhdr) { if (!m_dup_pkthdr(n, m, wait)) goto nospace; if (len == M_COPYALL) n->m_pkthdr.len -= off0; else n->m_pkthdr.len = len; copyhdr = 0; } n->m_len = min(len, m->m_len - off); if (m->m_flags & M_EXT) { n->m_data = m->m_data + off; mb_dupcl(n, m); } else bcopy(mtod(m, caddr_t)+off, mtod(n, caddr_t), (u_int)n->m_len); if (len != M_COPYALL) len -= n->m_len; off = 0; m = m->m_next; np = &n->m_next; } return (top); nospace: m_freem(top); return (NULL); } /* * Copy an entire packet, including header (which must be present). * An optimization of the common case `m_copym(m, 0, M_COPYALL, how)'. * Note that the copy is read-only, because clusters are not copied, * only their reference counts are incremented. * Preserve alignment of the first mbuf so if the creator has left * some room at the beginning (e.g. for inserting protocol headers) * the copies still have the room available. */ struct mbuf * m_copypacket(struct mbuf *m, int how) { struct mbuf *top, *n, *o; MBUF_CHECKSLEEP(how); n = m_get(how, m->m_type); top = n; if (n == NULL) goto nospace; if (!m_dup_pkthdr(n, m, how)) goto nospace; n->m_len = m->m_len; if (m->m_flags & M_EXT) { n->m_data = m->m_data; mb_dupcl(n, m); } else { n->m_data = n->m_pktdat + (m->m_data - m->m_pktdat ); bcopy(mtod(m, char *), mtod(n, char *), n->m_len); } m = m->m_next; while (m) { o = m_get(how, m->m_type); if (o == NULL) goto nospace; n->m_next = o; n = n->m_next; n->m_len = m->m_len; if (m->m_flags & M_EXT) { n->m_data = m->m_data; mb_dupcl(n, m); } else { bcopy(mtod(m, char *), mtod(n, char *), n->m_len); } m = m->m_next; } return top; nospace: m_freem(top); return (NULL); } /* * Copy data from an mbuf chain starting "off" bytes from the beginning, * continuing for "len" bytes, into the indicated buffer. */ void m_copydata(const struct mbuf *m, int off, int len, caddr_t cp) { u_int count; KASSERT(off >= 0, ("m_copydata, negative off %d", off)); KASSERT(len >= 0, ("m_copydata, negative len %d", len)); while (off > 0) { KASSERT(m != NULL, ("m_copydata, offset > size of mbuf chain")); if (off < m->m_len) break; off -= m->m_len; m = m->m_next; } while (len > 0) { KASSERT(m != NULL, ("m_copydata, length > size of mbuf chain")); count = min(m->m_len - off, len); bcopy(mtod(m, caddr_t) + off, cp, count); len -= count; cp += count; off = 0; m = m->m_next; } } /* * Copy a packet header mbuf chain into a completely new chain, including * copying any mbuf clusters. Use this instead of m_copypacket() when * you need a writable copy of an mbuf chain. */ struct mbuf * m_dup(const struct mbuf *m, int how) { struct mbuf **p, *top = NULL; int remain, moff, nsize; MBUF_CHECKSLEEP(how); /* Sanity check */ if (m == NULL) return (NULL); M_ASSERTPKTHDR(m); /* While there's more data, get a new mbuf, tack it on, and fill it */ remain = m->m_pkthdr.len; moff = 0; p = ⊤ while (remain > 0 || top == NULL) { /* allow m->m_pkthdr.len == 0 */ struct mbuf *n; /* Get the next new mbuf */ if (remain >= MINCLSIZE) { n = m_getcl(how, m->m_type, 0); nsize = MCLBYTES; } else { n = m_get(how, m->m_type); nsize = MLEN; } if (n == NULL) goto nospace; if (top == NULL) { /* First one, must be PKTHDR */ if (!m_dup_pkthdr(n, m, how)) { m_free(n); goto nospace; } if ((n->m_flags & M_EXT) == 0) nsize = MHLEN; n->m_flags &= ~M_RDONLY; } n->m_len = 0; /* Link it into the new chain */ *p = n; p = &n->m_next; /* Copy data from original mbuf(s) into new mbuf */ while (n->m_len < nsize && m != NULL) { int chunk = min(nsize - n->m_len, m->m_len - moff); bcopy(m->m_data + moff, n->m_data + n->m_len, chunk); moff += chunk; n->m_len += chunk; remain -= chunk; if (moff == m->m_len) { m = m->m_next; moff = 0; } } /* Check correct total mbuf length */ KASSERT((remain > 0 && m != NULL) || (remain == 0 && m == NULL), ("%s: bogus m_pkthdr.len", __func__)); } return (top); nospace: m_freem(top); return (NULL); } /* * Concatenate mbuf chain n to m. * Both chains must be of the same type (e.g. MT_DATA). * Any m_pkthdr is not updated. */ void m_cat(struct mbuf *m, struct mbuf *n) { while (m->m_next) m = m->m_next; while (n) { if (!M_WRITABLE(m) || M_TRAILINGSPACE(m) < n->m_len) { /* just join the two chains */ m->m_next = n; return; } /* splat the data from one into the other */ bcopy(mtod(n, caddr_t), mtod(m, caddr_t) + m->m_len, (u_int)n->m_len); m->m_len += n->m_len; n = m_free(n); } } /* * Concatenate two pkthdr mbuf chains. */ void m_catpkt(struct mbuf *m, struct mbuf *n) { M_ASSERTPKTHDR(m); M_ASSERTPKTHDR(n); m->m_pkthdr.len += n->m_pkthdr.len; m_demote(n, 1, 0); m_cat(m, n); } void m_adj(struct mbuf *mp, int req_len) { int len = req_len; struct mbuf *m; int count; if ((m = mp) == NULL) return; if (len >= 0) { /* * Trim from head. */ while (m != NULL && len > 0) { if (m->m_len <= len) { len -= m->m_len; m->m_len = 0; m = m->m_next; } else { m->m_len -= len; m->m_data += len; len = 0; } } if (mp->m_flags & M_PKTHDR) mp->m_pkthdr.len -= (req_len - len); } else { /* * Trim from tail. Scan the mbuf chain, * calculating its length and finding the last mbuf. * If the adjustment only affects this mbuf, then just * adjust and return. Otherwise, rescan and truncate * after the remaining size. */ len = -len; count = 0; for (;;) { count += m->m_len; if (m->m_next == (struct mbuf *)0) break; m = m->m_next; } if (m->m_len >= len) { m->m_len -= len; if (mp->m_flags & M_PKTHDR) mp->m_pkthdr.len -= len; return; } count -= len; if (count < 0) count = 0; /* * Correct length for chain is "count". * Find the mbuf with last data, adjust its length, * and toss data from remaining mbufs on chain. */ m = mp; if (m->m_flags & M_PKTHDR) m->m_pkthdr.len = count; for (; m; m = m->m_next) { if (m->m_len >= count) { m->m_len = count; if (m->m_next != NULL) { m_freem(m->m_next); m->m_next = NULL; } break; } count -= m->m_len; } } } /* * Rearange an mbuf chain so that len bytes are contiguous * and in the data area of an mbuf (so that mtod will work * for a structure of size len). Returns the resulting * mbuf chain on success, frees it and returns null on failure. * If there is room, it will add up to max_protohdr-len extra bytes to the * contiguous region in an attempt to avoid being called next time. */ struct mbuf * m_pullup(struct mbuf *n, int len) { struct mbuf *m; int count; int space; /* * If first mbuf has no cluster, and has room for len bytes * without shifting current data, pullup into it, * otherwise allocate a new mbuf to prepend to the chain. */ if ((n->m_flags & M_EXT) == 0 && n->m_data + len < &n->m_dat[MLEN] && n->m_next) { if (n->m_len >= len) return (n); m = n; n = n->m_next; len -= m->m_len; } else { if (len > MHLEN) goto bad; m = m_get(M_NOWAIT, n->m_type); if (m == NULL) goto bad; if (n->m_flags & M_PKTHDR) m_move_pkthdr(m, n); } space = &m->m_dat[MLEN] - (m->m_data + m->m_len); do { count = min(min(max(len, max_protohdr), space), n->m_len); bcopy(mtod(n, caddr_t), mtod(m, caddr_t) + m->m_len, (u_int)count); len -= count; m->m_len += count; n->m_len -= count; space -= count; if (n->m_len) n->m_data += count; else n = m_free(n); } while (len > 0 && n); if (len > 0) { (void) m_free(m); goto bad; } m->m_next = n; return (m); bad: m_freem(n); return (NULL); } /* * Like m_pullup(), except a new mbuf is always allocated, and we allow * the amount of empty space before the data in the new mbuf to be specified * (in the event that the caller expects to prepend later). */ struct mbuf * m_copyup(struct mbuf *n, int len, int dstoff) { struct mbuf *m; int count, space; if (len > (MHLEN - dstoff)) goto bad; m = m_get(M_NOWAIT, n->m_type); if (m == NULL) goto bad; if (n->m_flags & M_PKTHDR) m_move_pkthdr(m, n); m->m_data += dstoff; space = &m->m_dat[MLEN] - (m->m_data + m->m_len); do { count = min(min(max(len, max_protohdr), space), n->m_len); memcpy(mtod(m, caddr_t) + m->m_len, mtod(n, caddr_t), (unsigned)count); len -= count; m->m_len += count; n->m_len -= count; space -= count; if (n->m_len) n->m_data += count; else n = m_free(n); } while (len > 0 && n); if (len > 0) { (void) m_free(m); goto bad; } m->m_next = n; return (m); bad: m_freem(n); return (NULL); } /* * Partition an mbuf chain in two pieces, returning the tail -- * all but the first len0 bytes. In case of failure, it returns NULL and * attempts to restore the chain to its original state. * * Note that the resulting mbufs might be read-only, because the new * mbuf can end up sharing an mbuf cluster with the original mbuf if * the "breaking point" happens to lie within a cluster mbuf. Use the * M_WRITABLE() macro to check for this case. */ struct mbuf * m_split(struct mbuf *m0, int len0, int wait) { struct mbuf *m, *n; u_int len = len0, remain; MBUF_CHECKSLEEP(wait); for (m = m0; m && len > m->m_len; m = m->m_next) len -= m->m_len; if (m == NULL) return (NULL); remain = m->m_len - len; if (m0->m_flags & M_PKTHDR && remain == 0) { n = m_gethdr(wait, m0->m_type); if (n == NULL) return (NULL); n->m_next = m->m_next; m->m_next = NULL; n->m_pkthdr.rcvif = m0->m_pkthdr.rcvif; n->m_pkthdr.len = m0->m_pkthdr.len - len0; m0->m_pkthdr.len = len0; return (n); } else if (m0->m_flags & M_PKTHDR) { n = m_gethdr(wait, m0->m_type); if (n == NULL) return (NULL); n->m_pkthdr.rcvif = m0->m_pkthdr.rcvif; n->m_pkthdr.len = m0->m_pkthdr.len - len0; m0->m_pkthdr.len = len0; if (m->m_flags & M_EXT) goto extpacket; if (remain > MHLEN) { /* m can't be the lead packet */ M_ALIGN(n, 0); n->m_next = m_split(m, len, wait); if (n->m_next == NULL) { (void) m_free(n); return (NULL); } else { n->m_len = 0; return (n); } } else M_ALIGN(n, remain); } else if (remain == 0) { n = m->m_next; m->m_next = NULL; return (n); } else { n = m_get(wait, m->m_type); if (n == NULL) return (NULL); M_ALIGN(n, remain); } extpacket: if (m->m_flags & M_EXT) { n->m_data = m->m_data + len; mb_dupcl(n, m); } else { bcopy(mtod(m, caddr_t) + len, mtod(n, caddr_t), remain); } n->m_len = remain; m->m_len = len; n->m_next = m->m_next; m->m_next = NULL; return (n); } /* * Routine to copy from device local memory into mbufs. * Note that `off' argument is offset into first mbuf of target chain from * which to begin copying the data to. */ struct mbuf * m_devget(char *buf, int totlen, int off, struct ifnet *ifp, void (*copy)(char *from, caddr_t to, u_int len)) { struct mbuf *m; struct mbuf *top = NULL, **mp = ⊤ int len; if (off < 0 || off > MHLEN) return (NULL); while (totlen > 0) { if (top == NULL) { /* First one, must be PKTHDR */ if (totlen + off >= MINCLSIZE) { m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); len = MCLBYTES; } else { m = m_gethdr(M_NOWAIT, MT_DATA); len = MHLEN; /* Place initial small packet/header at end of mbuf */ if (m && totlen + off + max_linkhdr <= MHLEN) { m->m_data += max_linkhdr; len -= max_linkhdr; } } if (m == NULL) return NULL; m->m_pkthdr.rcvif = ifp; m->m_pkthdr.len = totlen; } else { if (totlen + off >= MINCLSIZE) { m = m_getcl(M_NOWAIT, MT_DATA, 0); len = MCLBYTES; } else { m = m_get(M_NOWAIT, MT_DATA); len = MLEN; } if (m == NULL) { m_freem(top); return NULL; } } if (off) { m->m_data += off; len -= off; off = 0; } m->m_len = len = min(totlen, len); if (copy) copy(buf, mtod(m, caddr_t), (u_int)len); else bcopy(buf, mtod(m, caddr_t), (u_int)len); buf += len; *mp = m; mp = &m->m_next; totlen -= len; } return (top); } /* * Copy data from a buffer back into the indicated mbuf chain, * starting "off" bytes from the beginning, extending the mbuf * chain if necessary. */ void m_copyback(struct mbuf *m0, int off, int len, c_caddr_t cp) { int mlen; struct mbuf *m = m0, *n; int totlen = 0; if (m0 == NULL) return; while (off > (mlen = m->m_len)) { off -= mlen; totlen += mlen; if (m->m_next == NULL) { n = m_get(M_NOWAIT, m->m_type); if (n == NULL) goto out; bzero(mtod(n, caddr_t), MLEN); n->m_len = min(MLEN, len + off); m->m_next = n; } m = m->m_next; } while (len > 0) { if (m->m_next == NULL && (len > m->m_len - off)) { m->m_len += min(len - (m->m_len - off), M_TRAILINGSPACE(m)); } mlen = min (m->m_len - off, len); bcopy(cp, off + mtod(m, caddr_t), (u_int)mlen); cp += mlen; len -= mlen; mlen += off; off = 0; totlen += mlen; if (len == 0) break; if (m->m_next == NULL) { n = m_get(M_NOWAIT, m->m_type); if (n == NULL) break; n->m_len = min(MLEN, len); m->m_next = n; } m = m->m_next; } out: if (((m = m0)->m_flags & M_PKTHDR) && (m->m_pkthdr.len < totlen)) m->m_pkthdr.len = totlen; } /* * Append the specified data to the indicated mbuf chain, * Extend the mbuf chain if the new data does not fit in * existing space. * * Return 1 if able to complete the job; otherwise 0. */ int m_append(struct mbuf *m0, int len, c_caddr_t cp) { struct mbuf *m, *n; int remainder, space; for (m = m0; m->m_next != NULL; m = m->m_next) ; remainder = len; space = M_TRAILINGSPACE(m); if (space > 0) { /* * Copy into available space. */ if (space > remainder) space = remainder; bcopy(cp, mtod(m, caddr_t) + m->m_len, space); m->m_len += space; cp += space, remainder -= space; } while (remainder > 0) { /* * Allocate a new mbuf; could check space * and allocate a cluster instead. */ n = m_get(M_NOWAIT, m->m_type); if (n == NULL) break; n->m_len = min(MLEN, remainder); bcopy(cp, mtod(n, caddr_t), n->m_len); cp += n->m_len, remainder -= n->m_len; m->m_next = n; m = n; } if (m0->m_flags & M_PKTHDR) m0->m_pkthdr.len += len - remainder; return (remainder == 0); } /* * Apply function f to the data in an mbuf chain starting "off" bytes from * the beginning, continuing for "len" bytes. */ int m_apply(struct mbuf *m, int off, int len, int (*f)(void *, void *, u_int), void *arg) { u_int count; int rval; KASSERT(off >= 0, ("m_apply, negative off %d", off)); KASSERT(len >= 0, ("m_apply, negative len %d", len)); while (off > 0) { KASSERT(m != NULL, ("m_apply, offset > size of mbuf chain")); if (off < m->m_len) break; off -= m->m_len; m = m->m_next; } while (len > 0) { KASSERT(m != NULL, ("m_apply, offset > size of mbuf chain")); count = min(m->m_len - off, len); rval = (*f)(arg, mtod(m, caddr_t) + off, count); if (rval) return (rval); len -= count; off = 0; m = m->m_next; } return (0); } /* * Return a pointer to mbuf/offset of location in mbuf chain. */ struct mbuf * m_getptr(struct mbuf *m, int loc, int *off) { while (loc >= 0) { /* Normal end of search. */ if (m->m_len > loc) { *off = loc; return (m); } else { loc -= m->m_len; if (m->m_next == NULL) { if (loc == 0) { /* Point at the end of valid data. */ *off = m->m_len; return (m); } return (NULL); } m = m->m_next; } } return (NULL); } void m_print(const struct mbuf *m, int maxlen) { int len; int pdata; const struct mbuf *m2; if (m == NULL) { printf("mbuf: %p\n", m); return; } if (m->m_flags & M_PKTHDR) len = m->m_pkthdr.len; else len = -1; m2 = m; while (m2 != NULL && (len == -1 || len)) { pdata = m2->m_len; if (maxlen != -1 && pdata > maxlen) pdata = maxlen; printf("mbuf: %p len: %d, next: %p, %b%s", m2, m2->m_len, m2->m_next, m2->m_flags, "\20\20freelist\17skipfw" "\11proto5\10proto4\7proto3\6proto2\5proto1\4rdonly" "\3eor\2pkthdr\1ext", pdata ? "" : "\n"); if (pdata) printf(", %*D\n", pdata, (u_char *)m2->m_data, "-"); if (len != -1) len -= m2->m_len; m2 = m2->m_next; } if (len > 0) printf("%d bytes unaccounted for.\n", len); return; } u_int m_fixhdr(struct mbuf *m0) { u_int len; len = m_length(m0, NULL); m0->m_pkthdr.len = len; return (len); } u_int m_length(struct mbuf *m0, struct mbuf **last) { struct mbuf *m; u_int len; len = 0; for (m = m0; m != NULL; m = m->m_next) { len += m->m_len; if (m->m_next == NULL) break; } if (last != NULL) *last = m; return (len); } /* * Defragment a mbuf chain, returning the shortest possible * chain of mbufs and clusters. If allocation fails and * this cannot be completed, NULL will be returned, but * the passed in chain will be unchanged. Upon success, * the original chain will be freed, and the new chain * will be returned. * * If a non-packet header is passed in, the original * mbuf (chain?) will be returned unharmed. */ struct mbuf * m_defrag(struct mbuf *m0, int how) { struct mbuf *m_new = NULL, *m_final = NULL; int progress = 0, length; MBUF_CHECKSLEEP(how); if (!(m0->m_flags & M_PKTHDR)) return (m0); m_fixhdr(m0); /* Needed sanity check */ #ifdef MBUF_STRESS_TEST if (m_defragrandomfailures) { int temp = arc4random() & 0xff; if (temp == 0xba) goto nospace; } #endif if (m0->m_pkthdr.len > MHLEN) m_final = m_getcl(how, MT_DATA, M_PKTHDR); else m_final = m_gethdr(how, MT_DATA); if (m_final == NULL) goto nospace; if (m_dup_pkthdr(m_final, m0, how) == 0) goto nospace; m_new = m_final; while (progress < m0->m_pkthdr.len) { length = m0->m_pkthdr.len - progress; if (length > MCLBYTES) length = MCLBYTES; if (m_new == NULL) { if (length > MLEN) m_new = m_getcl(how, MT_DATA, 0); else m_new = m_get(how, MT_DATA); if (m_new == NULL) goto nospace; } m_copydata(m0, progress, length, mtod(m_new, caddr_t)); progress += length; m_new->m_len = length; if (m_new != m_final) m_cat(m_final, m_new); m_new = NULL; } #ifdef MBUF_STRESS_TEST if (m0->m_next == NULL) m_defraguseless++; #endif m_freem(m0); m0 = m_final; #ifdef MBUF_STRESS_TEST m_defragpackets++; m_defragbytes += m0->m_pkthdr.len; #endif return (m0); nospace: #ifdef MBUF_STRESS_TEST m_defragfailure++; #endif if (m_final) m_freem(m_final); return (NULL); } /* * Defragment an mbuf chain, returning at most maxfrags separate * mbufs+clusters. If this is not possible NULL is returned and - * the original mbuf chain is left in it's present (potentially + * the original mbuf chain is left in its present (potentially * modified) state. We use two techniques: collapsing consecutive * mbufs and replacing consecutive mbufs by a cluster. * * NB: this should really be named m_defrag but that name is taken */ struct mbuf * m_collapse(struct mbuf *m0, int how, int maxfrags) { struct mbuf *m, *n, *n2, **prev; u_int curfrags; /* * Calculate the current number of frags. */ curfrags = 0; for (m = m0; m != NULL; m = m->m_next) curfrags++; /* * First, try to collapse mbufs. Note that we always collapse * towards the front so we don't need to deal with moving the * pkthdr. This may be suboptimal if the first mbuf has much * less data than the following. */ m = m0; again: for (;;) { n = m->m_next; if (n == NULL) break; if (M_WRITABLE(m) && n->m_len < M_TRAILINGSPACE(m)) { bcopy(mtod(n, void *), mtod(m, char *) + m->m_len, n->m_len); m->m_len += n->m_len; m->m_next = n->m_next; m_free(n); if (--curfrags <= maxfrags) return m0; } else m = n; } KASSERT(maxfrags > 1, ("maxfrags %u, but normal collapse failed", maxfrags)); /* * Collapse consecutive mbufs to a cluster. */ prev = &m0->m_next; /* NB: not the first mbuf */ while ((n = *prev) != NULL) { if ((n2 = n->m_next) != NULL && n->m_len + n2->m_len < MCLBYTES) { m = m_getcl(how, MT_DATA, 0); if (m == NULL) goto bad; bcopy(mtod(n, void *), mtod(m, void *), n->m_len); bcopy(mtod(n2, void *), mtod(m, char *) + n->m_len, n2->m_len); m->m_len = n->m_len + n2->m_len; m->m_next = n2->m_next; *prev = m; m_free(n); m_free(n2); if (--curfrags <= maxfrags) /* +1 cl -2 mbufs */ return m0; /* * Still not there, try the normal collapse * again before we allocate another cluster. */ goto again; } prev = &n->m_next; } /* * No place where we can collapse to a cluster; punt. * This can occur if, for example, you request 2 frags * but the packet requires that both be clusters (we * never reallocate the first mbuf to avoid moving the * packet header). */ bad: return NULL; } #ifdef MBUF_STRESS_TEST /* * Fragment an mbuf chain. There's no reason you'd ever want to do * this in normal usage, but it's great for stress testing various * mbuf consumers. * * If fragmentation is not possible, the original chain will be * returned. * * Possible length values: * 0 no fragmentation will occur * > 0 each fragment will be of the specified length * -1 each fragment will be the same random value in length * -2 each fragment's length will be entirely random * (Random values range from 1 to 256) */ struct mbuf * m_fragment(struct mbuf *m0, int how, int length) { struct mbuf *m_new = NULL, *m_final = NULL; int progress = 0; if (!(m0->m_flags & M_PKTHDR)) return (m0); if ((length == 0) || (length < -2)) return (m0); m_fixhdr(m0); /* Needed sanity check */ m_final = m_getcl(how, MT_DATA, M_PKTHDR); if (m_final == NULL) goto nospace; if (m_dup_pkthdr(m_final, m0, how) == 0) goto nospace; m_new = m_final; if (length == -1) length = 1 + (arc4random() & 255); while (progress < m0->m_pkthdr.len) { int fraglen; if (length > 0) fraglen = length; else fraglen = 1 + (arc4random() & 255); if (fraglen > m0->m_pkthdr.len - progress) fraglen = m0->m_pkthdr.len - progress; if (fraglen > MCLBYTES) fraglen = MCLBYTES; if (m_new == NULL) { m_new = m_getcl(how, MT_DATA, 0); if (m_new == NULL) goto nospace; } m_copydata(m0, progress, fraglen, mtod(m_new, caddr_t)); progress += fraglen; m_new->m_len = fraglen; if (m_new != m_final) m_cat(m_final, m_new); m_new = NULL; } m_freem(m0); m0 = m_final; return (m0); nospace: if (m_final) m_freem(m_final); /* Return the original chain on failure */ return (m0); } #endif /* * Copy the contents of uio into a properly sized mbuf chain. */ struct mbuf * m_uiotombuf(struct uio *uio, int how, int len, int align, int flags) { struct mbuf *m, *mb; int error, length; ssize_t total; int progress = 0; /* * len can be zero or an arbitrary large value bound by * the total data supplied by the uio. */ if (len > 0) total = min(uio->uio_resid, len); else total = uio->uio_resid; /* * The smallest unit returned by m_getm2() is a single mbuf * with pkthdr. We can't align past it. */ if (align >= MHLEN) return (NULL); /* * Give us the full allocation or nothing. * If len is zero return the smallest empty mbuf. */ m = m_getm2(NULL, max(total + align, 1), how, MT_DATA, flags); if (m == NULL) return (NULL); m->m_data += align; /* Fill all mbufs with uio data and update header information. */ for (mb = m; mb != NULL; mb = mb->m_next) { length = min(M_TRAILINGSPACE(mb), total - progress); error = uiomove(mtod(mb, void *), length, uio); if (error) { m_freem(m); return (NULL); } mb->m_len = length; progress += length; if (flags & M_PKTHDR) m->m_pkthdr.len += length; } KASSERT(progress == total, ("%s: progress != total", __func__)); return (m); } /* * Copy an mbuf chain into a uio limited by len if set. */ int m_mbuftouio(struct uio *uio, struct mbuf *m, int len) { int error, length, total; int progress = 0; if (len > 0) total = min(uio->uio_resid, len); else total = uio->uio_resid; /* Fill the uio with data from the mbufs. */ for (; m != NULL; m = m->m_next) { length = min(m->m_len, total - progress); error = uiomove(mtod(m, void *), length, uio); if (error) return (error); progress += length; } return (0); } /* * Create a writable copy of the mbuf chain. While doing this * we compact the chain with a goal of producing a chain with * at most two mbufs. The second mbuf in this chain is likely * to be a cluster. The primary purpose of this work is to create * a writable packet for encryption, compression, etc. The * secondary goal is to linearize the data so the data can be * passed to crypto hardware in the most efficient manner possible. */ struct mbuf * m_unshare(struct mbuf *m0, int how) { struct mbuf *m, *mprev; struct mbuf *n, *mfirst, *mlast; int len, off; mprev = NULL; for (m = m0; m != NULL; m = mprev->m_next) { /* * Regular mbufs are ignored unless there's a cluster * in front of it that we can use to coalesce. We do * the latter mainly so later clusters can be coalesced * also w/o having to handle them specially (i.e. convert * mbuf+cluster -> cluster). This optimization is heavily * influenced by the assumption that we're running over * Ethernet where MCLBYTES is large enough that the max * packet size will permit lots of coalescing into a * single cluster. This in turn permits efficient * crypto operations, especially when using hardware. */ if ((m->m_flags & M_EXT) == 0) { if (mprev && (mprev->m_flags & M_EXT) && m->m_len <= M_TRAILINGSPACE(mprev)) { /* XXX: this ignores mbuf types */ memcpy(mtod(mprev, caddr_t) + mprev->m_len, mtod(m, caddr_t), m->m_len); mprev->m_len += m->m_len; mprev->m_next = m->m_next; /* unlink from chain */ m_free(m); /* reclaim mbuf */ #if 0 newipsecstat.ips_mbcoalesced++; #endif } else { mprev = m; } continue; } /* * Writable mbufs are left alone (for now). */ if (M_WRITABLE(m)) { mprev = m; continue; } /* * Not writable, replace with a copy or coalesce with * the previous mbuf if possible (since we have to copy * it anyway, we try to reduce the number of mbufs and * clusters so that future work is easier). */ KASSERT(m->m_flags & M_EXT, ("m_flags 0x%x", m->m_flags)); /* NB: we only coalesce into a cluster or larger */ if (mprev != NULL && (mprev->m_flags & M_EXT) && m->m_len <= M_TRAILINGSPACE(mprev)) { /* XXX: this ignores mbuf types */ memcpy(mtod(mprev, caddr_t) + mprev->m_len, mtod(m, caddr_t), m->m_len); mprev->m_len += m->m_len; mprev->m_next = m->m_next; /* unlink from chain */ m_free(m); /* reclaim mbuf */ #if 0 newipsecstat.ips_clcoalesced++; #endif continue; } /* * Allocate new space to hold the copy and copy the data. * We deal with jumbo mbufs (i.e. m_len > MCLBYTES) by * splitting them into clusters. We could just malloc a * buffer and make it external but too many device drivers * don't know how to break up the non-contiguous memory when * doing DMA. */ n = m_getcl(how, m->m_type, m->m_flags & M_COPYFLAGS); if (n == NULL) { m_freem(m0); return (NULL); } if (m->m_flags & M_PKTHDR) { KASSERT(mprev == NULL, ("%s: m0 %p, m %p has M_PKTHDR", __func__, m0, m)); m_move_pkthdr(n, m); } len = m->m_len; off = 0; mfirst = n; mlast = NULL; for (;;) { int cc = min(len, MCLBYTES); memcpy(mtod(n, caddr_t), mtod(m, caddr_t) + off, cc); n->m_len = cc; if (mlast != NULL) mlast->m_next = n; mlast = n; #if 0 newipsecstat.ips_clcopied++; #endif len -= cc; if (len <= 0) break; off += cc; n = m_getcl(how, m->m_type, m->m_flags & M_COPYFLAGS); if (n == NULL) { m_freem(mfirst); m_freem(m0); return (NULL); } } n->m_next = m->m_next; if (mprev == NULL) m0 = mfirst; /* new head of chain */ else mprev->m_next = mfirst; /* replace old mbuf */ m_free(m); /* release old mbuf */ mprev = mfirst; } return (m0); } #ifdef MBUF_PROFILING #define MP_BUCKETS 32 /* don't just change this as things may overflow.*/ struct mbufprofile { uintmax_t wasted[MP_BUCKETS]; uintmax_t used[MP_BUCKETS]; uintmax_t segments[MP_BUCKETS]; } mbprof; #define MP_MAXDIGITS 21 /* strlen("16,000,000,000,000,000,000") == 21 */ #define MP_NUMLINES 6 #define MP_NUMSPERLINE 16 #define MP_EXTRABYTES 64 /* > strlen("used:\nwasted:\nsegments:\n") */ /* work out max space needed and add a bit of spare space too */ #define MP_MAXLINE ((MP_MAXDIGITS+1) * MP_NUMSPERLINE) #define MP_BUFSIZE ((MP_MAXLINE * MP_NUMLINES) + 1 + MP_EXTRABYTES) char mbprofbuf[MP_BUFSIZE]; void m_profile(struct mbuf *m) { int segments = 0; int used = 0; int wasted = 0; while (m) { segments++; used += m->m_len; if (m->m_flags & M_EXT) { wasted += MHLEN - sizeof(m->m_ext) + m->m_ext.ext_size - m->m_len; } else { if (m->m_flags & M_PKTHDR) wasted += MHLEN - m->m_len; else wasted += MLEN - m->m_len; } m = m->m_next; } /* be paranoid.. it helps */ if (segments > MP_BUCKETS - 1) segments = MP_BUCKETS - 1; if (used > 100000) used = 100000; if (wasted > 100000) wasted = 100000; /* store in the appropriate bucket */ /* don't bother locking. if it's slightly off, so what? */ mbprof.segments[segments]++; mbprof.used[fls(used)]++; mbprof.wasted[fls(wasted)]++; } static void mbprof_textify(void) { int offset; char *c; uint64_t *p; p = &mbprof.wasted[0]; c = mbprofbuf; offset = snprintf(c, MP_MAXLINE + 10, "wasted:\n" "%ju %ju %ju %ju %ju %ju %ju %ju " "%ju %ju %ju %ju %ju %ju %ju %ju\n", p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15]); #ifdef BIG_ARRAY p = &mbprof.wasted[16]; c += offset; offset = snprintf(c, MP_MAXLINE, "%ju %ju %ju %ju %ju %ju %ju %ju " "%ju %ju %ju %ju %ju %ju %ju %ju\n", p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15]); #endif p = &mbprof.used[0]; c += offset; offset = snprintf(c, MP_MAXLINE + 10, "used:\n" "%ju %ju %ju %ju %ju %ju %ju %ju " "%ju %ju %ju %ju %ju %ju %ju %ju\n", p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15]); #ifdef BIG_ARRAY p = &mbprof.used[16]; c += offset; offset = snprintf(c, MP_MAXLINE, "%ju %ju %ju %ju %ju %ju %ju %ju " "%ju %ju %ju %ju %ju %ju %ju %ju\n", p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15]); #endif p = &mbprof.segments[0]; c += offset; offset = snprintf(c, MP_MAXLINE + 10, "segments:\n" "%ju %ju %ju %ju %ju %ju %ju %ju " "%ju %ju %ju %ju %ju %ju %ju %ju\n", p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15]); #ifdef BIG_ARRAY p = &mbprof.segments[16]; c += offset; offset = snprintf(c, MP_MAXLINE, "%ju %ju %ju %ju %ju %ju %ju %ju " "%ju %ju %ju %ju %ju %ju %ju %jju", p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15]); #endif } static int mbprof_handler(SYSCTL_HANDLER_ARGS) { int error; mbprof_textify(); error = SYSCTL_OUT(req, mbprofbuf, strlen(mbprofbuf) + 1); return (error); } static int mbprof_clr_handler(SYSCTL_HANDLER_ARGS) { int clear, error; clear = 0; error = sysctl_handle_int(oidp, &clear, 0, req); if (error || !req->newptr) return (error); if (clear) { bzero(&mbprof, sizeof(mbprof)); } return (error); } SYSCTL_PROC(_kern_ipc, OID_AUTO, mbufprofile, CTLTYPE_STRING|CTLFLAG_RD, NULL, 0, mbprof_handler, "A", "mbuf profiling statistics"); SYSCTL_PROC(_kern_ipc, OID_AUTO, mbufprofileclr, CTLTYPE_INT|CTLFLAG_RW, NULL, 0, mbprof_clr_handler, "I", "clear mbuf profiling statistics"); #endif Index: head/sys/net80211/ieee80211_node.h =================================================================== --- head/sys/net80211/ieee80211_node.h (revision 308456) +++ head/sys/net80211/ieee80211_node.h (revision 308457) @@ -1,471 +1,471 @@ /*- * Copyright (c) 2001 Atsushi Onoe * Copyright (c) 2002-2009 Sam Leffler, Errno Consulting * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _NET80211_IEEE80211_NODE_H_ #define _NET80211_IEEE80211_NODE_H_ #include /* for ieee80211_nodestats */ #include /* for aggregation state */ /* * Each ieee80211com instance has a single timer that fires every * IEEE80211_INACT_WAIT seconds to handle "inactivity processing". * This is used to do node inactivity processing when operating * as an AP, adhoc or mesh mode. For inactivity processing each node - * has a timeout set in it's ni_inact field that is decremented + * has a timeout set in its ni_inact field that is decremented * on each timeout and the node is reclaimed when the counter goes * to zero. We use different inactivity timeout values depending * on whether the node is associated and authorized (either by * 802.1x or open/shared key authentication) or associated but yet * to be authorized. The latter timeout is shorter to more aggressively * reclaim nodes that leave part way through the 802.1x exchange. */ #define IEEE80211_INACT_WAIT 15 /* inactivity interval (secs) */ #define IEEE80211_INACT_INIT (30/IEEE80211_INACT_WAIT) /* initial */ #define IEEE80211_INACT_AUTH (180/IEEE80211_INACT_WAIT) /* associated but not authorized */ #define IEEE80211_INACT_RUN (300/IEEE80211_INACT_WAIT) /* authorized */ #define IEEE80211_INACT_PROBE (30/IEEE80211_INACT_WAIT) /* probe */ #define IEEE80211_INACT_SCAN (300/IEEE80211_INACT_WAIT) /* scanned */ #define IEEE80211_TRANS_WAIT 2 /* mgt frame tx timer (secs) */ /* threshold for aging overlapping non-ERP bss */ #define IEEE80211_NONERP_PRESENT_AGE msecs_to_ticks(60*1000) #define IEEE80211_NODE_HASHSIZE 32 /* NB: hash size must be pow2 */ /* simple hash is enough for variation of macaddr */ #define IEEE80211_NODE_HASH(ic, addr) \ (((const uint8_t *)(addr))[IEEE80211_ADDR_LEN - 1] % \ IEEE80211_NODE_HASHSIZE) struct ieee80211_node_table; struct ieee80211com; struct ieee80211vap; struct ieee80211_scanparams; /* * Information element ``blob''. We use this structure * to capture management frame payloads that need to be * retained. Information elements within the payload that * we need to consult have references recorded. */ struct ieee80211_ies { /* the following are either NULL or point within data */ uint8_t *wpa_ie; /* captured WPA ie */ uint8_t *rsn_ie; /* captured RSN ie */ uint8_t *wme_ie; /* captured WME ie */ uint8_t *ath_ie; /* captured Atheros ie */ uint8_t *htcap_ie; /* captured HTCAP ie */ uint8_t *htinfo_ie; /* captured HTINFO ie */ uint8_t *tdma_ie; /* captured TDMA ie */ uint8_t *meshid_ie; /* captured MESH ID ie */ uint8_t *spare[4]; /* NB: these must be the last members of this structure */ uint8_t *data; /* frame data > 802.11 header */ int len; /* data size in bytes */ }; /* * 802.11s (Mesh) Peer Link FSM state. */ enum ieee80211_mesh_mlstate { IEEE80211_NODE_MESH_IDLE = 0, IEEE80211_NODE_MESH_OPENSNT = 1, /* open frame sent */ IEEE80211_NODE_MESH_OPENRCV = 2, /* open frame received */ IEEE80211_NODE_MESH_CONFIRMRCV = 3, /* confirm frame received */ IEEE80211_NODE_MESH_ESTABLISHED = 4, /* link established */ IEEE80211_NODE_MESH_HOLDING = 5, /* link closing */ }; #define IEEE80211_MESH_MLSTATE_BITS \ "\20\1IDLE\2OPENSNT\2OPENRCV\3CONFIRMRCV\4ESTABLISHED\5HOLDING" /* * Node specific information. Note that drivers are expected * to derive from this structure to add device-specific per-node * state. This is done by overriding the ic_node_* methods in * the ieee80211com structure. */ struct ieee80211_node { struct ieee80211vap *ni_vap; /* associated vap */ struct ieee80211com *ni_ic; /* copy from vap to save deref*/ struct ieee80211_node_table *ni_table; /* NB: may be NULL */ TAILQ_ENTRY(ieee80211_node) ni_list; /* list of all nodes */ LIST_ENTRY(ieee80211_node) ni_hash; /* hash collision list */ u_int ni_refcnt; /* count of held references */ u_int ni_flags; #define IEEE80211_NODE_AUTH 0x000001 /* authorized for data */ #define IEEE80211_NODE_QOS 0x000002 /* QoS enabled */ #define IEEE80211_NODE_ERP 0x000004 /* ERP enabled */ /* NB: this must have the same value as IEEE80211_FC1_PWR_MGT */ #define IEEE80211_NODE_PWR_MGT 0x000010 /* power save mode enabled */ #define IEEE80211_NODE_AREF 0x000020 /* authentication ref held */ #define IEEE80211_NODE_HT 0x000040 /* HT enabled */ #define IEEE80211_NODE_HTCOMPAT 0x000080 /* HT setup w/ vendor OUI's */ #define IEEE80211_NODE_WPS 0x000100 /* WPS association */ #define IEEE80211_NODE_TSN 0x000200 /* TSN association */ #define IEEE80211_NODE_AMPDU_RX 0x000400 /* AMPDU rx enabled */ #define IEEE80211_NODE_AMPDU_TX 0x000800 /* AMPDU tx enabled */ #define IEEE80211_NODE_MIMO_PS 0x001000 /* MIMO power save enabled */ #define IEEE80211_NODE_MIMO_RTS 0x002000 /* send RTS in MIMO PS */ #define IEEE80211_NODE_RIFS 0x004000 /* RIFS enabled */ #define IEEE80211_NODE_SGI20 0x008000 /* Short GI in HT20 enabled */ #define IEEE80211_NODE_SGI40 0x010000 /* Short GI in HT40 enabled */ #define IEEE80211_NODE_ASSOCID 0x020000 /* xmit requires associd */ #define IEEE80211_NODE_AMSDU_RX 0x040000 /* AMSDU rx enabled */ #define IEEE80211_NODE_AMSDU_TX 0x080000 /* AMSDU tx enabled */ uint16_t ni_associd; /* association ID */ uint16_t ni_vlan; /* vlan tag */ uint16_t ni_txpower; /* current transmit power */ uint8_t ni_authmode; /* authentication algorithm */ uint8_t ni_ath_flags; /* Atheros feature flags */ /* NB: These must have the same values as IEEE80211_ATHC_* */ #define IEEE80211_NODE_TURBOP 0x0001 /* Turbo prime enable */ #define IEEE80211_NODE_COMP 0x0002 /* Compresssion enable */ #define IEEE80211_NODE_FF 0x0004 /* Fast Frame capable */ #define IEEE80211_NODE_XR 0x0008 /* Atheros WME enable */ #define IEEE80211_NODE_AR 0x0010 /* AR capable */ #define IEEE80211_NODE_BOOST 0x0080 /* Dynamic Turbo boosted */ uint16_t ni_ath_defkeyix;/* Atheros def key index */ const struct ieee80211_txparam *ni_txparms; uint32_t ni_jointime; /* time of join (secs) */ uint32_t *ni_challenge; /* shared-key challenge */ struct ieee80211_ies ni_ies; /* captured ie's */ /* tx seq per-tid */ ieee80211_seq ni_txseqs[IEEE80211_TID_SIZE]; /* rx seq previous per-tid*/ ieee80211_seq ni_rxseqs[IEEE80211_TID_SIZE]; uint32_t ni_rxfragstamp; /* time stamp of last rx frag */ struct mbuf *ni_rxfrag[3]; /* rx frag reassembly */ struct ieee80211_key ni_ucastkey; /* unicast key */ /* hardware */ uint32_t ni_avgrssi; /* recv ssi state */ int8_t ni_noise; /* noise floor */ /* mimo statistics */ uint32_t ni_mimo_rssi_ctl[IEEE80211_MAX_CHAINS]; uint32_t ni_mimo_rssi_ext[IEEE80211_MAX_CHAINS]; uint8_t ni_mimo_noise_ctl[IEEE80211_MAX_CHAINS]; uint8_t ni_mimo_noise_ext[IEEE80211_MAX_CHAINS]; uint8_t ni_mimo_chains; /* header */ uint8_t ni_macaddr[IEEE80211_ADDR_LEN]; uint8_t ni_bssid[IEEE80211_ADDR_LEN]; /* beacon, probe response */ union { uint8_t data[8]; u_int64_t tsf; } ni_tstamp; /* from last rcv'd beacon */ uint16_t ni_intval; /* beacon interval */ uint16_t ni_capinfo; /* capabilities */ uint8_t ni_esslen; uint8_t ni_essid[IEEE80211_NWID_LEN]; struct ieee80211_rateset ni_rates; /* negotiated rate set */ struct ieee80211_channel *ni_chan; uint16_t ni_fhdwell; /* FH only */ uint8_t ni_fhindex; /* FH only */ uint16_t ni_erp; /* ERP from beacon/probe resp */ uint16_t ni_timoff; /* byte offset to TIM ie */ uint8_t ni_dtim_period; /* DTIM period */ uint8_t ni_dtim_count; /* DTIM count for last bcn */ /* 11s state */ uint8_t ni_meshidlen; uint8_t ni_meshid[IEEE80211_MESHID_LEN]; enum ieee80211_mesh_mlstate ni_mlstate; /* peering management state */ uint16_t ni_mllid; /* link local ID */ uint16_t ni_mlpid; /* link peer ID */ struct callout ni_mltimer; /* link mesh timer */ uint8_t ni_mlrcnt; /* link mesh retry counter */ uint8_t ni_mltval; /* link mesh timer value */ struct callout ni_mlhtimer; /* link mesh backoff timer */ uint8_t ni_mlhcnt; /* link mesh holding counter */ /* 11n state */ uint16_t ni_htcap; /* HT capabilities */ uint8_t ni_htparam; /* HT params */ uint8_t ni_htctlchan; /* HT control channel */ uint8_t ni_ht2ndchan; /* HT 2nd channel */ uint8_t ni_htopmode; /* HT operating mode */ uint8_t ni_htstbc; /* HT */ uint8_t ni_chw; /* negotiated channel width */ struct ieee80211_htrateset ni_htrates; /* negotiated ht rate set */ struct ieee80211_tx_ampdu ni_tx_ampdu[WME_NUM_TID]; struct ieee80211_rx_ampdu ni_rx_ampdu[WME_NUM_TID]; /* fast-frames state */ struct mbuf * ni_tx_superg[WME_NUM_TID]; /* others */ short ni_inact; /* inactivity mark count */ short ni_inact_reload;/* inactivity reload value */ int ni_txrate; /* legacy rate/MCS */ struct ieee80211_psq ni_psq; /* power save queue */ struct ieee80211_nodestats ni_stats; /* per-node statistics */ struct ieee80211vap *ni_wdsvap; /* associated WDS vap */ void *ni_rctls; /* private ratectl state */ uint64_t ni_spare[3]; }; MALLOC_DECLARE(M_80211_NODE); MALLOC_DECLARE(M_80211_NODE_IE); #define IEEE80211_NODE_ATH (IEEE80211_NODE_FF | IEEE80211_NODE_TURBOP) #define IEEE80211_NODE_AMPDU \ (IEEE80211_NODE_AMPDU_RX | IEEE80211_NODE_AMPDU_TX) #define IEEE80211_NODE_AMSDU \ (IEEE80211_NODE_AMSDU_RX | IEEE80211_NODE_AMSDU_TX) #define IEEE80211_NODE_HT_ALL \ (IEEE80211_NODE_HT | IEEE80211_NODE_HTCOMPAT | \ IEEE80211_NODE_AMPDU | IEEE80211_NODE_AMSDU | \ IEEE80211_NODE_MIMO_PS | IEEE80211_NODE_MIMO_RTS | \ IEEE80211_NODE_RIFS | IEEE80211_NODE_SGI20 | IEEE80211_NODE_SGI40) #define IEEE80211_NODE_BITS \ "\20\1AUTH\2QOS\3ERP\5PWR_MGT\6AREF\7HT\10HTCOMPAT\11WPS\12TSN" \ "\13AMPDU_RX\14AMPDU_TX\15MIMO_PS\16MIMO_RTS\17RIFS\20SGI20\21SGI40" \ "\22ASSOCID" #define IEEE80211_NODE_AID(ni) IEEE80211_AID(ni->ni_associd) #define IEEE80211_NODE_STAT(ni,stat) (ni->ni_stats.ns_##stat++) #define IEEE80211_NODE_STAT_ADD(ni,stat,v) (ni->ni_stats.ns_##stat += v) #define IEEE80211_NODE_STAT_SET(ni,stat,v) (ni->ni_stats.ns_##stat = v) /* * Filtered rssi calculation support. The receive rssi is maintained * as an average over the last 10 frames received using a low pass filter * (all frames for now, possibly need to be more selective). Calculations * are designed such that a good compiler can optimize them. The avg * rssi state should be initialized to IEEE80211_RSSI_DUMMY_MARKER and * each sample incorporated with IEEE80211_RSSI_LPF. Use IEEE80211_RSSI_GET * to extract the current value. * * Note that we assume rssi data are in the range [-127..127] and we * discard values <-20. This is consistent with assumptions throughout * net80211 that signal strength data are in .5 dBm units relative to * the current noise floor (linear, not log). */ #define IEEE80211_RSSI_LPF_LEN 10 #define IEEE80211_RSSI_DUMMY_MARKER 127 /* NB: pow2 to optimize out * and / */ #define IEEE80211_RSSI_EP_MULTIPLIER (1<<7) #define IEEE80211_RSSI_IN(x) ((x) * IEEE80211_RSSI_EP_MULTIPLIER) #define _IEEE80211_RSSI_LPF(x, y, len) \ (((x) != IEEE80211_RSSI_DUMMY_MARKER) ? (((x) * ((len) - 1) + (y)) / (len)) : (y)) #define IEEE80211_RSSI_LPF(x, y) do { \ if ((y) >= -20) { \ x = _IEEE80211_RSSI_LPF((x), IEEE80211_RSSI_IN((y)), \ IEEE80211_RSSI_LPF_LEN); \ } \ } while (0) #define IEEE80211_RSSI_EP_RND(x, mul) \ ((((x) % (mul)) >= ((mul)/2)) ? ((x) + ((mul) - 1)) / (mul) : (x)/(mul)) #define IEEE80211_RSSI_GET(x) \ IEEE80211_RSSI_EP_RND(x, IEEE80211_RSSI_EP_MULTIPLIER) static __inline struct ieee80211_node * ieee80211_ref_node(struct ieee80211_node *ni) { ieee80211_node_incref(ni); return ni; } static __inline void ieee80211_unref_node(struct ieee80211_node **ni) { ieee80211_node_decref(*ni); *ni = NULL; /* guard against use */ } void ieee80211_node_attach(struct ieee80211com *); void ieee80211_node_lateattach(struct ieee80211com *); void ieee80211_node_detach(struct ieee80211com *); void ieee80211_node_vattach(struct ieee80211vap *); void ieee80211_node_latevattach(struct ieee80211vap *); void ieee80211_node_vdetach(struct ieee80211vap *); static __inline int ieee80211_node_is_authorized(const struct ieee80211_node *ni) { return (ni->ni_flags & IEEE80211_NODE_AUTH); } void ieee80211_node_authorize(struct ieee80211_node *); void ieee80211_node_unauthorize(struct ieee80211_node *); void ieee80211_node_setuptxparms(struct ieee80211_node *); void ieee80211_node_set_chan(struct ieee80211_node *, struct ieee80211_channel *); void ieee80211_create_ibss(struct ieee80211vap*, struct ieee80211_channel *); void ieee80211_reset_bss(struct ieee80211vap *); void ieee80211_sync_curchan(struct ieee80211com *); void ieee80211_setupcurchan(struct ieee80211com *, struct ieee80211_channel *); void ieee80211_setcurchan(struct ieee80211com *, struct ieee80211_channel *); void ieee80211_update_chw(struct ieee80211com *); int ieee80211_ibss_merge_check(struct ieee80211_node *); int ieee80211_ibss_node_check_new(struct ieee80211_node *ni, const struct ieee80211_scanparams *); int ieee80211_ibss_merge(struct ieee80211_node *); struct ieee80211_scan_entry; int ieee80211_sta_join(struct ieee80211vap *, struct ieee80211_channel *, const struct ieee80211_scan_entry *); void ieee80211_sta_leave(struct ieee80211_node *); void ieee80211_node_deauth(struct ieee80211_node *, int); int ieee80211_ies_init(struct ieee80211_ies *, const uint8_t *, int); void ieee80211_ies_cleanup(struct ieee80211_ies *); void ieee80211_ies_expand(struct ieee80211_ies *); #define ieee80211_ies_setie(_ies, _ie, _off) do { \ (_ies)._ie = (_ies).data + (_off); \ } while (0) /* * Table of ieee80211_node instances. Each ieee80211com * has one that holds association stations (when operating * as an ap) or neighbors (in ibss mode). * * XXX embed this in ieee80211com instead of indirect? */ struct ieee80211_node_table { struct ieee80211com *nt_ic; /* back reference */ ieee80211_node_lock_t nt_nodelock; /* on node table */ TAILQ_HEAD(, ieee80211_node) nt_node; /* information of all nodes */ LIST_HEAD(, ieee80211_node) nt_hash[IEEE80211_NODE_HASHSIZE]; int nt_count; /* number of nodes */ struct ieee80211_node **nt_keyixmap; /* key ix -> node map */ int nt_keyixmax; /* keyixmap size */ const char *nt_name; /* table name for debug msgs */ int nt_inact_init; /* initial node inact setting */ }; struct ieee80211_node *ieee80211_alloc_node(struct ieee80211_node_table *, struct ieee80211vap *, const uint8_t macaddr[IEEE80211_ADDR_LEN]); struct ieee80211_node *ieee80211_tmp_node(struct ieee80211vap *, const uint8_t macaddr[IEEE80211_ADDR_LEN]); struct ieee80211_node *ieee80211_dup_bss(struct ieee80211vap *, const uint8_t macaddr[IEEE80211_ADDR_LEN]); struct ieee80211_node *ieee80211_node_create_wds(struct ieee80211vap *, const uint8_t bssid[IEEE80211_ADDR_LEN], struct ieee80211_channel *); #ifdef IEEE80211_DEBUG_REFCNT void ieee80211_free_node_debug(struct ieee80211_node *, const char *func, int line); struct ieee80211_node *ieee80211_find_node_locked_debug( struct ieee80211_node_table *, const uint8_t macaddr[IEEE80211_ADDR_LEN], const char *func, int line); struct ieee80211_node *ieee80211_find_node_debug(struct ieee80211_node_table *, const uint8_t macaddr[IEEE80211_ADDR_LEN], const char *func, int line); struct ieee80211_node *ieee80211_find_vap_node_locked_debug( struct ieee80211_node_table *, const struct ieee80211vap *vap, const uint8_t macaddr[IEEE80211_ADDR_LEN], const char *func, int line); struct ieee80211_node *ieee80211_find_vap_node_debug( struct ieee80211_node_table *, const struct ieee80211vap *vap, const uint8_t macaddr[IEEE80211_ADDR_LEN], const char *func, int line); struct ieee80211_node * ieee80211_find_rxnode_debug(struct ieee80211com *, const struct ieee80211_frame_min *, const char *func, int line); struct ieee80211_node * ieee80211_find_rxnode_withkey_debug( struct ieee80211com *, const struct ieee80211_frame_min *, uint16_t keyix, const char *func, int line); struct ieee80211_node *ieee80211_find_txnode_debug(struct ieee80211vap *, const uint8_t *, const char *func, int line); #define ieee80211_free_node(ni) \ ieee80211_free_node_debug(ni, __func__, __LINE__) #define ieee80211_find_node_locked(nt, mac) \ ieee80211_find_node_locked_debug(nt, mac, __func__, __LINE__) #define ieee80211_find_node(nt, mac) \ ieee80211_find_node_debug(nt, mac, __func__, __LINE__) #define ieee80211_find_vap_node_locked(nt, vap, mac) \ ieee80211_find_vap_node_locked_debug(nt, vap, mac, __func__, __LINE__) #define ieee80211_find_vap_node(nt, vap, mac) \ ieee80211_find_vap_node_debug(nt, vap, mac, __func__, __LINE__) #define ieee80211_find_rxnode(ic, wh) \ ieee80211_find_rxnode_debug(ic, wh, __func__, __LINE__) #define ieee80211_find_rxnode_withkey(ic, wh, keyix) \ ieee80211_find_rxnode_withkey_debug(ic, wh, keyix, __func__, __LINE__) #define ieee80211_find_txnode(vap, mac) \ ieee80211_find_txnode_debug(vap, mac, __func__, __LINE__) #else void ieee80211_free_node(struct ieee80211_node *); struct ieee80211_node *ieee80211_find_node_locked(struct ieee80211_node_table *, const uint8_t macaddr[IEEE80211_ADDR_LEN]); struct ieee80211_node *ieee80211_find_node(struct ieee80211_node_table *, const uint8_t macaddr[IEEE80211_ADDR_LEN]); struct ieee80211_node *ieee80211_find_vap_node_locked( struct ieee80211_node_table *, const struct ieee80211vap *, const uint8_t macaddr[IEEE80211_ADDR_LEN]); struct ieee80211_node *ieee80211_find_vap_node( struct ieee80211_node_table *, const struct ieee80211vap *, const uint8_t macaddr[IEEE80211_ADDR_LEN]); struct ieee80211_node * ieee80211_find_rxnode(struct ieee80211com *, const struct ieee80211_frame_min *); struct ieee80211_node * ieee80211_find_rxnode_withkey(struct ieee80211com *, const struct ieee80211_frame_min *, uint16_t keyix); struct ieee80211_node *ieee80211_find_txnode(struct ieee80211vap *, const uint8_t macaddr[IEEE80211_ADDR_LEN]); #endif int ieee80211_node_delucastkey(struct ieee80211_node *); void ieee80211_node_timeout(void *arg); typedef void ieee80211_iter_func(void *, struct ieee80211_node *); int ieee80211_iterate_nodes_vap(struct ieee80211_node_table *, struct ieee80211vap *, ieee80211_iter_func *, void *); void ieee80211_iterate_nodes(struct ieee80211_node_table *, ieee80211_iter_func *, void *); void ieee80211_notify_erp(struct ieee80211com *); void ieee80211_dump_node(struct ieee80211_node_table *, struct ieee80211_node *); void ieee80211_dump_nodes(struct ieee80211_node_table *); struct ieee80211_node *ieee80211_fakeup_adhoc_node(struct ieee80211vap *, const uint8_t macaddr[IEEE80211_ADDR_LEN]); struct ieee80211_scanparams; void ieee80211_init_neighbor(struct ieee80211_node *, const struct ieee80211_frame *, const struct ieee80211_scanparams *); struct ieee80211_node *ieee80211_add_neighbor(struct ieee80211vap *, const struct ieee80211_frame *, const struct ieee80211_scanparams *); void ieee80211_node_join(struct ieee80211_node *,int); void ieee80211_node_leave(struct ieee80211_node *); int8_t ieee80211_getrssi(struct ieee80211vap *); void ieee80211_getsignal(struct ieee80211vap *, int8_t *, int8_t *); #endif /* _NET80211_IEEE80211_NODE_H_ */ Index: head/sys/netgraph/ng_base.c =================================================================== --- head/sys/netgraph/ng_base.c (revision 308456) +++ head/sys/netgraph/ng_base.c (revision 308457) @@ -1,3846 +1,3846 @@ /*- * Copyright (c) 1996-1999 Whistle Communications, Inc. * All rights reserved. * * Subject to the following obligations and disclaimer of warranty, use and * redistribution of this software, in source or object code forms, with or * without modifications are expressly permitted by Whistle Communications; * provided, however, that: * 1. Any and all reproductions of the source or object code must include the * copyright notice above and the following disclaimer of warranties; and * 2. No rights are granted, in any manner or form, to use Whistle * Communications, Inc. trademarks, including the mark "WHISTLE * COMMUNICATIONS" on advertising, endorsements, or otherwise except as * such appears in the above copyright notice or in the software. * * THIS SOFTWARE IS BEING PROVIDED BY WHISTLE COMMUNICATIONS "AS IS", AND * TO THE MAXIMUM EXTENT PERMITTED BY LAW, WHISTLE COMMUNICATIONS MAKES NO * REPRESENTATIONS OR WARRANTIES, EXPRESS OR IMPLIED, REGARDING THIS SOFTWARE, * INCLUDING WITHOUT LIMITATION, ANY AND ALL IMPLIED WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT. * WHISTLE COMMUNICATIONS DOES NOT WARRANT, GUARANTEE, OR MAKE ANY * REPRESENTATIONS REGARDING THE USE OF, OR THE RESULTS OF THE USE OF THIS * SOFTWARE IN TERMS OF ITS CORRECTNESS, ACCURACY, RELIABILITY OR OTHERWISE. * IN NO EVENT SHALL WHISTLE COMMUNICATIONS BE LIABLE FOR ANY DAMAGES * RESULTING FROM OR ARISING OUT OF ANY USE OF THIS SOFTWARE, INCLUDING * WITHOUT LIMITATION, ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, * PUNITIVE, OR CONSEQUENTIAL DAMAGES, PROCUREMENT OF SUBSTITUTE GOODS OR * SERVICES, LOSS OF USE, DATA OR PROFITS, HOWEVER CAUSED AND UNDER ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF WHISTLE COMMUNICATIONS IS ADVISED OF THE POSSIBILITY * OF SUCH DAMAGE. * * Authors: Julian Elischer * Archie Cobbs * * $FreeBSD$ * $Whistle: ng_base.c,v 1.39 1999/01/28 23:54:53 julian Exp $ */ /* * This file implements the base netgraph code. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include MODULE_VERSION(netgraph, NG_ABI_VERSION); /* Mutex to protect topology events. */ static struct rwlock ng_topo_lock; #define TOPOLOGY_RLOCK() rw_rlock(&ng_topo_lock) #define TOPOLOGY_RUNLOCK() rw_runlock(&ng_topo_lock) #define TOPOLOGY_WLOCK() rw_wlock(&ng_topo_lock) #define TOPOLOGY_WUNLOCK() rw_wunlock(&ng_topo_lock) #define TOPOLOGY_NOTOWNED() rw_assert(&ng_topo_lock, RA_UNLOCKED) #ifdef NETGRAPH_DEBUG static struct mtx ng_nodelist_mtx; /* protects global node/hook lists */ static struct mtx ngq_mtx; /* protects the queue item list */ static SLIST_HEAD(, ng_node) ng_allnodes; static LIST_HEAD(, ng_node) ng_freenodes; /* in debug, we never free() them */ static SLIST_HEAD(, ng_hook) ng_allhooks; static LIST_HEAD(, ng_hook) ng_freehooks; /* in debug, we never free() them */ static void ng_dumpitems(void); static void ng_dumpnodes(void); static void ng_dumphooks(void); #endif /* NETGRAPH_DEBUG */ /* * DEAD versions of the structures. * In order to avoid races, it is sometimes necessary to point * at SOMETHING even though theoretically, the current entity is * INVALID. Use these to avoid these races. */ struct ng_type ng_deadtype = { NG_ABI_VERSION, "dead", NULL, /* modevent */ NULL, /* constructor */ NULL, /* rcvmsg */ NULL, /* shutdown */ NULL, /* newhook */ NULL, /* findhook */ NULL, /* connect */ NULL, /* rcvdata */ NULL, /* disconnect */ NULL, /* cmdlist */ }; struct ng_node ng_deadnode = { "dead", &ng_deadtype, NGF_INVALID, 0, /* numhooks */ NULL, /* private */ 0, /* ID */ LIST_HEAD_INITIALIZER(ng_deadnode.nd_hooks), {}, /* all_nodes list entry */ {}, /* id hashtable list entry */ { 0, 0, {}, /* should never use! (should hang) */ {}, /* workqueue entry */ STAILQ_HEAD_INITIALIZER(ng_deadnode.nd_input_queue.queue), }, 1, /* refs */ NULL, /* vnet */ #ifdef NETGRAPH_DEBUG ND_MAGIC, __FILE__, __LINE__, {NULL} #endif /* NETGRAPH_DEBUG */ }; struct ng_hook ng_deadhook = { "dead", NULL, /* private */ HK_INVALID | HK_DEAD, 0, /* undefined data link type */ &ng_deadhook, /* Peer is self */ &ng_deadnode, /* attached to deadnode */ {}, /* hooks list */ NULL, /* override rcvmsg() */ NULL, /* override rcvdata() */ 1, /* refs always >= 1 */ #ifdef NETGRAPH_DEBUG HK_MAGIC, __FILE__, __LINE__, {NULL} #endif /* NETGRAPH_DEBUG */ }; /* * END DEAD STRUCTURES */ /* List nodes with unallocated work */ static STAILQ_HEAD(, ng_node) ng_worklist = STAILQ_HEAD_INITIALIZER(ng_worklist); static struct mtx ng_worklist_mtx; /* MUST LOCK NODE FIRST */ /* List of installed types */ static LIST_HEAD(, ng_type) ng_typelist; static struct rwlock ng_typelist_lock; #define TYPELIST_RLOCK() rw_rlock(&ng_typelist_lock) #define TYPELIST_RUNLOCK() rw_runlock(&ng_typelist_lock) #define TYPELIST_WLOCK() rw_wlock(&ng_typelist_lock) #define TYPELIST_WUNLOCK() rw_wunlock(&ng_typelist_lock) /* Hash related definitions. */ LIST_HEAD(nodehash, ng_node); static VNET_DEFINE(struct nodehash *, ng_ID_hash); static VNET_DEFINE(u_long, ng_ID_hmask); static VNET_DEFINE(u_long, ng_nodes); static VNET_DEFINE(struct nodehash *, ng_name_hash); static VNET_DEFINE(u_long, ng_name_hmask); static VNET_DEFINE(u_long, ng_named_nodes); #define V_ng_ID_hash VNET(ng_ID_hash) #define V_ng_ID_hmask VNET(ng_ID_hmask) #define V_ng_nodes VNET(ng_nodes) #define V_ng_name_hash VNET(ng_name_hash) #define V_ng_name_hmask VNET(ng_name_hmask) #define V_ng_named_nodes VNET(ng_named_nodes) static struct rwlock ng_idhash_lock; #define IDHASH_RLOCK() rw_rlock(&ng_idhash_lock) #define IDHASH_RUNLOCK() rw_runlock(&ng_idhash_lock) #define IDHASH_WLOCK() rw_wlock(&ng_idhash_lock) #define IDHASH_WUNLOCK() rw_wunlock(&ng_idhash_lock) /* Method to find a node.. used twice so do it here */ #define NG_IDHASH_FN(ID) ((ID) % (V_ng_ID_hmask + 1)) #define NG_IDHASH_FIND(ID, node) \ do { \ rw_assert(&ng_idhash_lock, RA_LOCKED); \ LIST_FOREACH(node, &V_ng_ID_hash[NG_IDHASH_FN(ID)], \ nd_idnodes) { \ if (NG_NODE_IS_VALID(node) \ && (NG_NODE_ID(node) == ID)) { \ break; \ } \ } \ } while (0) static struct rwlock ng_namehash_lock; #define NAMEHASH_RLOCK() rw_rlock(&ng_namehash_lock) #define NAMEHASH_RUNLOCK() rw_runlock(&ng_namehash_lock) #define NAMEHASH_WLOCK() rw_wlock(&ng_namehash_lock) #define NAMEHASH_WUNLOCK() rw_wunlock(&ng_namehash_lock) /* Internal functions */ static int ng_add_hook(node_p node, const char *name, hook_p * hookp); static int ng_generic_msg(node_p here, item_p item, hook_p lasthook); static ng_ID_t ng_decodeidname(const char *name); static int ngb_mod_event(module_t mod, int event, void *data); static void ng_worklist_add(node_p node); static void ngthread(void *); static int ng_apply_item(node_p node, item_p item, int rw); static void ng_flush_input_queue(node_p node); static node_p ng_ID2noderef(ng_ID_t ID); static int ng_con_nodes(item_p item, node_p node, const char *name, node_p node2, const char *name2); static int ng_con_part2(node_p node, item_p item, hook_p hook); static int ng_con_part3(node_p node, item_p item, hook_p hook); static int ng_mkpeer(node_p node, const char *name, const char *name2, char *type); static void ng_name_rehash(void); static void ng_ID_rehash(void); /* Imported, these used to be externally visible, some may go back. */ void ng_destroy_hook(hook_p hook); int ng_path2noderef(node_p here, const char *path, node_p *dest, hook_p *lasthook); int ng_make_node(const char *type, node_p *nodepp); int ng_path_parse(char *addr, char **node, char **path, char **hook); void ng_rmnode(node_p node, hook_p dummy1, void *dummy2, int dummy3); void ng_unname(node_p node); /* Our own netgraph malloc type */ MALLOC_DEFINE(M_NETGRAPH, "netgraph", "netgraph structures and ctrl messages"); MALLOC_DEFINE(M_NETGRAPH_MSG, "netgraph_msg", "netgraph name storage"); static MALLOC_DEFINE(M_NETGRAPH_HOOK, "netgraph_hook", "netgraph hook structures"); static MALLOC_DEFINE(M_NETGRAPH_NODE, "netgraph_node", "netgraph node structures"); static MALLOC_DEFINE(M_NETGRAPH_ITEM, "netgraph_item", "netgraph item structures"); /* Should not be visible outside this file */ #define _NG_ALLOC_HOOK(hook) \ hook = malloc(sizeof(*hook), M_NETGRAPH_HOOK, M_NOWAIT | M_ZERO) #define _NG_ALLOC_NODE(node) \ node = malloc(sizeof(*node), M_NETGRAPH_NODE, M_NOWAIT | M_ZERO) #define NG_QUEUE_LOCK_INIT(n) \ mtx_init(&(n)->q_mtx, "ng_node", NULL, MTX_DEF) #define NG_QUEUE_LOCK(n) \ mtx_lock(&(n)->q_mtx) #define NG_QUEUE_UNLOCK(n) \ mtx_unlock(&(n)->q_mtx) #define NG_WORKLIST_LOCK_INIT() \ mtx_init(&ng_worklist_mtx, "ng_worklist", NULL, MTX_DEF) #define NG_WORKLIST_LOCK() \ mtx_lock(&ng_worklist_mtx) #define NG_WORKLIST_UNLOCK() \ mtx_unlock(&ng_worklist_mtx) #define NG_WORKLIST_SLEEP() \ mtx_sleep(&ng_worklist, &ng_worklist_mtx, PI_NET, "sleep", 0) #define NG_WORKLIST_WAKEUP() \ wakeup_one(&ng_worklist) #ifdef NETGRAPH_DEBUG /*----------------------------------------------*/ /* * In debug mode: * In an attempt to help track reference count screwups * we do not free objects back to the malloc system, but keep them * in a local cache where we can examine them and keep information safely * after they have been freed. * We use this scheme for nodes and hooks, and to some extent for items. */ static __inline hook_p ng_alloc_hook(void) { hook_p hook; SLIST_ENTRY(ng_hook) temp; mtx_lock(&ng_nodelist_mtx); hook = LIST_FIRST(&ng_freehooks); if (hook) { LIST_REMOVE(hook, hk_hooks); bcopy(&hook->hk_all, &temp, sizeof(temp)); bzero(hook, sizeof(struct ng_hook)); bcopy(&temp, &hook->hk_all, sizeof(temp)); mtx_unlock(&ng_nodelist_mtx); hook->hk_magic = HK_MAGIC; } else { mtx_unlock(&ng_nodelist_mtx); _NG_ALLOC_HOOK(hook); if (hook) { hook->hk_magic = HK_MAGIC; mtx_lock(&ng_nodelist_mtx); SLIST_INSERT_HEAD(&ng_allhooks, hook, hk_all); mtx_unlock(&ng_nodelist_mtx); } } return (hook); } static __inline node_p ng_alloc_node(void) { node_p node; SLIST_ENTRY(ng_node) temp; mtx_lock(&ng_nodelist_mtx); node = LIST_FIRST(&ng_freenodes); if (node) { LIST_REMOVE(node, nd_nodes); bcopy(&node->nd_all, &temp, sizeof(temp)); bzero(node, sizeof(struct ng_node)); bcopy(&temp, &node->nd_all, sizeof(temp)); mtx_unlock(&ng_nodelist_mtx); node->nd_magic = ND_MAGIC; } else { mtx_unlock(&ng_nodelist_mtx); _NG_ALLOC_NODE(node); if (node) { node->nd_magic = ND_MAGIC; mtx_lock(&ng_nodelist_mtx); SLIST_INSERT_HEAD(&ng_allnodes, node, nd_all); mtx_unlock(&ng_nodelist_mtx); } } return (node); } #define NG_ALLOC_HOOK(hook) do { (hook) = ng_alloc_hook(); } while (0) #define NG_ALLOC_NODE(node) do { (node) = ng_alloc_node(); } while (0) #define NG_FREE_HOOK(hook) \ do { \ mtx_lock(&ng_nodelist_mtx); \ LIST_INSERT_HEAD(&ng_freehooks, hook, hk_hooks); \ hook->hk_magic = 0; \ mtx_unlock(&ng_nodelist_mtx); \ } while (0) #define NG_FREE_NODE(node) \ do { \ mtx_lock(&ng_nodelist_mtx); \ LIST_INSERT_HEAD(&ng_freenodes, node, nd_nodes); \ node->nd_magic = 0; \ mtx_unlock(&ng_nodelist_mtx); \ } while (0) #else /* NETGRAPH_DEBUG */ /*----------------------------------------------*/ #define NG_ALLOC_HOOK(hook) _NG_ALLOC_HOOK(hook) #define NG_ALLOC_NODE(node) _NG_ALLOC_NODE(node) #define NG_FREE_HOOK(hook) do { free((hook), M_NETGRAPH_HOOK); } while (0) #define NG_FREE_NODE(node) do { free((node), M_NETGRAPH_NODE); } while (0) #endif /* NETGRAPH_DEBUG */ /*----------------------------------------------*/ /* Set this to kdb_enter("X") to catch all errors as they occur */ #ifndef TRAP_ERROR #define TRAP_ERROR() #endif static VNET_DEFINE(ng_ID_t, nextID) = 1; #define V_nextID VNET(nextID) #ifdef INVARIANTS #define CHECK_DATA_MBUF(m) do { \ struct mbuf *n; \ int total; \ \ M_ASSERTPKTHDR(m); \ for (total = 0, n = (m); n != NULL; n = n->m_next) { \ total += n->m_len; \ if (n->m_nextpkt != NULL) \ panic("%s: m_nextpkt", __func__); \ } \ \ if ((m)->m_pkthdr.len != total) { \ panic("%s: %d != %d", \ __func__, (m)->m_pkthdr.len, total); \ } \ } while (0) #else #define CHECK_DATA_MBUF(m) #endif #define ERROUT(x) do { error = (x); goto done; } while (0) /************************************************************************ Parse type definitions for generic messages ************************************************************************/ /* Handy structure parse type defining macro */ #define DEFINE_PARSE_STRUCT_TYPE(lo, up, args) \ static const struct ng_parse_struct_field \ ng_ ## lo ## _type_fields[] = NG_GENERIC_ ## up ## _INFO args; \ static const struct ng_parse_type ng_generic_ ## lo ## _type = { \ &ng_parse_struct_type, \ &ng_ ## lo ## _type_fields \ } DEFINE_PARSE_STRUCT_TYPE(mkpeer, MKPEER, ()); DEFINE_PARSE_STRUCT_TYPE(connect, CONNECT, ()); DEFINE_PARSE_STRUCT_TYPE(name, NAME, ()); DEFINE_PARSE_STRUCT_TYPE(rmhook, RMHOOK, ()); DEFINE_PARSE_STRUCT_TYPE(nodeinfo, NODEINFO, ()); DEFINE_PARSE_STRUCT_TYPE(typeinfo, TYPEINFO, ()); DEFINE_PARSE_STRUCT_TYPE(linkinfo, LINKINFO, (&ng_generic_nodeinfo_type)); /* Get length of an array when the length is stored as a 32 bit value immediately preceding the array -- as with struct namelist and struct typelist. */ static int ng_generic_list_getLength(const struct ng_parse_type *type, const u_char *start, const u_char *buf) { return *((const u_int32_t *)(buf - 4)); } /* Get length of the array of struct linkinfo inside a struct hooklist */ static int ng_generic_linkinfo_getLength(const struct ng_parse_type *type, const u_char *start, const u_char *buf) { const struct hooklist *hl = (const struct hooklist *)start; return hl->nodeinfo.hooks; } /* Array type for a variable length array of struct namelist */ static const struct ng_parse_array_info ng_nodeinfoarray_type_info = { &ng_generic_nodeinfo_type, &ng_generic_list_getLength }; static const struct ng_parse_type ng_generic_nodeinfoarray_type = { &ng_parse_array_type, &ng_nodeinfoarray_type_info }; /* Array type for a variable length array of struct typelist */ static const struct ng_parse_array_info ng_typeinfoarray_type_info = { &ng_generic_typeinfo_type, &ng_generic_list_getLength }; static const struct ng_parse_type ng_generic_typeinfoarray_type = { &ng_parse_array_type, &ng_typeinfoarray_type_info }; /* Array type for array of struct linkinfo in struct hooklist */ static const struct ng_parse_array_info ng_generic_linkinfo_array_type_info = { &ng_generic_linkinfo_type, &ng_generic_linkinfo_getLength }; static const struct ng_parse_type ng_generic_linkinfo_array_type = { &ng_parse_array_type, &ng_generic_linkinfo_array_type_info }; DEFINE_PARSE_STRUCT_TYPE(typelist, TYPELIST, (&ng_generic_typeinfoarray_type)); DEFINE_PARSE_STRUCT_TYPE(hooklist, HOOKLIST, (&ng_generic_nodeinfo_type, &ng_generic_linkinfo_array_type)); DEFINE_PARSE_STRUCT_TYPE(listnodes, LISTNODES, (&ng_generic_nodeinfoarray_type)); /* List of commands and how to convert arguments to/from ASCII */ static const struct ng_cmdlist ng_generic_cmds[] = { { NGM_GENERIC_COOKIE, NGM_SHUTDOWN, "shutdown", NULL, NULL }, { NGM_GENERIC_COOKIE, NGM_MKPEER, "mkpeer", &ng_generic_mkpeer_type, NULL }, { NGM_GENERIC_COOKIE, NGM_CONNECT, "connect", &ng_generic_connect_type, NULL }, { NGM_GENERIC_COOKIE, NGM_NAME, "name", &ng_generic_name_type, NULL }, { NGM_GENERIC_COOKIE, NGM_RMHOOK, "rmhook", &ng_generic_rmhook_type, NULL }, { NGM_GENERIC_COOKIE, NGM_NODEINFO, "nodeinfo", NULL, &ng_generic_nodeinfo_type }, { NGM_GENERIC_COOKIE, NGM_LISTHOOKS, "listhooks", NULL, &ng_generic_hooklist_type }, { NGM_GENERIC_COOKIE, NGM_LISTNAMES, "listnames", NULL, &ng_generic_listnodes_type /* same as NGM_LISTNODES */ }, { NGM_GENERIC_COOKIE, NGM_LISTNODES, "listnodes", NULL, &ng_generic_listnodes_type }, { NGM_GENERIC_COOKIE, NGM_LISTTYPES, "listtypes", NULL, &ng_generic_typelist_type }, { NGM_GENERIC_COOKIE, NGM_TEXT_CONFIG, "textconfig", NULL, &ng_parse_string_type }, { NGM_GENERIC_COOKIE, NGM_TEXT_STATUS, "textstatus", NULL, &ng_parse_string_type }, { NGM_GENERIC_COOKIE, NGM_ASCII2BINARY, "ascii2binary", &ng_parse_ng_mesg_type, &ng_parse_ng_mesg_type }, { NGM_GENERIC_COOKIE, NGM_BINARY2ASCII, "binary2ascii", &ng_parse_ng_mesg_type, &ng_parse_ng_mesg_type }, { 0 } }; /************************************************************************ Node routines ************************************************************************/ /* * Instantiate a node of the requested type */ int ng_make_node(const char *typename, node_p *nodepp) { struct ng_type *type; int error; /* Check that the type makes sense */ if (typename == NULL) { TRAP_ERROR(); return (EINVAL); } /* Locate the node type. If we fail we return. Do not try to load * module. */ if ((type = ng_findtype(typename)) == NULL) return (ENXIO); /* * If we have a constructor, then make the node and * call the constructor to do type specific initialisation. */ if (type->constructor != NULL) { if ((error = ng_make_node_common(type, nodepp)) == 0) { if ((error = ((*type->constructor)(*nodepp))) != 0) { NG_NODE_UNREF(*nodepp); } } } else { /* * Node has no constructor. We cannot ask for one * to be made. It must be brought into existence by * some external agency. The external agency should * call ng_make_node_common() directly to get the * netgraph part initialised. */ TRAP_ERROR(); error = EINVAL; } return (error); } /* * Generic node creation. Called by node initialisation for externally * instantiated nodes (e.g. hardware, sockets, etc ). * The returned node has a reference count of 1. */ int ng_make_node_common(struct ng_type *type, node_p *nodepp) { node_p node; /* Require the node type to have been already installed */ if (ng_findtype(type->name) == NULL) { TRAP_ERROR(); return (EINVAL); } /* Make a node and try attach it to the type */ NG_ALLOC_NODE(node); if (node == NULL) { TRAP_ERROR(); return (ENOMEM); } node->nd_type = type; #ifdef VIMAGE node->nd_vnet = curvnet; #endif NG_NODE_REF(node); /* note reference */ type->refs++; NG_QUEUE_LOCK_INIT(&node->nd_input_queue); STAILQ_INIT(&node->nd_input_queue.queue); node->nd_input_queue.q_flags = 0; /* Initialize hook list for new node */ LIST_INIT(&node->nd_hooks); /* Get an ID and put us in the hash chain. */ IDHASH_WLOCK(); for (;;) { /* wrap protection, even if silly */ node_p node2 = NULL; node->nd_ID = V_nextID++; /* 137/sec for 1 year before wrap */ /* Is there a problem with the new number? */ NG_IDHASH_FIND(node->nd_ID, node2); /* already taken? */ if ((node->nd_ID != 0) && (node2 == NULL)) { break; } } V_ng_nodes++; if (V_ng_nodes * 2 > V_ng_ID_hmask) ng_ID_rehash(); LIST_INSERT_HEAD(&V_ng_ID_hash[NG_IDHASH_FN(node->nd_ID)], node, nd_idnodes); IDHASH_WUNLOCK(); /* Done */ *nodepp = node; return (0); } /* * Forceably start the shutdown process on a node. Either call * its shutdown method, or do the default shutdown if there is * no type-specific method. * * We can only be called from a shutdown message, so we know we have * a writer lock, and therefore exclusive access. It also means * that we should not be on the work queue, but we check anyhow. * * Persistent node types must have a type-specific method which * allocates a new node in which case, this one is irretrievably going away, * or cleans up anything it needs, and just makes the node valid again, * in which case we allow the node to survive. * * XXX We need to think of how to tell a persistent node that we * REALLY need to go away because the hardware has gone or we * are rebooting.... etc. */ void ng_rmnode(node_p node, hook_p dummy1, void *dummy2, int dummy3) { hook_p hook; /* Check if it's already shutting down */ if ((node->nd_flags & NGF_CLOSING) != 0) return; if (node == &ng_deadnode) { printf ("shutdown called on deadnode\n"); return; } /* Add an extra reference so it doesn't go away during this */ NG_NODE_REF(node); /* * Mark it invalid so any newcomers know not to try use it * Also add our own mark so we can't recurse * note that NGF_INVALID does not do this as it's also set during * creation */ node->nd_flags |= NGF_INVALID|NGF_CLOSING; /* If node has its pre-shutdown method, then call it first*/ if (node->nd_type && node->nd_type->close) (*node->nd_type->close)(node); /* Notify all remaining connected nodes to disconnect */ while ((hook = LIST_FIRST(&node->nd_hooks)) != NULL) ng_destroy_hook(hook); /* * Drain the input queue forceably. * it has no hooks so what's it going to do, bleed on someone? * Theoretically we came here from a queue entry that was added * Just before the queue was closed, so it should be empty anyway. * Also removes us from worklist if needed. */ ng_flush_input_queue(node); /* Ask the type if it has anything to do in this case */ if (node->nd_type && node->nd_type->shutdown) { (*node->nd_type->shutdown)(node); if (NG_NODE_IS_VALID(node)) { /* * Well, blow me down if the node code hasn't declared * that it doesn't want to die. * Presumably it is a persistent node. * If we REALLY want it to go away, * e.g. hardware going away, * Our caller should set NGF_REALLY_DIE in nd_flags. */ node->nd_flags &= ~(NGF_INVALID|NGF_CLOSING); NG_NODE_UNREF(node); /* Assume they still have theirs */ return; } } else { /* do the default thing */ NG_NODE_UNREF(node); } ng_unname(node); /* basically a NOP these days */ /* * Remove extra reference, possibly the last * Possible other holders of references may include * timeout callouts, but theoretically the node's supposed to * have cancelled them. Possibly hardware dependencies may * force a driver to 'linger' with a reference. */ NG_NODE_UNREF(node); } /* * Remove a reference to the node, possibly the last. * deadnode always acts as it it were the last. */ void ng_unref_node(node_p node) { if (node == &ng_deadnode) return; CURVNET_SET(node->nd_vnet); if (refcount_release(&node->nd_refs)) { /* we were the last */ node->nd_type->refs--; /* XXX maybe should get types lock? */ NAMEHASH_WLOCK(); if (NG_NODE_HAS_NAME(node)) { V_ng_named_nodes--; LIST_REMOVE(node, nd_nodes); } NAMEHASH_WUNLOCK(); IDHASH_WLOCK(); V_ng_nodes--; LIST_REMOVE(node, nd_idnodes); IDHASH_WUNLOCK(); mtx_destroy(&node->nd_input_queue.q_mtx); NG_FREE_NODE(node); } CURVNET_RESTORE(); } /************************************************************************ Node ID handling ************************************************************************/ static node_p ng_ID2noderef(ng_ID_t ID) { node_p node; IDHASH_RLOCK(); NG_IDHASH_FIND(ID, node); if (node) NG_NODE_REF(node); IDHASH_RUNLOCK(); return(node); } ng_ID_t ng_node2ID(node_p node) { return (node ? NG_NODE_ID(node) : 0); } /************************************************************************ Node name handling ************************************************************************/ /* * Assign a node a name. */ int ng_name_node(node_p node, const char *name) { uint32_t hash; node_p node2; int i; /* Check the name is valid */ for (i = 0; i < NG_NODESIZ; i++) { if (name[i] == '\0' || name[i] == '.' || name[i] == ':') break; } if (i == 0 || name[i] != '\0') { TRAP_ERROR(); return (EINVAL); } if (ng_decodeidname(name) != 0) { /* valid IDs not allowed here */ TRAP_ERROR(); return (EINVAL); } NAMEHASH_WLOCK(); if (V_ng_named_nodes * 2 > V_ng_name_hmask) ng_name_rehash(); hash = hash32_str(name, HASHINIT) & V_ng_name_hmask; /* Check the name isn't already being used. */ LIST_FOREACH(node2, &V_ng_name_hash[hash], nd_nodes) if (NG_NODE_IS_VALID(node2) && (strcmp(NG_NODE_NAME(node2), name) == 0)) { NAMEHASH_WUNLOCK(); return (EADDRINUSE); } if (NG_NODE_HAS_NAME(node)) LIST_REMOVE(node, nd_nodes); else V_ng_named_nodes++; /* Copy it. */ strlcpy(NG_NODE_NAME(node), name, NG_NODESIZ); /* Update name hash. */ LIST_INSERT_HEAD(&V_ng_name_hash[hash], node, nd_nodes); NAMEHASH_WUNLOCK(); return (0); } /* * Find a node by absolute name. The name should NOT end with ':' * The name "." means "this node" and "[xxx]" means "the node * with ID (ie, at address) xxx". * * Returns the node if found, else NULL. * Eventually should add something faster than a sequential search. * Note it acquires a reference on the node so you can be sure it's still * there. */ node_p ng_name2noderef(node_p here, const char *name) { node_p node; ng_ID_t temp; int hash; /* "." means "this node" */ if (strcmp(name, ".") == 0) { NG_NODE_REF(here); return(here); } /* Check for name-by-ID */ if ((temp = ng_decodeidname(name)) != 0) { return (ng_ID2noderef(temp)); } /* Find node by name. */ hash = hash32_str(name, HASHINIT) & V_ng_name_hmask; NAMEHASH_RLOCK(); LIST_FOREACH(node, &V_ng_name_hash[hash], nd_nodes) if (NG_NODE_IS_VALID(node) && (strcmp(NG_NODE_NAME(node), name) == 0)) { NG_NODE_REF(node); break; } NAMEHASH_RUNLOCK(); return (node); } /* * Decode an ID name, eg. "[f03034de]". Returns 0 if the * string is not valid, otherwise returns the value. */ static ng_ID_t ng_decodeidname(const char *name) { const int len = strlen(name); char *eptr; u_long val; /* Check for proper length, brackets, no leading junk */ if ((len < 3) || (name[0] != '[') || (name[len - 1] != ']') || (!isxdigit(name[1]))) return ((ng_ID_t)0); /* Decode number */ val = strtoul(name + 1, &eptr, 16); if ((eptr - name != len - 1) || (val == ULONG_MAX) || (val == 0)) return ((ng_ID_t)0); return ((ng_ID_t)val); } /* * Remove a name from a node. This should only be called * when shutting down and removing the node. */ void ng_unname(node_p node) { } /* * Allocate a bigger name hash. */ static void ng_name_rehash() { struct nodehash *new; uint32_t hash; u_long hmask; node_p node, node2; int i; new = hashinit_flags((V_ng_name_hmask + 1) * 2, M_NETGRAPH_NODE, &hmask, HASH_NOWAIT); if (new == NULL) return; for (i = 0; i <= V_ng_name_hmask; i++) LIST_FOREACH_SAFE(node, &V_ng_name_hash[i], nd_nodes, node2) { #ifdef INVARIANTS LIST_REMOVE(node, nd_nodes); #endif hash = hash32_str(NG_NODE_NAME(node), HASHINIT) & hmask; LIST_INSERT_HEAD(&new[hash], node, nd_nodes); } hashdestroy(V_ng_name_hash, M_NETGRAPH_NODE, V_ng_name_hmask); V_ng_name_hash = new; V_ng_name_hmask = hmask; } /* * Allocate a bigger ID hash. */ static void ng_ID_rehash() { struct nodehash *new; uint32_t hash; u_long hmask; node_p node, node2; int i; new = hashinit_flags((V_ng_ID_hmask + 1) * 2, M_NETGRAPH_NODE, &hmask, HASH_NOWAIT); if (new == NULL) return; for (i = 0; i <= V_ng_ID_hmask; i++) LIST_FOREACH_SAFE(node, &V_ng_ID_hash[i], nd_idnodes, node2) { #ifdef INVARIANTS LIST_REMOVE(node, nd_idnodes); #endif hash = (node->nd_ID % (hmask + 1)); LIST_INSERT_HEAD(&new[hash], node, nd_idnodes); } hashdestroy(V_ng_ID_hash, M_NETGRAPH_NODE, V_ng_name_hmask); V_ng_ID_hash = new; V_ng_ID_hmask = hmask; } /************************************************************************ Hook routines Names are not optional. Hooks are always connected, except for a brief moment within these routines. On invalidation or during creation they are connected to the 'dead' hook. ************************************************************************/ /* * Remove a hook reference */ void ng_unref_hook(hook_p hook) { if (hook == &ng_deadhook) return; if (refcount_release(&hook->hk_refs)) { /* we were the last */ if (_NG_HOOK_NODE(hook)) /* it'll probably be ng_deadnode */ _NG_NODE_UNREF((_NG_HOOK_NODE(hook))); NG_FREE_HOOK(hook); } } /* * Add an unconnected hook to a node. Only used internally. * Assumes node is locked. (XXX not yet true ) */ static int ng_add_hook(node_p node, const char *name, hook_p *hookp) { hook_p hook; int error = 0; /* Check that the given name is good */ if (name == NULL) { TRAP_ERROR(); return (EINVAL); } if (ng_findhook(node, name) != NULL) { TRAP_ERROR(); return (EEXIST); } /* Allocate the hook and link it up */ NG_ALLOC_HOOK(hook); if (hook == NULL) { TRAP_ERROR(); return (ENOMEM); } hook->hk_refs = 1; /* add a reference for us to return */ hook->hk_flags = HK_INVALID; hook->hk_peer = &ng_deadhook; /* start off this way */ hook->hk_node = node; NG_NODE_REF(node); /* each hook counts as a reference */ /* Set hook name */ strlcpy(NG_HOOK_NAME(hook), name, NG_HOOKSIZ); /* * Check if the node type code has something to say about it * If it fails, the unref of the hook will also unref the node. */ if (node->nd_type->newhook != NULL) { if ((error = (*node->nd_type->newhook)(node, hook, name))) { NG_HOOK_UNREF(hook); /* this frees the hook */ return (error); } } /* * The 'type' agrees so far, so go ahead and link it in. * We'll ask again later when we actually connect the hooks. */ LIST_INSERT_HEAD(&node->nd_hooks, hook, hk_hooks); node->nd_numhooks++; NG_HOOK_REF(hook); /* one for the node */ if (hookp) *hookp = hook; return (0); } /* * Find a hook * * Node types may supply their own optimized routines for finding * hooks. If none is supplied, we just do a linear search. * XXX Possibly we should add a reference to the hook? */ hook_p ng_findhook(node_p node, const char *name) { hook_p hook; if (node->nd_type->findhook != NULL) return (*node->nd_type->findhook)(node, name); LIST_FOREACH(hook, &node->nd_hooks, hk_hooks) { if (NG_HOOK_IS_VALID(hook) && (strcmp(NG_HOOK_NAME(hook), name) == 0)) return (hook); } return (NULL); } /* * Destroy a hook * * As hooks are always attached, this really destroys two hooks. * The one given, and the one attached to it. Disconnect the hooks * from each other first. We reconnect the peer hook to the 'dead' * hook so that it can still exist after we depart. We then * send the peer its own destroy message. This ensures that we only * interact with the peer's structures when it is locked processing that * message. We hold a reference to the peer hook so we are guaranteed that * the peer hook and node are still going to exist until * we are finished there as the hook holds a ref on the node. * We run this same code again on the peer hook, but that time it is already * attached to the 'dead' hook. * * This routine is called at all stages of hook creation * on error detection and must be able to handle any such stage. */ void ng_destroy_hook(hook_p hook) { hook_p peer; node_p node; if (hook == &ng_deadhook) { /* better safe than sorry */ printf("ng_destroy_hook called on deadhook\n"); return; } /* * Protect divorce process with mutex, to avoid races on * simultaneous disconnect. */ TOPOLOGY_WLOCK(); hook->hk_flags |= HK_INVALID; peer = NG_HOOK_PEER(hook); node = NG_HOOK_NODE(hook); if (peer && (peer != &ng_deadhook)) { /* * Set the peer to point to ng_deadhook * from this moment on we are effectively independent it. - * send it an rmhook message of it's own. + * send it an rmhook message of its own. */ peer->hk_peer = &ng_deadhook; /* They no longer know us */ hook->hk_peer = &ng_deadhook; /* Nor us, them */ if (NG_HOOK_NODE(peer) == &ng_deadnode) { /* * If it's already divorced from a node, * just free it. */ TOPOLOGY_WUNLOCK(); } else { TOPOLOGY_WUNLOCK(); ng_rmhook_self(peer); /* Send it a surprise */ } NG_HOOK_UNREF(peer); /* account for peer link */ NG_HOOK_UNREF(hook); /* account for peer link */ } else TOPOLOGY_WUNLOCK(); TOPOLOGY_NOTOWNED(); /* * Remove the hook from the node's list to avoid possible recursion * in case the disconnection results in node shutdown. */ if (node == &ng_deadnode) { /* happens if called from ng_con_nodes() */ return; } LIST_REMOVE(hook, hk_hooks); node->nd_numhooks--; if (node->nd_type->disconnect) { /* * The type handler may elect to destroy the node so don't * trust its existence after this point. (except * that we still hold a reference on it. (which we * inherrited from the hook we are destroying) */ (*node->nd_type->disconnect) (hook); } /* * Note that because we will point to ng_deadnode, the original node * is not decremented automatically so we do that manually. */ _NG_HOOK_NODE(hook) = &ng_deadnode; NG_NODE_UNREF(node); /* We no longer point to it so adjust count */ NG_HOOK_UNREF(hook); /* Account for linkage (in list) to node */ } /* * Take two hooks on a node and merge the connection so that the given node * is effectively bypassed. */ int ng_bypass(hook_p hook1, hook_p hook2) { if (hook1->hk_node != hook2->hk_node) { TRAP_ERROR(); return (EINVAL); } TOPOLOGY_WLOCK(); if (NG_HOOK_NOT_VALID(hook1) || NG_HOOK_NOT_VALID(hook2)) { TOPOLOGY_WUNLOCK(); return (EINVAL); } hook1->hk_peer->hk_peer = hook2->hk_peer; hook2->hk_peer->hk_peer = hook1->hk_peer; hook1->hk_peer = &ng_deadhook; hook2->hk_peer = &ng_deadhook; TOPOLOGY_WUNLOCK(); NG_HOOK_UNREF(hook1); NG_HOOK_UNREF(hook2); /* XXX If we ever cache methods on hooks update them as well */ ng_destroy_hook(hook1); ng_destroy_hook(hook2); return (0); } /* * Install a new netgraph type */ int ng_newtype(struct ng_type *tp) { const size_t namelen = strlen(tp->name); /* Check version and type name fields */ if ((tp->version != NG_ABI_VERSION) || (namelen == 0) || (namelen >= NG_TYPESIZ)) { TRAP_ERROR(); if (tp->version != NG_ABI_VERSION) { printf("Netgraph: Node type rejected. ABI mismatch. " "Suggest recompile\n"); } return (EINVAL); } /* Check for name collision */ if (ng_findtype(tp->name) != NULL) { TRAP_ERROR(); return (EEXIST); } /* Link in new type */ TYPELIST_WLOCK(); LIST_INSERT_HEAD(&ng_typelist, tp, types); tp->refs = 1; /* first ref is linked list */ TYPELIST_WUNLOCK(); return (0); } /* * unlink a netgraph type * If no examples exist */ int ng_rmtype(struct ng_type *tp) { /* Check for name collision */ if (tp->refs != 1) { TRAP_ERROR(); return (EBUSY); } /* Unlink type */ TYPELIST_WLOCK(); LIST_REMOVE(tp, types); TYPELIST_WUNLOCK(); return (0); } /* * Look for a type of the name given */ struct ng_type * ng_findtype(const char *typename) { struct ng_type *type; TYPELIST_RLOCK(); LIST_FOREACH(type, &ng_typelist, types) { if (strcmp(type->name, typename) == 0) break; } TYPELIST_RUNLOCK(); return (type); } /************************************************************************ Composite routines ************************************************************************/ /* * Connect two nodes using the specified hooks, using queued functions. */ static int ng_con_part3(node_p node, item_p item, hook_p hook) { int error = 0; /* * When we run, we know that the node 'node' is locked for us. * Our caller has a reference on the hook. * Our caller has a reference on the node. * (In this case our caller is ng_apply_item() ). * The peer hook has a reference on the hook. * We are all set up except for the final call to the node, and * the clearing of the INVALID flag. */ if (NG_HOOK_NODE(hook) == &ng_deadnode) { /* * The node must have been freed again since we last visited * here. ng_destry_hook() has this effect but nothing else does. * We should just release our references and * free anything we can think of. * Since we know it's been destroyed, and it's our caller * that holds the references, just return. */ ERROUT(ENOENT); } if (hook->hk_node->nd_type->connect) { if ((error = (*hook->hk_node->nd_type->connect) (hook))) { ng_destroy_hook(hook); /* also zaps peer */ printf("failed in ng_con_part3()\n"); ERROUT(error); } } /* * XXX this is wrong for SMP. Possibly we need * to separate out 'create' and 'invalid' flags. * should only set flags on hooks we have locked under our node. */ hook->hk_flags &= ~HK_INVALID; done: NG_FREE_ITEM(item); return (error); } static int ng_con_part2(node_p node, item_p item, hook_p hook) { hook_p peer; int error = 0; /* * When we run, we know that the node 'node' is locked for us. * Our caller has a reference on the hook. * Our caller has a reference on the node. * (In this case our caller is ng_apply_item() ). * The peer hook has a reference on the hook. * our node pointer points to the 'dead' node. * First check the hook name is unique. * Should not happen because we checked before queueing this. */ if (ng_findhook(node, NG_HOOK_NAME(hook)) != NULL) { TRAP_ERROR(); ng_destroy_hook(hook); /* should destroy peer too */ printf("failed in ng_con_part2()\n"); ERROUT(EEXIST); } /* * Check if the node type code has something to say about it * If it fails, the unref of the hook will also unref the attached node, * however since that node is 'ng_deadnode' this will do nothing. * The peer hook will also be destroyed. */ if (node->nd_type->newhook != NULL) { if ((error = (*node->nd_type->newhook)(node, hook, hook->hk_name))) { ng_destroy_hook(hook); /* should destroy peer too */ printf("failed in ng_con_part2()\n"); ERROUT(error); } } /* * The 'type' agrees so far, so go ahead and link it in. * We'll ask again later when we actually connect the hooks. */ hook->hk_node = node; /* just overwrite ng_deadnode */ NG_NODE_REF(node); /* each hook counts as a reference */ LIST_INSERT_HEAD(&node->nd_hooks, hook, hk_hooks); node->nd_numhooks++; NG_HOOK_REF(hook); /* one for the node */ /* * We now have a symmetrical situation, where both hooks have been * linked to their nodes, the newhook methods have been called * And the references are all correct. The hooks are still marked * as invalid, as we have not called the 'connect' methods * yet. * We can call the local one immediately as we have the * node locked, but we need to queue the remote one. */ if (hook->hk_node->nd_type->connect) { if ((error = (*hook->hk_node->nd_type->connect) (hook))) { ng_destroy_hook(hook); /* also zaps peer */ printf("failed in ng_con_part2(A)\n"); ERROUT(error); } } /* * Acquire topo mutex to avoid race with ng_destroy_hook(). */ TOPOLOGY_RLOCK(); peer = hook->hk_peer; if (peer == &ng_deadhook) { TOPOLOGY_RUNLOCK(); printf("failed in ng_con_part2(B)\n"); ng_destroy_hook(hook); ERROUT(ENOENT); } TOPOLOGY_RUNLOCK(); if ((error = ng_send_fn2(peer->hk_node, peer, item, &ng_con_part3, NULL, 0, NG_REUSE_ITEM))) { printf("failed in ng_con_part2(C)\n"); ng_destroy_hook(hook); /* also zaps peer */ return (error); /* item was consumed. */ } hook->hk_flags &= ~HK_INVALID; /* need both to be able to work */ return (0); /* item was consumed. */ done: NG_FREE_ITEM(item); return (error); } /* * Connect this node with another node. We assume that this node is * currently locked, as we are only called from an NGM_CONNECT message. */ static int ng_con_nodes(item_p item, node_p node, const char *name, node_p node2, const char *name2) { int error; hook_p hook; hook_p hook2; if (ng_findhook(node2, name2) != NULL) { return(EEXIST); } if ((error = ng_add_hook(node, name, &hook))) /* gives us a ref */ return (error); /* Allocate the other hook and link it up */ NG_ALLOC_HOOK(hook2); if (hook2 == NULL) { TRAP_ERROR(); ng_destroy_hook(hook); /* XXX check ref counts so far */ NG_HOOK_UNREF(hook); /* including our ref */ return (ENOMEM); } hook2->hk_refs = 1; /* start with a reference for us. */ hook2->hk_flags = HK_INVALID; hook2->hk_peer = hook; /* Link the two together */ hook->hk_peer = hook2; NG_HOOK_REF(hook); /* Add a ref for the peer to each*/ NG_HOOK_REF(hook2); hook2->hk_node = &ng_deadnode; strlcpy(NG_HOOK_NAME(hook2), name2, NG_HOOKSIZ); /* * Queue the function above. * Procesing continues in that function in the lock context of * the other node. */ if ((error = ng_send_fn2(node2, hook2, item, &ng_con_part2, NULL, 0, NG_NOFLAGS))) { printf("failed in ng_con_nodes(): %d\n", error); ng_destroy_hook(hook); /* also zaps peer */ } NG_HOOK_UNREF(hook); /* Let each hook go if it wants to */ NG_HOOK_UNREF(hook2); return (error); } /* * Make a peer and connect. * We assume that the local node is locked. * The new node probably doesn't need a lock until * it has a hook, because it cannot really have any work until then, * but we should think about it a bit more. * * The problem may come if the other node also fires up * some hardware or a timer or some other source of activation, * also it may already get a command msg via it's ID. * * We could use the same method as ng_con_nodes() but we'd have * to add ability to remove the node when failing. (Not hard, just * make arg1 point to the node to remove). * Unless of course we just ignore failure to connect and leave * an unconnected node? */ static int ng_mkpeer(node_p node, const char *name, const char *name2, char *type) { node_p node2; hook_p hook1, hook2; int error; if ((error = ng_make_node(type, &node2))) { return (error); } if ((error = ng_add_hook(node, name, &hook1))) { /* gives us a ref */ ng_rmnode(node2, NULL, NULL, 0); return (error); } if ((error = ng_add_hook(node2, name2, &hook2))) { ng_rmnode(node2, NULL, NULL, 0); ng_destroy_hook(hook1); NG_HOOK_UNREF(hook1); return (error); } /* * Actually link the two hooks together. */ hook1->hk_peer = hook2; hook2->hk_peer = hook1; /* Each hook is referenced by the other */ NG_HOOK_REF(hook1); NG_HOOK_REF(hook2); /* Give each node the opportunity to veto the pending connection */ if (hook1->hk_node->nd_type->connect) { error = (*hook1->hk_node->nd_type->connect) (hook1); } if ((error == 0) && hook2->hk_node->nd_type->connect) { error = (*hook2->hk_node->nd_type->connect) (hook2); } /* * drop the references we were holding on the two hooks. */ if (error) { ng_destroy_hook(hook2); /* also zaps hook1 */ ng_rmnode(node2, NULL, NULL, 0); } else { /* As a last act, allow the hooks to be used */ hook1->hk_flags &= ~HK_INVALID; hook2->hk_flags &= ~HK_INVALID; } NG_HOOK_UNREF(hook1); NG_HOOK_UNREF(hook2); return (error); } /************************************************************************ Utility routines to send self messages ************************************************************************/ /* Shut this node down as soon as everyone is clear of it */ /* Should add arg "immediately" to jump the queue */ int ng_rmnode_self(node_p node) { int error; if (node == &ng_deadnode) return (0); node->nd_flags |= NGF_INVALID; if (node->nd_flags & NGF_CLOSING) return (0); error = ng_send_fn(node, NULL, &ng_rmnode, NULL, 0); return (error); } static void ng_rmhook_part2(node_p node, hook_p hook, void *arg1, int arg2) { ng_destroy_hook(hook); return ; } int ng_rmhook_self(hook_p hook) { int error; node_p node = NG_HOOK_NODE(hook); if (node == &ng_deadnode) return (0); error = ng_send_fn(node, hook, &ng_rmhook_part2, NULL, 0); return (error); } /*********************************************************************** * Parse and verify a string of the form: * * Such a string can refer to a specific node or a specific hook * on a specific node, depending on how you look at it. In the * latter case, the PATH component must not end in a dot. * * Both and are optional. The is a string * of hook names separated by dots. This breaks out the original * string, setting *nodep to "NODE" (or NULL if none) and *pathp * to "PATH" (or NULL if degenerate). Also, *hookp will point to * the final hook component of , if any, otherwise NULL. * * This returns -1 if the path is malformed. The char ** are optional. ***********************************************************************/ int ng_path_parse(char *addr, char **nodep, char **pathp, char **hookp) { char *node, *path, *hook; int k; /* * Extract absolute NODE, if any */ for (path = addr; *path && *path != ':'; path++); if (*path) { node = addr; /* Here's the NODE */ *path++ = '\0'; /* Here's the PATH */ /* Node name must not be empty */ if (!*node) return -1; /* A name of "." is OK; otherwise '.' not allowed */ if (strcmp(node, ".") != 0) { for (k = 0; node[k]; k++) if (node[k] == '.') return -1; } } else { node = NULL; /* No absolute NODE */ path = addr; /* Here's the PATH */ } /* Snoop for illegal characters in PATH */ for (k = 0; path[k]; k++) if (path[k] == ':') return -1; /* Check for no repeated dots in PATH */ for (k = 0; path[k]; k++) if (path[k] == '.' && path[k + 1] == '.') return -1; /* Remove extra (degenerate) dots from beginning or end of PATH */ if (path[0] == '.') path++; if (*path && path[strlen(path) - 1] == '.') path[strlen(path) - 1] = 0; /* If PATH has a dot, then we're not talking about a hook */ if (*path) { for (hook = path, k = 0; path[k]; k++) if (path[k] == '.') { hook = NULL; break; } } else path = hook = NULL; /* Done */ if (nodep) *nodep = node; if (pathp) *pathp = path; if (hookp) *hookp = hook; return (0); } /* * Given a path, which may be absolute or relative, and a starting node, * return the destination node. */ int ng_path2noderef(node_p here, const char *address, node_p *destp, hook_p *lasthook) { char fullpath[NG_PATHSIZ]; char *nodename, *path; node_p node, oldnode; /* Initialize */ if (destp == NULL) { TRAP_ERROR(); return EINVAL; } *destp = NULL; /* Make a writable copy of address for ng_path_parse() */ strncpy(fullpath, address, sizeof(fullpath) - 1); fullpath[sizeof(fullpath) - 1] = '\0'; /* Parse out node and sequence of hooks */ if (ng_path_parse(fullpath, &nodename, &path, NULL) < 0) { TRAP_ERROR(); return EINVAL; } /* * For an absolute address, jump to the starting node. * Note that this holds a reference on the node for us. * Don't forget to drop the reference if we don't need it. */ if (nodename) { node = ng_name2noderef(here, nodename); if (node == NULL) { TRAP_ERROR(); return (ENOENT); } } else { if (here == NULL) { TRAP_ERROR(); return (EINVAL); } node = here; NG_NODE_REF(node); } if (path == NULL) { if (lasthook != NULL) *lasthook = NULL; *destp = node; return (0); } /* * Now follow the sequence of hooks * * XXXGL: The path may demolish as we go the sequence, but if * we hold the topology mutex at critical places, then, I hope, * we would always have valid pointers in hand, although the * path behind us may no longer exist. */ for (;;) { hook_p hook; char *segment; /* * Break out the next path segment. Replace the dot we just * found with a NUL; "path" points to the next segment (or the * NUL at the end). */ for (segment = path; *path != '\0'; path++) { if (*path == '.') { *path++ = '\0'; break; } } /* We have a segment, so look for a hook by that name */ hook = ng_findhook(node, segment); TOPOLOGY_WLOCK(); /* Can't get there from here... */ if (hook == NULL || NG_HOOK_PEER(hook) == NULL || NG_HOOK_NOT_VALID(hook) || NG_HOOK_NOT_VALID(NG_HOOK_PEER(hook))) { TRAP_ERROR(); NG_NODE_UNREF(node); TOPOLOGY_WUNLOCK(); return (ENOENT); } /* * Hop on over to the next node * XXX * Big race conditions here as hooks and nodes go away * *** Idea.. store an ng_ID_t in each hook and use that * instead of the direct hook in this crawl? */ oldnode = node; if ((node = NG_PEER_NODE(hook))) NG_NODE_REF(node); /* XXX RACE */ NG_NODE_UNREF(oldnode); /* XXX another race */ if (NG_NODE_NOT_VALID(node)) { NG_NODE_UNREF(node); /* XXX more races */ TOPOLOGY_WUNLOCK(); TRAP_ERROR(); return (ENXIO); } if (*path == '\0') { if (lasthook != NULL) { if (hook != NULL) { *lasthook = NG_HOOK_PEER(hook); NG_HOOK_REF(*lasthook); } else *lasthook = NULL; } TOPOLOGY_WUNLOCK(); *destp = node; return (0); } TOPOLOGY_WUNLOCK(); } } /***************************************************************\ * Input queue handling. * All activities are submitted to the node via the input queue * which implements a multiple-reader/single-writer gate. * Items which cannot be handled immediately are queued. * * read-write queue locking inline functions * \***************************************************************/ static __inline void ng_queue_rw(node_p node, item_p item, int rw); static __inline item_p ng_dequeue(node_p node, int *rw); static __inline item_p ng_acquire_read(node_p node, item_p item); static __inline item_p ng_acquire_write(node_p node, item_p item); static __inline void ng_leave_read(node_p node); static __inline void ng_leave_write(node_p node); /* * Definition of the bits fields in the ng_queue flag word. * Defined here rather than in netgraph.h because no-one should fiddle * with them. * * The ordering here may be important! don't shuffle these. */ /*- Safety Barrier--------+ (adjustable to suit taste) (not used yet) | V +-------+-------+-------+-------+-------+-------+-------+-------+ | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |A|c|t|i|v|e| |R|e|a|d|e|r| |C|o|u|n|t| | | | | | | | | |P|A| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |O|W| +-------+-------+-------+-------+-------+-------+-------+-------+ \___________________________ ____________________________/ | | V | | [active reader count] | | | | Operation Pending -------------------------------+ | | Active Writer ---------------------------------------+ Node queue has such semantics: - All flags modifications are atomic. - Reader count can be incremented only if there is no writer or pending flags. As soon as this can't be done with single operation, it is implemented with spin loop and atomic_cmpset(). - Writer flag can be set only if there is no any bits set. It is implemented with atomic_cmpset(). - Pending flag can be set any time, but to avoid collision on queue processing all queue fields are protected by the mutex. - Queue processing thread reads queue holding the mutex, but releases it while processing. When queue is empty pending flag is removed. */ #define WRITER_ACTIVE 0x00000001 #define OP_PENDING 0x00000002 #define READER_INCREMENT 0x00000004 #define READER_MASK 0xfffffffc /* Not valid if WRITER_ACTIVE is set */ #define SAFETY_BARRIER 0x00100000 /* 128K items queued should be enough */ /* Defines of more elaborate states on the queue */ /* Mask of bits a new read cares about */ #define NGQ_RMASK (WRITER_ACTIVE|OP_PENDING) /* Mask of bits a new write cares about */ #define NGQ_WMASK (NGQ_RMASK|READER_MASK) /* Test to decide if there is something on the queue. */ #define QUEUE_ACTIVE(QP) ((QP)->q_flags & OP_PENDING) /* How to decide what the next queued item is. */ #define HEAD_IS_READER(QP) NGI_QUEUED_READER(STAILQ_FIRST(&(QP)->queue)) #define HEAD_IS_WRITER(QP) NGI_QUEUED_WRITER(STAILQ_FIRST(&(QP)->queue)) /* notused */ /* Read the status to decide if the next item on the queue can now run. */ #define QUEUED_READER_CAN_PROCEED(QP) \ (((QP)->q_flags & (NGQ_RMASK & ~OP_PENDING)) == 0) #define QUEUED_WRITER_CAN_PROCEED(QP) \ (((QP)->q_flags & (NGQ_WMASK & ~OP_PENDING)) == 0) /* Is there a chance of getting ANY work off the queue? */ #define NEXT_QUEUED_ITEM_CAN_PROCEED(QP) \ ((HEAD_IS_READER(QP)) ? QUEUED_READER_CAN_PROCEED(QP) : \ QUEUED_WRITER_CAN_PROCEED(QP)) #define NGQRW_R 0 #define NGQRW_W 1 #define NGQ2_WORKQ 0x00000001 /* * Taking into account the current state of the queue and node, possibly take * the next entry off the queue and return it. Return NULL if there was * nothing we could return, either because there really was nothing there, or * because the node was in a state where it cannot yet process the next item * on the queue. */ static __inline item_p ng_dequeue(node_p node, int *rw) { item_p item; struct ng_queue *ngq = &node->nd_input_queue; /* This MUST be called with the mutex held. */ mtx_assert(&ngq->q_mtx, MA_OWNED); /* If there is nothing queued, then just return. */ if (!QUEUE_ACTIVE(ngq)) { CTR4(KTR_NET, "%20s: node [%x] (%p) queue empty; " "queue flags 0x%lx", __func__, node->nd_ID, node, ngq->q_flags); return (NULL); } /* * From here, we can assume there is a head item. * We need to find out what it is and if it can be dequeued, given * the current state of the node. */ if (HEAD_IS_READER(ngq)) { while (1) { long t = ngq->q_flags; if (t & WRITER_ACTIVE) { /* There is writer, reader can't proceed. */ CTR4(KTR_NET, "%20s: node [%x] (%p) queued " "reader can't proceed; queue flags 0x%lx", __func__, node->nd_ID, node, t); return (NULL); } if (atomic_cmpset_acq_int(&ngq->q_flags, t, t + READER_INCREMENT)) break; cpu_spinwait(); } /* We have got reader lock for the node. */ *rw = NGQRW_R; } else if (atomic_cmpset_acq_int(&ngq->q_flags, OP_PENDING, OP_PENDING + WRITER_ACTIVE)) { /* We have got writer lock for the node. */ *rw = NGQRW_W; } else { /* There is somebody other, writer can't proceed. */ CTR4(KTR_NET, "%20s: node [%x] (%p) queued writer can't " "proceed; queue flags 0x%lx", __func__, node->nd_ID, node, ngq->q_flags); return (NULL); } /* * Now we dequeue the request (whatever it may be) and correct the * pending flags and the next and last pointers. */ item = STAILQ_FIRST(&ngq->queue); STAILQ_REMOVE_HEAD(&ngq->queue, el_next); if (STAILQ_EMPTY(&ngq->queue)) atomic_clear_int(&ngq->q_flags, OP_PENDING); CTR6(KTR_NET, "%20s: node [%x] (%p) returning item %p as %s; queue " "flags 0x%lx", __func__, node->nd_ID, node, item, *rw ? "WRITER" : "READER", ngq->q_flags); return (item); } /* * Queue a packet to be picked up later by someone else. * If the queue could be run now, add node to the queue handler's worklist. */ static __inline void ng_queue_rw(node_p node, item_p item, int rw) { struct ng_queue *ngq = &node->nd_input_queue; if (rw == NGQRW_W) NGI_SET_WRITER(item); else NGI_SET_READER(item); item->depth = 1; NG_QUEUE_LOCK(ngq); /* Set OP_PENDING flag and enqueue the item. */ atomic_set_int(&ngq->q_flags, OP_PENDING); STAILQ_INSERT_TAIL(&ngq->queue, item, el_next); CTR5(KTR_NET, "%20s: node [%x] (%p) queued item %p as %s", __func__, node->nd_ID, node, item, rw ? "WRITER" : "READER" ); /* * We can take the worklist lock with the node locked * BUT NOT THE REVERSE! */ if (NEXT_QUEUED_ITEM_CAN_PROCEED(ngq)) ng_worklist_add(node); NG_QUEUE_UNLOCK(ngq); } /* Acquire reader lock on node. If node is busy, queue the packet. */ static __inline item_p ng_acquire_read(node_p node, item_p item) { KASSERT(node != &ng_deadnode, ("%s: working on deadnode", __func__)); /* Reader needs node without writer and pending items. */ for (;;) { long t = node->nd_input_queue.q_flags; if (t & NGQ_RMASK) break; /* Node is not ready for reader. */ if (atomic_cmpset_acq_int(&node->nd_input_queue.q_flags, t, t + READER_INCREMENT)) { /* Successfully grabbed node */ CTR4(KTR_NET, "%20s: node [%x] (%p) acquired item %p", __func__, node->nd_ID, node, item); return (item); } cpu_spinwait(); } /* Queue the request for later. */ ng_queue_rw(node, item, NGQRW_R); return (NULL); } /* Acquire writer lock on node. If node is busy, queue the packet. */ static __inline item_p ng_acquire_write(node_p node, item_p item) { KASSERT(node != &ng_deadnode, ("%s: working on deadnode", __func__)); /* Writer needs completely idle node. */ if (atomic_cmpset_acq_int(&node->nd_input_queue.q_flags, 0, WRITER_ACTIVE)) { /* Successfully grabbed node */ CTR4(KTR_NET, "%20s: node [%x] (%p) acquired item %p", __func__, node->nd_ID, node, item); return (item); } /* Queue the request for later. */ ng_queue_rw(node, item, NGQRW_W); return (NULL); } #if 0 static __inline item_p ng_upgrade_write(node_p node, item_p item) { struct ng_queue *ngq = &node->nd_input_queue; KASSERT(node != &ng_deadnode, ("%s: working on deadnode", __func__)); NGI_SET_WRITER(item); NG_QUEUE_LOCK(ngq); /* * There will never be no readers as we are there ourselves. * Set the WRITER_ACTIVE flags ASAP to block out fast track readers. * The caller we are running from will call ng_leave_read() * soon, so we must account for that. We must leave again with the * READER lock. If we find other readers, then * queue the request for later. However "later" may be rignt now * if there are no readers. We don't really care if there are queued * items as we will bypass them anyhow. */ atomic_add_int(&ngq->q_flags, WRITER_ACTIVE - READER_INCREMENT); if ((ngq->q_flags & (NGQ_WMASK & ~OP_PENDING)) == WRITER_ACTIVE) { NG_QUEUE_UNLOCK(ngq); /* It's just us, act on the item. */ /* will NOT drop writer lock when done */ ng_apply_item(node, item, 0); /* * Having acted on the item, atomically * downgrade back to READER and finish up. */ atomic_add_int(&ngq->q_flags, READER_INCREMENT - WRITER_ACTIVE); /* Our caller will call ng_leave_read() */ return; } /* * It's not just us active, so queue us AT THE HEAD. * "Why?" I hear you ask. * Put us at the head of the queue as we've already been * through it once. If there is nothing else waiting, * set the correct flags. */ if (STAILQ_EMPTY(&ngq->queue)) { /* We've gone from, 0 to 1 item in the queue */ atomic_set_int(&ngq->q_flags, OP_PENDING); CTR3(KTR_NET, "%20s: node [%x] (%p) set OP_PENDING", __func__, node->nd_ID, node); }; STAILQ_INSERT_HEAD(&ngq->queue, item, el_next); CTR4(KTR_NET, "%20s: node [%x] (%p) requeued item %p as WRITER", __func__, node->nd_ID, node, item ); /* Reverse what we did above. That downgrades us back to reader */ atomic_add_int(&ngq->q_flags, READER_INCREMENT - WRITER_ACTIVE); if (QUEUE_ACTIVE(ngq) && NEXT_QUEUED_ITEM_CAN_PROCEED(ngq)) ng_worklist_add(node); NG_QUEUE_UNLOCK(ngq); return; } #endif /* Release reader lock. */ static __inline void ng_leave_read(node_p node) { atomic_subtract_rel_int(&node->nd_input_queue.q_flags, READER_INCREMENT); } /* Release writer lock. */ static __inline void ng_leave_write(node_p node) { atomic_clear_rel_int(&node->nd_input_queue.q_flags, WRITER_ACTIVE); } /* Purge node queue. Called on node shutdown. */ static void ng_flush_input_queue(node_p node) { struct ng_queue *ngq = &node->nd_input_queue; item_p item; NG_QUEUE_LOCK(ngq); while ((item = STAILQ_FIRST(&ngq->queue)) != NULL) { STAILQ_REMOVE_HEAD(&ngq->queue, el_next); if (STAILQ_EMPTY(&ngq->queue)) atomic_clear_int(&ngq->q_flags, OP_PENDING); NG_QUEUE_UNLOCK(ngq); /* If the item is supplying a callback, call it with an error */ if (item->apply != NULL) { if (item->depth == 1) item->apply->error = ENOENT; if (refcount_release(&item->apply->refs)) { (*item->apply->apply)(item->apply->context, item->apply->error); } } NG_FREE_ITEM(item); NG_QUEUE_LOCK(ngq); } NG_QUEUE_UNLOCK(ngq); } /*********************************************************************** * Externally visible method for sending or queueing messages or data. ***********************************************************************/ /* * The module code should have filled out the item correctly by this stage: * Common: * reference to destination node. * Reference to destination rcv hook if relevant. * apply pointer must be or NULL or reference valid struct ng_apply_info. * Data: * pointer to mbuf * Control_Message: * pointer to msg. * ID of original sender node. (return address) * Function: * Function pointer * void * argument * integer argument * * The nodes have several routines and macros to help with this task: */ int ng_snd_item(item_p item, int flags) { hook_p hook; node_p node; int queue, rw; struct ng_queue *ngq; int error = 0; /* We are sending item, so it must be present! */ KASSERT(item != NULL, ("ng_snd_item: item is NULL")); #ifdef NETGRAPH_DEBUG _ngi_check(item, __FILE__, __LINE__); #endif /* Item was sent once more, postpone apply() call. */ if (item->apply) refcount_acquire(&item->apply->refs); node = NGI_NODE(item); /* Node is never optional. */ KASSERT(node != NULL, ("ng_snd_item: node is NULL")); hook = NGI_HOOK(item); /* Valid hook and mbuf are mandatory for data. */ if ((item->el_flags & NGQF_TYPE) == NGQF_DATA) { KASSERT(hook != NULL, ("ng_snd_item: hook for data is NULL")); if (NGI_M(item) == NULL) ERROUT(EINVAL); CHECK_DATA_MBUF(NGI_M(item)); } /* * If the item or the node specifies single threading, force * writer semantics. Similarly, the node may say one hook always * produces writers. These are overrides. */ if (((item->el_flags & NGQF_RW) == NGQF_WRITER) || (node->nd_flags & NGF_FORCE_WRITER) || (hook && (hook->hk_flags & HK_FORCE_WRITER))) { rw = NGQRW_W; } else { rw = NGQRW_R; } /* * If sender or receiver requests queued delivery, or call graph * loops back from outbound to inbound path, or stack usage * level is dangerous - enqueue message. */ if ((flags & NG_QUEUE) || (hook && (hook->hk_flags & HK_QUEUE))) { queue = 1; } else if (hook && (hook->hk_flags & HK_TO_INBOUND) && curthread->td_ng_outbound) { queue = 1; } else { queue = 0; #ifdef GET_STACK_USAGE /* * Most of netgraph nodes have small stack consumption and * for them 25% of free stack space is more than enough. * Nodes/hooks with higher stack usage should be marked as * HI_STACK. For them 50% of stack will be guaranteed then. * XXX: Values 25% and 50% are completely empirical. */ size_t st, su, sl; GET_STACK_USAGE(st, su); sl = st - su; if ((sl * 4 < st) || ((sl * 2 < st) && ((node->nd_flags & NGF_HI_STACK) || (hook && (hook->hk_flags & HK_HI_STACK))))) queue = 1; #endif } if (queue) { /* Put it on the queue for that node*/ ng_queue_rw(node, item, rw); return ((flags & NG_PROGRESS) ? EINPROGRESS : 0); } /* * We already decided how we will be queueud or treated. * Try get the appropriate operating permission. */ if (rw == NGQRW_R) item = ng_acquire_read(node, item); else item = ng_acquire_write(node, item); /* Item was queued while trying to get permission. */ if (item == NULL) return ((flags & NG_PROGRESS) ? EINPROGRESS : 0); NGI_GET_NODE(item, node); /* zaps stored node */ item->depth++; error = ng_apply_item(node, item, rw); /* drops r/w lock when done */ /* If something is waiting on queue and ready, schedule it. */ ngq = &node->nd_input_queue; if (QUEUE_ACTIVE(ngq)) { NG_QUEUE_LOCK(ngq); if (QUEUE_ACTIVE(ngq) && NEXT_QUEUED_ITEM_CAN_PROCEED(ngq)) ng_worklist_add(node); NG_QUEUE_UNLOCK(ngq); } /* * Node may go away as soon as we remove the reference. * Whatever we do, DO NOT access the node again! */ NG_NODE_UNREF(node); return (error); done: /* If was not sent, apply callback here. */ if (item->apply != NULL) { if (item->depth == 0 && error != 0) item->apply->error = error; if (refcount_release(&item->apply->refs)) { (*item->apply->apply)(item->apply->context, item->apply->error); } } NG_FREE_ITEM(item); return (error); } /* * We have an item that was possibly queued somewhere. * It should contain all the information needed * to run it on the appropriate node/hook. * If there is apply pointer and we own the last reference, call apply(). */ static int ng_apply_item(node_p node, item_p item, int rw) { hook_p hook; ng_rcvdata_t *rcvdata; ng_rcvmsg_t *rcvmsg; struct ng_apply_info *apply; int error = 0, depth; /* Node and item are never optional. */ KASSERT(node != NULL, ("ng_apply_item: node is NULL")); KASSERT(item != NULL, ("ng_apply_item: item is NULL")); NGI_GET_HOOK(item, hook); /* clears stored hook */ #ifdef NETGRAPH_DEBUG _ngi_check(item, __FILE__, __LINE__); #endif apply = item->apply; depth = item->depth; switch (item->el_flags & NGQF_TYPE) { case NGQF_DATA: /* * Check things are still ok as when we were queued. */ KASSERT(hook != NULL, ("ng_apply_item: hook for data is NULL")); if (NG_HOOK_NOT_VALID(hook) || NG_NODE_NOT_VALID(node)) { error = EIO; NG_FREE_ITEM(item); break; } /* * If no receive method, just silently drop it. * Give preference to the hook over-ride method. */ if ((!(rcvdata = hook->hk_rcvdata)) && (!(rcvdata = NG_HOOK_NODE(hook)->nd_type->rcvdata))) { error = 0; NG_FREE_ITEM(item); break; } error = (*rcvdata)(hook, item); break; case NGQF_MESG: if (hook && NG_HOOK_NOT_VALID(hook)) { /* * The hook has been zapped then we can't use it. * Immediately drop its reference. * The message may not need it. */ NG_HOOK_UNREF(hook); hook = NULL; } /* * Similarly, if the node is a zombie there is * nothing we can do with it, drop everything. */ if (NG_NODE_NOT_VALID(node)) { TRAP_ERROR(); error = EINVAL; NG_FREE_ITEM(item); break; } /* * Call the appropriate message handler for the object. * It is up to the message handler to free the message. * If it's a generic message, handle it generically, * otherwise call the type's message handler (if it exists). * XXX (race). Remember that a queued message may * reference a node or hook that has just been * invalidated. It will exist as the queue code * is holding a reference, but.. */ if ((NGI_MSG(item)->header.typecookie == NGM_GENERIC_COOKIE) && ((NGI_MSG(item)->header.flags & NGF_RESP) == 0)) { error = ng_generic_msg(node, item, hook); break; } if (((!hook) || (!(rcvmsg = hook->hk_rcvmsg))) && (!(rcvmsg = node->nd_type->rcvmsg))) { TRAP_ERROR(); error = 0; NG_FREE_ITEM(item); break; } error = (*rcvmsg)(node, item, hook); break; case NGQF_FN: case NGQF_FN2: /* * In the case of the shutdown message we allow it to hit * even if the node is invalid. */ if (NG_NODE_NOT_VALID(node) && NGI_FN(item) != &ng_rmnode) { TRAP_ERROR(); error = EINVAL; NG_FREE_ITEM(item); break; } /* Same is about some internal functions and invalid hook. */ if (hook && NG_HOOK_NOT_VALID(hook) && NGI_FN2(item) != &ng_con_part2 && NGI_FN2(item) != &ng_con_part3 && NGI_FN(item) != &ng_rmhook_part2) { TRAP_ERROR(); error = EINVAL; NG_FREE_ITEM(item); break; } if ((item->el_flags & NGQF_TYPE) == NGQF_FN) { (*NGI_FN(item))(node, hook, NGI_ARG1(item), NGI_ARG2(item)); NG_FREE_ITEM(item); } else /* it is NGQF_FN2 */ error = (*NGI_FN2(item))(node, item, hook); break; } /* * We held references on some of the resources * that we took from the item. Now that we have * finished doing everything, drop those references. */ if (hook) NG_HOOK_UNREF(hook); if (rw == NGQRW_R) ng_leave_read(node); else ng_leave_write(node); /* Apply callback. */ if (apply != NULL) { if (depth == 1 && error != 0) apply->error = error; if (refcount_release(&apply->refs)) (*apply->apply)(apply->context, apply->error); } return (error); } /*********************************************************************** * Implement the 'generic' control messages ***********************************************************************/ static int ng_generic_msg(node_p here, item_p item, hook_p lasthook) { int error = 0; struct ng_mesg *msg; struct ng_mesg *resp = NULL; NGI_GET_MSG(item, msg); if (msg->header.typecookie != NGM_GENERIC_COOKIE) { TRAP_ERROR(); error = EINVAL; goto out; } switch (msg->header.cmd) { case NGM_SHUTDOWN: ng_rmnode(here, NULL, NULL, 0); break; case NGM_MKPEER: { struct ngm_mkpeer *const mkp = (struct ngm_mkpeer *) msg->data; if (msg->header.arglen != sizeof(*mkp)) { TRAP_ERROR(); error = EINVAL; break; } mkp->type[sizeof(mkp->type) - 1] = '\0'; mkp->ourhook[sizeof(mkp->ourhook) - 1] = '\0'; mkp->peerhook[sizeof(mkp->peerhook) - 1] = '\0'; error = ng_mkpeer(here, mkp->ourhook, mkp->peerhook, mkp->type); break; } case NGM_CONNECT: { struct ngm_connect *const con = (struct ngm_connect *) msg->data; node_p node2; if (msg->header.arglen != sizeof(*con)) { TRAP_ERROR(); error = EINVAL; break; } con->path[sizeof(con->path) - 1] = '\0'; con->ourhook[sizeof(con->ourhook) - 1] = '\0'; con->peerhook[sizeof(con->peerhook) - 1] = '\0'; /* Don't forget we get a reference.. */ error = ng_path2noderef(here, con->path, &node2, NULL); if (error) break; error = ng_con_nodes(item, here, con->ourhook, node2, con->peerhook); NG_NODE_UNREF(node2); break; } case NGM_NAME: { struct ngm_name *const nam = (struct ngm_name *) msg->data; if (msg->header.arglen != sizeof(*nam)) { TRAP_ERROR(); error = EINVAL; break; } nam->name[sizeof(nam->name) - 1] = '\0'; error = ng_name_node(here, nam->name); break; } case NGM_RMHOOK: { struct ngm_rmhook *const rmh = (struct ngm_rmhook *) msg->data; hook_p hook; if (msg->header.arglen != sizeof(*rmh)) { TRAP_ERROR(); error = EINVAL; break; } rmh->ourhook[sizeof(rmh->ourhook) - 1] = '\0'; if ((hook = ng_findhook(here, rmh->ourhook)) != NULL) ng_destroy_hook(hook); break; } case NGM_NODEINFO: { struct nodeinfo *ni; NG_MKRESPONSE(resp, msg, sizeof(*ni), M_NOWAIT); if (resp == NULL) { error = ENOMEM; break; } /* Fill in node info */ ni = (struct nodeinfo *) resp->data; if (NG_NODE_HAS_NAME(here)) strcpy(ni->name, NG_NODE_NAME(here)); strcpy(ni->type, here->nd_type->name); ni->id = ng_node2ID(here); ni->hooks = here->nd_numhooks; break; } case NGM_LISTHOOKS: { const int nhooks = here->nd_numhooks; struct hooklist *hl; struct nodeinfo *ni; hook_p hook; /* Get response struct */ NG_MKRESPONSE(resp, msg, sizeof(*hl) + (nhooks * sizeof(struct linkinfo)), M_NOWAIT); if (resp == NULL) { error = ENOMEM; break; } hl = (struct hooklist *) resp->data; ni = &hl->nodeinfo; /* Fill in node info */ if (NG_NODE_HAS_NAME(here)) strcpy(ni->name, NG_NODE_NAME(here)); strcpy(ni->type, here->nd_type->name); ni->id = ng_node2ID(here); /* Cycle through the linked list of hooks */ ni->hooks = 0; LIST_FOREACH(hook, &here->nd_hooks, hk_hooks) { struct linkinfo *const link = &hl->link[ni->hooks]; if (ni->hooks >= nhooks) { log(LOG_ERR, "%s: number of %s changed\n", __func__, "hooks"); break; } if (NG_HOOK_NOT_VALID(hook)) continue; strcpy(link->ourhook, NG_HOOK_NAME(hook)); strcpy(link->peerhook, NG_PEER_HOOK_NAME(hook)); if (NG_PEER_NODE_NAME(hook)[0] != '\0') strcpy(link->nodeinfo.name, NG_PEER_NODE_NAME(hook)); strcpy(link->nodeinfo.type, NG_PEER_NODE(hook)->nd_type->name); link->nodeinfo.id = ng_node2ID(NG_PEER_NODE(hook)); link->nodeinfo.hooks = NG_PEER_NODE(hook)->nd_numhooks; ni->hooks++; } break; } case NGM_LISTNODES: { struct namelist *nl; node_p node; int i; IDHASH_RLOCK(); /* Get response struct. */ NG_MKRESPONSE(resp, msg, sizeof(*nl) + (V_ng_nodes * sizeof(struct nodeinfo)), M_NOWAIT | M_ZERO); if (resp == NULL) { IDHASH_RUNLOCK(); error = ENOMEM; break; } nl = (struct namelist *) resp->data; /* Cycle through the lists of nodes. */ nl->numnames = 0; for (i = 0; i <= V_ng_ID_hmask; i++) { LIST_FOREACH(node, &V_ng_ID_hash[i], nd_idnodes) { struct nodeinfo *const np = &nl->nodeinfo[nl->numnames]; if (NG_NODE_NOT_VALID(node)) continue; if (NG_NODE_HAS_NAME(node)) strcpy(np->name, NG_NODE_NAME(node)); strcpy(np->type, node->nd_type->name); np->id = ng_node2ID(node); np->hooks = node->nd_numhooks; KASSERT(nl->numnames < V_ng_nodes, ("%s: no space", __func__)); nl->numnames++; } } IDHASH_RUNLOCK(); break; } case NGM_LISTNAMES: { struct namelist *nl; node_p node; int i; NAMEHASH_RLOCK(); /* Get response struct. */ NG_MKRESPONSE(resp, msg, sizeof(*nl) + (V_ng_named_nodes * sizeof(struct nodeinfo)), M_NOWAIT); if (resp == NULL) { NAMEHASH_RUNLOCK(); error = ENOMEM; break; } nl = (struct namelist *) resp->data; /* Cycle through the lists of nodes. */ nl->numnames = 0; for (i = 0; i <= V_ng_name_hmask; i++) { LIST_FOREACH(node, &V_ng_name_hash[i], nd_nodes) { struct nodeinfo *const np = &nl->nodeinfo[nl->numnames]; if (NG_NODE_NOT_VALID(node)) continue; strcpy(np->name, NG_NODE_NAME(node)); strcpy(np->type, node->nd_type->name); np->id = ng_node2ID(node); np->hooks = node->nd_numhooks; KASSERT(nl->numnames < V_ng_named_nodes, ("%s: no space", __func__)); nl->numnames++; } } NAMEHASH_RUNLOCK(); break; } case NGM_LISTTYPES: { struct typelist *tl; struct ng_type *type; int num = 0; TYPELIST_RLOCK(); /* Count number of types */ LIST_FOREACH(type, &ng_typelist, types) num++; /* Get response struct */ NG_MKRESPONSE(resp, msg, sizeof(*tl) + (num * sizeof(struct typeinfo)), M_NOWAIT); if (resp == NULL) { TYPELIST_RUNLOCK(); error = ENOMEM; break; } tl = (struct typelist *) resp->data; /* Cycle through the linked list of types */ tl->numtypes = 0; LIST_FOREACH(type, &ng_typelist, types) { struct typeinfo *const tp = &tl->typeinfo[tl->numtypes]; strcpy(tp->type_name, type->name); tp->numnodes = type->refs - 1; /* don't count list */ KASSERT(tl->numtypes < num, ("%s: no space", __func__)); tl->numtypes++; } TYPELIST_RUNLOCK(); break; } case NGM_BINARY2ASCII: { int bufSize = 20 * 1024; /* XXX hard coded constant */ const struct ng_parse_type *argstype; const struct ng_cmdlist *c; struct ng_mesg *binary, *ascii; /* Data area must contain a valid netgraph message */ binary = (struct ng_mesg *)msg->data; if (msg->header.arglen < sizeof(struct ng_mesg) || (msg->header.arglen - sizeof(struct ng_mesg) < binary->header.arglen)) { TRAP_ERROR(); error = EINVAL; break; } /* Get a response message with lots of room */ NG_MKRESPONSE(resp, msg, sizeof(*ascii) + bufSize, M_NOWAIT); if (resp == NULL) { error = ENOMEM; break; } ascii = (struct ng_mesg *)resp->data; /* Copy binary message header to response message payload */ bcopy(binary, ascii, sizeof(*binary)); /* Find command by matching typecookie and command number */ for (c = here->nd_type->cmdlist; c != NULL && c->name != NULL; c++) { if (binary->header.typecookie == c->cookie && binary->header.cmd == c->cmd) break; } if (c == NULL || c->name == NULL) { for (c = ng_generic_cmds; c->name != NULL; c++) { if (binary->header.typecookie == c->cookie && binary->header.cmd == c->cmd) break; } if (c->name == NULL) { NG_FREE_MSG(resp); error = ENOSYS; break; } } /* Convert command name to ASCII */ snprintf(ascii->header.cmdstr, sizeof(ascii->header.cmdstr), "%s", c->name); /* Convert command arguments to ASCII */ argstype = (binary->header.flags & NGF_RESP) ? c->respType : c->mesgType; if (argstype == NULL) { *ascii->data = '\0'; } else { if ((error = ng_unparse(argstype, (u_char *)binary->data, ascii->data, bufSize)) != 0) { NG_FREE_MSG(resp); break; } } /* Return the result as struct ng_mesg plus ASCII string */ bufSize = strlen(ascii->data) + 1; ascii->header.arglen = bufSize; resp->header.arglen = sizeof(*ascii) + bufSize; break; } case NGM_ASCII2BINARY: { int bufSize = 20 * 1024; /* XXX hard coded constant */ const struct ng_cmdlist *c; const struct ng_parse_type *argstype; struct ng_mesg *ascii, *binary; int off = 0; /* Data area must contain at least a struct ng_mesg + '\0' */ ascii = (struct ng_mesg *)msg->data; if ((msg->header.arglen < sizeof(*ascii) + 1) || (ascii->header.arglen < 1) || (msg->header.arglen < sizeof(*ascii) + ascii->header.arglen)) { TRAP_ERROR(); error = EINVAL; break; } ascii->data[ascii->header.arglen - 1] = '\0'; /* Get a response message with lots of room */ NG_MKRESPONSE(resp, msg, sizeof(*binary) + bufSize, M_NOWAIT); if (resp == NULL) { error = ENOMEM; break; } binary = (struct ng_mesg *)resp->data; /* Copy ASCII message header to response message payload */ bcopy(ascii, binary, sizeof(*ascii)); /* Find command by matching ASCII command string */ for (c = here->nd_type->cmdlist; c != NULL && c->name != NULL; c++) { if (strcmp(ascii->header.cmdstr, c->name) == 0) break; } if (c == NULL || c->name == NULL) { for (c = ng_generic_cmds; c->name != NULL; c++) { if (strcmp(ascii->header.cmdstr, c->name) == 0) break; } if (c->name == NULL) { NG_FREE_MSG(resp); error = ENOSYS; break; } } /* Convert command name to binary */ binary->header.cmd = c->cmd; binary->header.typecookie = c->cookie; /* Convert command arguments to binary */ argstype = (binary->header.flags & NGF_RESP) ? c->respType : c->mesgType; if (argstype == NULL) { bufSize = 0; } else { if ((error = ng_parse(argstype, ascii->data, &off, (u_char *)binary->data, &bufSize)) != 0) { NG_FREE_MSG(resp); break; } } /* Return the result */ binary->header.arglen = bufSize; resp->header.arglen = sizeof(*binary) + bufSize; break; } case NGM_TEXT_CONFIG: case NGM_TEXT_STATUS: /* * This one is tricky as it passes the command down to the * actual node, even though it is a generic type command. * This means we must assume that the item/msg is already freed * when control passes back to us. */ if (here->nd_type->rcvmsg != NULL) { NGI_MSG(item) = msg; /* put it back as we found it */ return((*here->nd_type->rcvmsg)(here, item, lasthook)); } /* Fall through if rcvmsg not supported */ default: TRAP_ERROR(); error = EINVAL; } /* * Sometimes a generic message may be statically allocated * to avoid problems with allocating when in tight memory situations. * Don't free it if it is so. * I break them apart here, because erros may cause a free if the item * in which case we'd be doing it twice. * they are kept together above, to simplify freeing. */ out: NG_RESPOND_MSG(error, here, item, resp); NG_FREE_MSG(msg); return (error); } /************************************************************************ Queue element get/free routines ************************************************************************/ uma_zone_t ng_qzone; uma_zone_t ng_qdzone; static int numthreads = 0; /* number of queue threads */ static int maxalloc = 4096;/* limit the damage of a leak */ static int maxdata = 4096; /* limit the damage of a DoS */ SYSCTL_INT(_net_graph, OID_AUTO, threads, CTLFLAG_RDTUN, &numthreads, 0, "Number of queue processing threads"); SYSCTL_INT(_net_graph, OID_AUTO, maxalloc, CTLFLAG_RDTUN, &maxalloc, 0, "Maximum number of non-data queue items to allocate"); SYSCTL_INT(_net_graph, OID_AUTO, maxdata, CTLFLAG_RDTUN, &maxdata, 0, "Maximum number of data queue items to allocate"); #ifdef NETGRAPH_DEBUG static TAILQ_HEAD(, ng_item) ng_itemlist = TAILQ_HEAD_INITIALIZER(ng_itemlist); static int allocated; /* number of items malloc'd */ #endif /* * Get a queue entry. * This is usually called when a packet first enters netgraph. * By definition, this is usually from an interrupt, or from a user. * Users are not so important, but try be quick for the times that it's * an interrupt. */ static __inline item_p ng_alloc_item(int type, int flags) { item_p item; KASSERT(((type & ~NGQF_TYPE) == 0), ("%s: incorrect item type: %d", __func__, type)); item = uma_zalloc((type == NGQF_DATA) ? ng_qdzone : ng_qzone, ((flags & NG_WAITOK) ? M_WAITOK : M_NOWAIT) | M_ZERO); if (item) { item->el_flags = type; #ifdef NETGRAPH_DEBUG mtx_lock(&ngq_mtx); TAILQ_INSERT_TAIL(&ng_itemlist, item, all); allocated++; mtx_unlock(&ngq_mtx); #endif } return (item); } /* * Release a queue entry */ void ng_free_item(item_p item) { /* - * The item may hold resources on it's own. We need to free + * The item may hold resources on its own. We need to free * these before we can free the item. What they are depends upon * what kind of item it is. it is important that nodes zero * out pointers to resources that they remove from the item * or we release them again here. */ switch (item->el_flags & NGQF_TYPE) { case NGQF_DATA: /* If we have an mbuf still attached.. */ NG_FREE_M(_NGI_M(item)); break; case NGQF_MESG: _NGI_RETADDR(item) = 0; NG_FREE_MSG(_NGI_MSG(item)); break; case NGQF_FN: case NGQF_FN2: /* nothing to free really, */ _NGI_FN(item) = NULL; _NGI_ARG1(item) = NULL; _NGI_ARG2(item) = 0; break; } /* If we still have a node or hook referenced... */ _NGI_CLR_NODE(item); _NGI_CLR_HOOK(item); #ifdef NETGRAPH_DEBUG mtx_lock(&ngq_mtx); TAILQ_REMOVE(&ng_itemlist, item, all); allocated--; mtx_unlock(&ngq_mtx); #endif uma_zfree(((item->el_flags & NGQF_TYPE) == NGQF_DATA) ? ng_qdzone : ng_qzone, item); } /* * Change type of the queue entry. * Possibly reallocates it from another UMA zone. */ static __inline item_p ng_realloc_item(item_p pitem, int type, int flags) { item_p item; int from, to; KASSERT((pitem != NULL), ("%s: can't reallocate NULL", __func__)); KASSERT(((type & ~NGQF_TYPE) == 0), ("%s: incorrect item type: %d", __func__, type)); from = ((pitem->el_flags & NGQF_TYPE) == NGQF_DATA); to = (type == NGQF_DATA); if (from != to) { /* If reallocation is required do it and copy item. */ if ((item = ng_alloc_item(type, flags)) == NULL) { ng_free_item(pitem); return (NULL); } *item = *pitem; ng_free_item(pitem); } else item = pitem; item->el_flags = (item->el_flags & ~NGQF_TYPE) | type; return (item); } /************************************************************************ Module routines ************************************************************************/ /* * Handle the loading/unloading of a netgraph node type module */ int ng_mod_event(module_t mod, int event, void *data) { struct ng_type *const type = data; int error = 0; switch (event) { case MOD_LOAD: /* Register new netgraph node type */ if ((error = ng_newtype(type)) != 0) break; /* Call type specific code */ if (type->mod_event != NULL) if ((error = (*type->mod_event)(mod, event, data))) { TYPELIST_WLOCK(); type->refs--; /* undo it */ LIST_REMOVE(type, types); TYPELIST_WUNLOCK(); } break; case MOD_UNLOAD: if (type->refs > 1) { /* make sure no nodes exist! */ error = EBUSY; } else { if (type->refs == 0) /* failed load, nothing to undo */ break; if (type->mod_event != NULL) { /* check with type */ error = (*type->mod_event)(mod, event, data); if (error != 0) /* type refuses.. */ break; } TYPELIST_WLOCK(); LIST_REMOVE(type, types); TYPELIST_WUNLOCK(); } break; default: if (type->mod_event != NULL) error = (*type->mod_event)(mod, event, data); else error = EOPNOTSUPP; /* XXX ? */ break; } return (error); } static void vnet_netgraph_init(const void *unused __unused) { /* We start with small hashes, but they can grow. */ V_ng_ID_hash = hashinit(16, M_NETGRAPH_NODE, &V_ng_ID_hmask); V_ng_name_hash = hashinit(16, M_NETGRAPH_NODE, &V_ng_name_hmask); } VNET_SYSINIT(vnet_netgraph_init, SI_SUB_NETGRAPH, SI_ORDER_FIRST, vnet_netgraph_init, NULL); #ifdef VIMAGE static void vnet_netgraph_uninit(const void *unused __unused) { node_p node = NULL, last_killed = NULL; int i; do { /* Find a node to kill */ IDHASH_RLOCK(); for (i = 0; i <= V_ng_ID_hmask; i++) { LIST_FOREACH(node, &V_ng_ID_hash[i], nd_idnodes) { if (node != &ng_deadnode) { NG_NODE_REF(node); break; } } if (node != NULL) break; } IDHASH_RUNLOCK(); /* Attempt to kill it only if it is a regular node */ if (node != NULL) { if (node == last_killed) { /* This should never happen */ printf("ng node %s needs NGF_REALLY_DIE\n", node->nd_name); if (node->nd_flags & NGF_REALLY_DIE) panic("ng node %s won't die", node->nd_name); node->nd_flags |= NGF_REALLY_DIE; } ng_rmnode(node, NULL, NULL, 0); NG_NODE_UNREF(node); last_killed = node; } } while (node != NULL); hashdestroy(V_ng_name_hash, M_NETGRAPH_NODE, V_ng_name_hmask); hashdestroy(V_ng_ID_hash, M_NETGRAPH_NODE, V_ng_ID_hmask); } VNET_SYSUNINIT(vnet_netgraph_uninit, SI_SUB_NETGRAPH, SI_ORDER_FIRST, vnet_netgraph_uninit, NULL); #endif /* VIMAGE */ /* * Handle loading and unloading for this code. * The only thing we need to link into is the NETISR strucure. */ static int ngb_mod_event(module_t mod, int event, void *data) { struct proc *p; struct thread *td; int i, error = 0; switch (event) { case MOD_LOAD: /* Initialize everything. */ NG_WORKLIST_LOCK_INIT(); rw_init(&ng_typelist_lock, "netgraph types"); rw_init(&ng_idhash_lock, "netgraph idhash"); rw_init(&ng_namehash_lock, "netgraph namehash"); rw_init(&ng_topo_lock, "netgraph topology mutex"); #ifdef NETGRAPH_DEBUG mtx_init(&ng_nodelist_mtx, "netgraph nodelist mutex", NULL, MTX_DEF); mtx_init(&ngq_mtx, "netgraph item list mutex", NULL, MTX_DEF); #endif ng_qzone = uma_zcreate("NetGraph items", sizeof(struct ng_item), NULL, NULL, NULL, NULL, UMA_ALIGN_CACHE, 0); uma_zone_set_max(ng_qzone, maxalloc); ng_qdzone = uma_zcreate("NetGraph data items", sizeof(struct ng_item), NULL, NULL, NULL, NULL, UMA_ALIGN_CACHE, 0); uma_zone_set_max(ng_qdzone, maxdata); /* Autoconfigure number of threads. */ if (numthreads <= 0) numthreads = mp_ncpus; /* Create threads. */ p = NULL; /* start with no process */ for (i = 0; i < numthreads; i++) { if (kproc_kthread_add(ngthread, NULL, &p, &td, RFHIGHPID, 0, "ng_queue", "ng_queue%d", i)) { numthreads = i; break; } } break; case MOD_UNLOAD: /* You can't unload it because an interface may be using it. */ error = EBUSY; break; default: error = EOPNOTSUPP; break; } return (error); } static moduledata_t netgraph_mod = { "netgraph", ngb_mod_event, (NULL) }; DECLARE_MODULE(netgraph, netgraph_mod, SI_SUB_NETGRAPH, SI_ORDER_FIRST); SYSCTL_NODE(_net, OID_AUTO, graph, CTLFLAG_RW, 0, "netgraph Family"); SYSCTL_INT(_net_graph, OID_AUTO, abi_version, CTLFLAG_RD, SYSCTL_NULL_INT_PTR, NG_ABI_VERSION,""); SYSCTL_INT(_net_graph, OID_AUTO, msg_version, CTLFLAG_RD, SYSCTL_NULL_INT_PTR, NG_VERSION, ""); #ifdef NETGRAPH_DEBUG void dumphook (hook_p hook, char *file, int line) { printf("hook: name %s, %d refs, Last touched:\n", _NG_HOOK_NAME(hook), hook->hk_refs); printf(" Last active @ %s, line %d\n", hook->lastfile, hook->lastline); if (line) { printf(" problem discovered at file %s, line %d\n", file, line); #ifdef KDB kdb_backtrace(); #endif } } void dumpnode(node_p node, char *file, int line) { printf("node: ID [%x]: type '%s', %d hooks, flags 0x%x, %d refs, %s:\n", _NG_NODE_ID(node), node->nd_type->name, node->nd_numhooks, node->nd_flags, node->nd_refs, node->nd_name); printf(" Last active @ %s, line %d\n", node->lastfile, node->lastline); if (line) { printf(" problem discovered at file %s, line %d\n", file, line); #ifdef KDB kdb_backtrace(); #endif } } void dumpitem(item_p item, char *file, int line) { printf(" ACTIVE item, last used at %s, line %d", item->lastfile, item->lastline); switch(item->el_flags & NGQF_TYPE) { case NGQF_DATA: printf(" - [data]\n"); break; case NGQF_MESG: printf(" - retaddr[%d]:\n", _NGI_RETADDR(item)); break; case NGQF_FN: printf(" - fn@%p (%p, %p, %p, %d (%x))\n", _NGI_FN(item), _NGI_NODE(item), _NGI_HOOK(item), item->body.fn.fn_arg1, item->body.fn.fn_arg2, item->body.fn.fn_arg2); break; case NGQF_FN2: printf(" - fn2@%p (%p, %p, %p, %d (%x))\n", _NGI_FN2(item), _NGI_NODE(item), _NGI_HOOK(item), item->body.fn.fn_arg1, item->body.fn.fn_arg2, item->body.fn.fn_arg2); break; } if (line) { printf(" problem discovered at file %s, line %d\n", file, line); if (_NGI_NODE(item)) { printf("node %p ([%x])\n", _NGI_NODE(item), ng_node2ID(_NGI_NODE(item))); } } } static void ng_dumpitems(void) { item_p item; int i = 1; TAILQ_FOREACH(item, &ng_itemlist, all) { printf("[%d] ", i++); dumpitem(item, NULL, 0); } } static void ng_dumpnodes(void) { node_p node; int i = 1; mtx_lock(&ng_nodelist_mtx); SLIST_FOREACH(node, &ng_allnodes, nd_all) { printf("[%d] ", i++); dumpnode(node, NULL, 0); } mtx_unlock(&ng_nodelist_mtx); } static void ng_dumphooks(void) { hook_p hook; int i = 1; mtx_lock(&ng_nodelist_mtx); SLIST_FOREACH(hook, &ng_allhooks, hk_all) { printf("[%d] ", i++); dumphook(hook, NULL, 0); } mtx_unlock(&ng_nodelist_mtx); } static int sysctl_debug_ng_dump_items(SYSCTL_HANDLER_ARGS) { int error; int val; int i; val = allocated; i = 1; error = sysctl_handle_int(oidp, &val, 0, req); if (error != 0 || req->newptr == NULL) return (error); if (val == 42) { ng_dumpitems(); ng_dumpnodes(); ng_dumphooks(); } return (0); } SYSCTL_PROC(_debug, OID_AUTO, ng_dump_items, CTLTYPE_INT | CTLFLAG_RW, 0, sizeof(int), sysctl_debug_ng_dump_items, "I", "Number of allocated items"); #endif /* NETGRAPH_DEBUG */ /*********************************************************************** * Worklist routines **********************************************************************/ /* * Pick a node off the list of nodes with work, * try get an item to process off it. Remove the node from the list. */ static void ngthread(void *arg) { for (;;) { node_p node; /* Get node from the worklist. */ NG_WORKLIST_LOCK(); while ((node = STAILQ_FIRST(&ng_worklist)) == NULL) NG_WORKLIST_SLEEP(); STAILQ_REMOVE_HEAD(&ng_worklist, nd_input_queue.q_work); NG_WORKLIST_UNLOCK(); CURVNET_SET(node->nd_vnet); CTR3(KTR_NET, "%20s: node [%x] (%p) taken off worklist", __func__, node->nd_ID, node); /* * We have the node. We also take over the reference * that the list had on it. * Now process as much as you can, until it won't * let you have another item off the queue. * All this time, keep the reference * that lets us be sure that the node still exists. * Let the reference go at the last minute. */ for (;;) { item_p item; int rw; NG_QUEUE_LOCK(&node->nd_input_queue); item = ng_dequeue(node, &rw); if (item == NULL) { node->nd_input_queue.q_flags2 &= ~NGQ2_WORKQ; NG_QUEUE_UNLOCK(&node->nd_input_queue); break; /* go look for another node */ } else { NG_QUEUE_UNLOCK(&node->nd_input_queue); NGI_GET_NODE(item, node); /* zaps stored node */ ng_apply_item(node, item, rw); NG_NODE_UNREF(node); } } NG_NODE_UNREF(node); CURVNET_RESTORE(); } } /* * XXX * It's posible that a debugging NG_NODE_REF may need * to be outside the mutex zone */ static void ng_worklist_add(node_p node) { mtx_assert(&node->nd_input_queue.q_mtx, MA_OWNED); if ((node->nd_input_queue.q_flags2 & NGQ2_WORKQ) == 0) { /* * If we are not already on the work queue, * then put us on. */ node->nd_input_queue.q_flags2 |= NGQ2_WORKQ; NG_NODE_REF(node); /* XXX safe in mutex? */ NG_WORKLIST_LOCK(); STAILQ_INSERT_TAIL(&ng_worklist, node, nd_input_queue.q_work); NG_WORKLIST_UNLOCK(); CTR3(KTR_NET, "%20s: node [%x] (%p) put on worklist", __func__, node->nd_ID, node); NG_WORKLIST_WAKEUP(); } else { CTR3(KTR_NET, "%20s: node [%x] (%p) already on worklist", __func__, node->nd_ID, node); } } /*********************************************************************** * Externally useable functions to set up a queue item ready for sending ***********************************************************************/ #ifdef NETGRAPH_DEBUG #define ITEM_DEBUG_CHECKS \ do { \ if (NGI_NODE(item) ) { \ printf("item already has node"); \ kdb_enter(KDB_WHY_NETGRAPH, "has node"); \ NGI_CLR_NODE(item); \ } \ if (NGI_HOOK(item) ) { \ printf("item already has hook"); \ kdb_enter(KDB_WHY_NETGRAPH, "has hook"); \ NGI_CLR_HOOK(item); \ } \ } while (0) #else #define ITEM_DEBUG_CHECKS #endif /* * Put mbuf into the item. * Hook and node references will be removed when the item is dequeued. * (or equivalent) * (XXX) Unsafe because no reference held by peer on remote node. * remote node might go away in this timescale. * We know the hooks can't go away because that would require getting * a writer item on both nodes and we must have at least a reader * here to be able to do this. * Note that the hook loaded is the REMOTE hook. * * This is possibly in the critical path for new data. */ item_p ng_package_data(struct mbuf *m, int flags) { item_p item; if ((item = ng_alloc_item(NGQF_DATA, flags)) == NULL) { NG_FREE_M(m); return (NULL); } ITEM_DEBUG_CHECKS; item->el_flags |= NGQF_READER; NGI_M(item) = m; return (item); } /* * Allocate a queue item and put items into it.. * Evaluate the address as this will be needed to queue it and * to work out what some of the fields should be. * Hook and node references will be removed when the item is dequeued. * (or equivalent) */ item_p ng_package_msg(struct ng_mesg *msg, int flags) { item_p item; if ((item = ng_alloc_item(NGQF_MESG, flags)) == NULL) { NG_FREE_MSG(msg); return (NULL); } ITEM_DEBUG_CHECKS; /* Messages items count as writers unless explicitly exempted. */ if (msg->header.cmd & NGM_READONLY) item->el_flags |= NGQF_READER; else item->el_flags |= NGQF_WRITER; /* * Set the current lasthook into the queue item */ NGI_MSG(item) = msg; NGI_RETADDR(item) = 0; return (item); } #define SET_RETADDR(item, here, retaddr) \ do { /* Data or fn items don't have retaddrs */ \ if ((item->el_flags & NGQF_TYPE) == NGQF_MESG) { \ if (retaddr) { \ NGI_RETADDR(item) = retaddr; \ } else { \ /* \ * The old return address should be ok. \ * If there isn't one, use the address \ * here. \ */ \ if (NGI_RETADDR(item) == 0) { \ NGI_RETADDR(item) \ = ng_node2ID(here); \ } \ } \ } \ } while (0) int ng_address_hook(node_p here, item_p item, hook_p hook, ng_ID_t retaddr) { hook_p peer; node_p peernode; ITEM_DEBUG_CHECKS; /* * Quick sanity check.. - * Since a hook holds a reference on it's node, once we know + * Since a hook holds a reference on its node, once we know * that the peer is still connected (even if invalid,) we know * that the peer node is present, though maybe invalid. */ TOPOLOGY_RLOCK(); if ((hook == NULL) || NG_HOOK_NOT_VALID(hook) || NG_HOOK_NOT_VALID(peer = NG_HOOK_PEER(hook)) || NG_NODE_NOT_VALID(peernode = NG_PEER_NODE(hook))) { NG_FREE_ITEM(item); TRAP_ERROR(); TOPOLOGY_RUNLOCK(); return (ENETDOWN); } /* * Transfer our interest to the other (peer) end. */ NG_HOOK_REF(peer); NG_NODE_REF(peernode); NGI_SET_HOOK(item, peer); NGI_SET_NODE(item, peernode); SET_RETADDR(item, here, retaddr); TOPOLOGY_RUNLOCK(); return (0); } int ng_address_path(node_p here, item_p item, const char *address, ng_ID_t retaddr) { node_p dest = NULL; hook_p hook = NULL; int error; ITEM_DEBUG_CHECKS; /* * Note that ng_path2noderef increments the reference count * on the node for us if it finds one. So we don't have to. */ error = ng_path2noderef(here, address, &dest, &hook); if (error) { NG_FREE_ITEM(item); return (error); } NGI_SET_NODE(item, dest); if (hook) NGI_SET_HOOK(item, hook); SET_RETADDR(item, here, retaddr); return (0); } int ng_address_ID(node_p here, item_p item, ng_ID_t ID, ng_ID_t retaddr) { node_p dest; ITEM_DEBUG_CHECKS; /* * Find the target node. */ dest = ng_ID2noderef(ID); /* GETS REFERENCE! */ if (dest == NULL) { NG_FREE_ITEM(item); TRAP_ERROR(); return(EINVAL); } /* Fill out the contents */ NGI_SET_NODE(item, dest); NGI_CLR_HOOK(item); SET_RETADDR(item, here, retaddr); return (0); } /* * special case to send a message to self (e.g. destroy node) * Possibly indicate an arrival hook too. * Useful for removing that hook :-) */ item_p ng_package_msg_self(node_p here, hook_p hook, struct ng_mesg *msg) { item_p item; /* * Find the target node. * If there is a HOOK argument, then use that in preference * to the address. */ if ((item = ng_alloc_item(NGQF_MESG, NG_NOFLAGS)) == NULL) { NG_FREE_MSG(msg); return (NULL); } /* Fill out the contents */ item->el_flags |= NGQF_WRITER; NG_NODE_REF(here); NGI_SET_NODE(item, here); if (hook) { NG_HOOK_REF(hook); NGI_SET_HOOK(item, hook); } NGI_MSG(item) = msg; NGI_RETADDR(item) = ng_node2ID(here); return (item); } /* * Send ng_item_fn function call to the specified node. */ int ng_send_fn(node_p node, hook_p hook, ng_item_fn *fn, void * arg1, int arg2) { return ng_send_fn1(node, hook, fn, arg1, arg2, NG_NOFLAGS); } int ng_send_fn1(node_p node, hook_p hook, ng_item_fn *fn, void * arg1, int arg2, int flags) { item_p item; if ((item = ng_alloc_item(NGQF_FN, flags)) == NULL) { return (ENOMEM); } item->el_flags |= NGQF_WRITER; NG_NODE_REF(node); /* and one for the item */ NGI_SET_NODE(item, node); if (hook) { NG_HOOK_REF(hook); NGI_SET_HOOK(item, hook); } NGI_FN(item) = fn; NGI_ARG1(item) = arg1; NGI_ARG2(item) = arg2; return(ng_snd_item(item, flags)); } /* * Send ng_item_fn2 function call to the specified node. * * If an optional pitem parameter is supplied, its apply * callback will be copied to the new item. If also NG_REUSE_ITEM * flag is set, no new item will be allocated, but pitem will * be used. */ int ng_send_fn2(node_p node, hook_p hook, item_p pitem, ng_item_fn2 *fn, void *arg1, int arg2, int flags) { item_p item; KASSERT((pitem != NULL || (flags & NG_REUSE_ITEM) == 0), ("%s: NG_REUSE_ITEM but no pitem", __func__)); /* * Allocate a new item if no supplied or * if we can't use supplied one. */ if (pitem == NULL || (flags & NG_REUSE_ITEM) == 0) { if ((item = ng_alloc_item(NGQF_FN2, flags)) == NULL) return (ENOMEM); if (pitem != NULL) item->apply = pitem->apply; } else { if ((item = ng_realloc_item(pitem, NGQF_FN2, flags)) == NULL) return (ENOMEM); } item->el_flags = (item->el_flags & ~NGQF_RW) | NGQF_WRITER; NG_NODE_REF(node); /* and one for the item */ NGI_SET_NODE(item, node); if (hook) { NG_HOOK_REF(hook); NGI_SET_HOOK(item, hook); } NGI_FN2(item) = fn; NGI_ARG1(item) = arg1; NGI_ARG2(item) = arg2; return(ng_snd_item(item, flags)); } /* * Official timeout routines for Netgraph nodes. */ static void ng_callout_trampoline(void *arg) { item_p item = arg; CURVNET_SET(NGI_NODE(item)->nd_vnet); ng_snd_item(item, 0); CURVNET_RESTORE(); } int ng_callout(struct callout *c, node_p node, hook_p hook, int ticks, ng_item_fn *fn, void * arg1, int arg2) { item_p item, oitem; if ((item = ng_alloc_item(NGQF_FN, NG_NOFLAGS)) == NULL) return (ENOMEM); item->el_flags |= NGQF_WRITER; NG_NODE_REF(node); /* and one for the item */ NGI_SET_NODE(item, node); if (hook) { NG_HOOK_REF(hook); NGI_SET_HOOK(item, hook); } NGI_FN(item) = fn; NGI_ARG1(item) = arg1; NGI_ARG2(item) = arg2; oitem = c->c_arg; if (callout_reset(c, ticks, &ng_callout_trampoline, item) == 1 && oitem != NULL) NG_FREE_ITEM(oitem); return (0); } /* A special modified version of untimeout() */ int ng_uncallout(struct callout *c, node_p node) { item_p item; int rval; KASSERT(c != NULL, ("ng_uncallout: NULL callout")); KASSERT(node != NULL, ("ng_uncallout: NULL node")); rval = callout_stop(c); item = c->c_arg; /* Do an extra check */ if ((rval > 0) && (c->c_func == &ng_callout_trampoline) && (item != NULL) && (NGI_NODE(item) == node)) { /* * We successfully removed it from the queue before it ran * So now we need to unreference everything that was * given extra references. (NG_FREE_ITEM does this). */ NG_FREE_ITEM(item); } c->c_arg = NULL; return (rval); } /* * Set the address, if none given, give the node here. */ void ng_replace_retaddr(node_p here, item_p item, ng_ID_t retaddr) { if (retaddr) { NGI_RETADDR(item) = retaddr; } else { /* * The old return address should be ok. * If there isn't one, use the address here. */ NGI_RETADDR(item) = ng_node2ID(here); } } Index: head/sys/powerpc/powerpc/mmu_if.m =================================================================== --- head/sys/powerpc/powerpc/mmu_if.m (revision 308456) +++ head/sys/powerpc/powerpc/mmu_if.m (revision 308457) @@ -1,979 +1,979 @@ #- # Copyright (c) 2005 Peter Grehan # All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions # are met: # 1. Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # # THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE # ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS # OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) # HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY # OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF # SUCH DAMAGE. # # $FreeBSD$ # #include #include #include #include #include #include #include /** * @defgroup MMU mmu - KObj methods for PowerPC MMU implementations * @brief A set of methods required by all MMU implementations. These * are basically direct call-thru's from the pmap machine-dependent * code. * Thanks to Bruce M Simpson's pmap man pages for routine descriptions. *@{ */ INTERFACE mmu; # # Default implementations of some methods # CODE { static void mmu_null_copy(mmu_t mmu, pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len, vm_offset_t src_addr) { return; } static void mmu_null_growkernel(mmu_t mmu, vm_offset_t addr) { return; } static void mmu_null_init(mmu_t mmu) { return; } static boolean_t mmu_null_is_prefaultable(mmu_t mmu, pmap_t pmap, vm_offset_t va) { return (FALSE); } static void mmu_null_object_init_pt(mmu_t mmu, pmap_t pmap, vm_offset_t addr, vm_object_t object, vm_pindex_t index, vm_size_t size) { return; } static void mmu_null_page_init(mmu_t mmu, vm_page_t m) { return; } static void mmu_null_remove_pages(mmu_t mmu, pmap_t pmap) { return; } static int mmu_null_mincore(mmu_t mmu, pmap_t pmap, vm_offset_t addr, vm_paddr_t *locked_pa) { return (0); } static void mmu_null_deactivate(struct thread *td) { return; } static void mmu_null_align_superpage(mmu_t mmu, vm_object_t object, vm_ooffset_t offset, vm_offset_t *addr, vm_size_t size) { return; } static void *mmu_null_mapdev_attr(mmu_t mmu, vm_paddr_t pa, vm_size_t size, vm_memattr_t ma) { return MMU_MAPDEV(mmu, pa, size); } static void mmu_null_kenter_attr(mmu_t mmu, vm_offset_t va, vm_paddr_t pa, vm_memattr_t ma) { MMU_KENTER(mmu, va, pa); } static void mmu_null_page_set_memattr(mmu_t mmu, vm_page_t m, vm_memattr_t ma) { return; } static int mmu_null_change_attr(mmu_t mmu, vm_offset_t va, vm_size_t sz, vm_memattr_t mode) { return (0); } }; /** * @brief Apply the given advice to the specified range of addresses within * the given pmap. Depending on the advice, clear the referenced and/or * modified flags in each mapping and set the mapped page's dirty field. * * @param _pmap physical map * @param _start virtual range start * @param _end virtual range end * @param _advice advice to apply */ METHOD void advise { mmu_t _mmu; pmap_t _pmap; vm_offset_t _start; vm_offset_t _end; int _advice; }; /** * @brief Clear the 'modified' bit on the given physical page * * @param _pg physical page */ METHOD void clear_modify { mmu_t _mmu; vm_page_t _pg; }; /** * @brief Clear the write and modified bits in each of the given * physical page's mappings * * @param _pg physical page */ METHOD void remove_write { mmu_t _mmu; vm_page_t _pg; }; /** * @brief Copy the address range given by the source physical map, virtual * address and length to the destination physical map and virtual address. * This routine is optional (xxx default null implementation ?) * * @param _dst_pmap destination physical map * @param _src_pmap source physical map * @param _dst_addr destination virtual address * @param _len size of range * @param _src_addr source virtual address */ METHOD void copy { mmu_t _mmu; pmap_t _dst_pmap; pmap_t _src_pmap; vm_offset_t _dst_addr; vm_size_t _len; vm_offset_t _src_addr; } DEFAULT mmu_null_copy; /** * @brief Copy the source physical page to the destination physical page * * @param _src source physical page * @param _dst destination physical page */ METHOD void copy_page { mmu_t _mmu; vm_page_t _src; vm_page_t _dst; }; METHOD void copy_pages { mmu_t _mmu; vm_page_t *_ma; vm_offset_t _a_offset; vm_page_t *_mb; vm_offset_t _b_offset; int _xfersize; }; /** * @brief Create a mapping between a virtual/physical address pair in the * passed physical map with the specified protection and wiring * * @param _pmap physical map * @param _va mapping virtual address * @param _p mapping physical page * @param _prot mapping page protection * @param _flags pmap_enter flags * @param _psind superpage size index */ METHOD int enter { mmu_t _mmu; pmap_t _pmap; vm_offset_t _va; vm_page_t _p; vm_prot_t _prot; u_int _flags; int8_t _psind; }; /** * @brief Maps a sequence of resident pages belonging to the same object. * * @param _pmap physical map * @param _start virtual range start * @param _end virtual range end * @param _m_start physical page mapped at start * @param _prot mapping page protection */ METHOD void enter_object { mmu_t _mmu; pmap_t _pmap; vm_offset_t _start; vm_offset_t _end; vm_page_t _m_start; vm_prot_t _prot; }; /** * @brief A faster entry point for page mapping where it is possible * to short-circuit some of the tests in pmap_enter. * * @param _pmap physical map (and also currently active pmap) * @param _va mapping virtual address * @param _pg mapping physical page * @param _prot new page protection - used to see if page is exec. */ METHOD void enter_quick { mmu_t _mmu; pmap_t _pmap; vm_offset_t _va; vm_page_t _pg; vm_prot_t _prot; }; /** * @brief Reverse map the given virtual address, returning the physical * page associated with the address if a mapping exists. * * @param _pmap physical map * @param _va mapping virtual address * * @retval 0 No mapping found * @retval addr The mapping physical address */ METHOD vm_paddr_t extract { mmu_t _mmu; pmap_t _pmap; vm_offset_t _va; }; /** * @brief Reverse map the given virtual address, returning the * physical page if found. The page must be held (by calling * vm_page_hold) if the page protection matches the given protection * * @param _pmap physical map * @param _va mapping virtual address * @param _prot protection used to determine if physical page * should be locked * * @retval NULL No mapping found * @retval page Pointer to physical page. Held if protections match */ METHOD vm_page_t extract_and_hold { mmu_t _mmu; pmap_t _pmap; vm_offset_t _va; vm_prot_t _prot; }; /** * @brief Increase kernel virtual address space to the given virtual address. * Not really required for PowerPC, so optional unless the MMU implementation * can use it. * * @param _va new upper limit for kernel virtual address space */ METHOD void growkernel { mmu_t _mmu; vm_offset_t _va; } DEFAULT mmu_null_growkernel; /** * @brief Called from vm_mem_init. Zone allocation is available at * this stage so a convenient time to create zones. This routine is * for MMU-implementation convenience and is optional. */ METHOD void init { mmu_t _mmu; } DEFAULT mmu_null_init; /** * @brief Return if the page has been marked by MMU hardware to have been * modified * * @param _pg physical page to test * * @retval boolean TRUE if page has been modified */ METHOD boolean_t is_modified { mmu_t _mmu; vm_page_t _pg; }; /** * @brief Return whether the specified virtual address is a candidate to be * prefaulted in. This routine is optional. * * @param _pmap physical map * @param _va virtual address to test * * @retval boolean TRUE if the address is a candidate. */ METHOD boolean_t is_prefaultable { mmu_t _mmu; pmap_t _pmap; vm_offset_t _va; } DEFAULT mmu_null_is_prefaultable; /** * @brief Return whether or not the specified physical page was referenced * in any physical maps. * * @params _pg physical page * * @retval boolean TRUE if page has been referenced */ METHOD boolean_t is_referenced { mmu_t _mmu; vm_page_t _pg; }; /** * @brief Return a count of referenced bits for a page, clearing those bits. * Not all referenced bits need to be cleared, but it is necessary that 0 * only be returned when there are none set. * * @params _m physical page * * @retval int count of referenced bits */ METHOD int ts_referenced { mmu_t _mmu; vm_page_t _pg; }; /** * @brief Map the requested physical address range into kernel virtual * address space. The value in _virt is taken as a hint. The virtual * address of the range is returned, or NULL if the mapping could not * be created. The range can be direct-mapped if that is supported. * * @param *_virt Hint for start virtual address, and also return * value * @param _start physical address range start * @param _end physical address range end * @param _prot protection of range (currently ignored) * * @retval NULL could not map the area * @retval addr, *_virt mapping start virtual address */ METHOD vm_offset_t map { mmu_t _mmu; vm_offset_t *_virt; vm_paddr_t _start; vm_paddr_t _end; int _prot; }; /** * @brief Used to create a contiguous set of read-only mappings for a * given object to try and eliminate a cascade of on-demand faults as * the object is accessed sequentially. This routine is optional. * * @param _pmap physical map * @param _addr mapping start virtual address * @param _object device-backed V.M. object to be mapped * @param _pindex page-index within object of mapping start * @param _size size in bytes of mapping */ METHOD void object_init_pt { mmu_t _mmu; pmap_t _pmap; vm_offset_t _addr; vm_object_t _object; vm_pindex_t _pindex; vm_size_t _size; } DEFAULT mmu_null_object_init_pt; /** * @brief Used to determine if the specified page has a mapping for the * given physical map, by scanning the list of reverse-mappings from the * page. The list is scanned to a maximum of 16 entries. * * @param _pmap physical map * @param _pg physical page * * @retval bool TRUE if the physical map was found in the first 16 * reverse-map list entries off the physical page. */ METHOD boolean_t page_exists_quick { mmu_t _mmu; pmap_t _pmap; vm_page_t _pg; }; /** * @brief Initialise the machine-dependent section of the physical page * data structure. This routine is optional. * * @param _pg physical page */ METHOD void page_init { mmu_t _mmu; vm_page_t _pg; } DEFAULT mmu_null_page_init; /** * @brief Count the number of managed mappings to the given physical * page that are wired. * * @param _pg physical page * * @retval int the number of wired, managed mappings to the * given physical page */ METHOD int page_wired_mappings { mmu_t _mmu; vm_page_t _pg; }; /** * @brief Initialise a physical map data structure * * @param _pmap physical map */ METHOD void pinit { mmu_t _mmu; pmap_t _pmap; }; /** * @brief Initialise the physical map for process 0, the initial process * in the system. * XXX default to pinit ? * * @param _pmap physical map */ METHOD void pinit0 { mmu_t _mmu; pmap_t _pmap; }; /** * @brief Set the protection for physical pages in the given virtual address * range to the given value. * * @param _pmap physical map * @param _start virtual range start * @param _end virtual range end * @param _prot new page protection */ METHOD void protect { mmu_t _mmu; pmap_t _pmap; vm_offset_t _start; vm_offset_t _end; vm_prot_t _prot; }; /** * @brief Create a mapping in kernel virtual address space for the given array * of wired physical pages. * * @param _start mapping virtual address start * @param *_m array of physical page pointers * @param _count array elements */ METHOD void qenter { mmu_t _mmu; vm_offset_t _start; vm_page_t *_pg; int _count; }; /** * @brief Remove the temporary mappings created by qenter. * * @param _start mapping virtual address start * @param _count number of pages in mapping */ METHOD void qremove { mmu_t _mmu; vm_offset_t _start; int _count; }; /** * @brief Release per-pmap resources, e.g. mutexes, allocated memory etc. There * should be no existing mappings for the physical map at this point * * @param _pmap physical map */ METHOD void release { mmu_t _mmu; pmap_t _pmap; }; /** * @brief Remove all mappings in the given physical map for the start/end * virtual address range. The range will be page-aligned. * * @param _pmap physical map * @param _start mapping virtual address start * @param _end mapping virtual address end */ METHOD void remove { mmu_t _mmu; pmap_t _pmap; vm_offset_t _start; vm_offset_t _end; }; /** * @brief Traverse the reverse-map list off the given physical page and * remove all mappings. Clear the PGA_WRITEABLE attribute from the page. * * @param _pg physical page */ METHOD void remove_all { mmu_t _mmu; vm_page_t _pg; }; /** * @brief Remove all mappings in the given start/end virtual address range * for the given physical map. Similar to the remove method, but it used * when tearing down all mappings in an address space. This method is * optional, since pmap_remove will be called for each valid vm_map in * the address space later. * * @param _pmap physical map * @param _start mapping virtual address start * @param _end mapping virtual address end */ METHOD void remove_pages { mmu_t _mmu; pmap_t _pmap; } DEFAULT mmu_null_remove_pages; /** * @brief Clear the wired attribute from the mappings for the specified range * of addresses in the given pmap. * * @param _pmap physical map * @param _start virtual range start * @param _end virtual range end */ METHOD void unwire { mmu_t _mmu; pmap_t _pmap; vm_offset_t _start; vm_offset_t _end; }; /** * @brief Zero a physical page. It is not assumed that the page is mapped, * so a temporary (or direct) mapping may need to be used. * * @param _pg physical page */ METHOD void zero_page { mmu_t _mmu; vm_page_t _pg; }; /** * @brief Zero a portion of a physical page, starting at a given offset and * for a given size (multiples of 512 bytes for 4k pages). * * @param _pg physical page * @param _off byte offset from start of page * @param _size size of area to zero */ METHOD void zero_page_area { mmu_t _mmu; vm_page_t _pg; int _off; int _size; }; /** * @brief Extract mincore(2) information from a mapping. * * @param _pmap physical map * @param _addr page virtual address * @param _locked_pa page physical address * * @retval 0 no result * @retval non-zero mincore(2) flag values */ METHOD int mincore { mmu_t _mmu; pmap_t _pmap; vm_offset_t _addr; vm_paddr_t *_locked_pa; } DEFAULT mmu_null_mincore; /** * @brief Perform any operations required to allow a physical map to be used * before it's address space is accessed. * * @param _td thread associated with physical map */ METHOD void activate { mmu_t _mmu; struct thread *_td; }; /** * @brief Perform any operations required to deactivate a physical map, * for instance as it is context-switched out. * * @param _td thread associated with physical map */ METHOD void deactivate { mmu_t _mmu; struct thread *_td; } DEFAULT mmu_null_deactivate; /** * @brief Return a hint for the best virtual address to map a tentative * virtual address range in a given VM object. The default is to just * return the given tentative start address. * * @param _obj VM backing object * @param _offset starting offset with the VM object * @param _addr initial guess at virtual address * @param _size size of virtual address range */ METHOD void align_superpage { mmu_t _mmu; vm_object_t _obj; vm_ooffset_t _offset; vm_offset_t *_addr; vm_size_t _size; } DEFAULT mmu_null_align_superpage; /** * INTERNAL INTERFACES */ /** * @brief Bootstrap the VM system. At the completion of this routine, the - * kernel will be running in it's own address space with full control over + * kernel will be running in its own address space with full control over * paging. * * @param _start start of reserved memory (obsolete ???) * @param _end end of reserved memory (obsolete ???) * XXX I think the intent of these was to allow * the memory used by kernel text+data+bss and * loader variables/load-time kld's to be carved out * of available physical mem. * */ METHOD void bootstrap { mmu_t _mmu; vm_offset_t _start; vm_offset_t _end; }; /** * @brief Set up the MMU on the current CPU. Only called by the PMAP layer * for alternate CPUs on SMP systems. * * @param _ap Set to 1 if the CPU being set up is an AP * */ METHOD void cpu_bootstrap { mmu_t _mmu; int _ap; }; /** * @brief Create a kernel mapping for a given physical address range. * Called by bus code on behalf of device drivers. The mapping does not * have to be a virtual address: it can be a direct-mapped physical address * if that is supported by the MMU. * * @param _pa start physical address * @param _size size in bytes of mapping * * @retval addr address of mapping. */ METHOD void * mapdev { mmu_t _mmu; vm_paddr_t _pa; vm_size_t _size; }; /** * @brief Create a kernel mapping for a given physical address range. * Called by bus code on behalf of device drivers. The mapping does not * have to be a virtual address: it can be a direct-mapped physical address * if that is supported by the MMU. * * @param _pa start physical address * @param _size size in bytes of mapping * @param _attr cache attributes * * @retval addr address of mapping. */ METHOD void * mapdev_attr { mmu_t _mmu; vm_paddr_t _pa; vm_size_t _size; vm_memattr_t _attr; } DEFAULT mmu_null_mapdev_attr; /** * @brief Change cache control attributes for a page. Should modify all * mappings for that page. * * @param _m page to modify * @param _ma new cache control attributes */ METHOD void page_set_memattr { mmu_t _mmu; vm_page_t _pg; vm_memattr_t _ma; } DEFAULT mmu_null_page_set_memattr; /** * @brief Remove the mapping created by mapdev. Called when a driver * is unloaded. * * @param _va Mapping address returned from mapdev * @param _size size in bytes of mapping */ METHOD void unmapdev { mmu_t _mmu; vm_offset_t _va; vm_size_t _size; }; /** * @brief Reverse-map a kernel virtual address * * @param _va kernel virtual address to reverse-map * * @retval pa physical address corresponding to mapping */ METHOD vm_paddr_t kextract { mmu_t _mmu; vm_offset_t _va; }; /** * @brief Map a wired page into kernel virtual address space * * @param _va mapping virtual address * @param _pa mapping physical address */ METHOD void kenter { mmu_t _mmu; vm_offset_t _va; vm_paddr_t _pa; }; /** * @brief Map a wired page into kernel virtual address space * * @param _va mapping virtual address * @param _pa mapping physical address * @param _ma mapping cache control attributes */ METHOD void kenter_attr { mmu_t _mmu; vm_offset_t _va; vm_paddr_t _pa; vm_memattr_t _ma; } DEFAULT mmu_null_kenter_attr; /** * @brief Unmap a wired page from kernel virtual address space * * @param _va mapped virtual address */ METHOD void kremove { mmu_t _mmu; vm_offset_t _va; }; /** * @brief Determine if the given physical address range has been direct-mapped. * * @param _pa physical address start * @param _size physical address range size * * @retval bool TRUE if the range is direct-mapped. */ METHOD boolean_t dev_direct_mapped { mmu_t _mmu; vm_paddr_t _pa; vm_size_t _size; }; /** * @brief Enforce instruction cache coherency. Typically called after a * region of memory has been modified and before execution of or within * that region is attempted. Setting breakpoints in a process through * ptrace(2) is one example of when the instruction cache needs to be * made coherent. * * @param _pm the physical map of the virtual address * @param _va the virtual address of the modified region * @param _sz the size of the modified region */ METHOD void sync_icache { mmu_t _mmu; pmap_t _pm; vm_offset_t _va; vm_size_t _sz; }; /** * @brief Create temporary memory mapping for use by dumpsys(). * * @param _pa The physical page to map. * @param _sz The requested size of the mapping. * @param _va The virtual address of the mapping. */ METHOD void dumpsys_map { mmu_t _mmu; vm_paddr_t _pa; size_t _sz; void **_va; }; /** * @brief Remove temporary dumpsys() mapping. * * @param _pa The physical page to map. * @param _sz The requested size of the mapping. * @param _va The virtual address of the mapping. */ METHOD void dumpsys_unmap { mmu_t _mmu; vm_paddr_t _pa; size_t _sz; void *_va; }; /** * @brief Initialize memory chunks for dumpsys. */ METHOD void scan_init { mmu_t _mmu; }; /** * @brief Create a temporary thread-local KVA mapping of a single page. * * @param _pg The physical page to map * * @retval addr The temporary KVA */ METHOD vm_offset_t quick_enter_page { mmu_t _mmu; vm_page_t _pg; }; /** * @brief Undo a mapping created by quick_enter_page * * @param _va The mapped KVA */ METHOD void quick_remove_page { mmu_t _mmu; vm_offset_t _va; }; /** * @brief Change the specified virtual address range's memory type. * * @param _va The virtual base address to change * * @param _sz Size of the region to change * * @param _mode New mode to set on the VA range * * @retval error 0 on success, EINVAL or ENOMEM on error. */ METHOD int change_attr { mmu_t _mmu; vm_offset_t _va; vm_size_t _sz; vm_memattr_t _mode; } DEFAULT mmu_null_change_attr; Index: head/sys/security/mac/mac_inet.c =================================================================== --- head/sys/security/mac/mac_inet.c (revision 308456) +++ head/sys/security/mac/mac_inet.c (revision 308457) @@ -1,507 +1,507 @@ /*- * Copyright (c) 1999-2002, 2007, 2009 Robert N. M. Watson * Copyright (c) 2001 Ilmar S. Habibulin * Copyright (c) 2001-2004 Networks Associates Technology, Inc. * Copyright (c) 2006 SPARTA, Inc. * Copyright (c) 2008 Apple Inc. * All rights reserved. * * This software was developed by Robert Watson and Ilmar Habibulin for the * TrustedBSD Project. * * This software was developed for the FreeBSD Project in part by Network * Associates Laboratories, the Security Research Division of Network * Associates, Inc. under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), * as part of the DARPA CHATS research program. * * This software was enhanced by SPARTA ISSO under SPAWAR contract * N66001-04-C-6019 ("SEFOS"). * * This software was developed at the University of Cambridge Computer * Laboratory with support from a grant from Google, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_mac.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static struct label * mac_inpcb_label_alloc(int flag) { struct label *label; int error; label = mac_labelzone_alloc(flag); if (label == NULL) return (NULL); if (flag & M_WAITOK) MAC_POLICY_CHECK(inpcb_init_label, label, flag); else MAC_POLICY_CHECK_NOSLEEP(inpcb_init_label, label, flag); if (error) { MAC_POLICY_PERFORM_NOSLEEP(inpcb_destroy_label, label); mac_labelzone_free(label); return (NULL); } return (label); } int mac_inpcb_init(struct inpcb *inp, int flag) { if (mac_labeled & MPC_OBJECT_INPCB) { inp->inp_label = mac_inpcb_label_alloc(flag); if (inp->inp_label == NULL) return (ENOMEM); } else inp->inp_label = NULL; return (0); } static struct label * mac_ipq_label_alloc(int flag) { struct label *label; int error; label = mac_labelzone_alloc(flag); if (label == NULL) return (NULL); if (flag & M_WAITOK) MAC_POLICY_CHECK(ipq_init_label, label, flag); else MAC_POLICY_CHECK_NOSLEEP(ipq_init_label, label, flag); if (error) { MAC_POLICY_PERFORM_NOSLEEP(ipq_destroy_label, label); mac_labelzone_free(label); return (NULL); } return (label); } int mac_ipq_init(struct ipq *q, int flag) { if (mac_labeled & MPC_OBJECT_IPQ) { q->ipq_label = mac_ipq_label_alloc(flag); if (q->ipq_label == NULL) return (ENOMEM); } else q->ipq_label = NULL; return (0); } static void mac_inpcb_label_free(struct label *label) { MAC_POLICY_PERFORM_NOSLEEP(inpcb_destroy_label, label); mac_labelzone_free(label); } void mac_inpcb_destroy(struct inpcb *inp) { if (inp->inp_label != NULL) { mac_inpcb_label_free(inp->inp_label); inp->inp_label = NULL; } } static void mac_ipq_label_free(struct label *label) { MAC_POLICY_PERFORM_NOSLEEP(ipq_destroy_label, label); mac_labelzone_free(label); } void mac_ipq_destroy(struct ipq *q) { if (q->ipq_label != NULL) { mac_ipq_label_free(q->ipq_label); q->ipq_label = NULL; } } void mac_inpcb_create(struct socket *so, struct inpcb *inp) { MAC_POLICY_PERFORM_NOSLEEP(inpcb_create, so, so->so_label, inp, inp->inp_label); } void mac_ipq_reassemble(struct ipq *q, struct mbuf *m) { struct label *label; if (mac_policy_count == 0) return; label = mac_mbuf_to_label(m); MAC_POLICY_PERFORM_NOSLEEP(ipq_reassemble, q, q->ipq_label, m, label); } void mac_netinet_fragment(struct mbuf *m, struct mbuf *frag) { struct label *mlabel, *fraglabel; if (mac_policy_count == 0) return; mlabel = mac_mbuf_to_label(m); fraglabel = mac_mbuf_to_label(frag); MAC_POLICY_PERFORM_NOSLEEP(netinet_fragment, m, mlabel, frag, fraglabel); } void mac_ipq_create(struct mbuf *m, struct ipq *q) { struct label *label; if (mac_policy_count == 0) return; label = mac_mbuf_to_label(m); MAC_POLICY_PERFORM_NOSLEEP(ipq_create, m, label, q, q->ipq_label); } void mac_inpcb_create_mbuf(struct inpcb *inp, struct mbuf *m) { struct label *mlabel; INP_LOCK_ASSERT(inp); if (mac_policy_count == 0) return; mlabel = mac_mbuf_to_label(m); MAC_POLICY_PERFORM_NOSLEEP(inpcb_create_mbuf, inp, inp->inp_label, m, mlabel); } int mac_ipq_match(struct mbuf *m, struct ipq *q) { struct label *label; int result; if (mac_policy_count == 0) return (1); label = mac_mbuf_to_label(m); result = 1; MAC_POLICY_BOOLEAN_NOSLEEP(ipq_match, &&, m, label, q, q->ipq_label); return (result); } void mac_netinet_arp_send(struct ifnet *ifp, struct mbuf *m) { struct label *mlabel; if (mac_policy_count == 0) return; mlabel = mac_mbuf_to_label(m); MAC_IFNET_LOCK(ifp); MAC_POLICY_PERFORM_NOSLEEP(netinet_arp_send, ifp, ifp->if_label, m, mlabel); MAC_IFNET_UNLOCK(ifp); } void mac_netinet_icmp_reply(struct mbuf *mrecv, struct mbuf *msend) { struct label *mrecvlabel, *msendlabel; if (mac_policy_count == 0) return; mrecvlabel = mac_mbuf_to_label(mrecv); msendlabel = mac_mbuf_to_label(msend); MAC_POLICY_PERFORM_NOSLEEP(netinet_icmp_reply, mrecv, mrecvlabel, msend, msendlabel); } void mac_netinet_icmp_replyinplace(struct mbuf *m) { struct label *label; if (mac_policy_count == 0) return; label = mac_mbuf_to_label(m); MAC_POLICY_PERFORM_NOSLEEP(netinet_icmp_replyinplace, m, label); } void mac_netinet_igmp_send(struct ifnet *ifp, struct mbuf *m) { struct label *mlabel; if (mac_policy_count == 0) return; mlabel = mac_mbuf_to_label(m); MAC_IFNET_LOCK(ifp); MAC_POLICY_PERFORM_NOSLEEP(netinet_igmp_send, ifp, ifp->if_label, m, mlabel); MAC_IFNET_UNLOCK(ifp); } void mac_netinet_tcp_reply(struct mbuf *m) { struct label *label; if (mac_policy_count == 0) return; label = mac_mbuf_to_label(m); MAC_POLICY_PERFORM_NOSLEEP(netinet_tcp_reply, m, label); } void mac_ipq_update(struct mbuf *m, struct ipq *q) { struct label *label; if (mac_policy_count == 0) return; label = mac_mbuf_to_label(m); MAC_POLICY_PERFORM_NOSLEEP(ipq_update, m, label, q, q->ipq_label); } MAC_CHECK_PROBE_DEFINE2(inpcb_check_deliver, "struct inpcb *", "struct mbuf *"); int mac_inpcb_check_deliver(struct inpcb *inp, struct mbuf *m) { struct label *label; int error; M_ASSERTPKTHDR(m); if (mac_policy_count == 0) return (0); label = mac_mbuf_to_label(m); MAC_POLICY_CHECK_NOSLEEP(inpcb_check_deliver, inp, inp->inp_label, m, label); MAC_CHECK_PROBE2(inpcb_check_deliver, error, inp, m); return (error); } MAC_CHECK_PROBE_DEFINE2(inpcb_check_visible, "struct ucred *", "struct inpcb *"); int mac_inpcb_check_visible(struct ucred *cred, struct inpcb *inp) { int error; INP_LOCK_ASSERT(inp); MAC_POLICY_CHECK_NOSLEEP(inpcb_check_visible, cred, inp, inp->inp_label); MAC_CHECK_PROBE2(inpcb_check_visible, error, cred, inp); return (error); } void mac_inpcb_sosetlabel(struct socket *so, struct inpcb *inp) { INP_WLOCK_ASSERT(inp); SOCK_LOCK_ASSERT(so); MAC_POLICY_PERFORM_NOSLEEP(inpcb_sosetlabel, so, so->so_label, inp, inp->inp_label); } void mac_netinet_firewall_reply(struct mbuf *mrecv, struct mbuf *msend) { struct label *mrecvlabel, *msendlabel; M_ASSERTPKTHDR(mrecv); M_ASSERTPKTHDR(msend); if (mac_policy_count == 0) return; mrecvlabel = mac_mbuf_to_label(mrecv); msendlabel = mac_mbuf_to_label(msend); MAC_POLICY_PERFORM_NOSLEEP(netinet_firewall_reply, mrecv, mrecvlabel, msend, msendlabel); } void mac_netinet_firewall_send(struct mbuf *m) { struct label *label; M_ASSERTPKTHDR(m); if (mac_policy_count == 0) return; label = mac_mbuf_to_label(m); MAC_POLICY_PERFORM_NOSLEEP(netinet_firewall_send, m, label); } /* * These functions really should be referencing the syncache structure * instead of the label. However, due to some of the complexities associated - * with exposing this syncache structure we operate directly on it's label + * with exposing this syncache structure we operate directly on its label * pointer. This should be OK since we aren't making any access control * decisions within this code directly, we are merely allocating and copying * label storage so we can properly initialize mbuf labels for any packets * the syncache code might create. */ void mac_syncache_destroy(struct label **label) { if (*label != NULL) { MAC_POLICY_PERFORM_NOSLEEP(syncache_destroy_label, *label); mac_labelzone_free(*label); *label = NULL; } } int mac_syncache_init(struct label **label) { int error; if (mac_labeled & MPC_OBJECT_SYNCACHE) { *label = mac_labelzone_alloc(M_NOWAIT); if (*label == NULL) return (ENOMEM); /* * Since we are holding the inpcb locks the policy can not * allocate policy specific label storage using M_WAITOK. So * we need to do a MAC_CHECK instead of the typical * MAC_PERFORM so we can propagate allocation failures back * to the syncache code. */ MAC_POLICY_CHECK_NOSLEEP(syncache_init_label, *label, M_NOWAIT); if (error) { MAC_POLICY_PERFORM_NOSLEEP(syncache_destroy_label, *label); mac_labelzone_free(*label); } return (error); } else *label = NULL; return (0); } void mac_syncache_create(struct label *label, struct inpcb *inp) { INP_WLOCK_ASSERT(inp); MAC_POLICY_PERFORM_NOSLEEP(syncache_create, label, inp); } void mac_syncache_create_mbuf(struct label *sc_label, struct mbuf *m) { struct label *mlabel; M_ASSERTPKTHDR(m); if (mac_policy_count == 0) return; mlabel = mac_mbuf_to_label(m); MAC_POLICY_PERFORM_NOSLEEP(syncache_create_mbuf, sc_label, m, mlabel); } Index: head/sys/vm/vm_pageout.c =================================================================== --- head/sys/vm/vm_pageout.c (revision 308456) +++ head/sys/vm/vm_pageout.c (revision 308457) @@ -1,1857 +1,1857 @@ /*- * Copyright (c) 1991 Regents of the University of California. * All rights reserved. * Copyright (c) 1994 John S. Dyson * All rights reserved. * Copyright (c) 1994 David Greenman * All rights reserved. * Copyright (c) 2005 Yahoo! Technologies Norway AS * All rights reserved. * * This code is derived from software contributed to Berkeley by * The Mach Operating System project at Carnegie-Mellon University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)vm_pageout.c 7.4 (Berkeley) 5/7/91 * * * Copyright (c) 1987, 1990 Carnegie-Mellon University. * All rights reserved. * * Authors: Avadis Tevanian, Jr., Michael Wayne Young * * Permission to use, copy, modify and distribute this software and * its documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. * * Carnegie Mellon requests users of this software to return to * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 * * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. */ /* * The proverbial page-out daemon. */ #include __FBSDID("$FreeBSD$"); #include "opt_vm.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * System initialization */ /* the kernel process "vm_pageout"*/ static void vm_pageout(void); static void vm_pageout_init(void); static int vm_pageout_clean(vm_page_t m); static int vm_pageout_cluster(vm_page_t m); static bool vm_pageout_scan(struct vm_domain *vmd, int pass); static void vm_pageout_mightbe_oom(struct vm_domain *vmd, int page_shortage, int starting_page_shortage); SYSINIT(pagedaemon_init, SI_SUB_KTHREAD_PAGE, SI_ORDER_FIRST, vm_pageout_init, NULL); struct proc *pageproc; static struct kproc_desc page_kp = { "pagedaemon", vm_pageout, &pageproc }; SYSINIT(pagedaemon, SI_SUB_KTHREAD_PAGE, SI_ORDER_SECOND, kproc_start, &page_kp); SDT_PROVIDER_DEFINE(vm); SDT_PROBE_DEFINE(vm, , , vm__lowmem_scan); #if !defined(NO_SWAPPING) /* the kernel process "vm_daemon"*/ static void vm_daemon(void); static struct proc *vmproc; static struct kproc_desc vm_kp = { "vmdaemon", vm_daemon, &vmproc }; SYSINIT(vmdaemon, SI_SUB_KTHREAD_VM, SI_ORDER_FIRST, kproc_start, &vm_kp); #endif int vm_pageout_deficit; /* Estimated number of pages deficit */ u_int vm_pageout_wakeup_thresh; static int vm_pageout_oom_seq = 12; bool vm_pageout_wanted; /* Event on which pageout daemon sleeps */ bool vm_pages_needed; /* Are threads waiting for free pages? */ #if !defined(NO_SWAPPING) static int vm_pageout_req_swapout; /* XXX */ static int vm_daemon_needed; static struct mtx vm_daemon_mtx; /* Allow for use by vm_pageout before vm_daemon is initialized. */ MTX_SYSINIT(vm_daemon, &vm_daemon_mtx, "vm daemon", MTX_DEF); #endif static int vm_max_launder = 32; static int vm_pageout_update_period; static int defer_swap_pageouts; static int disable_swap_pageouts; static int lowmem_period = 10; static time_t lowmem_uptime; #if defined(NO_SWAPPING) static int vm_swap_enabled = 0; static int vm_swap_idle_enabled = 0; #else static int vm_swap_enabled = 1; static int vm_swap_idle_enabled = 0; #endif static int vm_panic_on_oom = 0; SYSCTL_INT(_vm, OID_AUTO, panic_on_oom, CTLFLAG_RWTUN, &vm_panic_on_oom, 0, "panic on out of memory instead of killing the largest process"); SYSCTL_INT(_vm, OID_AUTO, pageout_wakeup_thresh, CTLFLAG_RW, &vm_pageout_wakeup_thresh, 0, "free page threshold for waking up the pageout daemon"); SYSCTL_INT(_vm, OID_AUTO, max_launder, CTLFLAG_RW, &vm_max_launder, 0, "Limit dirty flushes in pageout"); SYSCTL_INT(_vm, OID_AUTO, pageout_update_period, CTLFLAG_RW, &vm_pageout_update_period, 0, "Maximum active LRU update period"); SYSCTL_INT(_vm, OID_AUTO, lowmem_period, CTLFLAG_RW, &lowmem_period, 0, "Low memory callback period"); #if defined(NO_SWAPPING) SYSCTL_INT(_vm, VM_SWAPPING_ENABLED, swap_enabled, CTLFLAG_RD, &vm_swap_enabled, 0, "Enable entire process swapout"); SYSCTL_INT(_vm, OID_AUTO, swap_idle_enabled, CTLFLAG_RD, &vm_swap_idle_enabled, 0, "Allow swapout on idle criteria"); #else SYSCTL_INT(_vm, VM_SWAPPING_ENABLED, swap_enabled, CTLFLAG_RW, &vm_swap_enabled, 0, "Enable entire process swapout"); SYSCTL_INT(_vm, OID_AUTO, swap_idle_enabled, CTLFLAG_RW, &vm_swap_idle_enabled, 0, "Allow swapout on idle criteria"); #endif SYSCTL_INT(_vm, OID_AUTO, defer_swapspace_pageouts, CTLFLAG_RW, &defer_swap_pageouts, 0, "Give preference to dirty pages in mem"); SYSCTL_INT(_vm, OID_AUTO, disable_swapspace_pageouts, CTLFLAG_RW, &disable_swap_pageouts, 0, "Disallow swapout of dirty pages"); static int pageout_lock_miss; SYSCTL_INT(_vm, OID_AUTO, pageout_lock_miss, CTLFLAG_RD, &pageout_lock_miss, 0, "vget() lock misses during pageout"); SYSCTL_INT(_vm, OID_AUTO, pageout_oom_seq, CTLFLAG_RW, &vm_pageout_oom_seq, 0, "back-to-back calls to oom detector to start OOM"); #define VM_PAGEOUT_PAGE_COUNT 16 int vm_pageout_page_count = VM_PAGEOUT_PAGE_COUNT; int vm_page_max_wired; /* XXX max # of wired pages system-wide */ SYSCTL_INT(_vm, OID_AUTO, max_wired, CTLFLAG_RW, &vm_page_max_wired, 0, "System-wide limit to wired page count"); static boolean_t vm_pageout_fallback_object_lock(vm_page_t, vm_page_t *); #if !defined(NO_SWAPPING) static void vm_pageout_map_deactivate_pages(vm_map_t, long); static void vm_pageout_object_deactivate_pages(pmap_t, vm_object_t, long); static void vm_req_vmdaemon(int req); #endif static boolean_t vm_pageout_page_lock(vm_page_t, vm_page_t *); /* * Initialize a dummy page for marking the caller's place in the specified * paging queue. In principle, this function only needs to set the flag * PG_MARKER. Nonetheless, it write busies and initializes the hold count * to one as safety precautions. */ static void vm_pageout_init_marker(vm_page_t marker, u_short queue) { bzero(marker, sizeof(*marker)); marker->flags = PG_MARKER; marker->busy_lock = VPB_SINGLE_EXCLUSIVER; marker->queue = queue; marker->hold_count = 1; } /* * vm_pageout_fallback_object_lock: * * Lock vm object currently associated with `m'. VM_OBJECT_TRYWLOCK is * known to have failed and page queue must be either PQ_ACTIVE or * PQ_INACTIVE. To avoid lock order violation, unlock the page queue * while locking the vm object. Use marker page to detect page queue * changes and maintain notion of next page on page queue. Return * TRUE if no changes were detected, FALSE otherwise. vm object is * locked on return. * * This function depends on both the lock portion of struct vm_object * and normal struct vm_page being type stable. */ static boolean_t vm_pageout_fallback_object_lock(vm_page_t m, vm_page_t *next) { struct vm_page marker; struct vm_pagequeue *pq; boolean_t unchanged; u_short queue; vm_object_t object; queue = m->queue; vm_pageout_init_marker(&marker, queue); pq = vm_page_pagequeue(m); object = m->object; TAILQ_INSERT_AFTER(&pq->pq_pl, m, &marker, plinks.q); vm_pagequeue_unlock(pq); vm_page_unlock(m); VM_OBJECT_WLOCK(object); vm_page_lock(m); vm_pagequeue_lock(pq); /* * The page's object might have changed, and/or the page might * have moved from its original position in the queue. If the * page's object has changed, then the caller should abandon * processing the page because the wrong object lock was * acquired. Use the marker's plinks.q, not the page's, to * determine if the page has been moved. The state of the * page's plinks.q can be indeterminate; whereas, the marker's * plinks.q must be valid. */ *next = TAILQ_NEXT(&marker, plinks.q); unchanged = m->object == object && m == TAILQ_PREV(&marker, pglist, plinks.q); KASSERT(!unchanged || m->queue == queue, ("page %p queue %d %d", m, queue, m->queue)); TAILQ_REMOVE(&pq->pq_pl, &marker, plinks.q); return (unchanged); } /* * Lock the page while holding the page queue lock. Use marker page * to detect page queue changes and maintain notion of next page on * page queue. Return TRUE if no changes were detected, FALSE * otherwise. The page is locked on return. The page queue lock might * be dropped and reacquired. * * This function depends on normal struct vm_page being type stable. */ static boolean_t vm_pageout_page_lock(vm_page_t m, vm_page_t *next) { struct vm_page marker; struct vm_pagequeue *pq; boolean_t unchanged; u_short queue; vm_page_lock_assert(m, MA_NOTOWNED); if (vm_page_trylock(m)) return (TRUE); queue = m->queue; vm_pageout_init_marker(&marker, queue); pq = vm_page_pagequeue(m); TAILQ_INSERT_AFTER(&pq->pq_pl, m, &marker, plinks.q); vm_pagequeue_unlock(pq); vm_page_lock(m); vm_pagequeue_lock(pq); /* Page queue might have changed. */ *next = TAILQ_NEXT(&marker, plinks.q); unchanged = m == TAILQ_PREV(&marker, pglist, plinks.q); KASSERT(!unchanged || m->queue == queue, ("page %p queue %d %d", m, queue, m->queue)); TAILQ_REMOVE(&pq->pq_pl, &marker, plinks.q); return (unchanged); } /* * Scan for pages at adjacent offsets within the given page's object that are * eligible for laundering, form a cluster of these pages and the given page, * and launder that cluster. */ static int vm_pageout_cluster(vm_page_t m) { vm_object_t object; vm_page_t mc[2 * vm_pageout_page_count], p, pb, ps; vm_pindex_t pindex; int ib, is, page_base, pageout_count; vm_page_assert_locked(m); object = m->object; VM_OBJECT_ASSERT_WLOCKED(object); pindex = m->pindex; /* * We can't clean the page if it is busy or held. */ vm_page_assert_unbusied(m); KASSERT(m->hold_count == 0, ("page %p is held", m)); vm_page_unlock(m); mc[vm_pageout_page_count] = pb = ps = m; pageout_count = 1; page_base = vm_pageout_page_count; ib = 1; is = 1; /* * We can cluster only if the page is not clean, busy, or held, and * the page is inactive. * * During heavy mmap/modification loads the pageout * daemon can really fragment the underlying file * due to flushing pages out of order and not trying to * align the clusters (which leaves sporadic out-of-order * holes). To solve this problem we do the reverse scan * first and attempt to align our cluster, then do a * forward scan if room remains. */ more: while (ib != 0 && pageout_count < vm_pageout_page_count) { if (ib > pindex) { ib = 0; break; } if ((p = vm_page_prev(pb)) == NULL || vm_page_busied(p)) { ib = 0; break; } vm_page_test_dirty(p); if (p->dirty == 0) { ib = 0; break; } vm_page_lock(p); if (p->queue != PQ_INACTIVE || p->hold_count != 0) { /* may be undergoing I/O */ vm_page_unlock(p); ib = 0; break; } vm_page_unlock(p); mc[--page_base] = pb = p; ++pageout_count; ++ib; /* * We are at an alignment boundary. Stop here, and switch * directions. Do not clear ib. */ if ((pindex - (ib - 1)) % vm_pageout_page_count == 0) break; } while (pageout_count < vm_pageout_page_count && pindex + is < object->size) { if ((p = vm_page_next(ps)) == NULL || vm_page_busied(p)) break; vm_page_test_dirty(p); if (p->dirty == 0) break; vm_page_lock(p); if (p->queue != PQ_INACTIVE || p->hold_count != 0) { /* may be undergoing I/O */ vm_page_unlock(p); break; } vm_page_unlock(p); mc[page_base + pageout_count] = ps = p; ++pageout_count; ++is; } /* * If we exhausted our forward scan, continue with the reverse scan * when possible, even past an alignment boundary. This catches * boundary conditions. */ if (ib != 0 && pageout_count < vm_pageout_page_count) goto more; return (vm_pageout_flush(&mc[page_base], pageout_count, 0, 0, NULL, NULL)); } /* * vm_pageout_flush() - launder the given pages * * The given pages are laundered. Note that we setup for the start of * I/O ( i.e. busy the page ), mark it read-only, and bump the object * reference count all in here rather then in the parent. If we want * the parent to do more sophisticated things we may have to change * the ordering. * * Returned runlen is the count of pages between mreq and first * page after mreq with status VM_PAGER_AGAIN. * *eio is set to TRUE if pager returned VM_PAGER_ERROR or VM_PAGER_FAIL * for any page in runlen set. */ int vm_pageout_flush(vm_page_t *mc, int count, int flags, int mreq, int *prunlen, boolean_t *eio) { vm_object_t object = mc[0]->object; int pageout_status[count]; int numpagedout = 0; int i, runlen; VM_OBJECT_ASSERT_WLOCKED(object); /* * Initiate I/O. Bump the vm_page_t->busy counter and * mark the pages read-only. * * We do not have to fixup the clean/dirty bits here... we can * allow the pager to do it after the I/O completes. * * NOTE! mc[i]->dirty may be partial or fragmented due to an * edge case with file fragments. */ for (i = 0; i < count; i++) { KASSERT(mc[i]->valid == VM_PAGE_BITS_ALL, ("vm_pageout_flush: partially invalid page %p index %d/%d", mc[i], i, count)); vm_page_sbusy(mc[i]); pmap_remove_write(mc[i]); } vm_object_pip_add(object, count); vm_pager_put_pages(object, mc, count, flags, pageout_status); runlen = count - mreq; if (eio != NULL) *eio = FALSE; for (i = 0; i < count; i++) { vm_page_t mt = mc[i]; KASSERT(pageout_status[i] == VM_PAGER_PEND || !pmap_page_is_write_mapped(mt), ("vm_pageout_flush: page %p is not write protected", mt)); switch (pageout_status[i]) { case VM_PAGER_OK: case VM_PAGER_PEND: numpagedout++; break; case VM_PAGER_BAD: /* * Page outside of range of object. Right now we * essentially lose the changes by pretending it * worked. */ vm_page_undirty(mt); break; case VM_PAGER_ERROR: case VM_PAGER_FAIL: /* * If page couldn't be paged out, then reactivate the * page so it doesn't clog the inactive list. (We * will try paging out it again later). */ vm_page_lock(mt); vm_page_activate(mt); vm_page_unlock(mt); if (eio != NULL && i >= mreq && i - mreq < runlen) *eio = TRUE; break; case VM_PAGER_AGAIN: if (i >= mreq && i - mreq < runlen) runlen = i - mreq; break; } /* * If the operation is still going, leave the page busy to * block all other accesses. Also, leave the paging in * progress indicator set so that we don't attempt an object * collapse. */ if (pageout_status[i] != VM_PAGER_PEND) { vm_object_pip_wakeup(object); vm_page_sunbusy(mt); } } if (prunlen != NULL) *prunlen = runlen; return (numpagedout); } #if !defined(NO_SWAPPING) /* * vm_pageout_object_deactivate_pages * * Deactivate enough pages to satisfy the inactive target * requirements. * * The object and map must be locked. */ static void vm_pageout_object_deactivate_pages(pmap_t pmap, vm_object_t first_object, long desired) { vm_object_t backing_object, object; vm_page_t p; int act_delta, remove_mode; VM_OBJECT_ASSERT_LOCKED(first_object); if ((first_object->flags & OBJ_FICTITIOUS) != 0) return; for (object = first_object;; object = backing_object) { if (pmap_resident_count(pmap) <= desired) goto unlock_return; VM_OBJECT_ASSERT_LOCKED(object); if ((object->flags & OBJ_UNMANAGED) != 0 || object->paging_in_progress != 0) goto unlock_return; remove_mode = 0; if (object->shadow_count > 1) remove_mode = 1; /* * Scan the object's entire memory queue. */ TAILQ_FOREACH(p, &object->memq, listq) { if (pmap_resident_count(pmap) <= desired) goto unlock_return; if (vm_page_busied(p)) continue; PCPU_INC(cnt.v_pdpages); vm_page_lock(p); if (p->wire_count != 0 || p->hold_count != 0 || !pmap_page_exists_quick(pmap, p)) { vm_page_unlock(p); continue; } act_delta = pmap_ts_referenced(p); if ((p->aflags & PGA_REFERENCED) != 0) { if (act_delta == 0) act_delta = 1; vm_page_aflag_clear(p, PGA_REFERENCED); } if (p->queue != PQ_ACTIVE && act_delta != 0) { vm_page_activate(p); p->act_count += act_delta; } else if (p->queue == PQ_ACTIVE) { if (act_delta == 0) { p->act_count -= min(p->act_count, ACT_DECLINE); if (!remove_mode && p->act_count == 0) { pmap_remove_all(p); vm_page_deactivate(p); } else vm_page_requeue(p); } else { vm_page_activate(p); if (p->act_count < ACT_MAX - ACT_ADVANCE) p->act_count += ACT_ADVANCE; vm_page_requeue(p); } } else if (p->queue == PQ_INACTIVE) pmap_remove_all(p); vm_page_unlock(p); } if ((backing_object = object->backing_object) == NULL) goto unlock_return; VM_OBJECT_RLOCK(backing_object); if (object != first_object) VM_OBJECT_RUNLOCK(object); } unlock_return: if (object != first_object) VM_OBJECT_RUNLOCK(object); } /* * deactivate some number of pages in a map, try to do it fairly, but * that is really hard to do. */ static void vm_pageout_map_deactivate_pages(map, desired) vm_map_t map; long desired; { vm_map_entry_t tmpe; vm_object_t obj, bigobj; int nothingwired; if (!vm_map_trylock(map)) return; bigobj = NULL; nothingwired = TRUE; /* * first, search out the biggest object, and try to free pages from * that. */ tmpe = map->header.next; while (tmpe != &map->header) { if ((tmpe->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) { obj = tmpe->object.vm_object; if (obj != NULL && VM_OBJECT_TRYRLOCK(obj)) { if (obj->shadow_count <= 1 && (bigobj == NULL || bigobj->resident_page_count < obj->resident_page_count)) { if (bigobj != NULL) VM_OBJECT_RUNLOCK(bigobj); bigobj = obj; } else VM_OBJECT_RUNLOCK(obj); } } if (tmpe->wired_count > 0) nothingwired = FALSE; tmpe = tmpe->next; } if (bigobj != NULL) { vm_pageout_object_deactivate_pages(map->pmap, bigobj, desired); VM_OBJECT_RUNLOCK(bigobj); } /* * Next, hunt around for other pages to deactivate. We actually * do this search sort of wrong -- .text first is not the best idea. */ tmpe = map->header.next; while (tmpe != &map->header) { if (pmap_resident_count(vm_map_pmap(map)) <= desired) break; if ((tmpe->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) { obj = tmpe->object.vm_object; if (obj != NULL) { VM_OBJECT_RLOCK(obj); vm_pageout_object_deactivate_pages(map->pmap, obj, desired); VM_OBJECT_RUNLOCK(obj); } } tmpe = tmpe->next; } /* * Remove all mappings if a process is swapped out, this will free page * table pages. */ if (desired == 0 && nothingwired) { pmap_remove(vm_map_pmap(map), vm_map_min(map), vm_map_max(map)); } vm_map_unlock(map); } #endif /* !defined(NO_SWAPPING) */ /* * Attempt to acquire all of the necessary locks to launder a page and * then call through the clustering layer to PUTPAGES. Wait a short * time for a vnode lock. * * Requires the page and object lock on entry, releases both before return. * Returns 0 on success and an errno otherwise. */ static int vm_pageout_clean(vm_page_t m) { struct vnode *vp; struct mount *mp; vm_object_t object; vm_pindex_t pindex; int error, lockmode; vm_page_assert_locked(m); object = m->object; VM_OBJECT_ASSERT_WLOCKED(object); error = 0; vp = NULL; mp = NULL; /* * The object is already known NOT to be dead. It * is possible for the vget() to block the whole * pageout daemon, but the new low-memory handling * code should prevent it. * * We can't wait forever for the vnode lock, we might * deadlock due to a vn_read() getting stuck in * vm_wait while holding this vnode. We skip the * vnode if we can't get it in a reasonable amount * of time. */ if (object->type == OBJT_VNODE) { vm_page_unlock(m); vp = object->handle; if (vp->v_type == VREG && vn_start_write(vp, &mp, V_NOWAIT) != 0) { mp = NULL; error = EDEADLK; goto unlock_all; } KASSERT(mp != NULL, ("vp %p with NULL v_mount", vp)); vm_object_reference_locked(object); pindex = m->pindex; VM_OBJECT_WUNLOCK(object); lockmode = MNT_SHARED_WRITES(vp->v_mount) ? LK_SHARED : LK_EXCLUSIVE; if (vget(vp, lockmode | LK_TIMELOCK, curthread)) { vp = NULL; error = EDEADLK; goto unlock_mp; } VM_OBJECT_WLOCK(object); vm_page_lock(m); /* * While the object and page were unlocked, the page * may have been: * (1) moved to a different queue, * (2) reallocated to a different object, * (3) reallocated to a different offset, or * (4) cleaned. */ if (m->queue != PQ_INACTIVE || m->object != object || m->pindex != pindex || m->dirty == 0) { vm_page_unlock(m); error = ENXIO; goto unlock_all; } /* * The page may have been busied or held while the object * and page locks were released. */ if (vm_page_busied(m) || m->hold_count != 0) { vm_page_unlock(m); error = EBUSY; goto unlock_all; } } /* * If a page is dirty, then it is either being washed * (but not yet cleaned) or it is still in the * laundry. If it is still in the laundry, then we * start the cleaning operation. */ if (vm_pageout_cluster(m) == 0) error = EIO; unlock_all: VM_OBJECT_WUNLOCK(object); unlock_mp: vm_page_lock_assert(m, MA_NOTOWNED); if (mp != NULL) { if (vp != NULL) vput(vp); vm_object_deallocate(object); vn_finished_write(mp); } return (error); } /* * vm_pageout_scan does the dirty work for the pageout daemon. * * pass 0 - Update active LRU/deactivate pages * pass 1 - Free inactive pages * pass 2 - Launder dirty pages * * Returns true if pass was zero or enough pages were freed by the inactive * queue scan to meet the target. */ static bool vm_pageout_scan(struct vm_domain *vmd, int pass) { vm_page_t m, next; struct vm_pagequeue *pq; vm_object_t object; long min_scan; int act_delta, addl_page_shortage, deficit, error, inactq_shortage; int maxlaunder, maxscan, page_shortage, scan_tick, scanned; int starting_page_shortage, vnodes_skipped; boolean_t pageout_ok, queue_locked; /* * If we need to reclaim memory ask kernel caches to return * some. We rate limit to avoid thrashing. */ if (vmd == &vm_dom[0] && pass > 0 && (time_uptime - lowmem_uptime) >= lowmem_period) { /* * Decrease registered cache sizes. */ SDT_PROBE0(vm, , , vm__lowmem_scan); EVENTHANDLER_INVOKE(vm_lowmem, 0); /* * We do this explicitly after the caches have been * drained above. */ uma_reclaim(); lowmem_uptime = time_uptime; } /* * The addl_page_shortage is the number of temporarily * stuck pages in the inactive queue. In other words, the * number of pages from the inactive count that should be * discounted in setting the target for the active queue scan. */ addl_page_shortage = 0; /* * Calculate the number of pages that we want to free. This number * can be negative if many pages are freed between the wakeup call to * the page daemon and this calculation. */ if (pass > 0) { deficit = atomic_readandclear_int(&vm_pageout_deficit); page_shortage = vm_paging_target() + deficit; } else page_shortage = deficit = 0; starting_page_shortage = page_shortage; /* * maxlaunder limits the number of dirty pages we flush per scan. * For most systems a smaller value (16 or 32) is more robust under * extreme memory and disk pressure because any unnecessary writes * to disk can result in extreme performance degredation. However, * systems with excessive dirty pages (especially when MAP_NOSYNC is * used) will die horribly with limited laundering. If the pageout * daemon cannot clean enough pages in the first pass, we let it go * all out in succeeding passes. */ if ((maxlaunder = vm_max_launder) <= 1) maxlaunder = 1; if (pass > 1) maxlaunder = 10000; vnodes_skipped = 0; /* * Start scanning the inactive queue for pages that we can free. The * scan will stop when we reach the target or we have scanned the * entire queue. (Note that m->act_count is not used to make * decisions for the inactive queue, only for the active queue.) */ pq = &vmd->vmd_pagequeues[PQ_INACTIVE]; maxscan = pq->pq_cnt; vm_pagequeue_lock(pq); queue_locked = TRUE; for (m = TAILQ_FIRST(&pq->pq_pl); m != NULL && maxscan-- > 0 && page_shortage > 0; m = next) { vm_pagequeue_assert_locked(pq); KASSERT(queue_locked, ("unlocked inactive queue")); KASSERT(m->queue == PQ_INACTIVE, ("Inactive queue %p", m)); PCPU_INC(cnt.v_pdpages); next = TAILQ_NEXT(m, plinks.q); /* * skip marker pages */ if (m->flags & PG_MARKER) continue; KASSERT((m->flags & PG_FICTITIOUS) == 0, ("Fictitious page %p cannot be in inactive queue", m)); KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("Unmanaged page %p cannot be in inactive queue", m)); /* * The page or object lock acquisitions fail if the * page was removed from the queue or moved to a * different position within the queue. In either * case, addl_page_shortage should not be incremented. */ if (!vm_pageout_page_lock(m, &next)) goto unlock_page; else if (m->hold_count != 0) { /* * Held pages are essentially stuck in the * queue. So, they ought to be discounted * from the inactive count. See the * calculation of inactq_shortage before the * loop over the active queue below. */ addl_page_shortage++; goto unlock_page; } object = m->object; if (!VM_OBJECT_TRYWLOCK(object)) { if (!vm_pageout_fallback_object_lock(m, &next)) goto unlock_object; else if (m->hold_count != 0) { addl_page_shortage++; goto unlock_object; } } if (vm_page_busied(m)) { /* * Don't mess with busy pages. Leave them at * the front of the queue. Most likely, they * are being paged out and will leave the * queue shortly after the scan finishes. So, * they ought to be discounted from the * inactive count. */ addl_page_shortage++; unlock_object: VM_OBJECT_WUNLOCK(object); unlock_page: vm_page_unlock(m); continue; } KASSERT(m->hold_count == 0, ("Held page %p", m)); /* * We unlock the inactive page queue, invalidating the * 'next' pointer. Use our marker to remember our * place. */ TAILQ_INSERT_AFTER(&pq->pq_pl, m, &vmd->vmd_marker, plinks.q); vm_pagequeue_unlock(pq); queue_locked = FALSE; /* * Invalid pages can be easily freed. They cannot be * mapped, vm_page_free() asserts this. */ if (m->valid == 0) goto free_page; /* * If the page has been referenced and the object is not dead, * reactivate or requeue the page depending on whether the * object is mapped. */ if ((m->aflags & PGA_REFERENCED) != 0) { vm_page_aflag_clear(m, PGA_REFERENCED); act_delta = 1; } else act_delta = 0; if (object->ref_count != 0) { act_delta += pmap_ts_referenced(m); } else { KASSERT(!pmap_page_is_mapped(m), ("vm_pageout_scan: page %p is mapped", m)); } if (act_delta != 0) { if (object->ref_count != 0) { vm_page_activate(m); /* * Increase the activation count if the page * was referenced while in the inactive queue. * This makes it less likely that the page will * be returned prematurely to the inactive * queue. */ m->act_count += act_delta + ACT_ADVANCE; goto drop_page; } else if ((object->flags & OBJ_DEAD) == 0) goto requeue_page; } /* * If the page appears to be clean at the machine-independent * layer, then remove all of its mappings from the pmap in * anticipation of freeing it. If, however, any of the page's * mappings allow write access, then the page may still be * modified until the last of those mappings are removed. */ if (object->ref_count != 0) { vm_page_test_dirty(m); if (m->dirty == 0) pmap_remove_all(m); } if (m->dirty == 0) { /* * Clean pages can be freed. */ free_page: vm_page_free(m); PCPU_INC(cnt.v_dfree); --page_shortage; } else if ((object->flags & OBJ_DEAD) != 0) { /* * Leave dirty pages from dead objects at the front of * the queue. They are being paged out and freed by * the thread that destroyed the object. They will * leave the queue shortly after the scan finishes, so * they should be discounted from the inactive count. */ addl_page_shortage++; } else if ((m->flags & PG_WINATCFLS) == 0 && pass < 2) { /* * Dirty pages need to be paged out, but flushing * a page is extremely expensive versus freeing * a clean page. Rather then artificially limiting * the number of pages we can flush, we instead give * dirty pages extra priority on the inactive queue * by forcing them to be cycled through the queue * twice before being flushed, after which the * (now clean) page will cycle through once more * before being freed. This significantly extends * the thrash point for a heavily loaded machine. */ m->flags |= PG_WINATCFLS; requeue_page: vm_pagequeue_lock(pq); queue_locked = TRUE; vm_page_requeue_locked(m); } else if (maxlaunder > 0) { /* * We always want to try to flush some dirty pages if * we encounter them, to keep the system stable. * Normally this number is small, but under extreme * pressure where there are insufficient clean pages * on the inactive queue, we may have to go all out. */ if (object->type != OBJT_SWAP && object->type != OBJT_DEFAULT) pageout_ok = TRUE; else if (disable_swap_pageouts) pageout_ok = FALSE; else if (defer_swap_pageouts) pageout_ok = vm_page_count_min(); else pageout_ok = TRUE; if (!pageout_ok) goto requeue_page; error = vm_pageout_clean(m); /* * Decrement page_shortage on success to account for * the (future) cleaned page. Otherwise we could wind * up laundering or cleaning too many pages. */ if (error == 0) { page_shortage--; maxlaunder--; } else if (error == EDEADLK) { pageout_lock_miss++; vnodes_skipped++; } else if (error == EBUSY) { addl_page_shortage++; } vm_page_lock_assert(m, MA_NOTOWNED); goto relock_queue; } drop_page: vm_page_unlock(m); VM_OBJECT_WUNLOCK(object); relock_queue: if (!queue_locked) { vm_pagequeue_lock(pq); queue_locked = TRUE; } next = TAILQ_NEXT(&vmd->vmd_marker, plinks.q); TAILQ_REMOVE(&pq->pq_pl, &vmd->vmd_marker, plinks.q); } vm_pagequeue_unlock(pq); #if !defined(NO_SWAPPING) /* * Wakeup the swapout daemon if we didn't free the targeted number of * pages. */ if (vm_swap_enabled && page_shortage > 0) vm_req_vmdaemon(VM_SWAP_NORMAL); #endif /* * Wakeup the sync daemon if we skipped a vnode in a writeable object * and we didn't free enough pages. */ if (vnodes_skipped > 0 && page_shortage > vm_cnt.v_free_target - vm_cnt.v_free_min) (void)speedup_syncer(); /* * If the inactive queue scan fails repeatedly to meet its * target, kill the largest process. */ vm_pageout_mightbe_oom(vmd, page_shortage, starting_page_shortage); /* * Compute the number of pages we want to try to move from the * active queue to the inactive queue. */ inactq_shortage = vm_cnt.v_inactive_target - vm_cnt.v_inactive_count + vm_paging_target() + deficit + addl_page_shortage; pq = &vmd->vmd_pagequeues[PQ_ACTIVE]; vm_pagequeue_lock(pq); maxscan = pq->pq_cnt; /* * If we're just idle polling attempt to visit every * active page within 'update_period' seconds. */ scan_tick = ticks; if (vm_pageout_update_period != 0) { min_scan = pq->pq_cnt; min_scan *= scan_tick - vmd->vmd_last_active_scan; min_scan /= hz * vm_pageout_update_period; } else min_scan = 0; if (min_scan > 0 || (inactq_shortage > 0 && maxscan > 0)) vmd->vmd_last_active_scan = scan_tick; /* * Scan the active queue for pages that can be deactivated. Update * the per-page activity counter and use it to identify deactivation * candidates. Held pages may be deactivated. */ for (m = TAILQ_FIRST(&pq->pq_pl), scanned = 0; m != NULL && (scanned < min_scan || (inactq_shortage > 0 && scanned < maxscan)); m = next, scanned++) { KASSERT(m->queue == PQ_ACTIVE, ("vm_pageout_scan: page %p isn't active", m)); next = TAILQ_NEXT(m, plinks.q); if ((m->flags & PG_MARKER) != 0) continue; KASSERT((m->flags & PG_FICTITIOUS) == 0, ("Fictitious page %p cannot be in active queue", m)); KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("Unmanaged page %p cannot be in active queue", m)); if (!vm_pageout_page_lock(m, &next)) { vm_page_unlock(m); continue; } /* * The count for page daemon pages is updated after checking * the page for eligibility. */ PCPU_INC(cnt.v_pdpages); /* * Check to see "how much" the page has been used. */ if ((m->aflags & PGA_REFERENCED) != 0) { vm_page_aflag_clear(m, PGA_REFERENCED); act_delta = 1; } else act_delta = 0; /* * Perform an unsynchronized object ref count check. While * the page lock ensures that the page is not reallocated to * another object, in particular, one with unmanaged mappings * that cannot support pmap_ts_referenced(), two races are, * nonetheless, possible: * 1) The count was transitioning to zero, but we saw a non- * zero value. pmap_ts_referenced() will return zero * because the page is not mapped. * 2) The count was transitioning to one, but we saw zero. * This race delays the detection of a new reference. At * worst, we will deactivate and reactivate the page. */ if (m->object->ref_count != 0) act_delta += pmap_ts_referenced(m); /* * Advance or decay the act_count based on recent usage. */ if (act_delta != 0) { m->act_count += ACT_ADVANCE + act_delta; if (m->act_count > ACT_MAX) m->act_count = ACT_MAX; } else m->act_count -= min(m->act_count, ACT_DECLINE); /* * Move this page to the tail of the active or inactive * queue depending on usage. */ if (m->act_count == 0) { /* Dequeue to avoid later lock recursion. */ vm_page_dequeue_locked(m); vm_page_deactivate(m); inactq_shortage--; } else vm_page_requeue_locked(m); vm_page_unlock(m); } vm_pagequeue_unlock(pq); #if !defined(NO_SWAPPING) /* * Idle process swapout -- run once per second when we are reclaiming * pages. */ if (vm_swap_idle_enabled && pass > 0) { static long lsec; if (time_second != lsec) { vm_req_vmdaemon(VM_SWAP_IDLE); lsec = time_second; } } #endif return (page_shortage <= 0); } static int vm_pageout_oom_vote; /* * The pagedaemon threads randlomly select one to perform the * OOM. Trying to kill processes before all pagedaemons * failed to reach free target is premature. */ static void vm_pageout_mightbe_oom(struct vm_domain *vmd, int page_shortage, int starting_page_shortage) { int old_vote; if (starting_page_shortage <= 0 || starting_page_shortage != page_shortage) vmd->vmd_oom_seq = 0; else vmd->vmd_oom_seq++; if (vmd->vmd_oom_seq < vm_pageout_oom_seq) { if (vmd->vmd_oom) { vmd->vmd_oom = FALSE; atomic_subtract_int(&vm_pageout_oom_vote, 1); } return; } /* * Do not follow the call sequence until OOM condition is * cleared. */ vmd->vmd_oom_seq = 0; if (vmd->vmd_oom) return; vmd->vmd_oom = TRUE; old_vote = atomic_fetchadd_int(&vm_pageout_oom_vote, 1); if (old_vote != vm_ndomains - 1) return; /* * The current pagedaemon thread is the last in the quorum to * start OOM. Initiate the selection and signaling of the * victim. */ vm_pageout_oom(VM_OOM_MEM); /* * After one round of OOM terror, recall our vote. On the * next pass, current pagedaemon would vote again if the low * memory condition is still there, due to vmd_oom being * false. */ vmd->vmd_oom = FALSE; atomic_subtract_int(&vm_pageout_oom_vote, 1); } /* * The OOM killer is the page daemon's action of last resort when * memory allocation requests have been stalled for a prolonged period * of time because it cannot reclaim memory. This function computes * the approximate number of physical pages that could be reclaimed if * the specified address space is destroyed. * * Private, anonymous memory owned by the address space is the * principal resource that we expect to recover after an OOM kill. * Since the physical pages mapped by the address space's COW entries * are typically shared pages, they are unlikely to be released and so * they are not counted. * * To get to the point where the page daemon runs the OOM killer, its * efforts to write-back vnode-backed pages may have stalled. This * could be caused by a memory allocation deadlock in the write path * that might be resolved by an OOM kill. Therefore, physical pages * belonging to vnode-backed objects are counted, because they might * be freed without being written out first if the address space holds * the last reference to an unlinked vnode. * * Similarly, physical pages belonging to OBJT_PHYS objects are * counted because the address space might hold the last reference to * the object. */ static long vm_pageout_oom_pagecount(struct vmspace *vmspace) { vm_map_t map; vm_map_entry_t entry; vm_object_t obj; long res; map = &vmspace->vm_map; KASSERT(!map->system_map, ("system map")); sx_assert(&map->lock, SA_LOCKED); res = 0; for (entry = map->header.next; entry != &map->header; entry = entry->next) { if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) != 0) continue; obj = entry->object.vm_object; if (obj == NULL) continue; if ((entry->eflags & MAP_ENTRY_NEEDS_COPY) != 0 && obj->ref_count != 1) continue; switch (obj->type) { case OBJT_DEFAULT: case OBJT_SWAP: case OBJT_PHYS: case OBJT_VNODE: res += obj->resident_page_count; break; } } return (res); } void vm_pageout_oom(int shortage) { struct proc *p, *bigproc; vm_offset_t size, bigsize; struct thread *td; struct vmspace *vm; /* * We keep the process bigproc locked once we find it to keep anyone * from messing with it; however, there is a possibility of - * deadlock if process B is bigproc and one of it's child processes + * deadlock if process B is bigproc and one of its child processes * attempts to propagate a signal to B while we are waiting for A's * lock while walking this list. To avoid this, we don't block on * the process lock but just skip a process if it is already locked. */ bigproc = NULL; bigsize = 0; sx_slock(&allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { int breakout; PROC_LOCK(p); /* * If this is a system, protected or killed process, skip it. */ if (p->p_state != PRS_NORMAL || (p->p_flag & (P_INEXEC | P_PROTECTED | P_SYSTEM | P_WEXIT)) != 0 || p->p_pid == 1 || P_KILLED(p) || (p->p_pid < 48 && swap_pager_avail != 0)) { PROC_UNLOCK(p); continue; } /* * If the process is in a non-running type state, * don't touch it. Check all the threads individually. */ breakout = 0; FOREACH_THREAD_IN_PROC(p, td) { thread_lock(td); if (!TD_ON_RUNQ(td) && !TD_IS_RUNNING(td) && !TD_IS_SLEEPING(td) && !TD_IS_SUSPENDED(td) && !TD_IS_SWAPPED(td)) { thread_unlock(td); breakout = 1; break; } thread_unlock(td); } if (breakout) { PROC_UNLOCK(p); continue; } /* * get the process size */ vm = vmspace_acquire_ref(p); if (vm == NULL) { PROC_UNLOCK(p); continue; } _PHOLD_LITE(p); PROC_UNLOCK(p); sx_sunlock(&allproc_lock); if (!vm_map_trylock_read(&vm->vm_map)) { vmspace_free(vm); sx_slock(&allproc_lock); PRELE(p); continue; } size = vmspace_swap_count(vm); if (shortage == VM_OOM_MEM) size += vm_pageout_oom_pagecount(vm); vm_map_unlock_read(&vm->vm_map); vmspace_free(vm); sx_slock(&allproc_lock); /* * If this process is bigger than the biggest one, * remember it. */ if (size > bigsize) { if (bigproc != NULL) PRELE(bigproc); bigproc = p; bigsize = size; } else { PRELE(p); } } sx_sunlock(&allproc_lock); if (bigproc != NULL) { if (vm_panic_on_oom != 0) panic("out of swap space"); PROC_LOCK(bigproc); killproc(bigproc, "out of swap space"); sched_nice(bigproc, PRIO_MIN); _PRELE(bigproc); PROC_UNLOCK(bigproc); wakeup(&vm_cnt.v_free_count); } } static void vm_pageout_worker(void *arg) { struct vm_domain *domain; int domidx, pass; bool target_met; domidx = (uintptr_t)arg; domain = &vm_dom[domidx]; pass = 0; target_met = true; /* * XXXKIB It could be useful to bind pageout daemon threads to * the cores belonging to the domain, from which vm_page_array * is allocated. */ KASSERT(domain->vmd_segs != 0, ("domain without segments")); domain->vmd_last_active_scan = ticks; vm_pageout_init_marker(&domain->vmd_marker, PQ_INACTIVE); vm_pageout_init_marker(&domain->vmd_inacthead, PQ_INACTIVE); TAILQ_INSERT_HEAD(&domain->vmd_pagequeues[PQ_INACTIVE].pq_pl, &domain->vmd_inacthead, plinks.q); /* * The pageout daemon worker is never done, so loop forever. */ while (TRUE) { mtx_lock(&vm_page_queue_free_mtx); /* * Generally, after a level >= 1 scan, if there are enough * free pages to wakeup the waiters, then they are already * awake. A call to vm_page_free() during the scan awakened * them. However, in the following case, this wakeup serves * to bound the amount of time that a thread might wait. * Suppose a thread's call to vm_page_alloc() fails, but * before that thread calls VM_WAIT, enough pages are freed by * other threads to alleviate the free page shortage. The * thread will, nonetheless, wait until another page is freed * or this wakeup is performed. */ if (vm_pages_needed && !vm_page_count_min()) { vm_pages_needed = false; wakeup(&vm_cnt.v_free_count); } /* * Do not clear vm_pageout_wanted until we reach our free page * target. Otherwise, we may be awakened over and over again, * wasting CPU time. */ if (vm_pageout_wanted && target_met) vm_pageout_wanted = false; /* * Might the page daemon receive a wakeup call? */ if (vm_pageout_wanted) { /* * No. Either vm_pageout_wanted was set by another * thread during the previous scan, which must have * been a level 0 scan, or vm_pageout_wanted was * already set and the scan failed to free enough * pages. If we haven't yet performed a level >= 2 * scan (unlimited dirty cleaning), then upgrade the * level and scan again now. Otherwise, sleep a bit * and try again later. */ mtx_unlock(&vm_page_queue_free_mtx); if (pass > 1) pause("psleep", hz / 2); pass++; } else { /* * Yes. Sleep until pages need to be reclaimed or * have their reference stats updated. */ if (mtx_sleep(&vm_pageout_wanted, &vm_page_queue_free_mtx, PDROP | PVM, "psleep", hz) == 0) { PCPU_INC(cnt.v_pdwakeups); pass = 1; } else pass = 0; } target_met = vm_pageout_scan(domain, pass); } } /* * vm_pageout_init initialises basic pageout daemon settings. */ static void vm_pageout_init(void) { /* * Initialize some paging parameters. */ vm_cnt.v_interrupt_free_min = 2; if (vm_cnt.v_page_count < 2000) vm_pageout_page_count = 8; /* * v_free_reserved needs to include enough for the largest * swap pager structures plus enough for any pv_entry structs * when paging. */ if (vm_cnt.v_page_count > 1024) vm_cnt.v_free_min = 4 + (vm_cnt.v_page_count - 1024) / 200; else vm_cnt.v_free_min = 4; vm_cnt.v_pageout_free_min = (2*MAXBSIZE)/PAGE_SIZE + vm_cnt.v_interrupt_free_min; vm_cnt.v_free_reserved = vm_pageout_page_count + vm_cnt.v_pageout_free_min + (vm_cnt.v_page_count / 768); vm_cnt.v_free_severe = vm_cnt.v_free_min / 2; vm_cnt.v_free_target = 4 * vm_cnt.v_free_min + vm_cnt.v_free_reserved; vm_cnt.v_free_min += vm_cnt.v_free_reserved; vm_cnt.v_free_severe += vm_cnt.v_free_reserved; vm_cnt.v_inactive_target = (3 * vm_cnt.v_free_target) / 2; if (vm_cnt.v_inactive_target > vm_cnt.v_free_count / 3) vm_cnt.v_inactive_target = vm_cnt.v_free_count / 3; /* * Set the default wakeup threshold to be 10% above the minimum * page limit. This keeps the steady state out of shortfall. */ vm_pageout_wakeup_thresh = (vm_cnt.v_free_min / 10) * 11; /* * Set interval in seconds for active scan. We want to visit each * page at least once every ten minutes. This is to prevent worst * case paging behaviors with stale active LRU. */ if (vm_pageout_update_period == 0) vm_pageout_update_period = 600; /* XXX does not really belong here */ if (vm_page_max_wired == 0) vm_page_max_wired = vm_cnt.v_free_count / 3; } /* * vm_pageout is the high level pageout daemon. */ static void vm_pageout(void) { int error; #ifdef VM_NUMA_ALLOC int i; #endif swap_pager_swap_init(); #ifdef VM_NUMA_ALLOC for (i = 1; i < vm_ndomains; i++) { error = kthread_add(vm_pageout_worker, (void *)(uintptr_t)i, curproc, NULL, 0, 0, "dom%d", i); if (error != 0) { panic("starting pageout for domain %d, error %d\n", i, error); } } #endif error = kthread_add(uma_reclaim_worker, NULL, curproc, NULL, 0, 0, "uma"); if (error != 0) panic("starting uma_reclaim helper, error %d\n", error); vm_pageout_worker((void *)(uintptr_t)0); } /* * Unless the free page queue lock is held by the caller, this function * should be regarded as advisory. Specifically, the caller should * not msleep() on &vm_cnt.v_free_count following this function unless * the free page queue lock is held until the msleep() is performed. */ void pagedaemon_wakeup(void) { if (!vm_pageout_wanted && curthread->td_proc != pageproc) { vm_pageout_wanted = true; wakeup(&vm_pageout_wanted); } } #if !defined(NO_SWAPPING) static void vm_req_vmdaemon(int req) { static int lastrun = 0; mtx_lock(&vm_daemon_mtx); vm_pageout_req_swapout |= req; if ((ticks > (lastrun + hz)) || (ticks < lastrun)) { wakeup(&vm_daemon_needed); lastrun = ticks; } mtx_unlock(&vm_daemon_mtx); } static void vm_daemon(void) { struct rlimit rsslim; struct proc *p; struct thread *td; struct vmspace *vm; int breakout, swapout_flags, tryagain, attempts; #ifdef RACCT uint64_t rsize, ravailable; #endif while (TRUE) { mtx_lock(&vm_daemon_mtx); msleep(&vm_daemon_needed, &vm_daemon_mtx, PPAUSE, "psleep", #ifdef RACCT racct_enable ? hz : 0 #else 0 #endif ); swapout_flags = vm_pageout_req_swapout; vm_pageout_req_swapout = 0; mtx_unlock(&vm_daemon_mtx); if (swapout_flags) swapout_procs(swapout_flags); /* * scan the processes for exceeding their rlimits or if * process is swapped out -- deactivate pages */ tryagain = 0; attempts = 0; again: attempts++; sx_slock(&allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { vm_pindex_t limit, size; /* * if this is a system process or if we have already * looked at this process, skip it. */ PROC_LOCK(p); if (p->p_state != PRS_NORMAL || p->p_flag & (P_INEXEC | P_SYSTEM | P_WEXIT)) { PROC_UNLOCK(p); continue; } /* * if the process is in a non-running type state, * don't touch it. */ breakout = 0; FOREACH_THREAD_IN_PROC(p, td) { thread_lock(td); if (!TD_ON_RUNQ(td) && !TD_IS_RUNNING(td) && !TD_IS_SLEEPING(td) && !TD_IS_SUSPENDED(td)) { thread_unlock(td); breakout = 1; break; } thread_unlock(td); } if (breakout) { PROC_UNLOCK(p); continue; } /* * get a limit */ lim_rlimit_proc(p, RLIMIT_RSS, &rsslim); limit = OFF_TO_IDX( qmin(rsslim.rlim_cur, rsslim.rlim_max)); /* * let processes that are swapped out really be * swapped out set the limit to nothing (will force a * swap-out.) */ if ((p->p_flag & P_INMEM) == 0) limit = 0; /* XXX */ vm = vmspace_acquire_ref(p); _PHOLD_LITE(p); PROC_UNLOCK(p); if (vm == NULL) { PRELE(p); continue; } sx_sunlock(&allproc_lock); size = vmspace_resident_count(vm); if (size >= limit) { vm_pageout_map_deactivate_pages( &vm->vm_map, limit); } #ifdef RACCT if (racct_enable) { rsize = IDX_TO_OFF(size); PROC_LOCK(p); racct_set(p, RACCT_RSS, rsize); ravailable = racct_get_available(p, RACCT_RSS); PROC_UNLOCK(p); if (rsize > ravailable) { /* * Don't be overly aggressive; this * might be an innocent process, * and the limit could've been exceeded * by some memory hog. Don't try * to deactivate more than 1/4th * of process' resident set size. */ if (attempts <= 8) { if (ravailable < rsize - (rsize / 4)) { ravailable = rsize - (rsize / 4); } } vm_pageout_map_deactivate_pages( &vm->vm_map, OFF_TO_IDX(ravailable)); /* Update RSS usage after paging out. */ size = vmspace_resident_count(vm); rsize = IDX_TO_OFF(size); PROC_LOCK(p); racct_set(p, RACCT_RSS, rsize); PROC_UNLOCK(p); if (rsize > ravailable) tryagain = 1; } } #endif vmspace_free(vm); sx_slock(&allproc_lock); PRELE(p); } sx_sunlock(&allproc_lock); if (tryagain != 0 && attempts <= 10) goto again; } } #endif /* !defined(NO_SWAPPING) */ Index: head/sys/x86/xen/xen_intr.c =================================================================== --- head/sys/x86/xen/xen_intr.c (revision 308456) +++ head/sys/x86/xen/xen_intr.c (revision 308457) @@ -1,1667 +1,1667 @@ /****************************************************************************** * xen_intr.c * * Xen event and interrupt services for x86 HVM guests. * * Copyright (c) 2002-2005, K A Fraser * Copyright (c) 2005, Intel Corporation * Copyright (c) 2012, Spectra Logic Corporation * * This file may be distributed separately from the Linux kernel, or * incorporated into other software packages, subject to the following license: * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this source file (the "Software"), to deal in the Software without * restriction, including without limitation the rights to use, copy, modify, * merge, publish, distribute, sublicense, and/or sell copies of the Software, * and to permit persons to whom the Software is furnished to do so, subject to * the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS * IN THE SOFTWARE. */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DDB #include #endif static MALLOC_DEFINE(M_XENINTR, "xen_intr", "Xen Interrupt Services"); /** * Per-cpu event channel processing state. */ struct xen_intr_pcpu_data { /** * The last event channel bitmap section (level one bit) processed. * This is used to ensure we scan all ports before * servicing an already servied port again. */ u_int last_processed_l1i; /** * The last event channel processed within the event channel * bitmap being scanned. */ u_int last_processed_l2i; /** Pointer to this CPU's interrupt statistic counter. */ u_long *evtchn_intrcnt; /** * A bitmap of ports that can be serviced from this CPU. * A set bit means interrupt handling is enabled. */ u_long evtchn_enabled[sizeof(u_long) * 8]; }; /* * Start the scan at port 0 by initializing the last scanned * location as the highest numbered event channel port. */ static DPCPU_DEFINE(struct xen_intr_pcpu_data, xen_intr_pcpu) = { .last_processed_l1i = LONG_BIT - 1, .last_processed_l2i = LONG_BIT - 1 }; DPCPU_DECLARE(struct vcpu_info *, vcpu_info); #define XEN_EEXIST 17 /* Xen "already exists" error */ #define XEN_ALLOCATE_VECTOR 0 /* Allocate a vector for this event channel */ #define XEN_INVALID_EVTCHN 0 /* Invalid event channel */ #define is_valid_evtchn(x) ((x) != XEN_INVALID_EVTCHN) struct xenisrc { struct intsrc xi_intsrc; enum evtchn_type xi_type; int xi_cpu; /* VCPU for delivery. */ int xi_vector; /* Global isrc vector number. */ evtchn_port_t xi_port; int xi_pirq; int xi_virq; void *xi_cookie; u_int xi_close:1; /* close on unbind? */ u_int xi_activehi:1; u_int xi_edgetrigger:1; u_int xi_masked:1; volatile u_int xi_refcount; }; static void xen_intr_suspend(struct pic *); static void xen_intr_resume(struct pic *, bool suspend_cancelled); static void xen_intr_enable_source(struct intsrc *isrc); static void xen_intr_disable_source(struct intsrc *isrc, int eoi); static void xen_intr_eoi_source(struct intsrc *isrc); static void xen_intr_enable_intr(struct intsrc *isrc); static void xen_intr_disable_intr(struct intsrc *isrc); static int xen_intr_vector(struct intsrc *isrc); static int xen_intr_source_pending(struct intsrc *isrc); static int xen_intr_config_intr(struct intsrc *isrc, enum intr_trigger trig, enum intr_polarity pol); static int xen_intr_assign_cpu(struct intsrc *isrc, u_int apic_id); static void xen_intr_pirq_enable_source(struct intsrc *isrc); static void xen_intr_pirq_disable_source(struct intsrc *isrc, int eoi); static void xen_intr_pirq_eoi_source(struct intsrc *isrc); static void xen_intr_pirq_enable_intr(struct intsrc *isrc); static void xen_intr_pirq_disable_intr(struct intsrc *isrc); static int xen_intr_pirq_config_intr(struct intsrc *isrc, enum intr_trigger trig, enum intr_polarity pol); /** * PIC interface for all event channel port types except physical IRQs. */ struct pic xen_intr_pic = { .pic_enable_source = xen_intr_enable_source, .pic_disable_source = xen_intr_disable_source, .pic_eoi_source = xen_intr_eoi_source, .pic_enable_intr = xen_intr_enable_intr, .pic_disable_intr = xen_intr_disable_intr, .pic_vector = xen_intr_vector, .pic_source_pending = xen_intr_source_pending, .pic_suspend = xen_intr_suspend, .pic_resume = xen_intr_resume, .pic_config_intr = xen_intr_config_intr, .pic_assign_cpu = xen_intr_assign_cpu }; /** * PIC interface for all event channel representing * physical interrupt sources. */ struct pic xen_intr_pirq_pic = { .pic_enable_source = xen_intr_pirq_enable_source, .pic_disable_source = xen_intr_pirq_disable_source, .pic_eoi_source = xen_intr_pirq_eoi_source, .pic_enable_intr = xen_intr_pirq_enable_intr, .pic_disable_intr = xen_intr_pirq_disable_intr, .pic_vector = xen_intr_vector, .pic_source_pending = xen_intr_source_pending, .pic_config_intr = xen_intr_pirq_config_intr, .pic_assign_cpu = xen_intr_assign_cpu }; static struct mtx xen_intr_isrc_lock; static int xen_intr_auto_vector_count; static struct xenisrc *xen_intr_port_to_isrc[NR_EVENT_CHANNELS]; static u_long *xen_intr_pirq_eoi_map; static boolean_t xen_intr_pirq_eoi_map_enabled; /*------------------------- Private Functions --------------------------------*/ /** * Disable signal delivery for an event channel port on the * specified CPU. * * \param port The event channel port to mask. * * This API is used to manage the port<=>CPU binding of event * channel handlers. * * \note This operation does not preclude reception of an event * for this event channel on another CPU. To mask the * event channel globally, use evtchn_mask(). */ static inline void evtchn_cpu_mask_port(u_int cpu, evtchn_port_t port) { struct xen_intr_pcpu_data *pcpu; pcpu = DPCPU_ID_PTR(cpu, xen_intr_pcpu); xen_clear_bit(port, pcpu->evtchn_enabled); } /** * Enable signal delivery for an event channel port on the * specified CPU. * * \param port The event channel port to unmask. * * This API is used to manage the port<=>CPU binding of event * channel handlers. * * \note This operation does not guarantee that event delivery * is enabled for this event channel port. The port must * also be globally enabled. See evtchn_unmask(). */ static inline void evtchn_cpu_unmask_port(u_int cpu, evtchn_port_t port) { struct xen_intr_pcpu_data *pcpu; pcpu = DPCPU_ID_PTR(cpu, xen_intr_pcpu); xen_set_bit(port, pcpu->evtchn_enabled); } /** * Allocate and register a per-cpu Xen upcall interrupt counter. * * \param cpu The cpu for which to register this interrupt count. */ static void xen_intr_intrcnt_add(u_int cpu) { char buf[MAXCOMLEN + 1]; struct xen_intr_pcpu_data *pcpu; pcpu = DPCPU_ID_PTR(cpu, xen_intr_pcpu); if (pcpu->evtchn_intrcnt != NULL) return; snprintf(buf, sizeof(buf), "cpu%d:xen", cpu); intrcnt_add(buf, &pcpu->evtchn_intrcnt); } /** * Search for an already allocated but currently unused Xen interrupt * source object. * * \param type Restrict the search to interrupt sources of the given * type. * * \return A pointer to a free Xen interrupt source object or NULL. */ static struct xenisrc * xen_intr_find_unused_isrc(enum evtchn_type type) { int isrc_idx; KASSERT(mtx_owned(&xen_intr_isrc_lock), ("Evtchn isrc lock not held")); for (isrc_idx = 0; isrc_idx < xen_intr_auto_vector_count; isrc_idx ++) { struct xenisrc *isrc; u_int vector; vector = FIRST_EVTCHN_INT + isrc_idx; isrc = (struct xenisrc *)intr_lookup_source(vector); if (isrc != NULL && isrc->xi_type == EVTCHN_TYPE_UNBOUND) { KASSERT(isrc->xi_intsrc.is_handlers == 0, ("Free evtchn still has handlers")); isrc->xi_type = type; return (isrc); } } return (NULL); } /** * Allocate a Xen interrupt source object. * * \param type The type of interrupt source to create. * * \return A pointer to a newly allocated Xen interrupt source * object or NULL. */ static struct xenisrc * xen_intr_alloc_isrc(enum evtchn_type type, int vector) { static int warned; struct xenisrc *isrc; KASSERT(mtx_owned(&xen_intr_isrc_lock), ("Evtchn alloc lock not held")); if (xen_intr_auto_vector_count > NR_EVENT_CHANNELS) { if (!warned) { warned = 1; printf("xen_intr_alloc: Event channels exhausted.\n"); } return (NULL); } if (type != EVTCHN_TYPE_PIRQ) { vector = FIRST_EVTCHN_INT + xen_intr_auto_vector_count; xen_intr_auto_vector_count++; } KASSERT((intr_lookup_source(vector) == NULL), ("Trying to use an already allocated vector")); mtx_unlock(&xen_intr_isrc_lock); isrc = malloc(sizeof(*isrc), M_XENINTR, M_WAITOK | M_ZERO); isrc->xi_intsrc.is_pic = (type == EVTCHN_TYPE_PIRQ) ? &xen_intr_pirq_pic : &xen_intr_pic; isrc->xi_vector = vector; isrc->xi_type = type; intr_register_source(&isrc->xi_intsrc); mtx_lock(&xen_intr_isrc_lock); return (isrc); } /** * Attempt to free an active Xen interrupt source object. * * \param isrc The interrupt source object to release. * * \returns EBUSY if the source is still in use, otherwise 0. */ static int xen_intr_release_isrc(struct xenisrc *isrc) { mtx_lock(&xen_intr_isrc_lock); KASSERT(isrc->xi_intsrc.is_handlers == 0, ("Release called, but xenisrc still in use")); evtchn_mask_port(isrc->xi_port); evtchn_clear_port(isrc->xi_port); /* Rebind port to CPU 0. */ evtchn_cpu_mask_port(isrc->xi_cpu, isrc->xi_port); evtchn_cpu_unmask_port(0, isrc->xi_port); if (isrc->xi_close != 0 && is_valid_evtchn(isrc->xi_port)) { struct evtchn_close close = { .port = isrc->xi_port }; if (HYPERVISOR_event_channel_op(EVTCHNOP_close, &close)) panic("EVTCHNOP_close failed"); } xen_intr_port_to_isrc[isrc->xi_port] = NULL; isrc->xi_cpu = 0; isrc->xi_type = EVTCHN_TYPE_UNBOUND; isrc->xi_port = 0; isrc->xi_cookie = NULL; mtx_unlock(&xen_intr_isrc_lock); return (0); } /** * Associate an interrupt handler with an already allocated local Xen * event channel port. * * \param isrcp The returned Xen interrupt object associated with * the specified local port. * \param local_port The event channel to bind. * \param type The event channel type of local_port. * \param intr_owner The device making this bind request. * \param filter An interrupt filter handler. Specify NULL * to always dispatch to the ithread handler. * \param handler An interrupt ithread handler. Optional (can * specify NULL) if all necessary event actions * are performed by filter. * \param arg Argument to present to both filter and handler. * \param irqflags Interrupt handler flags. See sys/bus.h. * \param handlep Pointer to an opaque handle used to manage this * registration. * * \returns 0 on success, otherwise an errno. */ static int xen_intr_bind_isrc(struct xenisrc **isrcp, evtchn_port_t local_port, enum evtchn_type type, device_t intr_owner, driver_filter_t filter, driver_intr_t handler, void *arg, enum intr_type flags, xen_intr_handle_t *port_handlep) { struct xenisrc *isrc; int error; *isrcp = NULL; if (port_handlep == NULL) { device_printf(intr_owner, "xen_intr_bind_isrc: Bad event handle\n"); return (EINVAL); } mtx_lock(&xen_intr_isrc_lock); isrc = xen_intr_find_unused_isrc(type); if (isrc == NULL) { isrc = xen_intr_alloc_isrc(type, XEN_ALLOCATE_VECTOR); if (isrc == NULL) { mtx_unlock(&xen_intr_isrc_lock); return (ENOSPC); } } isrc->xi_port = local_port; xen_intr_port_to_isrc[local_port] = isrc; refcount_init(&isrc->xi_refcount, 1); mtx_unlock(&xen_intr_isrc_lock); /* Assign the opaque handler (the event channel port) */ *port_handlep = &isrc->xi_vector; #ifdef SMP if (type == EVTCHN_TYPE_PORT) { /* * By default all interrupts are assigned to vCPU#0 * unless specified otherwise, so shuffle them to balance * the interrupt load. */ xen_intr_assign_cpu(&isrc->xi_intsrc, intr_next_cpu()); } #endif if (filter == NULL && handler == NULL) { /* * No filter/handler provided, leave the event channel * masked and without a valid handler, the caller is * in charge of setting that up. */ *isrcp = isrc; return (0); } error = xen_intr_add_handler(intr_owner, filter, handler, arg, flags, *port_handlep); if (error != 0) { xen_intr_release_isrc(isrc); return (error); } *isrcp = isrc; return (0); } /** * Lookup a Xen interrupt source object given an interrupt binding handle. * * \param handle A handle initialized by a previous call to * xen_intr_bind_isrc(). * * \returns A pointer to the Xen interrupt source object associated * with the given interrupt handle. NULL if no association * currently exists. */ static struct xenisrc * xen_intr_isrc(xen_intr_handle_t handle) { int vector; if (handle == NULL) return (NULL); vector = *(int *)handle; KASSERT(vector >= FIRST_EVTCHN_INT && vector < (FIRST_EVTCHN_INT + xen_intr_auto_vector_count), ("Xen interrupt vector is out of range")); return ((struct xenisrc *)intr_lookup_source(vector)); } /** * Determine the event channel ports at the given section of the * event port bitmap which have pending events for the given cpu. * * \param pcpu The Xen interrupt pcpu data for the cpu being querried. * \param sh The Xen shared info area. * \param idx The index of the section of the event channel bitmap to * inspect. * * \returns A u_long with bits set for every event channel with pending * events. */ static inline u_long xen_intr_active_ports(struct xen_intr_pcpu_data *pcpu, shared_info_t *sh, u_int idx) { CTASSERT(sizeof(sh->evtchn_mask[0]) == sizeof(sh->evtchn_pending[0])); CTASSERT(sizeof(sh->evtchn_mask[0]) == sizeof(pcpu->evtchn_enabled[0])); CTASSERT(sizeof(sh->evtchn_mask) == sizeof(sh->evtchn_pending)); CTASSERT(sizeof(sh->evtchn_mask) == sizeof(pcpu->evtchn_enabled)); return (sh->evtchn_pending[idx] & ~sh->evtchn_mask[idx] & pcpu->evtchn_enabled[idx]); } /** * Interrupt handler for processing all Xen event channel events. * * \param trap_frame The trap frame context for the current interrupt. */ void xen_intr_handle_upcall(struct trapframe *trap_frame) { u_int l1i, l2i, port, cpu; u_long masked_l1, masked_l2; struct xenisrc *isrc; shared_info_t *s; vcpu_info_t *v; struct xen_intr_pcpu_data *pc; u_long l1, l2; /* * Disable preemption in order to always check and fire events * on the right vCPU */ critical_enter(); cpu = PCPU_GET(cpuid); pc = DPCPU_PTR(xen_intr_pcpu); s = HYPERVISOR_shared_info; v = DPCPU_GET(vcpu_info); if (xen_hvm_domain() && !xen_vector_callback_enabled) { KASSERT((cpu == 0), ("Fired PCI event callback on wrong CPU")); } v->evtchn_upcall_pending = 0; #if 0 #ifndef CONFIG_X86 /* No need for a barrier -- XCHG is a barrier on x86. */ /* Clear master flag /before/ clearing selector flag. */ wmb(); #endif #endif l1 = atomic_readandclear_long(&v->evtchn_pending_sel); l1i = pc->last_processed_l1i; l2i = pc->last_processed_l2i; (*pc->evtchn_intrcnt)++; while (l1 != 0) { l1i = (l1i + 1) % LONG_BIT; masked_l1 = l1 & ((~0UL) << l1i); if (masked_l1 == 0) { /* * if we masked out all events, wrap around * to the beginning. */ l1i = LONG_BIT - 1; l2i = LONG_BIT - 1; continue; } l1i = ffsl(masked_l1) - 1; do { l2 = xen_intr_active_ports(pc, s, l1i); l2i = (l2i + 1) % LONG_BIT; masked_l2 = l2 & ((~0UL) << l2i); if (masked_l2 == 0) { /* if we masked out all events, move on */ l2i = LONG_BIT - 1; break; } l2i = ffsl(masked_l2) - 1; /* process port */ port = (l1i * LONG_BIT) + l2i; synch_clear_bit(port, &s->evtchn_pending[0]); isrc = xen_intr_port_to_isrc[port]; if (__predict_false(isrc == NULL)) continue; /* Make sure we are firing on the right vCPU */ KASSERT((isrc->xi_cpu == PCPU_GET(cpuid)), ("Received unexpected event on vCPU#%d, event bound to vCPU#%d", PCPU_GET(cpuid), isrc->xi_cpu)); intr_execute_handlers(&isrc->xi_intsrc, trap_frame); /* * If this is the final port processed, * we'll pick up here+1 next time. */ pc->last_processed_l1i = l1i; pc->last_processed_l2i = l2i; } while (l2i != LONG_BIT - 1); l2 = xen_intr_active_ports(pc, s, l1i); if (l2 == 0) { /* * We handled all ports, so we can clear the * selector bit. */ l1 &= ~(1UL << l1i); } } critical_exit(); } static int xen_intr_init(void *dummy __unused) { shared_info_t *s = HYPERVISOR_shared_info; struct xen_intr_pcpu_data *pcpu; struct physdev_pirq_eoi_gmfn eoi_gmfn; int i, rc; if (!xen_domain()) return (0); mtx_init(&xen_intr_isrc_lock, "xen-irq-lock", NULL, MTX_DEF); /* * Register interrupt count manually as we aren't * guaranteed to see a call to xen_intr_assign_cpu() * before our first interrupt. Also set the per-cpu * mask of CPU#0 to enable all, since by default * all event channels are bound to CPU#0. */ CPU_FOREACH(i) { pcpu = DPCPU_ID_PTR(i, xen_intr_pcpu); memset(pcpu->evtchn_enabled, i == 0 ? ~0 : 0, sizeof(pcpu->evtchn_enabled)); xen_intr_intrcnt_add(i); } for (i = 0; i < nitems(s->evtchn_mask); i++) atomic_store_rel_long(&s->evtchn_mask[i], ~0); /* Try to register PIRQ EOI map */ xen_intr_pirq_eoi_map = malloc(PAGE_SIZE, M_XENINTR, M_WAITOK | M_ZERO); eoi_gmfn.gmfn = atop(vtophys(xen_intr_pirq_eoi_map)); rc = HYPERVISOR_physdev_op(PHYSDEVOP_pirq_eoi_gmfn_v2, &eoi_gmfn); if (rc != 0 && bootverbose) printf("Xen interrupts: unable to register PIRQ EOI map\n"); else xen_intr_pirq_eoi_map_enabled = true; intr_register_pic(&xen_intr_pic); intr_register_pic(&xen_intr_pirq_pic); if (bootverbose) printf("Xen interrupt system initialized\n"); return (0); } SYSINIT(xen_intr_init, SI_SUB_INTR, SI_ORDER_SECOND, xen_intr_init, NULL); /*--------------------------- Common PIC Functions ---------------------------*/ /** * Prepare this PIC for system suspension. */ static void xen_intr_suspend(struct pic *unused) { } static void xen_rebind_ipi(struct xenisrc *isrc) { #ifdef SMP int cpu = isrc->xi_cpu; int vcpu_id = pcpu_find(cpu)->pc_vcpu_id; int error; struct evtchn_bind_ipi bind_ipi = { .vcpu = vcpu_id }; error = HYPERVISOR_event_channel_op(EVTCHNOP_bind_ipi, &bind_ipi); if (error != 0) panic("unable to rebind xen IPI: %d", error); isrc->xi_port = bind_ipi.port; isrc->xi_cpu = 0; xen_intr_port_to_isrc[bind_ipi.port] = isrc; error = xen_intr_assign_cpu(&isrc->xi_intsrc, cpu_apic_ids[cpu]); if (error) panic("unable to bind xen IPI to CPU#%d: %d", cpu, error); evtchn_unmask_port(bind_ipi.port); #else panic("Resume IPI event channel on UP"); #endif } static void xen_rebind_virq(struct xenisrc *isrc) { int cpu = isrc->xi_cpu; int vcpu_id = pcpu_find(cpu)->pc_vcpu_id; int error; struct evtchn_bind_virq bind_virq = { .virq = isrc->xi_virq, .vcpu = vcpu_id }; error = HYPERVISOR_event_channel_op(EVTCHNOP_bind_virq, &bind_virq); if (error != 0) panic("unable to rebind xen VIRQ#%d: %d", isrc->xi_virq, error); isrc->xi_port = bind_virq.port; isrc->xi_cpu = 0; xen_intr_port_to_isrc[bind_virq.port] = isrc; #ifdef SMP error = xen_intr_assign_cpu(&isrc->xi_intsrc, cpu_apic_ids[cpu]); if (error) panic("unable to bind xen VIRQ#%d to CPU#%d: %d", isrc->xi_virq, cpu, error); #endif evtchn_unmask_port(bind_virq.port); } /** * Return this PIC to service after being suspended. */ static void xen_intr_resume(struct pic *unused, bool suspend_cancelled) { shared_info_t *s = HYPERVISOR_shared_info; struct xenisrc *isrc; u_int isrc_idx; int i; if (suspend_cancelled) return; /* Reset the per-CPU masks */ CPU_FOREACH(i) { struct xen_intr_pcpu_data *pcpu; pcpu = DPCPU_ID_PTR(i, xen_intr_pcpu); memset(pcpu->evtchn_enabled, i == 0 ? ~0 : 0, sizeof(pcpu->evtchn_enabled)); } /* Mask all event channels. */ for (i = 0; i < nitems(s->evtchn_mask); i++) atomic_store_rel_long(&s->evtchn_mask[i], ~0); /* Remove port -> isrc mappings */ memset(xen_intr_port_to_isrc, 0, sizeof(xen_intr_port_to_isrc)); /* Free unused isrcs and rebind VIRQs and IPIs */ for (isrc_idx = 0; isrc_idx < xen_intr_auto_vector_count; isrc_idx++) { u_int vector; vector = FIRST_EVTCHN_INT + isrc_idx; isrc = (struct xenisrc *)intr_lookup_source(vector); if (isrc != NULL) { isrc->xi_port = 0; switch (isrc->xi_type) { case EVTCHN_TYPE_IPI: xen_rebind_ipi(isrc); break; case EVTCHN_TYPE_VIRQ: xen_rebind_virq(isrc); break; default: break; } } } } /** * Disable a Xen interrupt source. * * \param isrc The interrupt source to disable. */ static void xen_intr_disable_intr(struct intsrc *base_isrc) { struct xenisrc *isrc = (struct xenisrc *)base_isrc; evtchn_mask_port(isrc->xi_port); } /** * Determine the global interrupt vector number for * a Xen interrupt source. * * \param isrc The interrupt source to query. * * \return The vector number corresponding to the given interrupt source. */ static int xen_intr_vector(struct intsrc *base_isrc) { struct xenisrc *isrc = (struct xenisrc *)base_isrc; return (isrc->xi_vector); } /** * Determine whether or not interrupt events are pending on the * the given interrupt source. * * \param isrc The interrupt source to query. * * \returns 0 if no events are pending, otherwise non-zero. */ static int xen_intr_source_pending(struct intsrc *isrc) { /* * EventChannels are edge triggered and never masked. * There can be no pending events. */ return (0); } /** * Perform configuration of an interrupt source. * * \param isrc The interrupt source to configure. * \param trig Edge or level. * \param pol Active high or low. * * \returns 0 if no events are pending, otherwise non-zero. */ static int xen_intr_config_intr(struct intsrc *isrc, enum intr_trigger trig, enum intr_polarity pol) { /* Configuration is only possible via the evtchn apis. */ return (ENODEV); } /** * Configure CPU affinity for interrupt source event delivery. * * \param isrc The interrupt source to configure. * \param apic_id The apic id of the CPU for handling future events. * * \returns 0 if successful, otherwise an errno. */ static int xen_intr_assign_cpu(struct intsrc *base_isrc, u_int apic_id) { #ifdef SMP struct evtchn_bind_vcpu bind_vcpu; struct xenisrc *isrc; u_int to_cpu, vcpu_id; int error, masked; if (xen_vector_callback_enabled == 0) return (EOPNOTSUPP); to_cpu = apic_cpuid(apic_id); vcpu_id = pcpu_find(to_cpu)->pc_vcpu_id; xen_intr_intrcnt_add(to_cpu); mtx_lock(&xen_intr_isrc_lock); isrc = (struct xenisrc *)base_isrc; if (!is_valid_evtchn(isrc->xi_port)) { mtx_unlock(&xen_intr_isrc_lock); return (EINVAL); } /* * Mask the event channel while binding it to prevent interrupt * delivery with an inconsistent state in isrc->xi_cpu. */ masked = evtchn_test_and_set_mask(isrc->xi_port); if ((isrc->xi_type == EVTCHN_TYPE_VIRQ) || (isrc->xi_type == EVTCHN_TYPE_IPI)) { /* * Virtual IRQs are associated with a cpu by * the Hypervisor at evtchn_bind_virq time, so * all we need to do is update the per-CPU masks. */ evtchn_cpu_mask_port(isrc->xi_cpu, isrc->xi_port); isrc->xi_cpu = to_cpu; evtchn_cpu_unmask_port(isrc->xi_cpu, isrc->xi_port); goto out; } bind_vcpu.port = isrc->xi_port; bind_vcpu.vcpu = vcpu_id; error = HYPERVISOR_event_channel_op(EVTCHNOP_bind_vcpu, &bind_vcpu); if (isrc->xi_cpu != to_cpu) { if (error == 0) { /* Commit to new binding by removing the old one. */ evtchn_cpu_mask_port(isrc->xi_cpu, isrc->xi_port); isrc->xi_cpu = to_cpu; evtchn_cpu_unmask_port(isrc->xi_cpu, isrc->xi_port); } } out: if (masked == 0) evtchn_unmask_port(isrc->xi_port); mtx_unlock(&xen_intr_isrc_lock); return (0); #else return (EOPNOTSUPP); #endif } /*------------------- Virtual Interrupt Source PIC Functions -----------------*/ /* * Mask a level triggered interrupt source. * * \param isrc The interrupt source to mask (if necessary). * \param eoi If non-zero, perform any necessary end-of-interrupt * acknowledgements. */ static void xen_intr_disable_source(struct intsrc *base_isrc, int eoi) { struct xenisrc *isrc; isrc = (struct xenisrc *)base_isrc; /* * NB: checking if the event channel is already masked is * needed because the event channel user-space device - * masks event channels on it's filter as part of it's + * masks event channels on its filter as part of its * normal operation, and those shouldn't be automatically * unmasked by the generic interrupt code. The event channel * device will unmask them when needed. */ isrc->xi_masked = !!evtchn_test_and_set_mask(isrc->xi_port); } /* * Unmask a level triggered interrupt source. * * \param isrc The interrupt source to unmask (if necessary). */ static void xen_intr_enable_source(struct intsrc *base_isrc) { struct xenisrc *isrc; isrc = (struct xenisrc *)base_isrc; if (isrc->xi_masked == 0) evtchn_unmask_port(isrc->xi_port); } /* * Perform any necessary end-of-interrupt acknowledgements. * * \param isrc The interrupt source to EOI. */ static void xen_intr_eoi_source(struct intsrc *base_isrc) { } /* * Enable and unmask the interrupt source. * * \param isrc The interrupt source to enable. */ static void xen_intr_enable_intr(struct intsrc *base_isrc) { struct xenisrc *isrc = (struct xenisrc *)base_isrc; evtchn_unmask_port(isrc->xi_port); } /*------------------ Physical Interrupt Source PIC Functions -----------------*/ /* * Mask a level triggered interrupt source. * * \param isrc The interrupt source to mask (if necessary). * \param eoi If non-zero, perform any necessary end-of-interrupt * acknowledgements. */ static void xen_intr_pirq_disable_source(struct intsrc *base_isrc, int eoi) { struct xenisrc *isrc; isrc = (struct xenisrc *)base_isrc; if (isrc->xi_edgetrigger == 0) evtchn_mask_port(isrc->xi_port); if (eoi == PIC_EOI) xen_intr_pirq_eoi_source(base_isrc); } /* * Unmask a level triggered interrupt source. * * \param isrc The interrupt source to unmask (if necessary). */ static void xen_intr_pirq_enable_source(struct intsrc *base_isrc) { struct xenisrc *isrc; isrc = (struct xenisrc *)base_isrc; if (isrc->xi_edgetrigger == 0) evtchn_unmask_port(isrc->xi_port); } /* * Perform any necessary end-of-interrupt acknowledgements. * * \param isrc The interrupt source to EOI. */ static void xen_intr_pirq_eoi_source(struct intsrc *base_isrc) { struct xenisrc *isrc; int error; isrc = (struct xenisrc *)base_isrc; if (xen_test_bit(isrc->xi_pirq, xen_intr_pirq_eoi_map)) { struct physdev_eoi eoi = { .irq = isrc->xi_pirq }; error = HYPERVISOR_physdev_op(PHYSDEVOP_eoi, &eoi); if (error != 0) panic("Unable to EOI PIRQ#%d: %d\n", isrc->xi_pirq, error); } } /* * Enable and unmask the interrupt source. * * \param isrc The interrupt source to enable. */ static void xen_intr_pirq_enable_intr(struct intsrc *base_isrc) { struct xenisrc *isrc; struct evtchn_bind_pirq bind_pirq; struct physdev_irq_status_query irq_status; int error; isrc = (struct xenisrc *)base_isrc; if (!xen_intr_pirq_eoi_map_enabled) { irq_status.irq = isrc->xi_pirq; error = HYPERVISOR_physdev_op(PHYSDEVOP_irq_status_query, &irq_status); if (error) panic("unable to get status of IRQ#%d", isrc->xi_pirq); if (irq_status.flags & XENIRQSTAT_needs_eoi) { /* * Since the dynamic PIRQ EOI map is not available * mark the PIRQ as needing EOI unconditionally. */ xen_set_bit(isrc->xi_pirq, xen_intr_pirq_eoi_map); } } bind_pirq.pirq = isrc->xi_pirq; bind_pirq.flags = isrc->xi_edgetrigger ? 0 : BIND_PIRQ__WILL_SHARE; error = HYPERVISOR_event_channel_op(EVTCHNOP_bind_pirq, &bind_pirq); if (error) panic("unable to bind IRQ#%d", isrc->xi_pirq); isrc->xi_port = bind_pirq.port; mtx_lock(&xen_intr_isrc_lock); KASSERT((xen_intr_port_to_isrc[bind_pirq.port] == NULL), ("trying to override an already setup event channel port")); xen_intr_port_to_isrc[bind_pirq.port] = isrc; mtx_unlock(&xen_intr_isrc_lock); evtchn_unmask_port(isrc->xi_port); } /* * Disable an interrupt source. * * \param isrc The interrupt source to disable. */ static void xen_intr_pirq_disable_intr(struct intsrc *base_isrc) { struct xenisrc *isrc; struct evtchn_close close; int error; isrc = (struct xenisrc *)base_isrc; evtchn_mask_port(isrc->xi_port); close.port = isrc->xi_port; error = HYPERVISOR_event_channel_op(EVTCHNOP_close, &close); if (error) panic("unable to close event channel %d IRQ#%d", isrc->xi_port, isrc->xi_pirq); mtx_lock(&xen_intr_isrc_lock); xen_intr_port_to_isrc[isrc->xi_port] = NULL; mtx_unlock(&xen_intr_isrc_lock); isrc->xi_port = 0; } /** * Perform configuration of an interrupt source. * * \param isrc The interrupt source to configure. * \param trig Edge or level. * \param pol Active high or low. * * \returns 0 if no events are pending, otherwise non-zero. */ static int xen_intr_pirq_config_intr(struct intsrc *base_isrc, enum intr_trigger trig, enum intr_polarity pol) { struct xenisrc *isrc = (struct xenisrc *)base_isrc; struct physdev_setup_gsi setup_gsi; int error; KASSERT(!(trig == INTR_TRIGGER_CONFORM || pol == INTR_POLARITY_CONFORM), ("%s: Conforming trigger or polarity\n", __func__)); setup_gsi.gsi = isrc->xi_pirq; setup_gsi.triggering = trig == INTR_TRIGGER_EDGE ? 0 : 1; setup_gsi.polarity = pol == INTR_POLARITY_HIGH ? 0 : 1; error = HYPERVISOR_physdev_op(PHYSDEVOP_setup_gsi, &setup_gsi); if (error == -XEN_EEXIST) { if ((isrc->xi_edgetrigger && (trig != INTR_TRIGGER_EDGE)) || (isrc->xi_activehi && (pol != INTR_POLARITY_HIGH))) panic("unable to reconfigure interrupt IRQ#%d", isrc->xi_pirq); error = 0; } if (error) panic("unable to configure IRQ#%d\n", isrc->xi_pirq); isrc->xi_activehi = pol == INTR_POLARITY_HIGH ? 1 : 0; isrc->xi_edgetrigger = trig == INTR_TRIGGER_EDGE ? 1 : 0; return (0); } /*--------------------------- Public Functions -------------------------------*/ /*------- API comments for these methods can be found in xen/xenintr.h -------*/ int xen_intr_bind_local_port(device_t dev, evtchn_port_t local_port, driver_filter_t filter, driver_intr_t handler, void *arg, enum intr_type flags, xen_intr_handle_t *port_handlep) { struct xenisrc *isrc; int error; error = xen_intr_bind_isrc(&isrc, local_port, EVTCHN_TYPE_PORT, dev, filter, handler, arg, flags, port_handlep); if (error != 0) return (error); /* * The Event Channel API didn't open this port, so it is not * responsible for closing it automatically on unbind. */ isrc->xi_close = 0; return (0); } int xen_intr_alloc_and_bind_local_port(device_t dev, u_int remote_domain, driver_filter_t filter, driver_intr_t handler, void *arg, enum intr_type flags, xen_intr_handle_t *port_handlep) { struct xenisrc *isrc; struct evtchn_alloc_unbound alloc_unbound; int error; alloc_unbound.dom = DOMID_SELF; alloc_unbound.remote_dom = remote_domain; error = HYPERVISOR_event_channel_op(EVTCHNOP_alloc_unbound, &alloc_unbound); if (error != 0) { /* * XXX Trap Hypercall error code Linuxisms in * the HYPERCALL layer. */ return (-error); } error = xen_intr_bind_isrc(&isrc, alloc_unbound.port, EVTCHN_TYPE_PORT, dev, filter, handler, arg, flags, port_handlep); if (error != 0) { evtchn_close_t close = { .port = alloc_unbound.port }; if (HYPERVISOR_event_channel_op(EVTCHNOP_close, &close)) panic("EVTCHNOP_close failed"); return (error); } isrc->xi_close = 1; return (0); } int xen_intr_bind_remote_port(device_t dev, u_int remote_domain, u_int remote_port, driver_filter_t filter, driver_intr_t handler, void *arg, enum intr_type flags, xen_intr_handle_t *port_handlep) { struct xenisrc *isrc; struct evtchn_bind_interdomain bind_interdomain; int error; bind_interdomain.remote_dom = remote_domain; bind_interdomain.remote_port = remote_port; error = HYPERVISOR_event_channel_op(EVTCHNOP_bind_interdomain, &bind_interdomain); if (error != 0) { /* * XXX Trap Hypercall error code Linuxisms in * the HYPERCALL layer. */ return (-error); } error = xen_intr_bind_isrc(&isrc, bind_interdomain.local_port, EVTCHN_TYPE_PORT, dev, filter, handler, arg, flags, port_handlep); if (error) { evtchn_close_t close = { .port = bind_interdomain.local_port }; if (HYPERVISOR_event_channel_op(EVTCHNOP_close, &close)) panic("EVTCHNOP_close failed"); return (error); } /* * The Event Channel API opened this port, so it is * responsible for closing it automatically on unbind. */ isrc->xi_close = 1; return (0); } int xen_intr_bind_virq(device_t dev, u_int virq, u_int cpu, driver_filter_t filter, driver_intr_t handler, void *arg, enum intr_type flags, xen_intr_handle_t *port_handlep) { int vcpu_id = pcpu_find(cpu)->pc_vcpu_id; struct xenisrc *isrc; struct evtchn_bind_virq bind_virq = { .virq = virq, .vcpu = vcpu_id }; int error; /* Ensure the target CPU is ready to handle evtchn interrupts. */ xen_intr_intrcnt_add(cpu); isrc = NULL; error = HYPERVISOR_event_channel_op(EVTCHNOP_bind_virq, &bind_virq); if (error != 0) { /* * XXX Trap Hypercall error code Linuxisms in * the HYPERCALL layer. */ return (-error); } error = xen_intr_bind_isrc(&isrc, bind_virq.port, EVTCHN_TYPE_VIRQ, dev, filter, handler, arg, flags, port_handlep); #ifdef SMP if (error == 0) error = intr_event_bind(isrc->xi_intsrc.is_event, cpu); #endif if (error != 0) { evtchn_close_t close = { .port = bind_virq.port }; xen_intr_unbind(*port_handlep); if (HYPERVISOR_event_channel_op(EVTCHNOP_close, &close)) panic("EVTCHNOP_close failed"); return (error); } #ifdef SMP if (isrc->xi_cpu != cpu) { /* * Too early in the boot process for the generic interrupt * code to perform the binding. Update our event channel * masks manually so events can't fire on the wrong cpu * during AP startup. */ xen_intr_assign_cpu(&isrc->xi_intsrc, cpu_apic_ids[cpu]); } #endif /* * The Event Channel API opened this port, so it is * responsible for closing it automatically on unbind. */ isrc->xi_close = 1; isrc->xi_virq = virq; return (0); } int xen_intr_alloc_and_bind_ipi(device_t dev, u_int cpu, driver_filter_t filter, enum intr_type flags, xen_intr_handle_t *port_handlep) { #ifdef SMP int vcpu_id = pcpu_find(cpu)->pc_vcpu_id; struct xenisrc *isrc; struct evtchn_bind_ipi bind_ipi = { .vcpu = vcpu_id }; int error; /* Ensure the target CPU is ready to handle evtchn interrupts. */ xen_intr_intrcnt_add(cpu); isrc = NULL; error = HYPERVISOR_event_channel_op(EVTCHNOP_bind_ipi, &bind_ipi); if (error != 0) { /* * XXX Trap Hypercall error code Linuxisms in * the HYPERCALL layer. */ return (-error); } error = xen_intr_bind_isrc(&isrc, bind_ipi.port, EVTCHN_TYPE_IPI, dev, filter, NULL, NULL, flags, port_handlep); if (error == 0) error = intr_event_bind(isrc->xi_intsrc.is_event, cpu); if (error != 0) { evtchn_close_t close = { .port = bind_ipi.port }; xen_intr_unbind(*port_handlep); if (HYPERVISOR_event_channel_op(EVTCHNOP_close, &close)) panic("EVTCHNOP_close failed"); return (error); } if (isrc->xi_cpu != cpu) { /* * Too early in the boot process for the generic interrupt * code to perform the binding. Update our event channel * masks manually so events can't fire on the wrong cpu * during AP startup. */ xen_intr_assign_cpu(&isrc->xi_intsrc, cpu_apic_ids[cpu]); } /* * The Event Channel API opened this port, so it is * responsible for closing it automatically on unbind. */ isrc->xi_close = 1; return (0); #else return (EOPNOTSUPP); #endif } int xen_register_pirq(int vector, enum intr_trigger trig, enum intr_polarity pol) { struct physdev_map_pirq map_pirq; struct xenisrc *isrc; int error; if (vector == 0) return (EINVAL); if (bootverbose) printf("xen: register IRQ#%d\n", vector); map_pirq.domid = DOMID_SELF; map_pirq.type = MAP_PIRQ_TYPE_GSI; map_pirq.index = vector; map_pirq.pirq = vector; error = HYPERVISOR_physdev_op(PHYSDEVOP_map_pirq, &map_pirq); if (error) { printf("xen: unable to map IRQ#%d\n", vector); return (error); } mtx_lock(&xen_intr_isrc_lock); isrc = xen_intr_alloc_isrc(EVTCHN_TYPE_PIRQ, vector); mtx_unlock(&xen_intr_isrc_lock); KASSERT((isrc != NULL), ("xen: unable to allocate isrc for interrupt")); isrc->xi_pirq = vector; isrc->xi_activehi = pol == INTR_POLARITY_HIGH ? 1 : 0; isrc->xi_edgetrigger = trig == INTR_TRIGGER_EDGE ? 1 : 0; return (0); } int xen_register_msi(device_t dev, int vector, int count) { struct physdev_map_pirq msi_irq; struct xenisrc *isrc; int ret; memset(&msi_irq, 0, sizeof(msi_irq)); msi_irq.domid = DOMID_SELF; msi_irq.type = count == 1 ? MAP_PIRQ_TYPE_MSI_SEG : MAP_PIRQ_TYPE_MULTI_MSI; msi_irq.index = -1; msi_irq.pirq = -1; msi_irq.bus = pci_get_bus(dev) | (pci_get_domain(dev) << 16); msi_irq.devfn = (pci_get_slot(dev) << 3) | pci_get_function(dev); msi_irq.entry_nr = count; ret = HYPERVISOR_physdev_op(PHYSDEVOP_map_pirq, &msi_irq); if (ret != 0) return (ret); if (count != msi_irq.entry_nr) { panic("unable to setup all requested MSI vectors " "(expected %d got %d)", count, msi_irq.entry_nr); } mtx_lock(&xen_intr_isrc_lock); for (int i = 0; i < count; i++) { isrc = xen_intr_alloc_isrc(EVTCHN_TYPE_PIRQ, vector + i); KASSERT(isrc != NULL, ("xen: unable to allocate isrc for interrupt")); isrc->xi_pirq = msi_irq.pirq + i; /* MSI interrupts are always edge triggered */ isrc->xi_edgetrigger = 1; } mtx_unlock(&xen_intr_isrc_lock); return (0); } int xen_release_msi(int vector) { struct physdev_unmap_pirq unmap; struct xenisrc *isrc; int ret; isrc = (struct xenisrc *)intr_lookup_source(vector); if (isrc == NULL) return (ENXIO); unmap.pirq = isrc->xi_pirq; ret = HYPERVISOR_physdev_op(PHYSDEVOP_unmap_pirq, &unmap); if (ret != 0) return (ret); xen_intr_release_isrc(isrc); return (0); } int xen_intr_describe(xen_intr_handle_t port_handle, const char *fmt, ...) { char descr[MAXCOMLEN + 1]; struct xenisrc *isrc; va_list ap; isrc = xen_intr_isrc(port_handle); if (isrc == NULL) return (EINVAL); va_start(ap, fmt); vsnprintf(descr, sizeof(descr), fmt, ap); va_end(ap); return (intr_describe(isrc->xi_vector, isrc->xi_cookie, descr)); } void xen_intr_unbind(xen_intr_handle_t *port_handlep) { struct xenisrc *isrc; KASSERT(port_handlep != NULL, ("NULL xen_intr_handle_t passed to xen_intr_unbind")); isrc = xen_intr_isrc(*port_handlep); *port_handlep = NULL; if (isrc == NULL) return; mtx_lock(&xen_intr_isrc_lock); if (refcount_release(&isrc->xi_refcount) == 0) { mtx_unlock(&xen_intr_isrc_lock); return; } mtx_unlock(&xen_intr_isrc_lock); if (isrc->xi_cookie != NULL) intr_remove_handler(isrc->xi_cookie); xen_intr_release_isrc(isrc); } void xen_intr_signal(xen_intr_handle_t handle) { struct xenisrc *isrc; isrc = xen_intr_isrc(handle); if (isrc != NULL) { KASSERT(isrc->xi_type == EVTCHN_TYPE_PORT || isrc->xi_type == EVTCHN_TYPE_IPI, ("evtchn_signal on something other than a local port")); struct evtchn_send send = { .port = isrc->xi_port }; (void)HYPERVISOR_event_channel_op(EVTCHNOP_send, &send); } } evtchn_port_t xen_intr_port(xen_intr_handle_t handle) { struct xenisrc *isrc; isrc = xen_intr_isrc(handle); if (isrc == NULL) return (0); return (isrc->xi_port); } int xen_intr_add_handler(device_t dev, driver_filter_t filter, driver_intr_t handler, void *arg, enum intr_type flags, xen_intr_handle_t handle) { struct xenisrc *isrc; int error; isrc = xen_intr_isrc(handle); if (isrc == NULL || isrc->xi_cookie != NULL) return (EINVAL); error = intr_add_handler(device_get_nameunit(dev), isrc->xi_vector, filter, handler, arg, flags|INTR_EXCL, &isrc->xi_cookie); if (error != 0) { device_printf(dev, "xen_intr_add_handler: intr_add_handler failed: %d\n", error); } return (error); } int xen_intr_get_evtchn_from_port(evtchn_port_t port, xen_intr_handle_t *handlep) { if (!is_valid_evtchn(port) || port >= NR_EVENT_CHANNELS) return (EINVAL); if (handlep == NULL) { return (EINVAL); } mtx_lock(&xen_intr_isrc_lock); if (xen_intr_port_to_isrc[port] == NULL) { mtx_unlock(&xen_intr_isrc_lock); return (EINVAL); } refcount_acquire(&xen_intr_port_to_isrc[port]->xi_refcount); mtx_unlock(&xen_intr_isrc_lock); /* Assign the opaque handler (the event channel port) */ *handlep = &xen_intr_port_to_isrc[port]->xi_vector; return (0); } #ifdef DDB static const char * xen_intr_print_type(enum evtchn_type type) { static const char *evtchn_type_to_string[EVTCHN_TYPE_COUNT] = { [EVTCHN_TYPE_UNBOUND] = "UNBOUND", [EVTCHN_TYPE_PIRQ] = "PIRQ", [EVTCHN_TYPE_VIRQ] = "VIRQ", [EVTCHN_TYPE_IPI] = "IPI", [EVTCHN_TYPE_PORT] = "PORT", }; if (type >= EVTCHN_TYPE_COUNT) return ("UNKNOWN"); return (evtchn_type_to_string[type]); } static void xen_intr_dump_port(struct xenisrc *isrc) { struct xen_intr_pcpu_data *pcpu; shared_info_t *s = HYPERVISOR_shared_info; int i; db_printf("Port %d Type: %s\n", isrc->xi_port, xen_intr_print_type(isrc->xi_type)); if (isrc->xi_type == EVTCHN_TYPE_PIRQ) { db_printf("\tPirq: %d ActiveHi: %d EdgeTrigger: %d " "NeedsEOI: %d\n", isrc->xi_pirq, isrc->xi_activehi, isrc->xi_edgetrigger, !!xen_test_bit(isrc->xi_pirq, xen_intr_pirq_eoi_map)); } if (isrc->xi_type == EVTCHN_TYPE_VIRQ) db_printf("\tVirq: %d\n", isrc->xi_virq); db_printf("\tMasked: %d Pending: %d\n", !!xen_test_bit(isrc->xi_port, &s->evtchn_mask[0]), !!xen_test_bit(isrc->xi_port, &s->evtchn_pending[0])); db_printf("\tPer-CPU Masks: "); CPU_FOREACH(i) { pcpu = DPCPU_ID_PTR(i, xen_intr_pcpu); db_printf("cpu#%d: %d ", i, !!xen_test_bit(isrc->xi_port, pcpu->evtchn_enabled)); } db_printf("\n"); } DB_SHOW_COMMAND(xen_evtchn, db_show_xen_evtchn) { int i; if (!xen_domain()) { db_printf("Only available on Xen guests\n"); return; } for (i = 0; i < NR_EVENT_CHANNELS; i++) { struct xenisrc *isrc; isrc = xen_intr_port_to_isrc[i]; if (isrc == NULL) continue; xen_intr_dump_port(isrc); } } #endif /* DDB */ Index: head/usr.bin/du/du.c =================================================================== --- head/usr.bin/du/du.c (revision 308456) +++ head/usr.bin/du/du.c (revision 308457) @@ -1,544 +1,544 @@ /* * Copyright (c) 1989, 1993, 1994 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * Chris Newcomb. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #ifndef lint static const char copyright[] = "@(#) Copyright (c) 1989, 1993, 1994\n\ The Regents of the University of California. All rights reserved.\n"; #endif /* not lint */ #ifndef lint #if 0 static const char sccsid[] = "@(#)du.c 8.5 (Berkeley) 5/4/95"; #endif #endif /* not lint */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static SLIST_HEAD(ignhead, ignentry) ignores; struct ignentry { char *mask; SLIST_ENTRY(ignentry) next; }; static int linkchk(FTSENT *); static void usage(void); static void prthumanval(int64_t); static void ignoreadd(const char *); static void ignoreclean(void); static int ignorep(FTSENT *); static void siginfo(int __unused); static int nodumpflag = 0; static int Aflag; static long blocksize, cblocksize; static volatile sig_atomic_t info; int main(int argc, char *argv[]) { FTS *fts; FTSENT *p; off_t savednumber, curblocks; off_t threshold, threshold_sign; int ftsoptions; int depth; int Hflag, Lflag, aflag, sflag, dflag, cflag; int hflag, lflag, ch, notused, rval; char **save; static char dot[] = "."; setlocale(LC_ALL, ""); Hflag = Lflag = aflag = sflag = dflag = cflag = hflag = lflag = Aflag = 0; save = argv; ftsoptions = FTS_PHYSICAL; savednumber = 0; threshold = 0; threshold_sign = 1; cblocksize = DEV_BSIZE; blocksize = 0; depth = INT_MAX; SLIST_INIT(&ignores); while ((ch = getopt(argc, argv, "AB:HI:LPasd:cghklmnrt:x")) != -1) switch (ch) { case 'A': Aflag = 1; break; case 'B': errno = 0; cblocksize = atoi(optarg); if (errno == ERANGE || cblocksize <= 0) { warnx("invalid argument to option B: %s", optarg); usage(); } break; case 'H': Hflag = 1; Lflag = 0; break; case 'I': ignoreadd(optarg); break; case 'L': Lflag = 1; Hflag = 0; break; case 'P': Hflag = Lflag = 0; break; case 'a': aflag = 1; break; case 's': sflag = 1; break; case 'd': dflag = 1; errno = 0; depth = atoi(optarg); if (errno == ERANGE || depth < 0) { warnx("invalid argument to option d: %s", optarg); usage(); } break; case 'c': cflag = 1; break; case 'g': hflag = 0; blocksize = 1073741824; break; case 'h': hflag = 1; break; case 'k': hflag = 0; blocksize = 1024; break; case 'l': lflag = 1; break; case 'm': hflag = 0; blocksize = 1048576; break; case 'n': nodumpflag = 1; break; case 'r': /* Compatibility. */ break; case 't' : if (expand_number(optarg, &threshold) != 0 || threshold == 0) { warnx("invalid threshold: %s", optarg); usage(); } else if (threshold < 0) threshold_sign = -1; break; case 'x': ftsoptions |= FTS_XDEV; break; case '?': default: usage(); /* NOTREACHED */ } argc -= optind; argv += optind; /* * XXX * Because of the way that fts(3) works, logical walks will not count * the blocks actually used by symbolic links. We rationalize this by * noting that users computing logical sizes are likely to do logical * copies, so not counting the links is correct. The real reason is * that we'd have to re-implement the kernel's symbolic link traversing * algorithm to get this right. If, for example, you have relative * symbolic links referencing other relative symbolic links, it gets * very nasty, very fast. The bottom line is that it's documented in * the man page, so it's a feature. */ if (Hflag) ftsoptions |= FTS_COMFOLLOW; if (Lflag) { ftsoptions &= ~FTS_PHYSICAL; ftsoptions |= FTS_LOGICAL; } if (!Aflag && (cblocksize % DEV_BSIZE) != 0) cblocksize = howmany(cblocksize, DEV_BSIZE) * DEV_BSIZE; if (aflag + dflag + sflag > 1) usage(); if (sflag) depth = 0; if (!*argv) { argv = save; argv[0] = dot; argv[1] = NULL; } if (blocksize == 0) (void)getbsize(¬used, &blocksize); if (!Aflag) { cblocksize /= DEV_BSIZE; blocksize /= DEV_BSIZE; } if (threshold != 0) threshold = howmany(threshold / DEV_BSIZE * cblocksize, blocksize); rval = 0; (void)signal(SIGINFO, siginfo); if ((fts = fts_open(argv, ftsoptions, NULL)) == NULL) err(1, "fts_open"); while ((p = fts_read(fts)) != NULL) { switch (p->fts_info) { case FTS_D: /* Ignore. */ if (ignorep(p)) fts_set(fts, p, FTS_SKIP); break; case FTS_DP: if (ignorep(p)) break; curblocks = Aflag ? howmany(p->fts_statp->st_size, cblocksize) : howmany(p->fts_statp->st_blocks, cblocksize); p->fts_parent->fts_bignum += p->fts_bignum += curblocks; if (p->fts_level <= depth && threshold <= threshold_sign * howmany(p->fts_bignum * cblocksize, blocksize)) { if (hflag) { prthumanval(p->fts_bignum); (void)printf("\t%s\n", p->fts_path); } else { (void)printf("%jd\t%s\n", (intmax_t)howmany(p->fts_bignum * cblocksize, blocksize), p->fts_path); } } if (info) { info = 0; (void)printf("\t%s\n", p->fts_path); } break; case FTS_DC: /* Ignore. */ break; case FTS_DNR: /* Warn, continue. */ case FTS_ERR: case FTS_NS: warnx("%s: %s", p->fts_path, strerror(p->fts_errno)); rval = 1; break; default: if (ignorep(p)) break; if (lflag == 0 && p->fts_statp->st_nlink > 1 && linkchk(p)) break; curblocks = Aflag ? howmany(p->fts_statp->st_size, cblocksize) : howmany(p->fts_statp->st_blocks, cblocksize); if (aflag || p->fts_level == 0) { if (hflag) { prthumanval(curblocks); (void)printf("\t%s\n", p->fts_path); } else { (void)printf("%jd\t%s\n", (intmax_t)howmany(curblocks * cblocksize, blocksize), p->fts_path); } } p->fts_parent->fts_bignum += curblocks; } savednumber = p->fts_parent->fts_bignum; } if (errno) err(1, "fts_read"); if (cflag) { if (hflag) { prthumanval(savednumber); (void)printf("\ttotal\n"); } else { (void)printf("%jd\ttotal\n", (intmax_t)howmany( savednumber * cblocksize, blocksize)); } } ignoreclean(); exit(rval); } static int linkchk(FTSENT *p) { struct links_entry { struct links_entry *next; struct links_entry *previous; int links; dev_t dev; ino_t ino; }; static const size_t links_hash_initial_size = 8192; static struct links_entry **buckets; static struct links_entry *free_list; static size_t number_buckets; static unsigned long number_entries; static char stop_allocating; struct links_entry *le, **new_buckets; struct stat *st; size_t i, new_size; int hash; st = p->fts_statp; /* If necessary, initialize the hash table. */ if (buckets == NULL) { number_buckets = links_hash_initial_size; buckets = malloc(number_buckets * sizeof(buckets[0])); if (buckets == NULL) errx(1, "No memory for hardlink detection"); for (i = 0; i < number_buckets; i++) buckets[i] = NULL; } /* If the hash table is getting too full, enlarge it. */ if (number_entries > number_buckets * 10 && !stop_allocating) { new_size = number_buckets * 2; new_buckets = calloc(new_size, sizeof(struct links_entry *)); /* Try releasing the free list to see if that helps. */ if (new_buckets == NULL && free_list != NULL) { while (free_list != NULL) { le = free_list; free_list = le->next; free(le); } new_buckets = calloc(new_size, sizeof(new_buckets[0])); } if (new_buckets == NULL) { stop_allocating = 1; warnx("No more memory for tracking hard links"); } else { for (i = 0; i < number_buckets; i++) { while (buckets[i] != NULL) { /* Remove entry from old bucket. */ le = buckets[i]; buckets[i] = le->next; /* Add entry to new bucket. */ hash = (le->dev ^ le->ino) % new_size; if (new_buckets[hash] != NULL) new_buckets[hash]->previous = le; le->next = new_buckets[hash]; le->previous = NULL; new_buckets[hash] = le; } } free(buckets); buckets = new_buckets; number_buckets = new_size; } } /* Try to locate this entry in the hash table. */ hash = ( st->st_dev ^ st->st_ino ) % number_buckets; for (le = buckets[hash]; le != NULL; le = le->next) { if (le->dev == st->st_dev && le->ino == st->st_ino) { /* * Save memory by releasing an entry when we've seen - * all of it's links. + * all of its links. */ if (--le->links <= 0) { if (le->previous != NULL) le->previous->next = le->next; if (le->next != NULL) le->next->previous = le->previous; if (buckets[hash] == le) buckets[hash] = le->next; number_entries--; /* Recycle this node through the free list */ if (stop_allocating) { free(le); } else { le->next = free_list; free_list = le; } } return (1); } } if (stop_allocating) return (0); /* Add this entry to the links cache. */ if (free_list != NULL) { /* Pull a node from the free list if we can. */ le = free_list; free_list = le->next; } else /* Malloc one if we have to. */ le = malloc(sizeof(struct links_entry)); if (le == NULL) { stop_allocating = 1; warnx("No more memory for tracking hard links"); return (0); } le->dev = st->st_dev; le->ino = st->st_ino; le->links = st->st_nlink - 1; number_entries++; le->next = buckets[hash]; le->previous = NULL; if (buckets[hash] != NULL) buckets[hash]->previous = le; buckets[hash] = le; return (0); } static void prthumanval(int64_t bytes) { char buf[5]; bytes *= cblocksize; if (!Aflag) bytes *= DEV_BSIZE; humanize_number(buf, sizeof(buf), bytes, "", HN_AUTOSCALE, HN_B | HN_NOSPACE | HN_DECIMAL); (void)printf("%4s", buf); } static void usage(void) { (void)fprintf(stderr, "usage: du [-Aclnx] [-H | -L | -P] [-g | -h | -k | -m] " "[-a | -s | -d depth] [-B blocksize] [-I mask] " "[-t threshold] [file ...]\n"); exit(EX_USAGE); } static void ignoreadd(const char *mask) { struct ignentry *ign; ign = calloc(1, sizeof(*ign)); if (ign == NULL) errx(1, "cannot allocate memory"); ign->mask = strdup(mask); if (ign->mask == NULL) errx(1, "cannot allocate memory"); SLIST_INSERT_HEAD(&ignores, ign, next); } static void ignoreclean(void) { struct ignentry *ign; while (!SLIST_EMPTY(&ignores)) { ign = SLIST_FIRST(&ignores); SLIST_REMOVE_HEAD(&ignores, next); free(ign->mask); free(ign); } } static int ignorep(FTSENT *ent) { struct ignentry *ign; if (nodumpflag && (ent->fts_statp->st_flags & UF_NODUMP)) return 1; SLIST_FOREACH(ign, &ignores, next) if (fnmatch(ign->mask, ent->fts_name, 0) != FNM_NOMATCH) return 1; return 0; } static void siginfo(int sig __unused) { info = 1; } Index: head/usr.bin/xlint/lint2/read.c =================================================================== --- head/usr.bin/xlint/lint2/read.c (revision 308456) +++ head/usr.bin/xlint/lint2/read.c (revision 308457) @@ -1,1245 +1,1245 @@ /* $NetBSD: read.c,v 1.19 2007/09/28 21:53:50 uwe Exp $ */ /* * Copyright (c) 1996 Christopher G. Demetriou. All Rights Reserved. * Copyright (c) 1994, 1995 Jochen Pohl * All Rights Reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by Jochen Pohl for * The NetBSD Project. * 4. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include #if defined(__RCSID) && !defined(lint) __RCSID("$NetBSD: read.c,v 1.19 2007/09/28 21:53:50 uwe Exp $"); #endif __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include "lint2.h" /* index of current (included) source file */ static int srcfile; /* * The array pointed to by inpfns maps the file name indices of input files * to the file name indices used in lint2 */ static short *inpfns; static size_t ninpfns; /* * The array pointed to by *fnames maps file name indizes to file names. * Indices of type short are used instead of pointers to save memory. */ const char **fnames; static size_t nfnames; /* * Types are shared (to save memory for the types itself) and accessed * via indices (to save memory for references to types (indices are short)). * To share types, an equal type must be located fast. This is done by a * hash table. Access by indices is done via an array of pointers to the * types. */ typedef struct thtab { const char *th_name; u_short th_idx; struct thtab *th_nxt; } thtab_t; static thtab_t **thtab; /* hash table */ type_t **tlst; /* array for indexed access */ static size_t tlstlen; /* length of tlst */ static hte_t **renametab; /* index of current C source file (as spezified at the command line) */ static int csrcfile; #define inperr() inperror(__FILE__, __LINE__) static void inperror(const char *, size_t); static void setsrc(const char *); static void setfnid(int, const char *); static void funccall(pos_t *, const char *); static void decldef(pos_t *, const char *); static void usedsym(pos_t *, const char *); static u_short inptype(const char *, const char **); static int gettlen(const char *, const char **); static u_short findtype(const char *, size_t, int); static u_short storetyp(type_t *, const char *, size_t, int); static int thash(const char *, size_t); static char *inpqstrg(const char *, const char **); static const char *inpname(const char *, const char **); static int getfnidx(const char *); void readfile(const char *name) { FILE *inp; size_t len; const char *cp; char *line, *eptr, rt = '\0'; int cline, isrc, iline; pos_t pos; if (inpfns == NULL) if ((inpfns = calloc(ninpfns = 128, sizeof (short))) == NULL) nomem(); if (fnames == NULL) if ((fnames = calloc(nfnames = 256, sizeof (char *))) == NULL) nomem(); if (tlstlen == 0) if ((tlst = calloc(tlstlen = 256, sizeof (type_t *))) == NULL) nomem(); if (thtab == NULL) if ((thtab = calloc(THSHSIZ2, sizeof (thtab_t))) == NULL) nomem(); _inithash(&renametab); srcfile = getfnidx(name); if ((inp = fopen(name, "r")) == NULL) err(1, "cannot open %s", name); while ((line = fgetln(inp, &len)) != NULL) { if (len == 0 || line[len - 1] != '\n') inperr(); line[len - 1] = '\0'; cp = line; /* line number in csrcfile */ cline = (int)strtol(cp, &eptr, 10); if (cp == eptr) { cline = -1; } else { cp = eptr; } /* record type */ if (*cp != '\0') { rt = *cp++; } else { inperr(); } if (rt == 'S') { setsrc(cp); continue; } else if (rt == 's') { setfnid(cline, cp); continue; } /* * Index of (included) source file. If this index is * different from csrcfile, it refers to an included * file. */ isrc = (int)strtol(cp, &eptr, 10); if (cp == eptr) inperr(); cp = eptr; isrc = inpfns[isrc]; /* line number in isrc */ if (*cp++ != '.') inperr(); iline = (int)strtol(cp, &eptr, 10); if (cp == eptr) inperr(); cp = eptr; pos.p_src = (u_short)csrcfile; pos.p_line = (u_short)cline; pos.p_isrc = (u_short)isrc; pos.p_iline = (u_short)iline; /* process rest of this record */ switch (rt) { case 'c': funccall(&pos, cp); break; case 'd': decldef(&pos, cp); break; case 'u': usedsym(&pos, cp); break; default: inperr(); } } _destroyhash(renametab); if (ferror(inp)) err(1, "read error on %s", name); (void)fclose(inp); } static void inperror(const char *file, size_t line) { errx(1, "%s,%zd: input file error: %s", file, line, fnames[srcfile]); } /* * Set the name of the C source file of the .ln file which is * currently read. */ static void setsrc(const char *cp) { csrcfile = getfnidx(cp); } /* * setfnid() gets as input an index as used in an input file and the * associated file name. If necessary, it creates a new lint2 file * name index for this file name and creates the mapping of the index * as used in the input file to the index used in lint2. */ static void setfnid(int fid, const char *cp) { if (fid == -1) inperr(); if (fid >= ninpfns) { if ((inpfns = realloc(inpfns, (ninpfns * 2) * sizeof (short))) == NULL) nomem(); (void)memset(inpfns + ninpfns, 0, ninpfns * sizeof (short)); ninpfns *= 2; } /* * Should always be true because indices written in the output * file by lint1 are always the previous index + 1. */ if (fid >= ninpfns) errx(1, "internal error: setfnid()"); inpfns[fid] = (u_short)getfnidx(cp); } /* * Process a function call record (c-record). */ static void funccall(pos_t *posp, const char *cp) { arginf_t *ai, **lai; char c, *eptr; int rused, rdisc; hte_t *hte; fcall_t *fcall; const char *name; fcall = xalloc(sizeof (fcall_t)); STRUCT_ASSIGN(fcall->f_pos, *posp); /* read flags */ rused = rdisc = 0; lai = &fcall->f_args; while ((c = *cp) == 'u' || c == 'i' || c == 'd' || c == 'z' || c == 'p' || c == 'n' || c == 's') { cp++; switch (c) { case 'u': if (rused || rdisc) inperr(); rused = 1; break; case 'i': if (rused || rdisc) inperr(); break; case 'd': if (rused || rdisc) inperr(); rdisc = 1; break; case 'z': case 'p': case 'n': case 's': ai = xalloc(sizeof (arginf_t)); ai->a_num = (int)strtol(cp, &eptr, 10); if (cp == eptr) inperr(); cp = eptr; if (c == 'z') { ai->a_pcon = ai->a_zero = 1; } else if (c == 'p') { ai->a_pcon = 1; } else if (c == 'n') { ai->a_ncon = 1; } else { ai->a_fmt = 1; ai->a_fstrg = inpqstrg(cp, &cp); } *lai = ai; lai = &ai->a_nxt; break; } } fcall->f_rused = rused; fcall->f_rdisc = rdisc; /* read name of function */ name = inpname(cp, &cp); /* first look it up in the renaming table, then in the normal table */ hte = _hsearch(renametab, name, 0); if (hte != NULL) hte = hte->h_hte; else hte = hsearch(name, 1); hte->h_used = 1; fcall->f_type = inptype(cp, &cp); *hte->h_lcall = fcall; hte->h_lcall = &fcall->f_nxt; if (*cp != '\0') inperr(); } /* * Process a declaration or definition (d-record). */ static void decldef(pos_t *posp, const char *cp) { sym_t *symp, sym; char c, *ep, *pos1; int used, renamed; hte_t *hte, *renamehte = NULL; const char *name, *rename; (void)memset(&sym, 0, sizeof (sym)); STRUCT_ASSIGN(sym.s_pos, *posp); sym.s_def = NODECL; used = 0; while (strchr("tdeurosvPS", (c = *cp)) != NULL) { cp++; switch (c) { case 't': if (sym.s_def != NODECL) inperr(); sym.s_def = TDEF; break; case 'd': if (sym.s_def != NODECL) inperr(); sym.s_def = DEF; break; case 'e': if (sym.s_def != NODECL) inperr(); sym.s_def = DECL; break; case 'u': if (used) inperr(); used = 1; break; case 'r': if (sym.s_rval) inperr(); sym.s_rval = 1; break; case 'o': if (sym.s_osdef) inperr(); sym.s_osdef = 1; break; case 's': if (sym.s_static) inperr(); sym.s_static = 1; break; case 'v': if (sym.s_va) inperr(); sym.s_va = 1; sym.s_nva = (short)strtol(cp, &ep, 10); if (cp == ep) inperr(); cp = ep; break; case 'P': if (sym.s_prfl) inperr(); sym.s_prfl = 1; sym.s_nprfl = (short)strtol(cp, &ep, 10); if (cp == ep) inperr(); cp = ep; break; case 'S': if (sym.s_scfl) inperr(); sym.s_scfl = 1; sym.s_nscfl = (short)strtol(cp, &ep, 10); if (cp == ep) inperr(); cp = ep; break; } } /* read symbol name, doing renaming if necessary */ name = inpname(cp, &cp); renamed = 0; if (*cp == 'r') { cp++; name = xstrdup(name); rename = inpname(cp, &cp); /* enter it and see if it's already been renamed */ renamehte = _hsearch(renametab, name, 1); if (renamehte->h_hte == NULL) { hte = hsearch(rename, 1); renamehte->h_hte = hte; renamed = 1; } else if (strcmp((hte = renamehte->h_hte)->h_name, rename)) { pos1 = xstrdup(mkpos(&renamehte->h_syms->s_pos)); /* %s renamed multiple times\t%s :: %s */ msg(18, name, pos1, mkpos(&sym.s_pos)); free(pos1); } free((char *)name); } else { /* it might be a previously-done rename */ hte = _hsearch(renametab, name, 0); if (hte != NULL) hte = hte->h_hte; else hte = hsearch(name, 1); } hte->h_used |= used; if (sym.s_def == DEF || sym.s_def == TDEF) hte->h_def = 1; sym.s_type = inptype(cp, &cp); /* * Allocate memory for this symbol only if it was not already * declared or tentatively defined at the same location with * the same type. Works only for symbols with external linkage, * because static symbols, tentatively defined at the same location * but in different translation units are really different symbols. */ for (symp = hte->h_syms; symp != NULL; symp = symp->s_nxt) { if (symp->s_pos.p_isrc == sym.s_pos.p_isrc && symp->s_pos.p_iline == sym.s_pos.p_iline && symp->s_type == sym.s_type && ((symp->s_def == DECL && sym.s_def == DECL) || (!sflag && symp->s_def == TDEF && sym.s_def == TDEF)) && !symp->s_static && !sym.s_static) { break; } } if (symp == NULL) { /* allocsym reserviert keinen Platz fuer s_nva */ if (sym.s_va || sym.s_prfl || sym.s_scfl) { symp = xalloc(sizeof (sym_t)); STRUCT_ASSIGN(*symp, sym); } else { symp = xalloc(sizeof (symp->s_s)); STRUCT_ASSIGN(symp->s_s, sym.s_s); } *hte->h_lsym = symp; hte->h_lsym = &symp->s_nxt; /* XXX hack so we can remember where a symbol was renamed */ if (renamed) renamehte->h_syms = symp; } if (*cp != '\0') inperr(); } /* * Read an u-record (emitted by lint1 if a symbol was used). */ static void usedsym(pos_t *posp, const char *cp) { usym_t *usym; hte_t *hte; const char *name; usym = xalloc(sizeof (usym_t)); STRUCT_ASSIGN(usym->u_pos, *posp); /* needed as delimiter between two numbers */ if (*cp++ != 'x') inperr(); name = inpname(cp, &cp); hte = _hsearch(renametab, name, 0); if (hte != NULL) hte = hte->h_hte; else hte = hsearch(name, 1); hte->h_used = 1; *hte->h_lusym = usym; hte->h_lusym = &usym->u_nxt; } /* * Read a type and return the index of this type. */ static u_short inptype(const char *cp, const char **epp) { char c, s, *eptr; const char *ep; type_t *tp; int narg, i, osdef = 0; size_t tlen; u_short tidx, sidx; int h; /* If we have this type already, return it's index. */ tlen = gettlen(cp, &ep); h = thash(cp, tlen); if ((tidx = findtype(cp, tlen, h)) != 0) { *epp = ep; return (tidx); } /* No, we must create a new type. */ tp = xalloc(sizeof (type_t)); tidx = storetyp(tp, cp, tlen, h); c = *cp++; while (c == 'c' || c == 'v') { if (c == 'c') { tp->t_const = 1; } else { tp->t_volatile = 1; } c = *cp++; } if (c == 's' || c == 'u' || c == 'l' || c == 'e') { s = c; c = *cp++; } else { s = '\0'; } switch (c) { case 'C': tp->t_tspec = s == 's' ? SCHAR : (s == 'u' ? UCHAR : CHAR); break; case 'S': tp->t_tspec = s == 'u' ? USHORT : SHORT; break; case 'I': tp->t_tspec = s == 'u' ? UINT : INT; break; case 'L': tp->t_tspec = s == 'u' ? ULONG : LONG; break; case 'Q': tp->t_tspec = s == 'u' ? UQUAD : QUAD; break; case 'D': tp->t_tspec = s == 's' ? FLOAT : (s == 'l' ? LDOUBLE : DOUBLE); break; case 'V': tp->t_tspec = VOID; break; case 'P': tp->t_tspec = PTR; break; case 'A': tp->t_tspec = ARRAY; break; case 'F': case 'f': osdef = c == 'f'; tp->t_tspec = FUNC; break; case 'T': tp->t_tspec = s == 'e' ? ENUM : (s == 's' ? STRUCT : UNION); break; } switch (tp->t_tspec) { case ARRAY: tp->t_dim = (int)strtol(cp, &eptr, 10); cp = eptr; sidx = inptype(cp, &cp); /* force seq. point! (ditto below) */ tp->t_subt = TP(sidx); break; case PTR: sidx = inptype(cp, &cp); tp->t_subt = TP(sidx); break; case FUNC: c = *cp; if (isdigit((u_char)c)) { if (!osdef) tp->t_proto = 1; narg = (int)strtol(cp, &eptr, 10); cp = eptr; if ((tp->t_args = calloc((size_t)(narg + 1), sizeof (type_t *))) == NULL) nomem(); for (i = 0; i < narg; i++) { if (i == narg - 1 && *cp == 'E') { tp->t_vararg = 1; cp++; } else { sidx = inptype(cp, &cp); tp->t_args[i] = TP(sidx); } } } sidx = inptype(cp, &cp); tp->t_subt = TP(sidx); break; case ENUM: tp->t_tspec = INT; tp->t_isenum = 1; /* FALLTHROUGH */ case STRUCT: case UNION: switch (*cp++) { case '1': tp->t_istag = 1; tp->t_tag = hsearch(inpname(cp, &cp), 1); break; case '2': tp->t_istynam = 1; tp->t_tynam = hsearch(inpname(cp, &cp), 1); break; case '3': tp->t_isuniqpos = 1; tp->t_uniqpos.p_line = strtol(cp, &eptr, 10); cp = eptr; cp++; /* xlate to 'global' file name. */ tp->t_uniqpos.p_file = addoutfile(inpfns[strtol(cp, &eptr, 10)]); cp = eptr; cp++; tp->t_uniqpos.p_uniq = strtol(cp, &eptr, 10); cp = eptr; break; } break; case LONG: case VOID: case LDOUBLE: case DOUBLE: case FLOAT: case UQUAD: case QUAD: case ULONG: case UINT: case INT: case USHORT: case SHORT: case UCHAR: case SCHAR: case CHAR: case UNSIGN: case SIGNED: case NOTSPEC: break; } *epp = cp; return (tidx); } /* * Get the length of a type string. */ static int gettlen(const char *cp, const char **epp) { const char *cp1; char c, s, *eptr; tspec_t t; int narg, i, cm, vm; cp1 = cp; c = *cp++; cm = vm = 0; while (c == 'c' || c == 'v') { if (c == 'c') { if (cm) inperr(); cm = 1; } else { if (vm) inperr(); vm = 1; } c = *cp++; } if (c == 's' || c == 'u' || c == 'l' || c == 'e') { s = c; c = *cp++; } else { s = '\0'; } t = NOTSPEC; switch (c) { case 'C': if (s == 's') { t = SCHAR; } else if (s == 'u') { t = UCHAR; } else if (s == '\0') { t = CHAR; } break; case 'S': if (s == 'u') { t = USHORT; } else if (s == '\0') { t = SHORT; } break; case 'I': if (s == 'u') { t = UINT; } else if (s == '\0') { t = INT; } break; case 'L': if (s == 'u') { t = ULONG; } else if (s == '\0') { t = LONG; } break; case 'Q': if (s == 'u') { t = UQUAD; } else if (s == '\0') { t = QUAD; } break; case 'D': if (s == 's') { t = FLOAT; } else if (s == 'l') { t = LDOUBLE; } else if (s == '\0') { t = DOUBLE; } break; case 'V': if (s == '\0') t = VOID; break; case 'P': if (s == '\0') t = PTR; break; case 'A': if (s == '\0') t = ARRAY; break; case 'F': case 'f': if (s == '\0') t = FUNC; break; case 'T': if (s == 'e') { t = ENUM; } else if (s == 's') { t = STRUCT; } else if (s == 'u') { t = UNION; } break; default: inperr(); } if (t == NOTSPEC) inperr(); switch (t) { case ARRAY: (void)strtol(cp, &eptr, 10); if (cp == eptr) inperr(); cp = eptr; (void)gettlen(cp, &cp); break; case PTR: (void)gettlen(cp, &cp); break; case FUNC: c = *cp; if (isdigit((u_char)c)) { narg = (int)strtol(cp, &eptr, 10); cp = eptr; for (i = 0; i < narg; i++) { if (i == narg - 1 && *cp == 'E') { cp++; } else { (void)gettlen(cp, &cp); } } } (void)gettlen(cp, &cp); break; case ENUM: case STRUCT: case UNION: switch (*cp++) { case '1': (void)inpname(cp, &cp); break; case '2': (void)inpname(cp, &cp); break; case '3': /* unique position: line.file.uniquifier */ (void)strtol(cp, &eptr, 10); if (cp == eptr) inperr(); cp = eptr; if (*cp++ != '.') inperr(); (void)strtol(cp, &eptr, 10); if (cp == eptr) inperr(); cp = eptr; if (*cp++ != '.') inperr(); (void)strtol(cp, &eptr, 10); if (cp == eptr) inperr(); cp = eptr; break; default: inperr(); } break; case FLOAT: case USHORT: case SHORT: case UCHAR: case SCHAR: case CHAR: case UNSIGN: case SIGNED: case NOTSPEC: case INT: case UINT: case DOUBLE: case LDOUBLE: case VOID: case ULONG: case QUAD: case UQUAD: case LONG: break; } *epp = cp; return (cp - cp1); } /* - * Search a type by it's type string. + * Search a type by its type string. */ static u_short findtype(const char *cp, size_t len, int h) { thtab_t *thte; for (thte = thtab[h]; thte != NULL; thte = thte->th_nxt) { if (strncmp(thte->th_name, cp, len) != 0) continue; if (thte->th_name[len] == '\0') return (thte->th_idx); } return (0); } /* * Store a type and it's type string so we can later share this type * if we read the same type string from the input file. */ static u_short storetyp(type_t *tp, const char *cp, size_t len, int h) { static u_int tidx = 1; /* 0 is reserved */ thtab_t *thte; char *name; if (tidx >= USHRT_MAX) errx(1, "sorry, too many types"); if (tidx == tlstlen - 1) { if ((tlst = realloc(tlst, (tlstlen * 2) * sizeof (type_t *))) == NULL) nomem(); (void)memset(tlst + tlstlen, 0, tlstlen * sizeof (type_t *)); tlstlen *= 2; } tlst[tidx] = tp; /* create a hash table entry */ name = xalloc(len + 1); (void)memcpy(name, cp, len); name[len] = '\0'; thte = xalloc(sizeof (thtab_t)); thte->th_name = name; thte->th_idx = tidx; thte->th_nxt = thtab[h]; thtab[h] = thte; return ((u_short)tidx++); } /* * Hash function for types */ static int thash(const char *s, size_t len) { u_int v; v = 0; while (len-- != 0) { v = (v << sizeof (v)) + (u_char)*s++; v ^= v >> (sizeof (v) * CHAR_BIT - sizeof (v)); } return (v % THSHSIZ2); } /* * Read a string enclosed by "". This string may contain quoted chars. */ static char * inpqstrg(const char *src, const char **epp) { char *strg, *dst; size_t slen; int c; int v; if ((dst = strg = malloc(slen = 32)) == NULL) nomem(); if ((c = *src++) != '"') inperr(); if ((c = *src++) == '\0') inperr(); while (c != '"') { if (c == '\\') { if ((c = *src++) == '\0') inperr(); switch (c) { case 'n': c = '\n'; break; case 't': c = '\t'; break; case 'v': c = '\v'; break; case 'b': c = '\b'; break; case 'r': c = '\r'; break; case 'f': c = '\f'; break; case 'a': c = '\a'; break; case '\\': c = '\\'; break; case '"': c = '"'; break; case '\'': c = '\''; break; case '0': case '1': case '2': case '3': v = (c - '0') << 6; if ((c = *src++) < '0' || c > '7') inperr(); v |= (c - '0') << 3; if ((c = *src++) < '0' || c > '7') inperr(); v |= c - '0'; c = (u_char)v; break; default: inperr(); } } /* keep space for trailing '\0' */ if (dst - strg == slen - 1) { if ((strg = realloc(strg, slen * 2)) == NULL) nomem(); dst = strg + (slen - 1); slen *= 2; } *dst++ = (char)c; if ((c = *src++) == '\0') inperr(); } *dst = '\0'; *epp = src; return (strg); } /* * Read the name of a symbol in static memory. */ static const char * inpname(const char *cp, const char **epp) { static char *buf; static size_t blen = 0; size_t len, i; char *eptr, c; len = (int)strtol(cp, &eptr, 10); if (cp == eptr) inperr(); cp = eptr; if (len + 1 > blen) if ((buf = realloc(buf, blen = len + 1)) == NULL) nomem(); for (i = 0; i < len; i++) { c = *cp++; if (!isalnum((unsigned char)c) && c != '_') inperr(); buf[i] = c; } buf[i] = '\0'; *epp = cp; return (buf); } /* * Return the index of a file name. If the name cannot be found, create * a new entry and return the index of the newly created entry. */ static int getfnidx(const char *fn) { int i; /* 0 ist reserved */ for (i = 1; fnames[i] != NULL; i++) { if (strcmp(fnames[i], fn) == 0) break; } if (fnames[i] != NULL) return (i); if (i == nfnames - 1) { if ((fnames = realloc(fnames, (nfnames * 2) * sizeof (char *))) == NULL) nomem(); (void)memset(fnames + nfnames, 0, nfnames * sizeof (char *)); nfnames *= 2; } if ((fnames[i] = strdup(fn)) == NULL) nomem(); return (i); } /* * Separate symbols with static and external linkage. */ void mkstatic(hte_t *hte) { sym_t *sym1, **symp, *sym; fcall_t **callp, *call; usym_t **usymp, *usym; hte_t *nhte; int ofnd; /* Look for first static definition */ for (sym1 = hte->h_syms; sym1 != NULL; sym1 = sym1->s_nxt) { if (sym1->s_static) break; } if (sym1 == NULL) return; /* Do nothing if this name is used only in one translation unit. */ ofnd = 0; for (sym = hte->h_syms; sym != NULL && !ofnd; sym = sym->s_nxt) { if (sym->s_pos.p_src != sym1->s_pos.p_src) ofnd = 1; } for (call = hte->h_calls; call != NULL && !ofnd; call = call->f_nxt) { if (call->f_pos.p_src != sym1->s_pos.p_src) ofnd = 1; } for (usym = hte->h_usyms; usym != NULL && !ofnd; usym = usym->u_nxt) { if (usym->u_pos.p_src != sym1->s_pos.p_src) ofnd = 1; } if (!ofnd) { hte->h_used = 1; /* errors about undef. static symbols are printed in lint1 */ hte->h_def = 1; hte->h_static = 1; return; } /* * Create a new hash table entry * * XXX this entry should be put at the beginning of the list to * avoid to process the same symbol twice. */ for (nhte = hte; nhte->h_link != NULL; nhte = nhte->h_link) continue; nhte->h_link = xmalloc(sizeof (hte_t)); nhte = nhte->h_link; nhte->h_name = hte->h_name; nhte->h_used = 1; nhte->h_def = 1; /* error in lint1 */ nhte->h_static = 1; nhte->h_syms = NULL; nhte->h_lsym = &nhte->h_syms; nhte->h_calls = NULL; nhte->h_lcall = &nhte->h_calls; nhte->h_usyms = NULL; nhte->h_lusym = &nhte->h_usyms; nhte->h_link = NULL; nhte->h_hte = NULL; /* * move all symbols used in this translation unit into the new * hash table entry. */ for (symp = &hte->h_syms; (sym = *symp) != NULL; ) { if (sym->s_pos.p_src == sym1->s_pos.p_src) { sym->s_static = 1; (*symp) = sym->s_nxt; if (hte->h_lsym == &sym->s_nxt) hte->h_lsym = symp; sym->s_nxt = NULL; *nhte->h_lsym = sym; nhte->h_lsym = &sym->s_nxt; } else { symp = &sym->s_nxt; } } for (callp = &hte->h_calls; (call = *callp) != NULL; ) { if (call->f_pos.p_src == sym1->s_pos.p_src) { (*callp) = call->f_nxt; if (hte->h_lcall == &call->f_nxt) hte->h_lcall = callp; call->f_nxt = NULL; *nhte->h_lcall = call; nhte->h_lcall = &call->f_nxt; } else { callp = &call->f_nxt; } } for (usymp = &hte->h_usyms; (usym = *usymp) != NULL; ) { if (usym->u_pos.p_src == sym1->s_pos.p_src) { (*usymp) = usym->u_nxt; if (hte->h_lusym == &usym->u_nxt) hte->h_lusym = usymp; usym->u_nxt = NULL; *nhte->h_lusym = usym; nhte->h_lusym = &usym->u_nxt; } else { usymp = &usym->u_nxt; } } /* h_def must be recalculated for old hte */ hte->h_def = nhte->h_def = 0; for (sym = hte->h_syms; sym != NULL; sym = sym->s_nxt) { if (sym->s_def == DEF || sym->s_def == TDEF) { hte->h_def = 1; break; } } mkstatic(hte); } Index: head/usr.sbin/bhyve/bhyverun.c =================================================================== --- head/usr.sbin/bhyve/bhyverun.c (revision 308456) +++ head/usr.sbin/bhyve/bhyverun.c (revision 308457) @@ -1,971 +1,971 @@ /*- * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "bhyverun.h" #include "acpi.h" #include "atkbdc.h" #include "inout.h" #include "dbgport.h" #include "fwctl.h" #include "ioapic.h" #include "mem.h" #include "mevent.h" #include "mptbl.h" #include "pci_emul.h" #include "pci_irq.h" #include "pci_lpc.h" #include "smbiostbl.h" #include "xmsr.h" #include "spinup_ap.h" #include "rtc.h" #define GUEST_NIO_PORT 0x488 /* guest upcalls via i/o port */ #define MB (1024UL * 1024) #define GB (1024UL * MB) typedef int (*vmexit_handler_t)(struct vmctx *, struct vm_exit *, int *vcpu); extern int vmexit_task_switch(struct vmctx *, struct vm_exit *, int *vcpu); char *vmname; int guest_ncpus; char *guest_uuid_str; static int guest_vmexit_on_hlt, guest_vmexit_on_pause; static int virtio_msix = 1; static int x2apic_mode = 0; /* default is xAPIC */ static int strictio; static int strictmsr = 1; static int acpi; static char *progname; static const int BSP = 0; static cpuset_t cpumask; static void vm_loop(struct vmctx *ctx, int vcpu, uint64_t rip); static struct vm_exit vmexit[VM_MAXCPU]; struct bhyvestats { uint64_t vmexit_bogus; uint64_t vmexit_reqidle; uint64_t vmexit_hlt; uint64_t vmexit_pause; uint64_t vmexit_mtrap; uint64_t vmexit_inst_emul; uint64_t cpu_switch_rotate; uint64_t cpu_switch_direct; } stats; struct mt_vmm_info { pthread_t mt_thr; struct vmctx *mt_ctx; int mt_vcpu; } mt_vmm_info[VM_MAXCPU]; static cpuset_t *vcpumap[VM_MAXCPU] = { NULL }; static void usage(int code) { fprintf(stderr, "Usage: %s [-abehuwxACHPSWY] [-c vcpus] [-g ] [-l ]\n" " %*s [-m mem] [-p vcpu:hostcpu] [-s ] [-U uuid] \n" " -a: local apic is in xAPIC mode (deprecated)\n" " -A: create ACPI tables\n" " -c: # cpus (default 1)\n" " -C: include guest memory in core file\n" " -e: exit on unhandled I/O access\n" " -g: gdb port\n" " -h: help\n" " -H: vmexit from the guest on hlt\n" " -l: LPC device configuration\n" " -m: memory size in MB\n" " -p: pin 'vcpu' to 'hostcpu'\n" " -P: vmexit from the guest on pause\n" " -s: PCI slot config\n" " -S: guest memory cannot be swapped\n" " -u: RTC keeps UTC time\n" " -U: uuid\n" " -w: ignore unimplemented MSRs\n" " -W: force virtio to use single-vector MSI\n" " -x: local apic is in x2APIC mode\n" " -Y: disable MPtable generation\n", progname, (int)strlen(progname), ""); exit(code); } static int pincpu_parse(const char *opt) { int vcpu, pcpu; if (sscanf(opt, "%d:%d", &vcpu, &pcpu) != 2) { fprintf(stderr, "invalid format: %s\n", opt); return (-1); } if (vcpu < 0 || vcpu >= VM_MAXCPU) { fprintf(stderr, "vcpu '%d' outside valid range from 0 to %d\n", vcpu, VM_MAXCPU - 1); return (-1); } if (pcpu < 0 || pcpu >= CPU_SETSIZE) { fprintf(stderr, "hostcpu '%d' outside valid range from " "0 to %d\n", pcpu, CPU_SETSIZE - 1); return (-1); } if (vcpumap[vcpu] == NULL) { if ((vcpumap[vcpu] = malloc(sizeof(cpuset_t))) == NULL) { perror("malloc"); return (-1); } CPU_ZERO(vcpumap[vcpu]); } CPU_SET(pcpu, vcpumap[vcpu]); return (0); } void vm_inject_fault(void *arg, int vcpu, int vector, int errcode_valid, int errcode) { struct vmctx *ctx; int error, restart_instruction; ctx = arg; restart_instruction = 1; error = vm_inject_exception(ctx, vcpu, vector, errcode_valid, errcode, restart_instruction); assert(error == 0); } void * paddr_guest2host(struct vmctx *ctx, uintptr_t gaddr, size_t len) { return (vm_map_gpa(ctx, gaddr, len)); } int fbsdrun_vmexit_on_pause(void) { return (guest_vmexit_on_pause); } int fbsdrun_vmexit_on_hlt(void) { return (guest_vmexit_on_hlt); } int fbsdrun_virtio_msix(void) { return (virtio_msix); } static void * fbsdrun_start_thread(void *param) { char tname[MAXCOMLEN + 1]; struct mt_vmm_info *mtp; int vcpu; mtp = param; vcpu = mtp->mt_vcpu; snprintf(tname, sizeof(tname), "vcpu %d", vcpu); pthread_set_name_np(mtp->mt_thr, tname); vm_loop(mtp->mt_ctx, vcpu, vmexit[vcpu].rip); /* not reached */ exit(1); return (NULL); } void fbsdrun_addcpu(struct vmctx *ctx, int fromcpu, int newcpu, uint64_t rip) { int error; assert(fromcpu == BSP); /* * The 'newcpu' must be activated in the context of 'fromcpu'. If * vm_activate_cpu() is delayed until newcpu's pthread starts running * then vmm.ko is out-of-sync with bhyve and this can create a race * with vm_suspend(). */ error = vm_activate_cpu(ctx, newcpu); if (error != 0) err(EX_OSERR, "could not activate CPU %d", newcpu); CPU_SET_ATOMIC(newcpu, &cpumask); /* * Set up the vmexit struct to allow execution to start * at the given RIP */ vmexit[newcpu].rip = rip; vmexit[newcpu].inst_length = 0; mt_vmm_info[newcpu].mt_ctx = ctx; mt_vmm_info[newcpu].mt_vcpu = newcpu; error = pthread_create(&mt_vmm_info[newcpu].mt_thr, NULL, fbsdrun_start_thread, &mt_vmm_info[newcpu]); assert(error == 0); } static int fbsdrun_deletecpu(struct vmctx *ctx, int vcpu) { if (!CPU_ISSET(vcpu, &cpumask)) { fprintf(stderr, "Attempting to delete unknown cpu %d\n", vcpu); exit(1); } CPU_CLR_ATOMIC(vcpu, &cpumask); return (CPU_EMPTY(&cpumask)); } static int vmexit_handle_notify(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu, uint32_t eax) { #if BHYVE_DEBUG /* * put guest-driven debug here */ #endif return (VMEXIT_CONTINUE); } static int vmexit_inout(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu) { int error; int bytes, port, in, out; int vcpu; vcpu = *pvcpu; port = vme->u.inout.port; bytes = vme->u.inout.bytes; in = vme->u.inout.in; out = !in; /* Extra-special case of host notifications */ if (out && port == GUEST_NIO_PORT) { error = vmexit_handle_notify(ctx, vme, pvcpu, vme->u.inout.eax); return (error); } error = emulate_inout(ctx, vcpu, vme, strictio); if (error) { fprintf(stderr, "Unhandled %s%c 0x%04x at 0x%lx\n", in ? "in" : "out", bytes == 1 ? 'b' : (bytes == 2 ? 'w' : 'l'), port, vmexit->rip); return (VMEXIT_ABORT); } else { return (VMEXIT_CONTINUE); } } static int vmexit_rdmsr(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu) { uint64_t val; uint32_t eax, edx; int error; val = 0; error = emulate_rdmsr(ctx, *pvcpu, vme->u.msr.code, &val); if (error != 0) { fprintf(stderr, "rdmsr to register %#x on vcpu %d\n", vme->u.msr.code, *pvcpu); if (strictmsr) { vm_inject_gp(ctx, *pvcpu); return (VMEXIT_CONTINUE); } } eax = val; error = vm_set_register(ctx, *pvcpu, VM_REG_GUEST_RAX, eax); assert(error == 0); edx = val >> 32; error = vm_set_register(ctx, *pvcpu, VM_REG_GUEST_RDX, edx); assert(error == 0); return (VMEXIT_CONTINUE); } static int vmexit_wrmsr(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu) { int error; error = emulate_wrmsr(ctx, *pvcpu, vme->u.msr.code, vme->u.msr.wval); if (error != 0) { fprintf(stderr, "wrmsr to register %#x(%#lx) on vcpu %d\n", vme->u.msr.code, vme->u.msr.wval, *pvcpu); if (strictmsr) { vm_inject_gp(ctx, *pvcpu); return (VMEXIT_CONTINUE); } } return (VMEXIT_CONTINUE); } static int vmexit_spinup_ap(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu) { (void)spinup_ap(ctx, *pvcpu, vme->u.spinup_ap.vcpu, vme->u.spinup_ap.rip); return (VMEXIT_CONTINUE); } #define DEBUG_EPT_MISCONFIG #ifdef DEBUG_EPT_MISCONFIG #define EXIT_REASON_EPT_MISCONFIG 49 #define VMCS_GUEST_PHYSICAL_ADDRESS 0x00002400 #define VMCS_IDENT(x) ((x) | 0x80000000) static uint64_t ept_misconfig_gpa, ept_misconfig_pte[4]; static int ept_misconfig_ptenum; #endif static int vmexit_vmx(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) { fprintf(stderr, "vm exit[%d]\n", *pvcpu); fprintf(stderr, "\treason\t\tVMX\n"); fprintf(stderr, "\trip\t\t0x%016lx\n", vmexit->rip); fprintf(stderr, "\tinst_length\t%d\n", vmexit->inst_length); fprintf(stderr, "\tstatus\t\t%d\n", vmexit->u.vmx.status); fprintf(stderr, "\texit_reason\t%u\n", vmexit->u.vmx.exit_reason); fprintf(stderr, "\tqualification\t0x%016lx\n", vmexit->u.vmx.exit_qualification); fprintf(stderr, "\tinst_type\t\t%d\n", vmexit->u.vmx.inst_type); fprintf(stderr, "\tinst_error\t\t%d\n", vmexit->u.vmx.inst_error); #ifdef DEBUG_EPT_MISCONFIG if (vmexit->u.vmx.exit_reason == EXIT_REASON_EPT_MISCONFIG) { vm_get_register(ctx, *pvcpu, VMCS_IDENT(VMCS_GUEST_PHYSICAL_ADDRESS), &ept_misconfig_gpa); vm_get_gpa_pmap(ctx, ept_misconfig_gpa, ept_misconfig_pte, &ept_misconfig_ptenum); fprintf(stderr, "\tEPT misconfiguration:\n"); fprintf(stderr, "\t\tGPA: %#lx\n", ept_misconfig_gpa); fprintf(stderr, "\t\tPTE(%d): %#lx %#lx %#lx %#lx\n", ept_misconfig_ptenum, ept_misconfig_pte[0], ept_misconfig_pte[1], ept_misconfig_pte[2], ept_misconfig_pte[3]); } #endif /* DEBUG_EPT_MISCONFIG */ return (VMEXIT_ABORT); } static int vmexit_svm(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) { fprintf(stderr, "vm exit[%d]\n", *pvcpu); fprintf(stderr, "\treason\t\tSVM\n"); fprintf(stderr, "\trip\t\t0x%016lx\n", vmexit->rip); fprintf(stderr, "\tinst_length\t%d\n", vmexit->inst_length); fprintf(stderr, "\texitcode\t%#lx\n", vmexit->u.svm.exitcode); fprintf(stderr, "\texitinfo1\t%#lx\n", vmexit->u.svm.exitinfo1); fprintf(stderr, "\texitinfo2\t%#lx\n", vmexit->u.svm.exitinfo2); return (VMEXIT_ABORT); } static int vmexit_bogus(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) { assert(vmexit->inst_length == 0); stats.vmexit_bogus++; return (VMEXIT_CONTINUE); } static int vmexit_reqidle(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) { assert(vmexit->inst_length == 0); stats.vmexit_reqidle++; return (VMEXIT_CONTINUE); } static int vmexit_hlt(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) { stats.vmexit_hlt++; /* * Just continue execution with the next instruction. We use * the HLT VM exit as a way to be friendly with the host * scheduler. */ return (VMEXIT_CONTINUE); } static int vmexit_pause(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) { stats.vmexit_pause++; return (VMEXIT_CONTINUE); } static int vmexit_mtrap(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) { assert(vmexit->inst_length == 0); stats.vmexit_mtrap++; return (VMEXIT_CONTINUE); } static int vmexit_inst_emul(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) { int err, i; struct vie *vie; stats.vmexit_inst_emul++; vie = &vmexit->u.inst_emul.vie; err = emulate_mem(ctx, *pvcpu, vmexit->u.inst_emul.gpa, vie, &vmexit->u.inst_emul.paging); if (err) { if (err == ESRCH) { fprintf(stderr, "Unhandled memory access to 0x%lx\n", vmexit->u.inst_emul.gpa); } fprintf(stderr, "Failed to emulate instruction ["); for (i = 0; i < vie->num_valid; i++) { fprintf(stderr, "0x%02x%s", vie->inst[i], i != (vie->num_valid - 1) ? " " : ""); } fprintf(stderr, "] at 0x%lx\n", vmexit->rip); return (VMEXIT_ABORT); } return (VMEXIT_CONTINUE); } static pthread_mutex_t resetcpu_mtx = PTHREAD_MUTEX_INITIALIZER; static pthread_cond_t resetcpu_cond = PTHREAD_COND_INITIALIZER; static int vmexit_suspend(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) { enum vm_suspend_how how; how = vmexit->u.suspended.how; fbsdrun_deletecpu(ctx, *pvcpu); if (*pvcpu != BSP) { pthread_mutex_lock(&resetcpu_mtx); pthread_cond_signal(&resetcpu_cond); pthread_mutex_unlock(&resetcpu_mtx); pthread_exit(NULL); } pthread_mutex_lock(&resetcpu_mtx); while (!CPU_EMPTY(&cpumask)) { pthread_cond_wait(&resetcpu_cond, &resetcpu_mtx); } pthread_mutex_unlock(&resetcpu_mtx); switch (how) { case VM_SUSPEND_RESET: exit(0); case VM_SUSPEND_POWEROFF: exit(1); case VM_SUSPEND_HALT: exit(2); case VM_SUSPEND_TRIPLEFAULT: exit(3); default: fprintf(stderr, "vmexit_suspend: invalid reason %d\n", how); exit(100); } return (0); /* NOTREACHED */ } static vmexit_handler_t handler[VM_EXITCODE_MAX] = { [VM_EXITCODE_INOUT] = vmexit_inout, [VM_EXITCODE_INOUT_STR] = vmexit_inout, [VM_EXITCODE_VMX] = vmexit_vmx, [VM_EXITCODE_SVM] = vmexit_svm, [VM_EXITCODE_BOGUS] = vmexit_bogus, [VM_EXITCODE_REQIDLE] = vmexit_reqidle, [VM_EXITCODE_RDMSR] = vmexit_rdmsr, [VM_EXITCODE_WRMSR] = vmexit_wrmsr, [VM_EXITCODE_MTRAP] = vmexit_mtrap, [VM_EXITCODE_INST_EMUL] = vmexit_inst_emul, [VM_EXITCODE_SPINUP_AP] = vmexit_spinup_ap, [VM_EXITCODE_SUSPENDED] = vmexit_suspend, [VM_EXITCODE_TASK_SWITCH] = vmexit_task_switch, }; static void vm_loop(struct vmctx *ctx, int vcpu, uint64_t startrip) { int error, rc; enum vm_exitcode exitcode; cpuset_t active_cpus; if (vcpumap[vcpu] != NULL) { error = pthread_setaffinity_np(pthread_self(), sizeof(cpuset_t), vcpumap[vcpu]); assert(error == 0); } error = vm_active_cpus(ctx, &active_cpus); assert(CPU_ISSET(vcpu, &active_cpus)); error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RIP, startrip); assert(error == 0); while (1) { error = vm_run(ctx, vcpu, &vmexit[vcpu]); if (error != 0) break; exitcode = vmexit[vcpu].exitcode; if (exitcode >= VM_EXITCODE_MAX || handler[exitcode] == NULL) { fprintf(stderr, "vm_loop: unexpected exitcode 0x%x\n", exitcode); exit(1); } rc = (*handler[exitcode])(ctx, &vmexit[vcpu], &vcpu); switch (rc) { case VMEXIT_CONTINUE: break; case VMEXIT_ABORT: abort(); default: exit(1); } } fprintf(stderr, "vm_run error %d, errno %d\n", error, errno); } static int num_vcpus_allowed(struct vmctx *ctx) { int tmp, error; error = vm_get_capability(ctx, BSP, VM_CAP_UNRESTRICTED_GUEST, &tmp); /* * The guest is allowed to spinup more than one processor only if the * UNRESTRICTED_GUEST capability is available. */ if (error == 0) return (VM_MAXCPU); else return (1); } void fbsdrun_set_capabilities(struct vmctx *ctx, int cpu) { int err, tmp; if (fbsdrun_vmexit_on_hlt()) { err = vm_get_capability(ctx, cpu, VM_CAP_HALT_EXIT, &tmp); if (err < 0) { fprintf(stderr, "VM exit on HLT not supported\n"); exit(1); } vm_set_capability(ctx, cpu, VM_CAP_HALT_EXIT, 1); if (cpu == BSP) handler[VM_EXITCODE_HLT] = vmexit_hlt; } if (fbsdrun_vmexit_on_pause()) { /* * pause exit support required for this mode */ err = vm_get_capability(ctx, cpu, VM_CAP_PAUSE_EXIT, &tmp); if (err < 0) { fprintf(stderr, "SMP mux requested, no pause support\n"); exit(1); } vm_set_capability(ctx, cpu, VM_CAP_PAUSE_EXIT, 1); if (cpu == BSP) handler[VM_EXITCODE_PAUSE] = vmexit_pause; } if (x2apic_mode) err = vm_set_x2apic_state(ctx, cpu, X2APIC_ENABLED); else err = vm_set_x2apic_state(ctx, cpu, X2APIC_DISABLED); if (err) { fprintf(stderr, "Unable to set x2apic state (%d)\n", err); exit(1); } vm_set_capability(ctx, cpu, VM_CAP_ENABLE_INVPCID, 1); } static struct vmctx * do_open(const char *vmname) { struct vmctx *ctx; int error; bool reinit, romboot; reinit = romboot = false; if (lpc_bootrom()) romboot = true; error = vm_create(vmname); if (error) { if (errno == EEXIST) { if (romboot) { reinit = true; } else { /* * The virtual machine has been setup by the * userspace bootloader. */ } } else { perror("vm_create"); exit(1); } } else { if (!romboot) { /* * If the virtual machine was just created then a * bootrom must be configured to boot it. */ fprintf(stderr, "virtual machine cannot be booted\n"); exit(1); } } ctx = vm_open(vmname); if (ctx == NULL) { perror("vm_open"); exit(1); } if (reinit) { error = vm_reinit(ctx); if (error) { perror("vm_reinit"); exit(1); } } return (ctx); } int main(int argc, char *argv[]) { int c, error, gdb_port, err, bvmcons; int max_vcpus, mptgen, memflags; int rtc_localtime; struct vmctx *ctx; uint64_t rip; size_t memsize; char *optstr; bvmcons = 0; progname = basename(argv[0]); gdb_port = 0; guest_ncpus = 1; memsize = 256 * MB; mptgen = 1; rtc_localtime = 1; memflags = 0; optstr = "abehuwxACHIPSWYp:g:c:s:m:l:U:"; while ((c = getopt(argc, argv, optstr)) != -1) { switch (c) { case 'a': x2apic_mode = 0; break; case 'A': acpi = 1; break; case 'b': bvmcons = 1; break; case 'p': if (pincpu_parse(optarg) != 0) { errx(EX_USAGE, "invalid vcpu pinning " "configuration '%s'", optarg); } break; case 'c': guest_ncpus = atoi(optarg); break; case 'C': memflags |= VM_MEM_F_INCORE; break; case 'g': gdb_port = atoi(optarg); break; case 'l': if (lpc_device_parse(optarg) != 0) { errx(EX_USAGE, "invalid lpc device " "configuration '%s'", optarg); } break; case 's': if (pci_parse_slot(optarg) != 0) exit(1); else break; case 'S': memflags |= VM_MEM_F_WIRED; break; case 'm': error = vm_parse_memsize(optarg, &memsize); if (error) errx(EX_USAGE, "invalid memsize '%s'", optarg); break; case 'H': guest_vmexit_on_hlt = 1; break; case 'I': /* * The "-I" option was used to add an ioapic to the * virtual machine. * * An ioapic is now provided unconditionally for each * virtual machine and this option is now deprecated. */ break; case 'P': guest_vmexit_on_pause = 1; break; case 'e': strictio = 1; break; case 'u': rtc_localtime = 0; break; case 'U': guest_uuid_str = optarg; break; case 'w': strictmsr = 0; break; case 'W': virtio_msix = 0; break; case 'x': x2apic_mode = 1; break; case 'Y': mptgen = 0; break; case 'h': usage(0); default: usage(1); } } argc -= optind; argv += optind; if (argc != 1) usage(1); vmname = argv[0]; ctx = do_open(vmname); if (guest_ncpus < 1) { fprintf(stderr, "Invalid guest vCPUs (%d)\n", guest_ncpus); exit(1); } max_vcpus = num_vcpus_allowed(ctx); if (guest_ncpus > max_vcpus) { fprintf(stderr, "%d vCPUs requested but only %d available\n", guest_ncpus, max_vcpus); exit(1); } fbsdrun_set_capabilities(ctx, BSP); vm_set_memflags(ctx, memflags); err = vm_setup_memory(ctx, memsize, VM_MMAP_ALL); if (err) { fprintf(stderr, "Unable to setup memory (%d)\n", errno); exit(1); } error = init_msr(); if (error) { fprintf(stderr, "init_msr error %d", error); exit(1); } init_mem(); init_inout(); atkbdc_init(ctx); pci_irq_init(ctx); ioapic_init(ctx); rtc_init(ctx, rtc_localtime); sci_init(ctx); /* - * Exit if a device emulation finds an error in it's initilization + * Exit if a device emulation finds an error in its initilization */ if (init_pci(ctx) != 0) exit(1); if (gdb_port != 0) init_dbgport(gdb_port); if (bvmcons) init_bvmcons(); if (lpc_bootrom()) { if (vm_set_capability(ctx, BSP, VM_CAP_UNRESTRICTED_GUEST, 1)) { fprintf(stderr, "ROM boot failed: unrestricted guest " "capability not available\n"); exit(1); } error = vcpu_reset(ctx, BSP); assert(error == 0); } error = vm_get_register(ctx, BSP, VM_REG_GUEST_RIP, &rip); assert(error == 0); /* * build the guest tables, MP etc. */ if (mptgen) { error = mptable_build(ctx, guest_ncpus); if (error) exit(1); } error = smbios_build(ctx); assert(error == 0); if (acpi) { error = acpi_build(ctx, guest_ncpus); assert(error == 0); } if (lpc_bootrom()) fwctl_init(); /* * Change the proc title to include the VM name. */ setproctitle("%s", vmname); /* * Add CPU 0 */ fbsdrun_addcpu(ctx, BSP, BSP, rip); /* * Head off to the main event dispatch loop */ mevent_dispatch(); exit(1); } Index: head/usr.sbin/bsnmpd/modules/snmp_bridge/BEGEMOT-BRIDGE-MIB.txt =================================================================== --- head/usr.sbin/bsnmpd/modules/snmp_bridge/BEGEMOT-BRIDGE-MIB.txt (revision 308456) +++ head/usr.sbin/bsnmpd/modules/snmp_bridge/BEGEMOT-BRIDGE-MIB.txt (revision 308457) @@ -1,1166 +1,1166 @@ -- -- Copyright (C) 2006 Shteryana Shopova -- All rights reserved. -- -- Redistribution and use in source and binary forms, with or without -- modification, are permitted provided that the following conditions -- are met: -- 1. Redistributions of source code must retain the above copyright -- notice, this list of conditions and the following disclaimer. -- 2. Redistributions in binary form must reproduce the above copyright -- notice, this list of conditions and the following disclaimer in the -- documentation and/or other materials provided with the distribution. -- -- THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND -- ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -- IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -- ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE -- FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -- DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS -- OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) -- HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT -- LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY -- OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF -- SUCH DAMAGE. -- -- $FreeBSD$ -- BEGEMOT-BRIDGE-MIB DEFINITIONS ::= BEGIN IMPORTS MODULE-IDENTITY, OBJECT-TYPE, NOTIFICATION-TYPE, Counter32, Integer32, TimeTicks, mib-2 FROM SNMPv2-SMI TEXTUAL-CONVENTION, MacAddress, TruthValue, RowStatus FROM SNMPv2-TC BridgeId, Timeout FROM BRIDGE-MIB InterfaceIndex FROM IF-MIB begemot FROM BEGEMOT-MIB; begemotBridge MODULE-IDENTITY LAST-UPDATED "200708060000Z" ORGANIZATION "Sofia University St. Kliment Ohridski" CONTACT-INFO " Shteryana Shopova Postal: Faculty of Mathematics and Informatics 5 James Bourchier Blvd. 1164 Sofia Bulgaria Fax: +359 2 687 180 E-Mail: syrinx@FreeBSD.org" DESCRIPTION "The Begemot MIB for managing bridge interfaces." REVISION "200708060000Z" DESCRIPTION "Third revision adds begemotBridgeBasePortPrivate object." REVISION "200611210000Z" DESCRIPTION "Second revision adds support for monitoring RSTP specific variables." REVISION "200607270000Z" DESCRIPTION "Initial revision." ::= { begemot 205 } -- ---------------------------------------------------------- -- BridgeIfName ::= TEXTUAL-CONVENTION DISPLAY-HINT "16a" STATUS current DESCRIPTION "Name of a bridge interface." SYNTAX OCTET STRING (SIZE(1..16)) BridgeIfNameOrEmpty ::= TEXTUAL-CONVENTION DISPLAY-HINT "16a" STATUS current DESCRIPTION "Name of a bridge interface." SYNTAX OCTET STRING (SIZE(0..16)) BridgePortId ::= TEXTUAL-CONVENTION DISPLAY-HINT "1x.1x" STATUS current DESCRIPTION "A port identifier that contains a bridge port's STP priority in the first octet and the port number in the second octet." SYNTAX OCTET STRING (SIZE(2)) -- ---------------------------------------------------------- -- -- subtrees in the Begemot Bridge MIB -- ---------------------------------------------------------- -- begemotBridgeNotifications OBJECT IDENTIFIER ::= { begemotBridge 0 } begemotBridgeBase OBJECT IDENTIFIER ::= { begemotBridge 1 } begemotBridgeStp OBJECT IDENTIFIER ::= { begemotBridge 2 } begemotBridgeTp OBJECT IDENTIFIER ::= { begemotBridge 3 } begemotBridgePf OBJECT IDENTIFIER ::= { begemotBridge 4 } begemotBridgeConfigObjects OBJECT IDENTIFIER ::= { begemotBridge 5 } -- ---------------------------------------------------------- -- -- the base Bridge interface table -- ---------------------------------------------------------- -- begemotBridgeBaseTable OBJECT-TYPE SYNTAX SEQUENCE OF BegemotBridgeBaseEntry MAX-ACCESS not-accessible STATUS current DESCRIPTION "A table that contains generic information for each bridge interface on the managed device." ::= { begemotBridgeBase 1 } begemotBridgeBaseEntry OBJECT-TYPE SYNTAX BegemotBridgeBaseEntry MAX-ACCESS not-accessible STATUS current DESCRIPTION "A list of information for the bridge interfaces on the managed device." INDEX { begemotBridgeBaseName } ::= { begemotBridgeBaseTable 1 } BegemotBridgeBaseEntry ::= SEQUENCE { begemotBridgeBaseName BridgeIfName, begemotBridgeBaseAddress MacAddress, begemotBridgeBaseNumPorts Integer32, begemotBridgeBaseType INTEGER, begemotBridgeBaseStatus RowStatus } begemotBridgeBaseName OBJECT-TYPE SYNTAX BridgeIfName MAX-ACCESS read-only STATUS current DESCRIPTION "The name of the bridge interface for which this entry contains management information." ::= { begemotBridgeBaseEntry 1 } begemotBridgeBaseAddress OBJECT-TYPE SYNTAX MacAddress MAX-ACCESS read-only STATUS current DESCRIPTION "The MAC address of the bridge interface." ::= { begemotBridgeBaseEntry 2 } begemotBridgeBaseNumPorts OBJECT-TYPE SYNTAX Integer32 MAX-ACCESS read-only STATUS current DESCRIPTION "The number of ports, members of this bridge." ::= { begemotBridgeBaseEntry 3 } begemotBridgeBaseType OBJECT-TYPE SYNTAX INTEGER { unknown(1), transparent-only(2), sourceroute-only(3), srt(4) } MAX-ACCESS read-only STATUS current DESCRIPTION "Indicates what type of bridging this bridge can perform." ::= { begemotBridgeBaseEntry 4 } begemotBridgeBaseStatus OBJECT-TYPE SYNTAX RowStatus MAX-ACCESS read-create STATUS current DESCRIPTION "Used to create/destroy bridge interfaces on the managed device." ::= { begemotBridgeBaseEntry 5 } -- ---------------------------------------------------------- -- -- the base Bridge ports table -- ---------------------------------------------------------- -- begemotBridgeBasePortTable OBJECT-TYPE SYNTAX SEQUENCE OF BegemotBridgeBasePortEntry MAX-ACCESS not-accessible STATUS current DESCRIPTION "A table containing generic information about ports, members of each bridge interface." ::= { begemotBridgeBase 2 } begemotBridgeBasePortEntry OBJECT-TYPE SYNTAX BegemotBridgeBasePortEntry MAX-ACCESS not-accessible STATUS current DESCRIPTION "A list of information about a specific port, member of a bridge interface." INDEX { begemotBridgeBaseName, begemotBridgeBasePortIfIndex } ::= { begemotBridgeBasePortTable 1 } BegemotBridgeBasePortEntry ::= SEQUENCE { begemotBridgeBasePort Integer32, begemotBridgeBasePortIfIndex InterfaceIndex, begemotBridgeBaseSpanEnabled INTEGER, begemotBridgeBasePortDelayExceededDiscards Counter32, begemotBridgeBasePortMtuExceededDiscards Counter32, begemotBridgeBasePortStatus RowStatus, begemotBridgeBasePortPrivate TruthValue } begemotBridgeBasePort OBJECT-TYPE SYNTAX Integer32 (1..65535) MAX-ACCESS read-only STATUS current DESCRIPTION "The system interface index of the interface corresponding to this port." ::= { begemotBridgeBasePortEntry 1 } begemotBridgeBasePortIfIndex OBJECT-TYPE SYNTAX InterfaceIndex MAX-ACCESS read-only STATUS current DESCRIPTION "The value of the instance of the ifIndex object, defined in IF-MIB, for the interface corresponding to this port." ::= { begemotBridgeBasePortEntry 2 } begemotBridgeBaseSpanEnabled OBJECT-TYPE SYNTAX INTEGER { enabled(1), disabled(2) } MAX-ACCESS read-write STATUS current DESCRIPTION "The value of this objects reflects whether the port is a span port on the specified bridge interface." ::= { begemotBridgeBasePortEntry 3 } begemotBridgeBasePortDelayExceededDiscards OBJECT-TYPE SYNTAX Counter32 MAX-ACCESS read-only STATUS current DESCRIPTION "The number of frames discarded by this port due to excessive transit delay through the bridge." ::= { begemotBridgeBasePortEntry 4 } begemotBridgeBasePortMtuExceededDiscards OBJECT-TYPE SYNTAX Counter32 MAX-ACCESS read-only STATUS current DESCRIPTION "The number of frames discarded by this port due to an excessive size." ::= { begemotBridgeBasePortEntry 5 } begemotBridgeBasePortStatus OBJECT-TYPE SYNTAX RowStatus MAX-ACCESS read-create STATUS current DESCRIPTION "Used to control addition of member ports to or removal of member ports from a specified bridge." ::= { begemotBridgeBasePortEntry 6 } begemotBridgeBasePortPrivate OBJECT-TYPE SYNTAX TruthValue MAX-ACCESS read-write STATUS current DESCRIPTION "The value of this objects reflects whether the port has a PRIVATE flag set. A port with this flags set can only communicate with ports not having the PRIVATE flag set." ::= { begemotBridgeBasePortEntry 7 } -- ---------------------------------------------------------- -- -- the Bridge interface STP table -- ---------------------------------------------------------- -- begemotBridgeStpTable OBJECT-TYPE SYNTAX SEQUENCE OF BegemotBridgeStpEntry MAX-ACCESS not-accessible STATUS current DESCRIPTION "A table that contains Spanning Tree Protocol information for each bridge interface on the managed device." ::= { begemotBridgeStp 1 } begemotBridgeStpEntry OBJECT-TYPE SYNTAX BegemotBridgeStpEntry MAX-ACCESS not-accessible STATUS current DESCRIPTION "A list of information about the Spanning Tree Protocol operation on a bridge interface." AUGMENTS { begemotBridgeBaseEntry } ::= { begemotBridgeStpTable 1 } BegemotBridgeStpEntry ::= SEQUENCE { begemotBridgeStpProtocolSpecification INTEGER, begemotBridgeStpPriority Integer32, begemotBridgeStpTimeSinceTopologyChange TimeTicks, begemotBridgeStpTopChanges Counter32, begemotBridgeStpDesignatedRoot BridgeId, begemotBridgeStpRootCost Integer32, begemotBridgeStpRootPort Integer32, begemotBridgeStpMaxAge Timeout, begemotBridgeStpHelloTime Timeout, begemotBridgeStpHoldTime Integer32, begemotBridgeStpForwardDelay Timeout, begemotBridgeStpBridgeMaxAge Timeout, begemotBridgeStpBridgeHelloTime Timeout, begemotBridgeStpBridgeForwardDelay Timeout, begemotBridgeStpVersion INTEGER, begemotBridgeStpTxHoldCount Integer32 } begemotBridgeStpProtocolSpecification OBJECT-TYPE SYNTAX INTEGER { unknown(1), decLb100(2), ieee8021d(3) } MAX-ACCESS read-only STATUS current DESCRIPTION "The Spanning Tree Protocol version being run on the bridge interface. The value 'decLb100(2)' indicates the DEC LANbridge 100 Spanning Tree protocol, 'ieee8021d(3)' indicates the bridge is running IEEE 802.1D STP implementation." ::= { begemotBridgeStpEntry 1 } begemotBridgeStpPriority OBJECT-TYPE SYNTAX Integer32 (0..65535) MAX-ACCESS read-write STATUS current DESCRIPTION "The priority value of the bridge interface forming the first two octets of the bridge identifier. Acceptable values are 0-61440, in steps of 4096." ::= { begemotBridgeStpEntry 2 } begemotBridgeStpTimeSinceTopologyChange OBJECT-TYPE SYNTAX TimeTicks UNITS "centi-seconds" MAX-ACCESS read-only STATUS current DESCRIPTION "The time (in hundreds of a second) since a topology change was last detected by this bridge." ::= { begemotBridgeStpEntry 3 } begemotBridgeStpTopChanges OBJECT-TYPE SYNTAX Counter32 MAX-ACCESS read-only STATUS current DESCRIPTION "The number of times a topology change was detected by the bridge interface since the management entity was initialized or reset." ::= { begemotBridgeStpEntry 4 } begemotBridgeStpDesignatedRoot OBJECT-TYPE SYNTAX BridgeId MAX-ACCESS read-only STATUS current DESCRIPTION "The bridge identifier of the root of the spanning tree as calculated by the Spanning Tree Protocol." ::= { begemotBridgeStpEntry 5 } begemotBridgeStpRootCost OBJECT-TYPE SYNTAX Integer32 MAX-ACCESS read-only STATUS current DESCRIPTION "The cost of the path from this bridge to the root bridge." ::= { begemotBridgeStpEntry 6 } begemotBridgeStpRootPort OBJECT-TYPE SYNTAX Integer32 MAX-ACCESS read-only STATUS current DESCRIPTION "The port number of the port that offers the lowest cost path from this bridge to the root bridge of the spanning tree. If this bridge is the root bridge, this object shall have a value of zero." ::= { begemotBridgeStpEntry 7 } begemotBridgeStpMaxAge OBJECT-TYPE SYNTAX Timeout UNITS "centi-seconds" MAX-ACCESS read-only STATUS current DESCRIPTION "The maximum age of Spanning Tree Protocol information received from the network on any port, before that information is discarded. This is the actual value that the bridge is currently using." ::= { begemotBridgeStpEntry 8 } begemotBridgeStpHelloTime OBJECT-TYPE SYNTAX Timeout UNITS "centi-seconds" MAX-ACCESS read-only STATUS current DESCRIPTION "The amount of time between transmission of Configuration BPDUs by this bridge on any port, when it is the root of the spanning tree or is trying to become so. This is the actual value that this bridge is currently using." ::= { begemotBridgeStpEntry 9 } begemotBridgeStpHoldTime OBJECT-TYPE SYNTAX Integer32 UNITS "centi-seconds" MAX-ACCESS read-only STATUS current DESCRIPTION "This time value determines the interval length during which no more than two Configuration BPDUs shall be transmitted by this node, in units of hundredths of a second." ::= { begemotBridgeStpEntry 10 } begemotBridgeStpForwardDelay OBJECT-TYPE SYNTAX Timeout UNITS "centi-seconds" MAX-ACCESS read-only STATUS current DESCRIPTION "This value, measured in units of hundredths of a second determines how long a port will stay consecutively in the Listening and Learning states before transitioning to Forwarding state. This is the actual value currently used by the bridge as opposed to begemotBridgeStpBridgeForwardDelay, which is the value this and all bridges participating in the spanning tree were to use, if this was the root bridge." ::= { begemotBridgeStpEntry 11 } begemotBridgeStpBridgeMaxAge OBJECT-TYPE SYNTAX Timeout (600..4000) UNITS "centi-seconds" MAX-ACCESS read-write STATUS current DESCRIPTION "The value that all bridges participating in the spanning tree would use for MaxAge if this bridge was the root of the spanning tree." ::= { begemotBridgeStpEntry 12 } begemotBridgeStpBridgeHelloTime OBJECT-TYPE SYNTAX Timeout (100..1000) UNITS "centi-seconds" MAX-ACCESS read-write STATUS current DESCRIPTION "The value that all bridges participating in the spanning tree would use for HelloTime if this bridge was the root of the spanning tree." ::= { begemotBridgeStpEntry 13 } begemotBridgeStpBridgeForwardDelay OBJECT-TYPE SYNTAX Timeout (400..3000) UNITS "centi-seconds" MAX-ACCESS read-write STATUS current DESCRIPTION "The value that all bridges participating in the spanning tree would use for ForwardDelay if this bridge was the root of the spanning tree." ::= { begemotBridgeStpEntry 14 } begemotBridgeStpVersion OBJECT-TYPE SYNTAX INTEGER { stpCompatible(0), rstp(2) } MAX-ACCESS read-write STATUS current DESCRIPTION "The version of Spanning Tree Protocol the bridge is currently running. The value 'stpCompatible(0)' indicates the Spanning Tree Protocol specified in IEEE 802.1D-1998 and 'rstp(2)' indicates the Rapid Spanning Tree Protocol specified in IEEE 802.1w and clause 17 of 802.1D-2004. The values are directly from the IEEE standard. New values may be defined as future versions of the protocol become available. The value of this object MUST be retained across reinitializations of the management system." DEFVAL { rstp } ::= { begemotBridgeStpEntry 15 } begemotBridgeStpTxHoldCount OBJECT-TYPE SYNTAX Integer32 (1..10) MAX-ACCESS read-write STATUS current DESCRIPTION "The value used by the Port Transmit state machine to limit the maximum transmission rate of BPDUs on the bridge interface. The value of this object MUST be retained across reinitializations of the management system." DEFVAL { 3 } ::= { begemotBridgeStpEntry 16 } -- ---------------------------------------------------------- -- -- the Bridge STP ports table -- ---------------------------------------------------------- -- begemotBridgeStpPortTable OBJECT-TYPE SYNTAX SEQUENCE OF BegemotBridgeStpPortEntry MAX-ACCESS not-accessible STATUS current DESCRIPTION "A table containing Spanning Tree Protocol information about the members of each bridge interface." ::= { begemotBridgeStp 2 } begemotBridgeStpPortEntry OBJECT-TYPE SYNTAX BegemotBridgeStpPortEntry MAX-ACCESS not-accessible STATUS current DESCRIPTION "A list of Spanning Tree Protocol information about a specific member of a bridge interface." INDEX { begemotBridgeBaseName, begemotBridgeBasePortIfIndex } ::= { begemotBridgeStpPortTable 1 } BegemotBridgeStpPortEntry ::= SEQUENCE { begemotBridgeStpPort Integer32, begemotBridgeStpPortPriority Integer32, begemotBridgeStpPortState INTEGER, begemotBridgeStpPortEnable INTEGER, begemotBridgeStpPortPathCost Integer32, begemotBridgeStpPortDesignatedRoot BridgeId, begemotBridgeStpPortDesignatedCost Integer32, begemotBridgeStpPortDesignatedBridge BridgeId, begemotBridgeStpPortDesignatedPort BridgePortId, begemotBridgeStpPortForwardTransitions Counter32 } begemotBridgeStpPort OBJECT-TYPE SYNTAX Integer32 (1..65535) MAX-ACCESS read-only STATUS current DESCRIPTION "The system interface index of the interface corresponding to this port, for which the management entity has Spanning Tree Protocol information." ::= { begemotBridgeStpPortEntry 1 } begemotBridgeStpPortPriority OBJECT-TYPE SYNTAX Integer32 (0..255) MAX-ACCESS read-write STATUS current DESCRIPTION "The STP priority of this port that is contained in the first octet of its Port Identifier. The second octet contains the value of begemotBridgeStpPort." ::= { begemotBridgeStpPortEntry 2 } begemotBridgeStpPortState OBJECT-TYPE SYNTAX INTEGER { disabled(1), blocking(2), listening(3), learning(4), forwarding(5), broken(6) } MAX-ACCESS read-only STATUS current DESCRIPTION "The current state of the port as defined by the operation of the Spanning Tree Protocol. If the Spanning Tree Protocol is administratively disabled on the port, this object shall have value disabled(1). A value of broken(6) does not correspond to any legal state of a port, and if present should indicate error in the operation of either the Spanning Tree Protocol implementation running on the device or the management entity." ::= { begemotBridgeStpPortEntry 3 } begemotBridgeStpPortEnable OBJECT-TYPE SYNTAX INTEGER { enabled(1), disabled(2) } MAX-ACCESS read-write STATUS current DESCRIPTION "The administrative Spanning Tree Protocol state of the port - value of enabled(1) indicates that the port is participating in the Spanning Tree Protocol operation." ::= { begemotBridgeStpPortEntry 4 } begemotBridgeStpPortPathCost OBJECT-TYPE SYNTAX Integer32 (1..65535) MAX-ACCESS read-write STATUS current DESCRIPTION "The contribution of the path through this port, when the port is the Root Port, to the total cost of the path to the root bridge for this bridge." ::= { begemotBridgeStpPortEntry 5 } begemotBridgeStpPortDesignatedRoot OBJECT-TYPE SYNTAX BridgeId MAX-ACCESS read-only STATUS current DESCRIPTION "The unique Bridge Identifier of the bridge recorded as the root in the Root Identifier parameter of Configuration BPDUs transmitted by the Designated Bridge for the LAN to which the port is attached." ::= { begemotBridgeStpPortEntry 6 } begemotBridgeStpPortDesignatedCost OBJECT-TYPE SYNTAX Integer32 MAX-ACCESS read-only STATUS current DESCRIPTION "For a Designated port, the path cost (equal to the Root Path Cost of the bridge) offered to the LAN to which the port is attached otherwise the cost of the path to the Root offered by the Designated Port on the LAN to which this Port is attached." ::= { begemotBridgeStpPortEntry 7 } begemotBridgeStpPortDesignatedBridge OBJECT-TYPE SYNTAX BridgeId MAX-ACCESS read-only STATUS current DESCRIPTION "The unique Bridge Identifier of the bridge to which the port belongs, in the case when the port is a designated port, otherwise the bridge believed to be the Designated Bridge for the LAN to which this port is attached." ::= { begemotBridgeStpPortEntry 8 } begemotBridgeStpPortDesignatedPort OBJECT-TYPE SYNTAX BridgePortId MAX-ACCESS read-only STATUS current DESCRIPTION "The Port Identifier of the Bridge port, on the Designated Bridge, through which the Designated Bridge transmits the Configuration Message information stored by this port." ::= { begemotBridgeStpPortEntry 9 } begemotBridgeStpPortForwardTransitions OBJECT-TYPE SYNTAX Counter32 MAX-ACCESS read-only STATUS current DESCRIPTION "The number of times this port has transitioned from the Learning state to the Forwarding state." ::= { begemotBridgeStpPortEntry 10 } -- ---------------------------------------------------------- -- -- the Bridge STP extended ports table -- ---------------------------------------------------------- -- begemotBridgeStpExtPortTable OBJECT-TYPE SYNTAX SEQUENCE OF BegemotBridgeStpExtPortEntry MAX-ACCESS not-accessible STATUS current DESCRIPTION "A table that contains port-specific Rapid Spanning Tree information for the bridge interface members." ::= { begemotBridgeStp 3 } begemotBridgeStpExtPortEntry OBJECT-TYPE SYNTAX BegemotBridgeStpExtPortEntry MAX-ACCESS not-accessible STATUS current DESCRIPTION "A list of Rapid Spanning Tree information maintained by each bridge interface member." AUGMENTS { begemotBridgeStpPortEntry } ::= { begemotBridgeStpExtPortTable 1 } BegemotBridgeStpExtPortEntry ::= SEQUENCE { begemotBridgeStpPortProtocolMigration TruthValue, begemotBridgeStpPortAdminEdgePort TruthValue, begemotBridgeStpPortOperEdgePort TruthValue, begemotBridgeStpPortAdminPointToPoint INTEGER, begemotBridgeStpPortOperPointToPoint TruthValue, begemotBridgeStpPortAdminPathCost Integer32 } begemotBridgeStpPortProtocolMigration OBJECT-TYPE SYNTAX TruthValue MAX-ACCESS read-write STATUS current DESCRIPTION "When operating in RSTP (version 2) mode, writing true(1) to this object forces this port to transmit RSTP BPDUs. Any other operation on this object has no effect and it always returns false(2) when read." ::= { begemotBridgeStpExtPortEntry 1 } begemotBridgeStpPortAdminEdgePort OBJECT-TYPE SYNTAX TruthValue MAX-ACCESS read-write STATUS current DESCRIPTION "The administrative value of the Edge Port parameter. A value of true(1) indicates that this port should be assumed as an edge-port, and a value of false(2) indicates that this port should be assumed as a non-edge-port. Setting this object will also cause the corresponding instance of begemotBridgeStpPortOperEdgePort to change to the same value. Note that even when this object's value is true, the value of the corresponding instance of begemotBridgeStpPortOperEdgePort can be false if a BPDU has been received. The value of this object MUST be retained across reinitializations of the management system." ::= { begemotBridgeStpExtPortEntry 2 } begemotBridgeStpPortOperEdgePort OBJECT-TYPE SYNTAX TruthValue MAX-ACCESS read-only STATUS current DESCRIPTION "The operational value of the Edge Port parameter. The object is initialized to the value of the corresponding instance of begemotBridgeStpPortAdminEdgePort. When the corresponding instance of begemotBridgeStpPortAdminEdgePort is set, this object will be changed as well. This object will also be changed to false on reception of a BPDU." ::= { begemotBridgeStpExtPortEntry 3 } begemotBridgeStpPortAdminPointToPoint OBJECT-TYPE SYNTAX INTEGER { forceTrue(0), forceFalse(1), auto(2) } MAX-ACCESS read-write STATUS current DESCRIPTION "The administrative point-to-point status of the LAN segment attached to this port, using the enumeration values of the IEEE 802.1w clause. A value of forceTrue(0) indicates that this port should always be treated as if it is connected to a point-to-point link. A value of forceFalse(1) indicates that this port should be treated as having a shared media connection. A value of auto(2) indicates that this port is considered to have a point-to-point link if it is an Aggregator and all of its members are aggregatable, or if the MAC entity is configured for full duplex operation, either through auto-negotiation or by management means. Manipulating this object changes the underlying adminPortToPortMAC. The value of this object MUST be retained across reinitializations of the management system." ::= { begemotBridgeStpExtPortEntry 4 } begemotBridgeStpPortOperPointToPoint OBJECT-TYPE SYNTAX TruthValue MAX-ACCESS read-only STATUS current DESCRIPTION "The operational point-to-point status of the LAN segment attached to this port. It indicates whether a port is considered to have a point-to-point connection. If adminPointToPointMAC is set to auto(2), then the value of operPointToPointMAC is determined in accordance with the specific procedures defined for the MAC entity concerned, as defined in IEEE 802.1w, clause 6.5. The value is determined dynamically; that is, it is re-evaluated whenever the value of adminPointToPointMAC changes, and whenever the specific procedures defined for the MAC entity evaluates a change in its point-to-point status." ::= { begemotBridgeStpExtPortEntry 5 } begemotBridgeStpPortAdminPathCost OBJECT-TYPE SYNTAX Integer32 (0..200000000) MAX-ACCESS read-write STATUS current DESCRIPTION "The administratively assigned value for the contribution of this port to the path cost of paths toward the spanning tree root. Writing a value of '0' assigns the automatically calculated default Path Cost value to the port. If the default Path Cost is being used, this object returns '0' when read. This complements the object begemotBridgeStpPortPathCost or begemotBridgeStpPortPathCost32, which returns the operational value of the path cost. The value of this object MUST be retained across reinitializations of the management system." ::= { begemotBridgeStpExtPortEntry 6 } -- ---------------------------------------------------------- -- -- the Bridge interface Transparent bridging table -- ---------------------------------------------------------- -- begemotBridgeTpTable OBJECT-TYPE SYNTAX SEQUENCE OF BegemotBridgeTpEntry MAX-ACCESS not-accessible STATUS current DESCRIPTION "A table that contains information regarding transparent bridging for each bridge interface on the managed device." ::= { begemotBridgeTp 1 } begemotBridgeTpEntry OBJECT-TYPE SYNTAX BegemotBridgeTpEntry MAX-ACCESS not-accessible STATUS current DESCRIPTION "A list of information regarding transparent bridging on a bridge interface." AUGMENTS { begemotBridgeBaseEntry } ::= { begemotBridgeTpTable 1 } BegemotBridgeTpEntry ::= SEQUENCE { begemotBridgeTpLearnedEntryDiscards Counter32, begemotBridgeTpAgingTime Integer32, begemotBridgeTpMaxAddresses Integer32 } begemotBridgeTpLearnedEntryDiscards OBJECT-TYPE SYNTAX Counter32 MAX-ACCESS read-only STATUS current DESCRIPTION "The total number of Forwarding Database entries that would have been learnt, but have been discarded due to Forwarding Address Table having reached it's maximum entries limit." ::= { begemotBridgeTpEntry 1 } begemotBridgeTpAgingTime OBJECT-TYPE SYNTAX Integer32 (10..1000000) UNITS "seconds" MAX-ACCESS read-write STATUS current DESCRIPTION "The timeout period in seconds before aging out dynamically learnt forwarding entries." ::= { begemotBridgeTpEntry 2 } begemotBridgeTpMaxAddresses OBJECT-TYPE SYNTAX Integer32 (1..10000) MAX-ACCESS read-write STATUS current DESCRIPTION "The maximum number of entires that this bridge can - learn in it's Forwarding Address Table and use for + learn in its Forwarding Address Table and use for making forwarding decisions." ::= { begemotBridgeTpEntry 3 } -- ---------------------------------------------------------- -- -- The Forwarding Database for Transparent Bridging interfaces -- ---------------------------------------------------------- -- begemotBridgeTpFdbTable OBJECT-TYPE SYNTAX SEQUENCE OF BegemotBridgeTpFdbEntry MAX-ACCESS not-accessible STATUS current DESCRIPTION "A table that contains information about unicast entries for which the bridge interfaces have forwarding and/or filtering information. This information is used by the bridge interfaces to make forwarding decisions." ::= { begemotBridgeTp 2 } begemotBridgeTpFdbEntry OBJECT-TYPE SYNTAX BegemotBridgeTpFdbEntry MAX-ACCESS not-accessible STATUS current DESCRIPTION "Information about a specific unicast MAC address for which the bridge interface has some forwarding and/or filtering information." INDEX { begemotBridgeBaseName, begemotBridgeTpFdbAddress } ::= { begemotBridgeTpFdbTable 1 } BegemotBridgeTpFdbEntry ::= SEQUENCE { begemotBridgeTpFdbAddress MacAddress, begemotBridgeTpFdbPort Integer32, begemotBridgeTpFdbStatus INTEGER } begemotBridgeTpFdbAddress OBJECT-TYPE SYNTAX MacAddress MAX-ACCESS read-only STATUS current DESCRIPTION "A unicast MAC address for which the bridge has which the bridge interface has some forwarding and/or filtering information." ::= { begemotBridgeTpFdbEntry 1 } begemotBridgeTpFdbPort OBJECT-TYPE SYNTAX Integer32 MAX-ACCESS read-only STATUS current DESCRIPTION "The port number of the bridge port on which a frame having a source address equal to the value of the corresponding instance of begemotBridgeTpFdbAddress has been seen." ::= { begemotBridgeTpFdbEntry 2 } begemotBridgeTpFdbStatus OBJECT-TYPE SYNTAX INTEGER { other(1), invalid(2), learned(3), self(4), mgmt(5) } MAX-ACCESS read-only STATUS current DESCRIPTION "The status of this entry. The meanings of the values are: other(1) - none of the following. invalid(2) - this entry is no longer valid (e.g., it was learned but has since aged out), but has not yet been flushed from the table. learned(3) - the value of the corresponding instance of begemotBridgeTpFdbPort was learned, and is being used. self(4) - the value of the corresponding instance of begemotBridgeTpFdbAddress represents one of the bridge's addresses. The corresponding instance of begemotBridgeTpFdbPort indicates which of the bridge's ports has this address. mgmt(5) - the value of the corresponding instance of begemotBridgeTpFdbAddress has been added to the bridge's Forwarding Database by some management means." ::= { begemotBridgeTpFdbEntry 3 } -- ---------------------------------------------------------- -- -- Ports table for Transparent Bridging interfaces -- ---------------------------------------------------------- -- begemotBridgeTpPortTable OBJECT-TYPE SYNTAX SEQUENCE OF BegemotBridgeTpPortEntry MAX-ACCESS not-accessible STATUS current DESCRIPTION "A table that contains information about every bridge port, member of a bridge interface, associated with the transparent bridging function of the bridge." ::= { begemotBridgeTp 3 } begemotBridgeTpPortEntry OBJECT-TYPE SYNTAX BegemotBridgeTpPortEntry MAX-ACCESS not-accessible STATUS current DESCRIPTION "A list of information about every bridge port, member of a bridge interface, associated with the bridge's transparent bridging function." INDEX { begemotBridgeBaseName, begemotBridgeBasePortIfIndex } ::= { begemotBridgeTpPortTable 1 } BegemotBridgeTpPortEntry ::= SEQUENCE { begemotBridgeTpPort Integer32, begemotBridgeTpPortMaxInfo Integer32, begemotBridgeTpPortInFrames Counter32, begemotBridgeTpPortOutFrames Counter32, begemotBridgeTpPortInDiscards Counter32 } begemotBridgeTpPort OBJECT-TYPE SYNTAX Integer32 (1..65535) MAX-ACCESS read-only STATUS current DESCRIPTION "The system interface index of the port for which this entry contains Transparent bridging management information." ::= { begemotBridgeTpPortEntry 1 } begemotBridgeTpPortMaxInfo OBJECT-TYPE SYNTAX Integer32 UNITS "bytes" MAX-ACCESS read-only STATUS current DESCRIPTION "The maximum size of the INFO (non-MAC) field that this port will receive or transmit." ::= { begemotBridgeTpPortEntry 2 } begemotBridgeTpPortInFrames OBJECT-TYPE SYNTAX Counter32 UNITS "frames" MAX-ACCESS read-only STATUS current DESCRIPTION "The number of frames that have been received by this port from its segment. Note that a frame received on the interface corresponding to this port is only counted by this object if and only if it is for a protocol being processed by the local bridging function, including bridge management frames." ::= { begemotBridgeTpPortEntry 3 } begemotBridgeTpPortOutFrames OBJECT-TYPE SYNTAX Counter32 UNITS "frames" MAX-ACCESS read-only STATUS current DESCRIPTION "The number of frames that have been transmitted by this port to its segment. Note that a frame transmitted on the interface corresponding to this port is only counted by this object if and only if it is for a protocol being processed by the local bridging function, including bridge management frames." ::= { begemotBridgeTpPortEntry 4 } begemotBridgeTpPortInDiscards OBJECT-TYPE SYNTAX Counter32 UNITS "frames" MAX-ACCESS read-only STATUS current DESCRIPTION "Count of received valid frames that were discarded (i.e., filtered) by the Forwarding Process." ::= { begemotBridgeTpPortEntry 5 } -- ---------------------------------------------------------- -- -- the begemotBridgePf objects -- ---------------------------------------------------------- -- begemotBridgePfilStatus OBJECT-TYPE SYNTAX TruthValue MAX-ACCESS read-write STATUS current DESCRIPTION "Indicates whether packet filtering by some firewall package is enabled on the bridge interface." ::= { begemotBridgePf 1 } begemotBridgePfilMembers OBJECT-TYPE SYNTAX TruthValue MAX-ACCESS read-write STATUS current DESCRIPTION "A value of true(1) indicates that packet filtering is enabled on both incoming and outgoing bridge member interfaces." ::= { begemotBridgePf 2 } begemotBridgePfilIpOnly OBJECT-TYPE SYNTAX TruthValue MAX-ACCESS read-write STATUS current DESCRIPTION "This value controls the handling of non-IP packets which are not passed on for further processing to a firewall package. A value of false(0) indicates that all non-IP Ethernet frames are passed unconditionally." ::= { begemotBridgePf 3 } begemotBridgeLayer2PfStatus OBJECT-TYPE SYNTAX INTEGER { enabled(1), disabled(2) } MAX-ACCESS read-write STATUS current DESCRIPTION "This value indicates whether layer2 filtering by a firewall package is enabled for bridge interfaces." ::= { begemotBridgePf 4 } -- ---------------------------------------------------------- -- -- the begemotBridgeConfigObjects objects -- ---------------------------------------------------------- -- begemotBridgeDefaultBridgeIf OBJECT-TYPE SYNTAX BridgeIfNameOrEmpty MAX-ACCESS read-write STATUS current DESCRIPTION "The name of the bridge interface that will be managed via objects in IETF BRIDGE-MIB (RFC4188). If the object's value is set to an empty string, bridge interfaces will only be managed via objects in this MIB module." DEFVAL { "bridge0" } ::= { begemotBridgeConfigObjects 1 } begemotBridgeDataUpdate OBJECT-TYPE SYNTAX Timeout (1..300) UNITS "seconds" MAX-ACCESS read-write STATUS current DESCRIPTION "The maximum age in seconds of the cached data." DEFVAL { 10 } ::= { begemotBridgeConfigObjects 2 } begemotBridgeDataPoll OBJECT-TYPE SYNTAX Timeout (1..3600) UNITS "seconds" MAX-ACCESS read-write STATUS current DESCRIPTION "The polling rate of data when the module is idle." DEFVAL { 300 } ::= { begemotBridgeConfigObjects 3 } -- ---------------------------------------------------------- -- -- Notifications for the Spanning Tree Protocol -- ---------------------------------------------------------- -- begemotBridgeNewRoot NOTIFICATION-TYPE OBJECTS { begemotBridgeBaseName } STATUS current DESCRIPTION "The begemotBridgeNewRoot trap indicates that one of the bridge interfaces on the sending agent's device has become the new root of the spanning tree topology it is participating in." ::= { begemotBridgeNotifications 1 } begemotBridgeTopologyChange NOTIFICATION-TYPE OBJECTS { begemotBridgeBaseName } STATUS current DESCRIPTION "A begemotBridgeTopologyChange trap is send when a member port on one of the bridge interfaces, monitored by the agent, transitions from the Learning state to the Forwarding state, or from the Forwarding state to the Blocking state. The trap is not sent if a begemotBridgeNewRoot trap is sent for the same transition." ::= { begemotBridgeNotifications 2 } END Index: head/usr.sbin/newsyslog/newsyslog.c =================================================================== --- head/usr.sbin/newsyslog/newsyslog.c (revision 308456) +++ head/usr.sbin/newsyslog/newsyslog.c (revision 308457) @@ -1,2681 +1,2681 @@ /*- * ------+---------+---------+-------- + --------+---------+---------+---------* * This file includes significant modifications done by: * Copyright (c) 2003, 2004 - Garance Alistair Drosehn . * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * ------+---------+---------+-------- + --------+---------+---------+---------* */ /* * This file contains changes from the Open Software Foundation. */ /* * Copyright 1988, 1989 by the Massachusetts Institute of Technology * * Permission to use, copy, modify, and distribute this software and its * documentation for any purpose and without fee is hereby granted, provided * that the above copyright notice appear in all copies and that both that * copyright notice and this permission notice appear in supporting * documentation, and that the names of M.I.T. and the M.I.T. S.I.P.B. not be * used in advertising or publicity pertaining to distribution of the * software without specific, written prior permission. M.I.T. and the M.I.T. * S.I.P.B. make no representations about the suitability of this software * for any purpose. It is provided "as is" without express or implied * warranty. * */ /* * newsyslog - roll over selected logs at the appropriate time, keeping the a * specified number of backup files around. */ #include __FBSDID("$FreeBSD$"); #define OSF #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "pathnames.h" #include "extern.h" /* * Compression suffixes */ #ifndef COMPRESS_SUFFIX_GZ #define COMPRESS_SUFFIX_GZ ".gz" #endif #ifndef COMPRESS_SUFFIX_BZ2 #define COMPRESS_SUFFIX_BZ2 ".bz2" #endif #ifndef COMPRESS_SUFFIX_XZ #define COMPRESS_SUFFIX_XZ ".xz" #endif #define COMPRESS_SUFFIX_MAXLEN MAX(MAX(sizeof(COMPRESS_SUFFIX_GZ),sizeof(COMPRESS_SUFFIX_BZ2)),sizeof(COMPRESS_SUFFIX_XZ)) /* * Compression types */ #define COMPRESS_TYPES 4 /* Number of supported compression types */ #define COMPRESS_NONE 0 #define COMPRESS_GZIP 1 #define COMPRESS_BZIP2 2 #define COMPRESS_XZ 3 /* * Bit-values for the 'flags' parsed from a config-file entry. */ #define CE_BINARY 0x0008 /* Logfile is in binary, do not add status */ /* messages to logfile(s) when rotating. */ #define CE_NOSIGNAL 0x0010 /* There is no process to signal when */ /* trimming this file. */ #define CE_TRIMAT 0x0020 /* trim file at a specific time. */ #define CE_GLOB 0x0040 /* name of the log is file name pattern. */ #define CE_SIGNALGROUP 0x0080 /* Signal a process-group instead of a single */ /* process when trimming this file. */ #define CE_CREATE 0x0100 /* Create the log file if it does not exist. */ #define CE_NODUMP 0x0200 /* Set 'nodump' on newly created log file. */ #define CE_PID2CMD 0x0400 /* Replace PID file with a shell command.*/ #define MIN_PID 5 /* Don't touch pids lower than this */ #define MAX_PID 99999 /* was lower, see /usr/include/sys/proc.h */ #define kbytes(size) (((size) + 1023) >> 10) #define DEFAULT_MARKER "" #define DEBUG_MARKER "" #define INCLUDE_MARKER "" #define DEFAULT_TIMEFNAME_FMT "%Y%m%dT%H%M%S" #define MAX_OLDLOGS 65536 /* Default maximum number of old logfiles */ struct compress_types { const char *flag; /* Flag in configuration file */ const char *suffix; /* Compression suffix */ const char *path; /* Path to compression program */ }; static const struct compress_types compress_type[COMPRESS_TYPES] = { { "", "", "" }, /* no compression */ { "Z", COMPRESS_SUFFIX_GZ, _PATH_GZIP }, /* gzip compression */ { "J", COMPRESS_SUFFIX_BZ2, _PATH_BZIP2 }, /* bzip2 compression */ { "X", COMPRESS_SUFFIX_XZ, _PATH_XZ } /* xz compression */ }; struct conf_entry { STAILQ_ENTRY(conf_entry) cf_nextp; char *log; /* Name of the log */ char *pid_cmd_file; /* PID or command file */ char *r_reason; /* The reason this file is being rotated */ int firstcreate; /* Creating log for the first time (-C). */ int rotate; /* Non-zero if this file should be rotated */ int fsize; /* size found for the log file */ uid_t uid; /* Owner of log */ gid_t gid; /* Group of log */ int numlogs; /* Number of logs to keep */ int trsize; /* Size cutoff to trigger trimming the log */ int hours; /* Hours between log trimming */ struct ptime_data *trim_at; /* Specific time to do trimming */ unsigned int permissions; /* File permissions on the log */ int flags; /* CE_BINARY */ int compress; /* Compression */ int sig; /* Signal to send */ int def_cfg; /* Using the rule for this file */ }; struct sigwork_entry { SLIST_ENTRY(sigwork_entry) sw_nextp; int sw_signum; /* the signal to send */ int sw_pidok; /* true if pid value is valid */ pid_t sw_pid; /* the process id from the PID file */ const char *sw_pidtype; /* "daemon" or "process group" */ int sw_runcmd; /* run command or send PID to signal */ char sw_fname[1]; /* file the PID was read from or shell cmd */ }; struct zipwork_entry { SLIST_ENTRY(zipwork_entry) zw_nextp; const struct conf_entry *zw_conf; /* for chown/perm/flag info */ const struct sigwork_entry *zw_swork; /* to know success of signal */ int zw_fsize; /* size of the file to compress */ char zw_fname[1]; /* the file to compress */ }; struct include_entry { STAILQ_ENTRY(include_entry) inc_nextp; const char *file; /* Name of file to process */ }; struct oldlog_entry { char *fname; /* Filename of the log file */ time_t t; /* Parsed timestamp of the logfile */ }; typedef enum { FREE_ENT, KEEP_ENT } fk_entry; STAILQ_HEAD(cflist, conf_entry); static SLIST_HEAD(swlisthead, sigwork_entry) swhead = SLIST_HEAD_INITIALIZER(swhead); static SLIST_HEAD(zwlisthead, zipwork_entry) zwhead = SLIST_HEAD_INITIALIZER(zwhead); STAILQ_HEAD(ilist, include_entry); int dbg_at_times; /* -D Show details of 'trim_at' code */ static int archtodir = 0; /* Archive old logfiles to other directory */ static int createlogs; /* Create (non-GLOB) logfiles which do not */ /* already exist. 1=='for entries with */ /* C flag', 2=='for all entries'. */ int verbose = 0; /* Print out what's going on */ static int needroot = 1; /* Root privs are necessary */ int noaction = 0; /* Don't do anything, just show it */ static int norotate = 0; /* Don't rotate */ static int nosignal; /* Do not send any signals */ static int enforcepid = 0; /* If PID file does not exist or empty, do nothing */ static int force = 0; /* Force the trim no matter what */ static int rotatereq = 0; /* -R = Always rotate the file(s) as given */ /* on the command (this also requires */ /* that a list of files *are* given on */ /* the run command). */ static char *requestor; /* The name given on a -R request */ static char *timefnamefmt = NULL;/* Use time based filenames instead of .0 */ static char *archdirname; /* Directory path to old logfiles archive */ static char *destdir = NULL; /* Directory to treat at root for logs */ static const char *conf; /* Configuration file to use */ struct ptime_data *dbg_timenow; /* A "timenow" value set via -D option */ static struct ptime_data *timenow; /* The time to use for checking at-fields */ #define DAYTIME_LEN 16 static char daytime[DAYTIME_LEN];/* The current time in human readable form, * used for rotation-tracking messages. */ static char hostname[MAXHOSTNAMELEN]; /* hostname */ static const char *path_syslogpid = _PATH_SYSLOGPID; static struct cflist *get_worklist(char **files); static void parse_file(FILE *cf, struct cflist *work_p, struct cflist *glob_p, struct conf_entry *defconf_p, struct ilist *inclist); static void add_to_queue(const char *fname, struct ilist *inclist); static char *sob(char *p); static char *son(char *p); static int isnumberstr(const char *); static int isglobstr(const char *); static char *missing_field(char *p, char *errline); static void change_attrs(const char *, const struct conf_entry *); static const char *get_logfile_suffix(const char *logfile); static fk_entry do_entry(struct conf_entry *); static fk_entry do_rotate(const struct conf_entry *); static void do_sigwork(struct sigwork_entry *); static void do_zipwork(struct zipwork_entry *); static struct sigwork_entry * save_sigwork(const struct conf_entry *); static struct zipwork_entry * save_zipwork(const struct conf_entry *, const struct sigwork_entry *, int, const char *); static void set_swpid(struct sigwork_entry *, const struct conf_entry *); static int sizefile(const char *); static void expand_globs(struct cflist *work_p, struct cflist *glob_p); static void free_clist(struct cflist *list); static void free_entry(struct conf_entry *ent); static struct conf_entry *init_entry(const char *fname, struct conf_entry *src_entry); static void parse_args(int argc, char **argv); static int parse_doption(const char *doption); static void usage(void); static int log_trim(const char *logname, const struct conf_entry *log_ent); static int age_old_log(const char *file); static void savelog(char *from, char *to); static void createdir(const struct conf_entry *ent, char *dirpart); static void createlog(const struct conf_entry *ent); static int parse_signal(const char *str); /* * All the following take a parameter of 'int', but expect values in the * range of unsigned char. Define wrappers which take values of type 'char', * whether signed or unsigned, and ensure they end up in the right range. */ #define isdigitch(Anychar) isdigit((u_char)(Anychar)) #define isprintch(Anychar) isprint((u_char)(Anychar)) #define isspacech(Anychar) isspace((u_char)(Anychar)) #define tolowerch(Anychar) tolower((u_char)(Anychar)) int main(int argc, char **argv) { struct cflist *worklist; struct conf_entry *p; struct sigwork_entry *stmp; struct zipwork_entry *ztmp; SLIST_INIT(&swhead); SLIST_INIT(&zwhead); parse_args(argc, argv); argc -= optind; argv += optind; if (needroot && getuid() && geteuid()) errx(1, "must have root privs"); worklist = get_worklist(argv); /* * Rotate all the files which need to be rotated. Note that * some users have *hundreds* of entries in newsyslog.conf! */ while (!STAILQ_EMPTY(worklist)) { p = STAILQ_FIRST(worklist); STAILQ_REMOVE_HEAD(worklist, cf_nextp); if (do_entry(p) == FREE_ENT) free_entry(p); } /* * Send signals to any processes which need a signal to tell * them to close and re-open the log file(s) we have rotated. * Note that zipwork_entries include pointers to these * sigwork_entry's, so we can not free the entries here. */ if (!SLIST_EMPTY(&swhead)) { if (noaction || verbose) printf("Signal all daemon process(es)...\n"); SLIST_FOREACH(stmp, &swhead, sw_nextp) do_sigwork(stmp); if (!(rotatereq && nosignal)) { if (noaction) printf("\tsleep 10\n"); else { if (verbose) printf("Pause 10 seconds to allow " "daemon(s) to close log file(s)\n"); sleep(10); } } } /* * Compress all files that we're expected to compress, now * that all processes should have closed the files which * have been rotated. */ if (!SLIST_EMPTY(&zwhead)) { if (noaction || verbose) printf("Compress all rotated log file(s)...\n"); while (!SLIST_EMPTY(&zwhead)) { ztmp = SLIST_FIRST(&zwhead); do_zipwork(ztmp); SLIST_REMOVE_HEAD(&zwhead, zw_nextp); free(ztmp); } } /* Now free all the sigwork entries. */ while (!SLIST_EMPTY(&swhead)) { stmp = SLIST_FIRST(&swhead); SLIST_REMOVE_HEAD(&swhead, sw_nextp); free(stmp); } while (wait(NULL) > 0 || errno == EINTR) ; return (0); } static struct conf_entry * init_entry(const char *fname, struct conf_entry *src_entry) { struct conf_entry *tempwork; if (verbose > 4) printf("\t--> [creating entry for %s]\n", fname); tempwork = malloc(sizeof(struct conf_entry)); if (tempwork == NULL) err(1, "malloc of conf_entry for %s", fname); if (destdir == NULL || fname[0] != '/') tempwork->log = strdup(fname); else asprintf(&tempwork->log, "%s%s", destdir, fname); if (tempwork->log == NULL) err(1, "strdup for %s", fname); if (src_entry != NULL) { tempwork->pid_cmd_file = NULL; if (src_entry->pid_cmd_file) tempwork->pid_cmd_file = strdup(src_entry->pid_cmd_file); tempwork->r_reason = NULL; tempwork->firstcreate = 0; tempwork->rotate = 0; tempwork->fsize = -1; tempwork->uid = src_entry->uid; tempwork->gid = src_entry->gid; tempwork->numlogs = src_entry->numlogs; tempwork->trsize = src_entry->trsize; tempwork->hours = src_entry->hours; tempwork->trim_at = NULL; if (src_entry->trim_at != NULL) tempwork->trim_at = ptime_init(src_entry->trim_at); tempwork->permissions = src_entry->permissions; tempwork->flags = src_entry->flags; tempwork->compress = src_entry->compress; tempwork->sig = src_entry->sig; tempwork->def_cfg = src_entry->def_cfg; } else { /* Initialize as a "do-nothing" entry */ tempwork->pid_cmd_file = NULL; tempwork->r_reason = NULL; tempwork->firstcreate = 0; tempwork->rotate = 0; tempwork->fsize = -1; tempwork->uid = (uid_t)-1; tempwork->gid = (gid_t)-1; tempwork->numlogs = 1; tempwork->trsize = -1; tempwork->hours = -1; tempwork->trim_at = NULL; tempwork->permissions = 0; tempwork->flags = 0; tempwork->compress = COMPRESS_NONE; tempwork->sig = SIGHUP; tempwork->def_cfg = 0; } return (tempwork); } static void free_entry(struct conf_entry *ent) { if (ent == NULL) return; if (ent->log != NULL) { if (verbose > 4) printf("\t--> [freeing entry for %s]\n", ent->log); free(ent->log); ent->log = NULL; } if (ent->pid_cmd_file != NULL) { free(ent->pid_cmd_file); ent->pid_cmd_file = NULL; } if (ent->r_reason != NULL) { free(ent->r_reason); ent->r_reason = NULL; } if (ent->trim_at != NULL) { ptime_free(ent->trim_at); ent->trim_at = NULL; } free(ent); } static void free_clist(struct cflist *list) { struct conf_entry *ent; while (!STAILQ_EMPTY(list)) { ent = STAILQ_FIRST(list); STAILQ_REMOVE_HEAD(list, cf_nextp); free_entry(ent); } free(list); list = NULL; } static fk_entry do_entry(struct conf_entry * ent) { #define REASON_MAX 80 int modtime; fk_entry free_or_keep; double diffsecs; char temp_reason[REASON_MAX]; int oversized; free_or_keep = FREE_ENT; if (verbose) printf("%s <%d%s>: ", ent->log, ent->numlogs, compress_type[ent->compress].flag); ent->fsize = sizefile(ent->log); oversized = ((ent->trsize > 0) && (ent->fsize >= ent->trsize)); modtime = age_old_log(ent->log); ent->rotate = 0; ent->firstcreate = 0; if (ent->fsize < 0) { /* * If either the C flag or the -C option was specified, * and if we won't be creating the file, then have the * verbose message include a hint as to why the file * will not be created. */ temp_reason[0] = '\0'; if (createlogs > 1) ent->firstcreate = 1; else if ((ent->flags & CE_CREATE) && createlogs) ent->firstcreate = 1; else if (ent->flags & CE_CREATE) strlcpy(temp_reason, " (no -C option)", REASON_MAX); else if (createlogs) strlcpy(temp_reason, " (no C flag)", REASON_MAX); if (ent->firstcreate) { if (verbose) printf("does not exist -> will create.\n"); createlog(ent); } else if (verbose) { printf("does not exist, skipped%s.\n", temp_reason); } } else { if (ent->flags & CE_TRIMAT && !force && !rotatereq && !oversized) { diffsecs = ptimeget_diff(timenow, ent->trim_at); if (diffsecs < 0.0) { /* trim_at is some time in the future. */ if (verbose) { ptime_adjust4dst(ent->trim_at, timenow); printf("--> will trim at %s", ptimeget_ctime(ent->trim_at)); } return (free_or_keep); } else if (diffsecs >= 3600.0) { /* * trim_at is more than an hour in the past, * so find the next valid trim_at time, and * tell the user what that will be. */ if (verbose && dbg_at_times) printf("\n\t--> prev trim at %s\t", ptimeget_ctime(ent->trim_at)); if (verbose) { ptimeset_nxtime(ent->trim_at); printf("--> will trim at %s", ptimeget_ctime(ent->trim_at)); } return (free_or_keep); } else if (verbose && noaction && dbg_at_times) { /* * If we are just debugging at-times, then * a detailed message is helpful. Also * skip "doing" any commands, since they * would all be turned off by no-action. */ printf("\n\t--> timematch at %s", ptimeget_ctime(ent->trim_at)); return (free_or_keep); } else if (verbose && ent->hours <= 0) { printf("--> time is up\n"); } } if (verbose && (ent->trsize > 0)) printf("size (Kb): %d [%d] ", ent->fsize, ent->trsize); if (verbose && (ent->hours > 0)) printf(" age (hr): %d [%d] ", modtime, ent->hours); /* * Figure out if this logfile needs to be rotated. */ temp_reason[0] = '\0'; if (rotatereq) { ent->rotate = 1; snprintf(temp_reason, REASON_MAX, " due to -R from %s", requestor); } else if (force) { ent->rotate = 1; snprintf(temp_reason, REASON_MAX, " due to -F request"); } else if (oversized) { ent->rotate = 1; snprintf(temp_reason, REASON_MAX, " due to size>%dK", ent->trsize); } else if (ent->hours <= 0 && (ent->flags & CE_TRIMAT)) { ent->rotate = 1; } else if ((ent->hours > 0) && ((modtime >= ent->hours) || (modtime < 0))) { ent->rotate = 1; } /* * If the file needs to be rotated, then rotate it. */ if (ent->rotate && !norotate) { if (temp_reason[0] != '\0') ent->r_reason = strdup(temp_reason); if (verbose) printf("--> trimming log....\n"); if (noaction && !verbose) printf("%s <%d%s>: trimming\n", ent->log, ent->numlogs, compress_type[ent->compress].flag); free_or_keep = do_rotate(ent); } else { if (verbose) printf("--> skipping\n"); } } return (free_or_keep); #undef REASON_MAX } static void parse_args(int argc, char **argv) { int ch; char *p; timenow = ptime_init(NULL); ptimeset_time(timenow, time(NULL)); strlcpy(daytime, ptimeget_ctime(timenow) + 4, DAYTIME_LEN); /* Let's get our hostname */ (void)gethostname(hostname, sizeof(hostname)); /* Truncate domain */ if ((p = strchr(hostname, '.')) != NULL) *p = '\0'; /* Parse command line options. */ while ((ch = getopt(argc, argv, "a:d:f:nrst:vCD:FNPR:S:")) != -1) switch (ch) { case 'a': archtodir++; archdirname = optarg; break; case 'd': destdir = optarg; break; case 'f': conf = optarg; break; case 'n': noaction++; /* FALLTHROUGH */ case 'r': needroot = 0; break; case 's': nosignal = 1; break; case 't': if (optarg[0] == '\0' || strcmp(optarg, "DEFAULT") == 0) timefnamefmt = strdup(DEFAULT_TIMEFNAME_FMT); else timefnamefmt = strdup(optarg); break; case 'v': verbose++; break; case 'C': /* Useful for things like rc.diskless... */ createlogs++; break; case 'D': /* * Set some debugging option. The specific option * depends on the value of optarg. These options * may come and go without notice or documentation. */ if (parse_doption(optarg)) break; usage(); /* NOTREACHED */ case 'F': force++; break; case 'N': norotate++; break; case 'P': enforcepid++; break; case 'R': rotatereq++; requestor = strdup(optarg); break; case 'S': path_syslogpid = optarg; break; case 'm': /* Used by OpenBSD for "monitor mode" */ default: usage(); /* NOTREACHED */ } if (force && norotate) { warnx("Only one of -F and -N may be specified."); usage(); /* NOTREACHED */ } if (rotatereq) { if (optind == argc) { warnx("At least one filename must be given when -R is specified."); usage(); /* NOTREACHED */ } /* Make sure "requestor" value is safe for a syslog message. */ for (p = requestor; *p != '\0'; p++) { if (!isprintch(*p) && (*p != '\t')) *p = '.'; } } if (dbg_timenow) { /* * Note that the 'daytime' variable is not changed. * That is only used in messages that track when a * logfile is rotated, and if a file *is* rotated, * then it will still rotated at the "real now" time. */ ptime_free(timenow); timenow = dbg_timenow; fprintf(stderr, "Debug: Running as if TimeNow is %s", ptimeget_ctime(dbg_timenow)); } } /* * These debugging options are mainly meant for developer use, such * as writing regression-tests. They would not be needed by users * during normal operation of newsyslog... */ static int parse_doption(const char *doption) { const char TN[] = "TN="; int res; if (strncmp(doption, TN, sizeof(TN) - 1) == 0) { /* * The "TimeNow" debugging option. This might be off * by an hour when crossing a timezone change. */ dbg_timenow = ptime_init(NULL); res = ptime_relparse(dbg_timenow, PTM_PARSE_ISO8601, time(NULL), doption + sizeof(TN) - 1); if (res == -2) { warnx("Non-existent time specified on -D %s", doption); return (0); /* failure */ } else if (res < 0) { warnx("Malformed time given on -D %s", doption); return (0); /* failure */ } return (1); /* successfully parsed */ } if (strcmp(doption, "ats") == 0) { dbg_at_times++; return (1); /* successfully parsed */ } /* XXX - This check could probably be dropped. */ if ((strcmp(doption, "neworder") == 0) || (strcmp(doption, "oldorder") == 0)) { warnx("NOTE: newsyslog always uses 'neworder'."); return (1); /* successfully parsed */ } warnx("Unknown -D (debug) option: '%s'", doption); return (0); /* failure */ } static void usage(void) { fprintf(stderr, "usage: newsyslog [-CFNPnrsv] [-a directory] [-d directory] [-f config_file]\n" " [-S pidfile] [-t timefmt] [[-R tagname] file ...]\n"); exit(1); } /* * Parse a configuration file and return a linked list of all the logs * which should be processed. */ static struct cflist * get_worklist(char **files) { FILE *f; char **given; struct cflist *cmdlist, *filelist, *globlist; struct conf_entry *defconf, *dupent, *ent; struct ilist inclist; struct include_entry *inc; int gmatch, fnres; defconf = NULL; STAILQ_INIT(&inclist); filelist = malloc(sizeof(struct cflist)); if (filelist == NULL) err(1, "malloc of filelist"); STAILQ_INIT(filelist); globlist = malloc(sizeof(struct cflist)); if (globlist == NULL) err(1, "malloc of globlist"); STAILQ_INIT(globlist); inc = malloc(sizeof(struct include_entry)); if (inc == NULL) err(1, "malloc of inc"); inc->file = conf; if (inc->file == NULL) inc->file = _PATH_CONF; STAILQ_INSERT_TAIL(&inclist, inc, inc_nextp); STAILQ_FOREACH(inc, &inclist, inc_nextp) { if (strcmp(inc->file, "-") != 0) f = fopen(inc->file, "r"); else { f = stdin; inc->file = ""; } if (!f) err(1, "%s", inc->file); if (verbose) printf("Processing %s\n", inc->file); parse_file(f, filelist, globlist, defconf, &inclist); (void) fclose(f); } /* * All config-file information has been read in and turned into * a filelist and a globlist. If there were no specific files * given on the run command, then the only thing left to do is to * call a routine which finds all files matched by the globlist * and adds them to the filelist. Then return the worklist. */ if (*files == NULL) { expand_globs(filelist, globlist); free_clist(globlist); if (defconf != NULL) free_entry(defconf); return (filelist); /* NOTREACHED */ } /* * If newsyslog was given a specific list of files to process, * it may be that some of those files were not listed in any * config file. Those unlisted files should get the default * rotation action. First, create the default-rotation action * if none was found in a system config file. */ if (defconf == NULL) { defconf = init_entry(DEFAULT_MARKER, NULL); defconf->numlogs = 3; defconf->trsize = 50; defconf->permissions = S_IRUSR|S_IWUSR; } /* * If newsyslog was run with a list of specific filenames, * then create a new worklist which has only those files in * it, picking up the rotation-rules for those files from * the original filelist. * * XXX - Note that this will copy multiple rules for a single * logfile, if multiple entries are an exact match for * that file. That matches the historic behavior, but do * we want to continue to allow it? If so, it should * probably be handled more intelligently. */ cmdlist = malloc(sizeof(struct cflist)); if (cmdlist == NULL) err(1, "malloc of cmdlist"); STAILQ_INIT(cmdlist); for (given = files; *given; ++given) { /* * First try to find exact-matches for this given file. */ gmatch = 0; STAILQ_FOREACH(ent, filelist, cf_nextp) { if (strcmp(ent->log, *given) == 0) { gmatch++; dupent = init_entry(*given, ent); STAILQ_INSERT_TAIL(cmdlist, dupent, cf_nextp); } } if (gmatch) { if (verbose > 2) printf("\t+ Matched entry %s\n", *given); continue; } /* * There was no exact-match for this given file, so look * for a "glob" entry which does match. */ gmatch = 0; if (verbose > 2 && globlist != NULL) printf("\t+ Checking globs for %s\n", *given); STAILQ_FOREACH(ent, globlist, cf_nextp) { fnres = fnmatch(ent->log, *given, FNM_PATHNAME); if (verbose > 2) printf("\t+ = %d for pattern %s\n", fnres, ent->log); if (fnres == 0) { gmatch++; dupent = init_entry(*given, ent); /* This new entry is not a glob! */ dupent->flags &= ~CE_GLOB; STAILQ_INSERT_TAIL(cmdlist, dupent, cf_nextp); /* Only allow a match to one glob-entry */ break; } } if (gmatch) { if (verbose > 2) printf("\t+ Matched %s via %s\n", *given, ent->log); continue; } /* * This given file was not found in any config file, so * add a worklist item based on the default entry. */ if (verbose > 2) printf("\t+ No entry matched %s (will use %s)\n", *given, DEFAULT_MARKER); dupent = init_entry(*given, defconf); /* Mark that it was *not* found in a config file */ dupent->def_cfg = 1; STAILQ_INSERT_TAIL(cmdlist, dupent, cf_nextp); } /* * Free all the entries in the original work list, the list of * glob entries, and the default entry. */ free_clist(filelist); free_clist(globlist); free_entry(defconf); /* And finally, return a worklist which matches the given files. */ return (cmdlist); } /* * Expand the list of entries with filename patterns, and add all files * which match those glob-entries onto the worklist. */ static void expand_globs(struct cflist *work_p, struct cflist *glob_p) { int gmatch, gres; size_t i; char *mfname; struct conf_entry *dupent, *ent, *globent; glob_t pglob; struct stat st_fm; /* * The worklist contains all fully-specified (non-GLOB) names. * * Now expand the list of filename-pattern (GLOB) entries into * a second list, which (by definition) will only match files * that already exist. Do not add a glob-related entry for any * file which already exists in the fully-specified list. */ STAILQ_FOREACH(globent, glob_p, cf_nextp) { gres = glob(globent->log, GLOB_NOCHECK, NULL, &pglob); if (gres != 0) { warn("cannot expand pattern (%d): %s", gres, globent->log); continue; } if (verbose > 2) printf("\t+ Expanding pattern %s\n", globent->log); for (i = 0; i < pglob.gl_matchc; i++) { mfname = pglob.gl_pathv[i]; /* See if this file already has a specific entry. */ gmatch = 0; STAILQ_FOREACH(ent, work_p, cf_nextp) { if (strcmp(mfname, ent->log) == 0) { gmatch++; break; } } if (gmatch) continue; /* Make sure the named matched is a file. */ gres = lstat(mfname, &st_fm); if (gres != 0) { /* Error on a file that glob() matched?!? */ warn("Skipping %s - lstat() error", mfname); continue; } if (!S_ISREG(st_fm.st_mode)) { /* We only rotate files! */ if (verbose > 2) printf("\t+ . skipping %s (!file)\n", mfname); continue; } if (verbose > 2) printf("\t+ . add file %s\n", mfname); dupent = init_entry(mfname, globent); /* This new entry is not a glob! */ dupent->flags &= ~CE_GLOB; /* Add to the worklist. */ STAILQ_INSERT_TAIL(work_p, dupent, cf_nextp); } globfree(&pglob); if (verbose > 2) printf("\t+ Done with pattern %s\n", globent->log); } } /* * Parse a configuration file and update a linked list of all the logs to * process. */ static void parse_file(FILE *cf, struct cflist *work_p, struct cflist *glob_p, struct conf_entry *defconf_p, struct ilist *inclist) { char line[BUFSIZ], *parse, *q; char *cp, *errline, *group; struct conf_entry *working; struct passwd *pwd; struct group *grp; glob_t pglob; int eol, ptm_opts, res, special; size_t i; errline = NULL; while (fgets(line, BUFSIZ, cf)) { if ((line[0] == '\n') || (line[0] == '#') || (strlen(line) == 0)) continue; if (errline != NULL) free(errline); errline = strdup(line); for (cp = line + 1; *cp != '\0'; cp++) { if (*cp != '#') continue; if (*(cp - 1) == '\\') { strcpy(cp - 1, cp); cp--; continue; } *cp = '\0'; break; } q = parse = missing_field(sob(line), errline); parse = son(line); if (!*parse) errx(1, "malformed line (missing fields):\n%s", errline); *parse = '\0'; /* * Allow people to set debug options via the config file. * (NOTE: debug options are undocumented, and may disappear * at any time, etc). */ if (strcasecmp(DEBUG_MARKER, q) == 0) { q = parse = missing_field(sob(parse + 1), errline); parse = son(parse); if (!*parse) warnx("debug line specifies no option:\n%s", errline); else { *parse = '\0'; parse_doption(q); } continue; } else if (strcasecmp(INCLUDE_MARKER, q) == 0) { if (verbose) printf("Found: %s", errline); q = parse = missing_field(sob(parse + 1), errline); parse = son(parse); if (!*parse) { warnx("include line missing argument:\n%s", errline); continue; } *parse = '\0'; if (isglobstr(q)) { res = glob(q, GLOB_NOCHECK, NULL, &pglob); if (res != 0) { warn("cannot expand pattern (%d): %s", res, q); continue; } if (verbose > 2) printf("\t+ Expanding pattern %s\n", q); for (i = 0; i < pglob.gl_matchc; i++) add_to_queue(pglob.gl_pathv[i], inclist); globfree(&pglob); } else add_to_queue(q, inclist); continue; } special = 0; working = init_entry(q, NULL); if (strcasecmp(DEFAULT_MARKER, q) == 0) { special = 1; if (defconf_p != NULL) { warnx("Ignoring duplicate entry for %s!", q); free_entry(working); continue; } defconf_p = working; } q = parse = missing_field(sob(parse + 1), errline); parse = son(parse); if (!*parse) errx(1, "malformed line (missing fields):\n%s", errline); *parse = '\0'; if ((group = strchr(q, ':')) != NULL || (group = strrchr(q, '.')) != NULL) { *group++ = '\0'; if (*q) { if (!(isnumberstr(q))) { if ((pwd = getpwnam(q)) == NULL) errx(1, "error in config file; unknown user:\n%s", errline); working->uid = pwd->pw_uid; } else working->uid = atoi(q); } else working->uid = (uid_t)-1; q = group; if (*q) { if (!(isnumberstr(q))) { if ((grp = getgrnam(q)) == NULL) errx(1, "error in config file; unknown group:\n%s", errline); working->gid = grp->gr_gid; } else working->gid = atoi(q); } else working->gid = (gid_t)-1; q = parse = missing_field(sob(parse + 1), errline); parse = son(parse); if (!*parse) errx(1, "malformed line (missing fields):\n%s", errline); *parse = '\0'; } else { working->uid = (uid_t)-1; working->gid = (gid_t)-1; } if (!sscanf(q, "%o", &working->permissions)) errx(1, "error in config file; bad permissions:\n%s", errline); q = parse = missing_field(sob(parse + 1), errline); parse = son(parse); if (!*parse) errx(1, "malformed line (missing fields):\n%s", errline); *parse = '\0'; if (!sscanf(q, "%d", &working->numlogs) || working->numlogs < 0) errx(1, "error in config file; bad value for count of logs to save:\n%s", errline); q = parse = missing_field(sob(parse + 1), errline); parse = son(parse); if (!*parse) errx(1, "malformed line (missing fields):\n%s", errline); *parse = '\0'; if (isdigitch(*q)) working->trsize = atoi(q); else if (strcmp(q, "*") == 0) working->trsize = -1; else { warnx("Invalid value of '%s' for 'size' in line:\n%s", q, errline); working->trsize = -1; } working->flags = 0; working->compress = COMPRESS_NONE; q = parse = missing_field(sob(parse + 1), errline); parse = son(parse); eol = !*parse; *parse = '\0'; { char *ep; u_long ul; ul = strtoul(q, &ep, 10); if (ep == q) working->hours = 0; else if (*ep == '*') working->hours = -1; else if (ul > INT_MAX) errx(1, "interval is too large:\n%s", errline); else working->hours = ul; if (*ep == '\0' || strcmp(ep, "*") == 0) goto no_trimat; if (*ep != '@' && *ep != '$') errx(1, "malformed interval/at:\n%s", errline); working->flags |= CE_TRIMAT; working->trim_at = ptime_init(NULL); ptm_opts = PTM_PARSE_ISO8601; if (*ep == '$') ptm_opts = PTM_PARSE_DWM; ptm_opts |= PTM_PARSE_MATCHDOM; res = ptime_relparse(working->trim_at, ptm_opts, ptimeget_secs(timenow), ep + 1); if (res == -2) errx(1, "nonexistent time for 'at' value:\n%s", errline); else if (res < 0) errx(1, "malformed 'at' value:\n%s", errline); } no_trimat: if (eol) q = NULL; else { q = parse = sob(parse + 1); /* Optional field */ parse = son(parse); if (!*parse) eol = 1; *parse = '\0'; } for (; q && *q && !isspacech(*q); q++) { switch (tolowerch(*q)) { case 'b': working->flags |= CE_BINARY; break; case 'c': working->flags |= CE_CREATE; break; case 'd': working->flags |= CE_NODUMP; break; case 'g': working->flags |= CE_GLOB; break; case 'j': working->compress = COMPRESS_BZIP2; break; case 'n': working->flags |= CE_NOSIGNAL; break; case 'r': working->flags |= CE_PID2CMD; break; case 'u': working->flags |= CE_SIGNALGROUP; break; case 'w': /* Depreciated flag - keep for compatibility purposes */ break; case 'x': working->compress = COMPRESS_XZ; break; case 'z': working->compress = COMPRESS_GZIP; break; case '-': break; case 'f': /* Used by OpenBSD for "CE_FOLLOW" */ case 'm': /* Used by OpenBSD for "CE_MONITOR" */ case 'p': /* Used by NetBSD for "CE_PLAIN0" */ default: errx(1, "illegal flag in config file -- %c", *q); } } if (eol) q = NULL; else { q = parse = sob(parse + 1); /* Optional field */ parse = son(parse); if (!*parse) eol = 1; *parse = '\0'; } working->pid_cmd_file = NULL; if (q && *q) { if (*q == '/') working->pid_cmd_file = strdup(q); else if (isalnum(*q)) goto got_sig; else { errx(1, "illegal pid file or signal in config file:\n%s", errline); } } if (eol) q = NULL; else { q = parse = sob(parse + 1); /* Optional field */ *(parse = son(parse)) = '\0'; } working->sig = SIGHUP; if (q && *q) { got_sig: working->sig = parse_signal(q); if (working->sig < 1 || working->sig >= sys_nsig) { errx(1, "illegal signal in config file:\n%s", errline); } } /* * Finish figuring out what pid-file to use (if any) in * later processing if this logfile needs to be rotated. */ if ((working->flags & CE_NOSIGNAL) == CE_NOSIGNAL) { /* * This config-entry specified 'n' for nosignal, * see if it also specified an explicit pid_cmd_file. * This would be a pretty pointless combination. */ if (working->pid_cmd_file != NULL) { warnx("Ignoring '%s' because flag 'n' was specified in line:\n%s", working->pid_cmd_file, errline); free(working->pid_cmd_file); working->pid_cmd_file = NULL; } } else if (working->pid_cmd_file == NULL) { /* * This entry did not specify the 'n' flag, which * means it should signal syslogd unless it had * specified some other pid-file (and obviously the * syslog pid-file will not be for a process-group). * Also, we should only try to notify syslog if we * are root. */ if (working->flags & CE_SIGNALGROUP) { warnx("Ignoring flag 'U' in line:\n%s", errline); working->flags &= ~CE_SIGNALGROUP; } if (needroot) working->pid_cmd_file = strdup(path_syslogpid); } /* * Add this entry to the appropriate list of entries, unless * it was some kind of special entry (eg: ). */ if (special) { ; /* Do not add to any list */ } else if (working->flags & CE_GLOB) { STAILQ_INSERT_TAIL(glob_p, working, cf_nextp); } else { STAILQ_INSERT_TAIL(work_p, working, cf_nextp); } } if (errline != NULL) free(errline); } static char * missing_field(char *p, char *errline) { if (!p || !*p) errx(1, "missing field in config file:\n%s", errline); return (p); } /* * In our sort we return it in the reverse of what qsort normally * would do, as we want the newest files first. If we have two * entries with the same time we don't really care about order. * * Support function for qsort() in delete_oldest_timelog(). */ static int oldlog_entry_compare(const void *a, const void *b) { const struct oldlog_entry *ola = a, *olb = b; if (ola->t > olb->t) return (-1); else if (ola->t < olb->t) return (1); else return (0); } /* * Check whether the file corresponding to dp is an archive of the logfile * logfname, based on the timefnamefmt format string. Return true and fill out * tm if this is the case; otherwise return false. */ static int validate_old_timelog(int fd, const struct dirent *dp, const char *logfname, struct tm *tm) { struct stat sb; size_t logfname_len; char *s; int c; logfname_len = strlen(logfname); if (dp->d_type != DT_REG) { /* * Some filesystems (e.g. NFS) don't fill out the d_type field * and leave it set to DT_UNKNOWN; in this case we must obtain * the file type ourselves. */ if (dp->d_type != DT_UNKNOWN || fstatat(fd, dp->d_name, &sb, AT_SYMLINK_NOFOLLOW) != 0 || !S_ISREG(sb.st_mode)) return (0); } /* Ignore everything but files with our logfile prefix. */ if (strncmp(dp->d_name, logfname, logfname_len) != 0) return (0); /* Ignore the actual non-rotated logfile. */ if (dp->d_namlen == logfname_len) return (0); /* * Make sure we created have found a logfile, so the * postfix is valid, IE format is: '.