Index: user/alc/PQ_LAUNDRY/sbin/pfctl/parse.y =================================================================== --- user/alc/PQ_LAUNDRY/sbin/pfctl/parse.y (revision 303666) +++ user/alc/PQ_LAUNDRY/sbin/pfctl/parse.y (revision 303667) @@ -1,6259 +1,6259 @@ /* $OpenBSD: parse.y,v 1.554 2008/10/17 12:59:53 henning Exp $ */ /* * Copyright (c) 2001 Markus Friedl. All rights reserved. * Copyright (c) 2001 Daniel Hartmeier. All rights reserved. * Copyright (c) 2001 Theo de Raadt. All rights reserved. * Copyright (c) 2002,2003 Henning Brauer. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ %{ #include __FBSDID("$FreeBSD$"); #include #include #include #ifdef __FreeBSD__ #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "pfctl_parser.h" #include "pfctl.h" static struct pfctl *pf = NULL; static int debug = 0; static int rulestate = 0; static u_int16_t returnicmpdefault = (ICMP_UNREACH << 8) | ICMP_UNREACH_PORT; static u_int16_t returnicmp6default = (ICMP6_DST_UNREACH << 8) | ICMP6_DST_UNREACH_NOPORT; static int blockpolicy = PFRULE_DROP; static int require_order = 1; static int default_statelock; TAILQ_HEAD(files, file) files = TAILQ_HEAD_INITIALIZER(files); static struct file { TAILQ_ENTRY(file) entry; FILE *stream; char *name; int lineno; int errors; } *file; struct file *pushfile(const char *, int); int popfile(void); int check_file_secrecy(int, const char *); int yyparse(void); int yylex(void); int yyerror(const char *, ...); int kw_cmp(const void *, const void *); int lookup(char *); int lgetc(int); int lungetc(int); int findeol(void); TAILQ_HEAD(symhead, sym) symhead = TAILQ_HEAD_INITIALIZER(symhead); struct sym { TAILQ_ENTRY(sym) entry; int used; int persist; char *nam; char *val; }; int symset(const char *, const char *, int); char *symget(const char *); int atoul(char *, u_long *); enum { PFCTL_STATE_NONE, PFCTL_STATE_OPTION, PFCTL_STATE_SCRUB, PFCTL_STATE_QUEUE, PFCTL_STATE_NAT, PFCTL_STATE_FILTER }; struct node_proto { u_int8_t proto; struct node_proto *next; struct node_proto *tail; }; struct node_port { u_int16_t port[2]; u_int8_t op; struct node_port *next; struct node_port *tail; }; struct node_uid { uid_t uid[2]; u_int8_t op; struct node_uid *next; struct node_uid *tail; }; struct node_gid { gid_t gid[2]; u_int8_t op; struct node_gid *next; struct node_gid *tail; }; struct node_icmp { u_int8_t code; u_int8_t type; u_int8_t proto; struct node_icmp *next; struct node_icmp *tail; }; enum { PF_STATE_OPT_MAX, PF_STATE_OPT_NOSYNC, PF_STATE_OPT_SRCTRACK, PF_STATE_OPT_MAX_SRC_STATES, PF_STATE_OPT_MAX_SRC_CONN, PF_STATE_OPT_MAX_SRC_CONN_RATE, PF_STATE_OPT_MAX_SRC_NODES, PF_STATE_OPT_OVERLOAD, PF_STATE_OPT_STATELOCK, PF_STATE_OPT_TIMEOUT, PF_STATE_OPT_SLOPPY, }; enum { PF_SRCTRACK_NONE, PF_SRCTRACK, PF_SRCTRACK_GLOBAL, PF_SRCTRACK_RULE }; struct node_state_opt { int type; union { u_int32_t max_states; u_int32_t max_src_states; u_int32_t max_src_conn; struct { u_int32_t limit; u_int32_t seconds; } max_src_conn_rate; struct { u_int8_t flush; char tblname[PF_TABLE_NAME_SIZE]; } overload; u_int32_t max_src_nodes; u_int8_t src_track; u_int32_t statelock; struct { int number; u_int32_t seconds; } timeout; } data; struct node_state_opt *next; struct node_state_opt *tail; }; struct peer { struct node_host *host; struct node_port *port; }; struct node_queue { char queue[PF_QNAME_SIZE]; char parent[PF_QNAME_SIZE]; char ifname[IFNAMSIZ]; int scheduler; struct node_queue *next; struct node_queue *tail; } *queues = NULL; struct node_qassign { char *qname; char *pqname; }; struct filter_opts { int marker; #define FOM_FLAGS 0x01 #define FOM_ICMP 0x02 #define FOM_TOS 0x04 #define FOM_KEEP 0x08 #define FOM_SRCTRACK 0x10 #define FOM_SETPRIO 0x0400 #define FOM_PRIO 0x2000 struct node_uid *uid; struct node_gid *gid; struct { u_int8_t b1; u_int8_t b2; u_int16_t w; u_int16_t w2; } flags; struct node_icmp *icmpspec; u_int32_t tos; u_int32_t prob; struct { int action; struct node_state_opt *options; } keep; int fragment; int allowopts; char *label; struct node_qassign queues; char *tag; char *match_tag; u_int8_t match_tag_not; u_int rtableid; u_int8_t prio; u_int8_t set_prio[2]; struct { struct node_host *addr; u_int16_t port; } divert; } filter_opts; struct antispoof_opts { char *label; u_int rtableid; } antispoof_opts; struct scrub_opts { int marker; #define SOM_MINTTL 0x01 #define SOM_MAXMSS 0x02 #define SOM_FRAGCACHE 0x04 #define SOM_SETTOS 0x08 int nodf; int minttl; int maxmss; int settos; int fragcache; int randomid; int reassemble_tcp; char *match_tag; u_int8_t match_tag_not; u_int rtableid; } scrub_opts; struct queue_opts { int marker; #define QOM_BWSPEC 0x01 #define QOM_SCHEDULER 0x02 #define QOM_PRIORITY 0x04 #define QOM_TBRSIZE 0x08 #define QOM_QLIMIT 0x10 struct node_queue_bw queue_bwspec; struct node_queue_opt scheduler; int priority; int tbrsize; int qlimit; } queue_opts; struct table_opts { int flags; int init_addr; struct node_tinithead init_nodes; } table_opts; struct pool_opts { int marker; #define POM_TYPE 0x01 #define POM_STICKYADDRESS 0x02 u_int8_t opts; int type; int staticport; struct pf_poolhashkey *key; } pool_opts; struct codel_opts codel_opts; struct node_hfsc_opts hfsc_opts; struct node_fairq_opts fairq_opts; struct node_state_opt *keep_state_defaults = NULL; int disallow_table(struct node_host *, const char *); int disallow_urpf_failed(struct node_host *, const char *); int disallow_alias(struct node_host *, const char *); int rule_consistent(struct pf_rule *, int); int filter_consistent(struct pf_rule *, int); int nat_consistent(struct pf_rule *); int rdr_consistent(struct pf_rule *); int process_tabledef(char *, struct table_opts *); void expand_label_str(char *, size_t, const char *, const char *); void expand_label_if(const char *, char *, size_t, const char *); void expand_label_addr(const char *, char *, size_t, u_int8_t, struct node_host *); void expand_label_port(const char *, char *, size_t, struct node_port *); void expand_label_proto(const char *, char *, size_t, u_int8_t); void expand_label_nr(const char *, char *, size_t); void expand_label(char *, size_t, const char *, u_int8_t, struct node_host *, struct node_port *, struct node_host *, struct node_port *, u_int8_t); void expand_rule(struct pf_rule *, struct node_if *, struct node_host *, struct node_proto *, struct node_os *, struct node_host *, struct node_port *, struct node_host *, struct node_port *, struct node_uid *, struct node_gid *, struct node_icmp *, const char *); int expand_altq(struct pf_altq *, struct node_if *, struct node_queue *, struct node_queue_bw bwspec, struct node_queue_opt *); int expand_queue(struct pf_altq *, struct node_if *, struct node_queue *, struct node_queue_bw, struct node_queue_opt *); int expand_skip_interface(struct node_if *); int check_rulestate(int); int getservice(char *); int rule_label(struct pf_rule *, char *); int rt_tableid_max(void); void mv_rules(struct pf_ruleset *, struct pf_ruleset *); void decide_address_family(struct node_host *, sa_family_t *); void remove_invalid_hosts(struct node_host **, sa_family_t *); int invalid_redirect(struct node_host *, sa_family_t); u_int16_t parseicmpspec(char *, sa_family_t); TAILQ_HEAD(loadanchorshead, loadanchors) loadanchorshead = TAILQ_HEAD_INITIALIZER(loadanchorshead); struct loadanchors { TAILQ_ENTRY(loadanchors) entries; char *anchorname; char *filename; }; typedef struct { union { int64_t number; double probability; int i; char *string; u_int rtableid; struct { u_int8_t b1; u_int8_t b2; u_int16_t w; u_int16_t w2; } b; struct range { int a; int b; int t; } range; struct node_if *interface; struct node_proto *proto; struct node_icmp *icmp; struct node_host *host; struct node_os *os; struct node_port *port; struct node_uid *uid; struct node_gid *gid; struct node_state_opt *state_opt; struct peer peer; struct { struct peer src, dst; struct node_os *src_os; } fromto; struct { struct node_host *host; u_int8_t rt; u_int8_t pool_opts; sa_family_t af; struct pf_poolhashkey *key; } route; struct redirection { struct node_host *host; struct range rport; } *redirection; struct { int action; struct node_state_opt *options; } keep_state; struct { u_int8_t log; u_int8_t logif; u_int8_t quick; } logquick; struct { int neg; char *name; } tagged; struct pf_poolhashkey *hashkey; struct node_queue *queue; struct node_queue_opt queue_options; struct node_queue_bw queue_bwspec; struct node_qassign qassign; struct filter_opts filter_opts; struct antispoof_opts antispoof_opts; struct queue_opts queue_opts; struct scrub_opts scrub_opts; struct table_opts table_opts; struct pool_opts pool_opts; struct node_hfsc_opts hfsc_opts; struct node_fairq_opts fairq_opts; struct codel_opts codel_opts; } v; int lineno; } YYSTYPE; #define PPORT_RANGE 1 #define PPORT_STAR 2 int parseport(char *, struct range *r, int); #define DYNIF_MULTIADDR(addr) ((addr).type == PF_ADDR_DYNIFTL && \ (!((addr).iflags & PFI_AFLAG_NOALIAS) || \ !isdigit((addr).v.ifname[strlen((addr).v.ifname)-1]))) %} %token PASS BLOCK SCRUB RETURN IN OS OUT LOG QUICK ON FROM TO FLAGS %token RETURNRST RETURNICMP RETURNICMP6 PROTO INET INET6 ALL ANY ICMPTYPE %token ICMP6TYPE CODE KEEP MODULATE STATE PORT RDR NAT BINAT ARROW NODF %token MINTTL ERROR ALLOWOPTS FASTROUTE FILENAME ROUTETO DUPTO REPLYTO NO LABEL %token NOROUTE URPFFAILED FRAGMENT USER GROUP MAXMSS MAXIMUM TTL TOS DROP TABLE %token REASSEMBLE FRAGDROP FRAGCROP ANCHOR NATANCHOR RDRANCHOR BINATANCHOR %token SET OPTIMIZATION TIMEOUT LIMIT LOGINTERFACE BLOCKPOLICY RANDOMID %token REQUIREORDER SYNPROXY FINGERPRINTS NOSYNC DEBUG SKIP HOSTID %token ANTISPOOF FOR INCLUDE %token BITMASK RANDOM SOURCEHASH ROUNDROBIN STATICPORT PROBABILITY %token ALTQ CBQ CODEL PRIQ HFSC FAIRQ BANDWIDTH TBRSIZE LINKSHARE REALTIME %token UPPERLIMIT QUEUE PRIORITY QLIMIT HOGS BUCKETS RTABLE TARGET INTERVAL %token LOAD RULESET_OPTIMIZATION PRIO %token STICKYADDRESS MAXSRCSTATES MAXSRCNODES SOURCETRACK GLOBAL RULE %token MAXSRCCONN MAXSRCCONNRATE OVERLOAD FLUSH SLOPPY %token TAGGED TAG IFBOUND FLOATING STATEPOLICY STATEDEFAULTS ROUTE SETTOS %token DIVERTTO DIVERTREPLY %token STRING %token NUMBER %token PORTBINARY %type interface if_list if_item_not if_item %type number icmptype icmp6type uid gid %type tos not yesno %type probability %type no dir af fragcache optimizer %type sourcetrack flush unaryop statelock %type action nataction natpasslog scrubaction %type flags flag blockspec prio %type portplain portstar portrange %type hashkey %type proto proto_list proto_item %type protoval %type icmpspec %type icmp_list icmp_item %type icmp6_list icmp6_item %type reticmpspec reticmp6spec %type fromto %type ipportspec from to %type ipspec toipspec xhost host dynaddr host_list %type redir_host_list redirspec %type route_host route_host_list routespec %type os xos os_list %type portspec port_list port_item %type uids uid_list uid_item %type gids gid_list gid_item %type route %type redirection redirpool %type label stringall tag anchorname %type string varstring numberstring %type keep %type state_opt_spec state_opt_list state_opt_item %type logquick quick log logopts logopt %type antispoof_ifspc antispoof_iflst antispoof_if %type qname %type qassign qassign_list qassign_item %type scheduler %type cbqflags_list cbqflags_item %type priqflags_list priqflags_item %type hfscopts_list hfscopts_item hfsc_opts %type fairqopts_list fairqopts_item fairq_opts %type codelopts_list codelopts_item codel_opts %type bandwidth %type filter_opts filter_opt filter_opts_l %type filter_sets filter_set filter_sets_l %type antispoof_opts antispoof_opt antispoof_opts_l %type queue_opts queue_opt queue_opts_l %type scrub_opts scrub_opt scrub_opts_l %type table_opts table_opt table_opts_l %type pool_opts pool_opt pool_opts_l %type tagged %type rtable %% ruleset : /* empty */ | ruleset include '\n' | ruleset '\n' | ruleset option '\n' | ruleset scrubrule '\n' | ruleset natrule '\n' | ruleset binatrule '\n' | ruleset pfrule '\n' | ruleset anchorrule '\n' | ruleset loadrule '\n' | ruleset altqif '\n' | ruleset queuespec '\n' | ruleset varset '\n' | ruleset antispoof '\n' | ruleset tabledef '\n' | '{' fakeanchor '}' '\n'; | ruleset error '\n' { file->errors++; } ; include : INCLUDE STRING { struct file *nfile; if ((nfile = pushfile($2, 0)) == NULL) { yyerror("failed to include file %s", $2); free($2); YYERROR; } free($2); file = nfile; lungetc('\n'); } ; /* * apply to previouslys specified rule: must be careful to note * what that is: pf or nat or binat or rdr */ fakeanchor : fakeanchor '\n' | fakeanchor anchorrule '\n' | fakeanchor binatrule '\n' | fakeanchor natrule '\n' | fakeanchor pfrule '\n' | fakeanchor error '\n' ; optimizer : string { if (!strcmp($1, "none")) $$ = 0; else if (!strcmp($1, "basic")) $$ = PF_OPTIMIZE_BASIC; else if (!strcmp($1, "profile")) $$ = PF_OPTIMIZE_BASIC | PF_OPTIMIZE_PROFILE; else { yyerror("unknown ruleset-optimization %s", $1); YYERROR; } } ; option : SET OPTIMIZATION STRING { if (check_rulestate(PFCTL_STATE_OPTION)) { free($3); YYERROR; } if (pfctl_set_optimization(pf, $3) != 0) { yyerror("unknown optimization %s", $3); free($3); YYERROR; } free($3); } | SET RULESET_OPTIMIZATION optimizer { if (!(pf->opts & PF_OPT_OPTIMIZE)) { pf->opts |= PF_OPT_OPTIMIZE; pf->optimize = $3; } } | SET TIMEOUT timeout_spec | SET TIMEOUT '{' optnl timeout_list '}' | SET LIMIT limit_spec | SET LIMIT '{' optnl limit_list '}' | SET LOGINTERFACE stringall { if (check_rulestate(PFCTL_STATE_OPTION)) { free($3); YYERROR; } if (pfctl_set_logif(pf, $3) != 0) { yyerror("error setting loginterface %s", $3); free($3); YYERROR; } free($3); } | SET HOSTID number { if ($3 == 0 || $3 > UINT_MAX) { yyerror("hostid must be non-zero"); YYERROR; } if (pfctl_set_hostid(pf, $3) != 0) { yyerror("error setting hostid %08x", $3); YYERROR; } } | SET BLOCKPOLICY DROP { if (pf->opts & PF_OPT_VERBOSE) printf("set block-policy drop\n"); if (check_rulestate(PFCTL_STATE_OPTION)) YYERROR; blockpolicy = PFRULE_DROP; } | SET BLOCKPOLICY RETURN { if (pf->opts & PF_OPT_VERBOSE) printf("set block-policy return\n"); if (check_rulestate(PFCTL_STATE_OPTION)) YYERROR; blockpolicy = PFRULE_RETURN; } | SET REQUIREORDER yesno { if (pf->opts & PF_OPT_VERBOSE) printf("set require-order %s\n", $3 == 1 ? "yes" : "no"); require_order = $3; } | SET FINGERPRINTS STRING { if (pf->opts & PF_OPT_VERBOSE) printf("set fingerprints \"%s\"\n", $3); if (check_rulestate(PFCTL_STATE_OPTION)) { free($3); YYERROR; } if (!pf->anchor->name[0]) { if (pfctl_file_fingerprints(pf->dev, pf->opts, $3)) { yyerror("error loading " "fingerprints %s", $3); free($3); YYERROR; } } free($3); } | SET STATEPOLICY statelock { if (pf->opts & PF_OPT_VERBOSE) switch ($3) { case 0: printf("set state-policy floating\n"); break; case PFRULE_IFBOUND: printf("set state-policy if-bound\n"); break; } default_statelock = $3; } | SET DEBUG STRING { if (check_rulestate(PFCTL_STATE_OPTION)) { free($3); YYERROR; } if (pfctl_set_debug(pf, $3) != 0) { yyerror("error setting debuglevel %s", $3); free($3); YYERROR; } free($3); } | SET SKIP interface { if (expand_skip_interface($3) != 0) { yyerror("error setting skip interface(s)"); YYERROR; } } | SET STATEDEFAULTS state_opt_list { if (keep_state_defaults != NULL) { yyerror("cannot redefine state-defaults"); YYERROR; } keep_state_defaults = $3; } ; stringall : STRING { $$ = $1; } | ALL { if (($$ = strdup("all")) == NULL) { err(1, "stringall: strdup"); } } ; string : STRING string { if (asprintf(&$$, "%s %s", $1, $2) == -1) err(1, "string: asprintf"); free($1); free($2); } | STRING ; varstring : numberstring varstring { if (asprintf(&$$, "%s %s", $1, $2) == -1) err(1, "string: asprintf"); free($1); free($2); } | numberstring ; numberstring : NUMBER { char *s; if (asprintf(&s, "%lld", (long long)$1) == -1) { yyerror("string: asprintf"); YYERROR; } $$ = s; } | STRING ; varset : STRING '=' varstring { if (pf->opts & PF_OPT_VERBOSE) printf("%s = \"%s\"\n", $1, $3); if (symset($1, $3, 0) == -1) err(1, "cannot store variable %s", $1); free($1); free($3); } ; anchorname : STRING { $$ = $1; } | /* empty */ { $$ = NULL; } ; pfa_anchorlist : /* empty */ | pfa_anchorlist '\n' | pfa_anchorlist pfrule '\n' | pfa_anchorlist anchorrule '\n' ; pfa_anchor : '{' { char ta[PF_ANCHOR_NAME_SIZE]; struct pf_ruleset *rs; /* steping into a brace anchor */ pf->asd++; pf->bn++; pf->brace = 1; /* create a holding ruleset in the root */ snprintf(ta, PF_ANCHOR_NAME_SIZE, "_%d", pf->bn); rs = pf_find_or_create_ruleset(ta); if (rs == NULL) err(1, "pfa_anchor: pf_find_or_create_ruleset"); pf->astack[pf->asd] = rs->anchor; pf->anchor = rs->anchor; } '\n' pfa_anchorlist '}' { pf->alast = pf->anchor; pf->asd--; pf->anchor = pf->astack[pf->asd]; } | /* empty */ ; anchorrule : ANCHOR anchorname dir quick interface af proto fromto filter_opts pfa_anchor { struct pf_rule r; struct node_proto *proto; if (check_rulestate(PFCTL_STATE_FILTER)) { if ($2) free($2); YYERROR; } if ($2 && ($2[0] == '_' || strstr($2, "/_") != NULL)) { free($2); yyerror("anchor names beginning with '_' " "are reserved for internal use"); YYERROR; } memset(&r, 0, sizeof(r)); if (pf->astack[pf->asd + 1]) { /* move inline rules into relative location */ pf_anchor_setup(&r, &pf->astack[pf->asd]->ruleset, $2 ? $2 : pf->alast->name); if (r.anchor == NULL) err(1, "anchorrule: unable to " "create ruleset"); if (pf->alast != r.anchor) { if (r.anchor->match) { yyerror("inline anchor '%s' " "already exists", r.anchor->name); YYERROR; } mv_rules(&pf->alast->ruleset, &r.anchor->ruleset); } pf_remove_if_empty_ruleset(&pf->alast->ruleset); pf->alast = r.anchor; } else { if (!$2) { yyerror("anchors without explicit " "rules must specify a name"); YYERROR; } } r.direction = $3; r.quick = $4.quick; r.af = $6; r.prob = $9.prob; r.rtableid = $9.rtableid; if ($9.tag) if (strlcpy(r.tagname, $9.tag, PF_TAG_NAME_SIZE) >= PF_TAG_NAME_SIZE) { yyerror("tag too long, max %u chars", PF_TAG_NAME_SIZE - 1); YYERROR; } if ($9.match_tag) if (strlcpy(r.match_tagname, $9.match_tag, PF_TAG_NAME_SIZE) >= PF_TAG_NAME_SIZE) { yyerror("tag too long, max %u chars", PF_TAG_NAME_SIZE - 1); YYERROR; } r.match_tag_not = $9.match_tag_not; if (rule_label(&r, $9.label)) YYERROR; free($9.label); r.flags = $9.flags.b1; r.flagset = $9.flags.b2; if (($9.flags.b1 & $9.flags.b2) != $9.flags.b1) { yyerror("flags always false"); YYERROR; } if ($9.flags.b1 || $9.flags.b2 || $8.src_os) { for (proto = $7; proto != NULL && proto->proto != IPPROTO_TCP; proto = proto->next) ; /* nothing */ if (proto == NULL && $7 != NULL) { if ($9.flags.b1 || $9.flags.b2) yyerror( "flags only apply to tcp"); if ($8.src_os) yyerror( "OS fingerprinting only " "applies to tcp"); YYERROR; } } r.tos = $9.tos; if ($9.keep.action) { yyerror("cannot specify state handling " "on anchors"); YYERROR; } if ($9.match_tag) if (strlcpy(r.match_tagname, $9.match_tag, PF_TAG_NAME_SIZE) >= PF_TAG_NAME_SIZE) { yyerror("tag too long, max %u chars", PF_TAG_NAME_SIZE - 1); YYERROR; } r.match_tag_not = $9.match_tag_not; if ($9.marker & FOM_PRIO) { if ($9.prio == 0) r.prio = PF_PRIO_ZERO; else r.prio = $9.prio; } if ($9.marker & FOM_SETPRIO) { r.set_prio[0] = $9.set_prio[0]; r.set_prio[1] = $9.set_prio[1]; r.scrub_flags |= PFSTATE_SETPRIO; } decide_address_family($8.src.host, &r.af); decide_address_family($8.dst.host, &r.af); expand_rule(&r, $5, NULL, $7, $8.src_os, $8.src.host, $8.src.port, $8.dst.host, $8.dst.port, $9.uid, $9.gid, $9.icmpspec, pf->astack[pf->asd + 1] ? pf->alast->name : $2); free($2); pf->astack[pf->asd + 1] = NULL; } | NATANCHOR string interface af proto fromto rtable { struct pf_rule r; if (check_rulestate(PFCTL_STATE_NAT)) { free($2); YYERROR; } memset(&r, 0, sizeof(r)); r.action = PF_NAT; r.af = $4; r.rtableid = $7; decide_address_family($6.src.host, &r.af); decide_address_family($6.dst.host, &r.af); expand_rule(&r, $3, NULL, $5, $6.src_os, $6.src.host, $6.src.port, $6.dst.host, $6.dst.port, 0, 0, 0, $2); free($2); } | RDRANCHOR string interface af proto fromto rtable { struct pf_rule r; if (check_rulestate(PFCTL_STATE_NAT)) { free($2); YYERROR; } memset(&r, 0, sizeof(r)); r.action = PF_RDR; r.af = $4; r.rtableid = $7; decide_address_family($6.src.host, &r.af); decide_address_family($6.dst.host, &r.af); if ($6.src.port != NULL) { yyerror("source port parameter not supported" " in rdr-anchor"); YYERROR; } if ($6.dst.port != NULL) { if ($6.dst.port->next != NULL) { yyerror("destination port list " "expansion not supported in " "rdr-anchor"); YYERROR; } else if ($6.dst.port->op != PF_OP_EQ) { yyerror("destination port operators" " not supported in rdr-anchor"); YYERROR; } r.dst.port[0] = $6.dst.port->port[0]; r.dst.port[1] = $6.dst.port->port[1]; r.dst.port_op = $6.dst.port->op; } expand_rule(&r, $3, NULL, $5, $6.src_os, $6.src.host, $6.src.port, $6.dst.host, $6.dst.port, 0, 0, 0, $2); free($2); } | BINATANCHOR string interface af proto fromto rtable { struct pf_rule r; if (check_rulestate(PFCTL_STATE_NAT)) { free($2); YYERROR; } memset(&r, 0, sizeof(r)); r.action = PF_BINAT; r.af = $4; r.rtableid = $7; if ($5 != NULL) { if ($5->next != NULL) { yyerror("proto list expansion" " not supported in binat-anchor"); YYERROR; } r.proto = $5->proto; free($5); } if ($6.src.host != NULL || $6.src.port != NULL || $6.dst.host != NULL || $6.dst.port != NULL) { yyerror("fromto parameter not supported" " in binat-anchor"); YYERROR; } decide_address_family($6.src.host, &r.af); decide_address_family($6.dst.host, &r.af); pfctl_add_rule(pf, &r, $2); free($2); } ; loadrule : LOAD ANCHOR string FROM string { struct loadanchors *loadanchor; if (strlen(pf->anchor->name) + 1 + strlen($3) >= MAXPATHLEN) { yyerror("anchorname %s too long, max %u\n", $3, MAXPATHLEN - 1); free($3); YYERROR; } loadanchor = calloc(1, sizeof(struct loadanchors)); if (loadanchor == NULL) err(1, "loadrule: calloc"); if ((loadanchor->anchorname = malloc(MAXPATHLEN)) == NULL) err(1, "loadrule: malloc"); if (pf->anchor->name[0]) snprintf(loadanchor->anchorname, MAXPATHLEN, "%s/%s", pf->anchor->name, $3); else strlcpy(loadanchor->anchorname, $3, MAXPATHLEN); if ((loadanchor->filename = strdup($5)) == NULL) err(1, "loadrule: strdup"); TAILQ_INSERT_TAIL(&loadanchorshead, loadanchor, entries); free($3); free($5); }; scrubaction : no SCRUB { $$.b2 = $$.w = 0; if ($1) $$.b1 = PF_NOSCRUB; else $$.b1 = PF_SCRUB; } ; scrubrule : scrubaction dir logquick interface af proto fromto scrub_opts { struct pf_rule r; if (check_rulestate(PFCTL_STATE_SCRUB)) YYERROR; memset(&r, 0, sizeof(r)); r.action = $1.b1; r.direction = $2; r.log = $3.log; r.logif = $3.logif; if ($3.quick) { yyerror("scrub rules do not support 'quick'"); YYERROR; } r.af = $5; if ($8.nodf) r.rule_flag |= PFRULE_NODF; if ($8.randomid) r.rule_flag |= PFRULE_RANDOMID; if ($8.reassemble_tcp) { if (r.direction != PF_INOUT) { yyerror("reassemble tcp rules can not " "specify direction"); YYERROR; } r.rule_flag |= PFRULE_REASSEMBLE_TCP; } if ($8.minttl) r.min_ttl = $8.minttl; if ($8.maxmss) r.max_mss = $8.maxmss; if ($8.marker & SOM_SETTOS) { r.rule_flag |= PFRULE_SET_TOS; r.set_tos = $8.settos; } if ($8.fragcache) r.rule_flag |= $8.fragcache; if ($8.match_tag) if (strlcpy(r.match_tagname, $8.match_tag, PF_TAG_NAME_SIZE) >= PF_TAG_NAME_SIZE) { yyerror("tag too long, max %u chars", PF_TAG_NAME_SIZE - 1); YYERROR; } r.match_tag_not = $8.match_tag_not; r.rtableid = $8.rtableid; expand_rule(&r, $4, NULL, $6, $7.src_os, $7.src.host, $7.src.port, $7.dst.host, $7.dst.port, NULL, NULL, NULL, ""); } ; scrub_opts : { bzero(&scrub_opts, sizeof scrub_opts); scrub_opts.rtableid = -1; } scrub_opts_l { $$ = scrub_opts; } | /* empty */ { bzero(&scrub_opts, sizeof scrub_opts); scrub_opts.rtableid = -1; $$ = scrub_opts; } ; scrub_opts_l : scrub_opts_l scrub_opt | scrub_opt ; scrub_opt : NODF { if (scrub_opts.nodf) { yyerror("no-df cannot be respecified"); YYERROR; } scrub_opts.nodf = 1; } | MINTTL NUMBER { if (scrub_opts.marker & SOM_MINTTL) { yyerror("min-ttl cannot be respecified"); YYERROR; } if ($2 < 0 || $2 > 255) { yyerror("illegal min-ttl value %d", $2); YYERROR; } scrub_opts.marker |= SOM_MINTTL; scrub_opts.minttl = $2; } | MAXMSS NUMBER { if (scrub_opts.marker & SOM_MAXMSS) { yyerror("max-mss cannot be respecified"); YYERROR; } if ($2 < 0 || $2 > 65535) { yyerror("illegal max-mss value %d", $2); YYERROR; } scrub_opts.marker |= SOM_MAXMSS; scrub_opts.maxmss = $2; } | SETTOS tos { if (scrub_opts.marker & SOM_SETTOS) { yyerror("set-tos cannot be respecified"); YYERROR; } scrub_opts.marker |= SOM_SETTOS; scrub_opts.settos = $2; } | fragcache { if (scrub_opts.marker & SOM_FRAGCACHE) { yyerror("fragcache cannot be respecified"); YYERROR; } scrub_opts.marker |= SOM_FRAGCACHE; scrub_opts.fragcache = $1; } | REASSEMBLE STRING { if (strcasecmp($2, "tcp") != 0) { yyerror("scrub reassemble supports only tcp, " "not '%s'", $2); free($2); YYERROR; } free($2); if (scrub_opts.reassemble_tcp) { yyerror("reassemble tcp cannot be respecified"); YYERROR; } scrub_opts.reassemble_tcp = 1; } | RANDOMID { if (scrub_opts.randomid) { yyerror("random-id cannot be respecified"); YYERROR; } scrub_opts.randomid = 1; } | RTABLE NUMBER { if ($2 < 0 || $2 > rt_tableid_max()) { yyerror("invalid rtable id"); YYERROR; } scrub_opts.rtableid = $2; } | not TAGGED string { scrub_opts.match_tag = $3; scrub_opts.match_tag_not = $1; } ; fragcache : FRAGMENT REASSEMBLE { $$ = 0; /* default */ } | FRAGMENT FRAGCROP { $$ = 0; } | FRAGMENT FRAGDROP { $$ = 0; } ; antispoof : ANTISPOOF logquick antispoof_ifspc af antispoof_opts { struct pf_rule r; struct node_host *h = NULL, *hh; struct node_if *i, *j; if (check_rulestate(PFCTL_STATE_FILTER)) YYERROR; for (i = $3; i; i = i->next) { bzero(&r, sizeof(r)); r.action = PF_DROP; r.direction = PF_IN; r.log = $2.log; r.logif = $2.logif; r.quick = $2.quick; r.af = $4; if (rule_label(&r, $5.label)) YYERROR; r.rtableid = $5.rtableid; j = calloc(1, sizeof(struct node_if)); if (j == NULL) err(1, "antispoof: calloc"); if (strlcpy(j->ifname, i->ifname, sizeof(j->ifname)) >= sizeof(j->ifname)) { free(j); yyerror("interface name too long"); YYERROR; } j->not = 1; if (i->dynamic) { h = calloc(1, sizeof(*h)); if (h == NULL) err(1, "address: calloc"); h->addr.type = PF_ADDR_DYNIFTL; set_ipmask(h, 128); if (strlcpy(h->addr.v.ifname, i->ifname, sizeof(h->addr.v.ifname)) >= sizeof(h->addr.v.ifname)) { free(h); yyerror( "interface name too long"); YYERROR; } hh = malloc(sizeof(*hh)); if (hh == NULL) err(1, "address: malloc"); bcopy(h, hh, sizeof(*hh)); h->addr.iflags = PFI_AFLAG_NETWORK; } else { h = ifa_lookup(j->ifname, PFI_AFLAG_NETWORK); hh = NULL; } if (h != NULL) expand_rule(&r, j, NULL, NULL, NULL, h, NULL, NULL, NULL, NULL, NULL, NULL, ""); if ((i->ifa_flags & IFF_LOOPBACK) == 0) { bzero(&r, sizeof(r)); r.action = PF_DROP; r.direction = PF_IN; r.log = $2.log; r.logif = $2.logif; r.quick = $2.quick; r.af = $4; if (rule_label(&r, $5.label)) YYERROR; r.rtableid = $5.rtableid; if (hh != NULL) h = hh; else h = ifa_lookup(i->ifname, 0); if (h != NULL) expand_rule(&r, NULL, NULL, NULL, NULL, h, NULL, NULL, NULL, NULL, NULL, NULL, ""); } else free(hh); } free($5.label); } ; antispoof_ifspc : FOR antispoof_if { $$ = $2; } | FOR '{' optnl antispoof_iflst '}' { $$ = $4; } ; antispoof_iflst : antispoof_if optnl { $$ = $1; } | antispoof_iflst comma antispoof_if optnl { $1->tail->next = $3; $1->tail = $3; $$ = $1; } ; antispoof_if : if_item { $$ = $1; } | '(' if_item ')' { $2->dynamic = 1; $$ = $2; } ; antispoof_opts : { bzero(&antispoof_opts, sizeof antispoof_opts); antispoof_opts.rtableid = -1; } antispoof_opts_l { $$ = antispoof_opts; } | /* empty */ { bzero(&antispoof_opts, sizeof antispoof_opts); antispoof_opts.rtableid = -1; $$ = antispoof_opts; } ; antispoof_opts_l : antispoof_opts_l antispoof_opt | antispoof_opt ; antispoof_opt : label { if (antispoof_opts.label) { yyerror("label cannot be redefined"); YYERROR; } antispoof_opts.label = $1; } | RTABLE NUMBER { if ($2 < 0 || $2 > rt_tableid_max()) { yyerror("invalid rtable id"); YYERROR; } antispoof_opts.rtableid = $2; } ; not : '!' { $$ = 1; } | /* empty */ { $$ = 0; } ; tabledef : TABLE '<' STRING '>' table_opts { struct node_host *h, *nh; struct node_tinit *ti, *nti; if (strlen($3) >= PF_TABLE_NAME_SIZE) { yyerror("table name too long, max %d chars", PF_TABLE_NAME_SIZE - 1); free($3); YYERROR; } if (pf->loadopt & PFCTL_FLAG_TABLE) if (process_tabledef($3, &$5)) { free($3); YYERROR; } free($3); for (ti = SIMPLEQ_FIRST(&$5.init_nodes); ti != SIMPLEQ_END(&$5.init_nodes); ti = nti) { if (ti->file) free(ti->file); for (h = ti->host; h != NULL; h = nh) { nh = h->next; free(h); } nti = SIMPLEQ_NEXT(ti, entries); free(ti); } } ; table_opts : { bzero(&table_opts, sizeof table_opts); SIMPLEQ_INIT(&table_opts.init_nodes); } table_opts_l { $$ = table_opts; } | /* empty */ { bzero(&table_opts, sizeof table_opts); SIMPLEQ_INIT(&table_opts.init_nodes); $$ = table_opts; } ; table_opts_l : table_opts_l table_opt | table_opt ; table_opt : STRING { if (!strcmp($1, "const")) table_opts.flags |= PFR_TFLAG_CONST; else if (!strcmp($1, "persist")) table_opts.flags |= PFR_TFLAG_PERSIST; else if (!strcmp($1, "counters")) table_opts.flags |= PFR_TFLAG_COUNTERS; else { yyerror("invalid table option '%s'", $1); free($1); YYERROR; } free($1); } | '{' optnl '}' { table_opts.init_addr = 1; } | '{' optnl host_list '}' { struct node_host *n; struct node_tinit *ti; for (n = $3; n != NULL; n = n->next) { switch (n->addr.type) { case PF_ADDR_ADDRMASK: continue; /* ok */ case PF_ADDR_RANGE: yyerror("address ranges are not " "permitted inside tables"); break; case PF_ADDR_DYNIFTL: yyerror("dynamic addresses are not " "permitted inside tables"); break; case PF_ADDR_TABLE: yyerror("tables cannot contain tables"); break; case PF_ADDR_NOROUTE: yyerror("\"no-route\" is not permitted " "inside tables"); break; case PF_ADDR_URPFFAILED: yyerror("\"urpf-failed\" is not " "permitted inside tables"); break; default: yyerror("unknown address type %d", n->addr.type); } YYERROR; } if (!(ti = calloc(1, sizeof(*ti)))) err(1, "table_opt: calloc"); ti->host = $3; SIMPLEQ_INSERT_TAIL(&table_opts.init_nodes, ti, entries); table_opts.init_addr = 1; } | FILENAME STRING { struct node_tinit *ti; if (!(ti = calloc(1, sizeof(*ti)))) err(1, "table_opt: calloc"); ti->file = $2; SIMPLEQ_INSERT_TAIL(&table_opts.init_nodes, ti, entries); table_opts.init_addr = 1; } ; altqif : ALTQ interface queue_opts QUEUE qassign { struct pf_altq a; if (check_rulestate(PFCTL_STATE_QUEUE)) YYERROR; memset(&a, 0, sizeof(a)); if ($3.scheduler.qtype == ALTQT_NONE) { yyerror("no scheduler specified!"); YYERROR; } a.scheduler = $3.scheduler.qtype; a.qlimit = $3.qlimit; a.tbrsize = $3.tbrsize; if ($5 == NULL && $3.scheduler.qtype != ALTQT_CODEL) { yyerror("no child queues specified"); YYERROR; } if (expand_altq(&a, $2, $5, $3.queue_bwspec, &$3.scheduler)) YYERROR; } ; queuespec : QUEUE STRING interface queue_opts qassign { struct pf_altq a; if (check_rulestate(PFCTL_STATE_QUEUE)) { free($2); YYERROR; } memset(&a, 0, sizeof(a)); if (strlcpy(a.qname, $2, sizeof(a.qname)) >= sizeof(a.qname)) { yyerror("queue name too long (max " "%d chars)", PF_QNAME_SIZE-1); free($2); YYERROR; } free($2); if ($4.tbrsize) { yyerror("cannot specify tbrsize for queue"); YYERROR; } if ($4.priority > 255) { yyerror("priority out of range: max 255"); YYERROR; } a.priority = $4.priority; a.qlimit = $4.qlimit; a.scheduler = $4.scheduler.qtype; if (expand_queue(&a, $3, $5, $4.queue_bwspec, &$4.scheduler)) { yyerror("errors in queue definition"); YYERROR; } } ; queue_opts : { bzero(&queue_opts, sizeof queue_opts); queue_opts.priority = DEFAULT_PRIORITY; queue_opts.qlimit = DEFAULT_QLIMIT; queue_opts.scheduler.qtype = ALTQT_NONE; queue_opts.queue_bwspec.bw_percent = 100; } queue_opts_l { $$ = queue_opts; } | /* empty */ { bzero(&queue_opts, sizeof queue_opts); queue_opts.priority = DEFAULT_PRIORITY; queue_opts.qlimit = DEFAULT_QLIMIT; queue_opts.scheduler.qtype = ALTQT_NONE; queue_opts.queue_bwspec.bw_percent = 100; $$ = queue_opts; } ; queue_opts_l : queue_opts_l queue_opt | queue_opt ; queue_opt : BANDWIDTH bandwidth { if (queue_opts.marker & QOM_BWSPEC) { yyerror("bandwidth cannot be respecified"); YYERROR; } queue_opts.marker |= QOM_BWSPEC; queue_opts.queue_bwspec = $2; } | PRIORITY NUMBER { if (queue_opts.marker & QOM_PRIORITY) { yyerror("priority cannot be respecified"); YYERROR; } if ($2 < 0 || $2 > 255) { yyerror("priority out of range: max 255"); YYERROR; } queue_opts.marker |= QOM_PRIORITY; queue_opts.priority = $2; } | QLIMIT NUMBER { if (queue_opts.marker & QOM_QLIMIT) { yyerror("qlimit cannot be respecified"); YYERROR; } if ($2 < 0 || $2 > 65535) { yyerror("qlimit out of range: max 65535"); YYERROR; } queue_opts.marker |= QOM_QLIMIT; queue_opts.qlimit = $2; } | scheduler { if (queue_opts.marker & QOM_SCHEDULER) { yyerror("scheduler cannot be respecified"); YYERROR; } queue_opts.marker |= QOM_SCHEDULER; queue_opts.scheduler = $1; } | TBRSIZE NUMBER { if (queue_opts.marker & QOM_TBRSIZE) { yyerror("tbrsize cannot be respecified"); YYERROR; } if ($2 < 0 || $2 > 65535) { yyerror("tbrsize too big: max 65535"); YYERROR; } queue_opts.marker |= QOM_TBRSIZE; queue_opts.tbrsize = $2; } ; bandwidth : STRING { double bps; char *cp; $$.bw_percent = 0; bps = strtod($1, &cp); if (cp != NULL) { if (strlen(cp) > 1) { char *cu = cp + 1; if (!strcmp(cu, "Bit") || !strcmp(cu, "B") || !strcmp(cu, "bit") || !strcmp(cu, "b")) { *cu = 0; } } if (!strcmp(cp, "b")) ; /* nothing */ else if (!strcmp(cp, "K")) bps *= 1000; else if (!strcmp(cp, "M")) bps *= 1000 * 1000; else if (!strcmp(cp, "G")) bps *= 1000 * 1000 * 1000; else if (!strcmp(cp, "%")) { if (bps < 0 || bps > 100) { yyerror("bandwidth spec " "out of range"); free($1); YYERROR; } $$.bw_percent = bps; bps = 0; } else { yyerror("unknown unit %s", cp); free($1); YYERROR; } } free($1); $$.bw_absolute = (u_int32_t)bps; } | NUMBER { if ($1 < 0 || $1 > UINT_MAX) { yyerror("bandwidth number too big"); YYERROR; } $$.bw_percent = 0; $$.bw_absolute = $1; } ; scheduler : CBQ { $$.qtype = ALTQT_CBQ; $$.data.cbq_opts.flags = 0; } | CBQ '(' cbqflags_list ')' { $$.qtype = ALTQT_CBQ; $$.data.cbq_opts.flags = $3; } | PRIQ { $$.qtype = ALTQT_PRIQ; $$.data.priq_opts.flags = 0; } | PRIQ '(' priqflags_list ')' { $$.qtype = ALTQT_PRIQ; $$.data.priq_opts.flags = $3; } | HFSC { $$.qtype = ALTQT_HFSC; bzero(&$$.data.hfsc_opts, sizeof(struct node_hfsc_opts)); } | HFSC '(' hfsc_opts ')' { $$.qtype = ALTQT_HFSC; $$.data.hfsc_opts = $3; } | FAIRQ { $$.qtype = ALTQT_FAIRQ; bzero(&$$.data.fairq_opts, sizeof(struct node_fairq_opts)); } | FAIRQ '(' fairq_opts ')' { $$.qtype = ALTQT_FAIRQ; $$.data.fairq_opts = $3; } | CODEL { $$.qtype = ALTQT_CODEL; bzero(&$$.data.codel_opts, sizeof(struct codel_opts)); } | CODEL '(' codel_opts ')' { $$.qtype = ALTQT_CODEL; $$.data.codel_opts = $3; } ; cbqflags_list : cbqflags_item { $$ |= $1; } | cbqflags_list comma cbqflags_item { $$ |= $3; } ; cbqflags_item : STRING { if (!strcmp($1, "default")) $$ = CBQCLF_DEFCLASS; else if (!strcmp($1, "borrow")) $$ = CBQCLF_BORROW; else if (!strcmp($1, "red")) $$ = CBQCLF_RED; else if (!strcmp($1, "ecn")) $$ = CBQCLF_RED|CBQCLF_ECN; else if (!strcmp($1, "rio")) $$ = CBQCLF_RIO; else if (!strcmp($1, "codel")) $$ = CBQCLF_CODEL; else { yyerror("unknown cbq flag \"%s\"", $1); free($1); YYERROR; } free($1); } ; priqflags_list : priqflags_item { $$ |= $1; } | priqflags_list comma priqflags_item { $$ |= $3; } ; priqflags_item : STRING { if (!strcmp($1, "default")) $$ = PRCF_DEFAULTCLASS; else if (!strcmp($1, "red")) $$ = PRCF_RED; else if (!strcmp($1, "ecn")) $$ = PRCF_RED|PRCF_ECN; else if (!strcmp($1, "rio")) $$ = PRCF_RIO; else if (!strcmp($1, "codel")) $$ = PRCF_CODEL; else { yyerror("unknown priq flag \"%s\"", $1); free($1); YYERROR; } free($1); } ; hfsc_opts : { bzero(&hfsc_opts, sizeof(struct node_hfsc_opts)); } hfscopts_list { $$ = hfsc_opts; } ; hfscopts_list : hfscopts_item | hfscopts_list comma hfscopts_item ; hfscopts_item : LINKSHARE bandwidth { if (hfsc_opts.linkshare.used) { yyerror("linkshare already specified"); YYERROR; } hfsc_opts.linkshare.m2 = $2; hfsc_opts.linkshare.used = 1; } | LINKSHARE '(' bandwidth comma NUMBER comma bandwidth ')' { if ($5 < 0 || $5 > INT_MAX) { yyerror("timing in curve out of range"); YYERROR; } if (hfsc_opts.linkshare.used) { yyerror("linkshare already specified"); YYERROR; } hfsc_opts.linkshare.m1 = $3; hfsc_opts.linkshare.d = $5; hfsc_opts.linkshare.m2 = $7; hfsc_opts.linkshare.used = 1; } | REALTIME bandwidth { if (hfsc_opts.realtime.used) { yyerror("realtime already specified"); YYERROR; } hfsc_opts.realtime.m2 = $2; hfsc_opts.realtime.used = 1; } | REALTIME '(' bandwidth comma NUMBER comma bandwidth ')' { if ($5 < 0 || $5 > INT_MAX) { yyerror("timing in curve out of range"); YYERROR; } if (hfsc_opts.realtime.used) { yyerror("realtime already specified"); YYERROR; } hfsc_opts.realtime.m1 = $3; hfsc_opts.realtime.d = $5; hfsc_opts.realtime.m2 = $7; hfsc_opts.realtime.used = 1; } | UPPERLIMIT bandwidth { if (hfsc_opts.upperlimit.used) { yyerror("upperlimit already specified"); YYERROR; } hfsc_opts.upperlimit.m2 = $2; hfsc_opts.upperlimit.used = 1; } | UPPERLIMIT '(' bandwidth comma NUMBER comma bandwidth ')' { if ($5 < 0 || $5 > INT_MAX) { yyerror("timing in curve out of range"); YYERROR; } if (hfsc_opts.upperlimit.used) { yyerror("upperlimit already specified"); YYERROR; } hfsc_opts.upperlimit.m1 = $3; hfsc_opts.upperlimit.d = $5; hfsc_opts.upperlimit.m2 = $7; hfsc_opts.upperlimit.used = 1; } | STRING { if (!strcmp($1, "default")) hfsc_opts.flags |= HFCF_DEFAULTCLASS; else if (!strcmp($1, "red")) hfsc_opts.flags |= HFCF_RED; else if (!strcmp($1, "ecn")) hfsc_opts.flags |= HFCF_RED|HFCF_ECN; else if (!strcmp($1, "rio")) hfsc_opts.flags |= HFCF_RIO; else if (!strcmp($1, "codel")) hfsc_opts.flags |= HFCF_CODEL; else { yyerror("unknown hfsc flag \"%s\"", $1); free($1); YYERROR; } free($1); } ; fairq_opts : { bzero(&fairq_opts, sizeof(struct node_fairq_opts)); } fairqopts_list { $$ = fairq_opts; } ; fairqopts_list : fairqopts_item | fairqopts_list comma fairqopts_item ; fairqopts_item : LINKSHARE bandwidth { if (fairq_opts.linkshare.used) { yyerror("linkshare already specified"); YYERROR; } fairq_opts.linkshare.m2 = $2; fairq_opts.linkshare.used = 1; } | LINKSHARE '(' bandwidth number bandwidth ')' { if (fairq_opts.linkshare.used) { yyerror("linkshare already specified"); YYERROR; } fairq_opts.linkshare.m1 = $3; fairq_opts.linkshare.d = $4; fairq_opts.linkshare.m2 = $5; fairq_opts.linkshare.used = 1; } | HOGS bandwidth { fairq_opts.hogs_bw = $2; } | BUCKETS number { fairq_opts.nbuckets = $2; } | STRING { if (!strcmp($1, "default")) fairq_opts.flags |= FARF_DEFAULTCLASS; else if (!strcmp($1, "red")) fairq_opts.flags |= FARF_RED; else if (!strcmp($1, "ecn")) fairq_opts.flags |= FARF_RED|FARF_ECN; else if (!strcmp($1, "rio")) fairq_opts.flags |= FARF_RIO; else if (!strcmp($1, "codel")) fairq_opts.flags |= FARF_CODEL; else { yyerror("unknown fairq flag \"%s\"", $1); free($1); YYERROR; } free($1); } ; codel_opts : { bzero(&codel_opts, sizeof(struct codel_opts)); } codelopts_list { $$ = codel_opts; } ; codelopts_list : codelopts_item | codelopts_list comma codelopts_item ; codelopts_item : INTERVAL number { if (codel_opts.interval) { yyerror("interval already specified"); YYERROR; } codel_opts.interval = $2; } | TARGET number { if (codel_opts.target) { yyerror("target already specified"); YYERROR; } codel_opts.target = $2; } | STRING { if (!strcmp($1, "ecn")) codel_opts.ecn = 1; else { yyerror("unknown codel option \"%s\"", $1); free($1); YYERROR; } free($1); } ; qassign : /* empty */ { $$ = NULL; } | qassign_item { $$ = $1; } | '{' optnl qassign_list '}' { $$ = $3; } ; qassign_list : qassign_item optnl { $$ = $1; } | qassign_list comma qassign_item optnl { $1->tail->next = $3; $1->tail = $3; $$ = $1; } ; qassign_item : STRING { $$ = calloc(1, sizeof(struct node_queue)); if ($$ == NULL) err(1, "qassign_item: calloc"); if (strlcpy($$->queue, $1, sizeof($$->queue)) >= sizeof($$->queue)) { yyerror("queue name '%s' too long (max " "%d chars)", $1, sizeof($$->queue)-1); free($1); free($$); YYERROR; } free($1); $$->next = NULL; $$->tail = $$; } ; pfrule : action dir logquick interface route af proto fromto filter_opts { struct pf_rule r; struct node_state_opt *o; struct node_proto *proto; int srctrack = 0; int statelock = 0; int adaptive = 0; int defaults = 0; if (check_rulestate(PFCTL_STATE_FILTER)) YYERROR; memset(&r, 0, sizeof(r)); r.action = $1.b1; switch ($1.b2) { case PFRULE_RETURNRST: r.rule_flag |= PFRULE_RETURNRST; r.return_ttl = $1.w; break; case PFRULE_RETURNICMP: r.rule_flag |= PFRULE_RETURNICMP; r.return_icmp = $1.w; r.return_icmp6 = $1.w2; break; case PFRULE_RETURN: r.rule_flag |= PFRULE_RETURN; r.return_icmp = $1.w; r.return_icmp6 = $1.w2; break; } r.direction = $2; r.log = $3.log; r.logif = $3.logif; r.quick = $3.quick; r.prob = $9.prob; r.rtableid = $9.rtableid; if ($9.marker & FOM_PRIO) { if ($9.prio == 0) r.prio = PF_PRIO_ZERO; else r.prio = $9.prio; } if ($9.marker & FOM_SETPRIO) { r.set_prio[0] = $9.set_prio[0]; r.set_prio[1] = $9.set_prio[1]; r.scrub_flags |= PFSTATE_SETPRIO; } r.af = $6; if ($9.tag) if (strlcpy(r.tagname, $9.tag, PF_TAG_NAME_SIZE) >= PF_TAG_NAME_SIZE) { yyerror("tag too long, max %u chars", PF_TAG_NAME_SIZE - 1); YYERROR; } if ($9.match_tag) if (strlcpy(r.match_tagname, $9.match_tag, PF_TAG_NAME_SIZE) >= PF_TAG_NAME_SIZE) { yyerror("tag too long, max %u chars", PF_TAG_NAME_SIZE - 1); YYERROR; } r.match_tag_not = $9.match_tag_not; if (rule_label(&r, $9.label)) YYERROR; free($9.label); r.flags = $9.flags.b1; r.flagset = $9.flags.b2; if (($9.flags.b1 & $9.flags.b2) != $9.flags.b1) { yyerror("flags always false"); YYERROR; } if ($9.flags.b1 || $9.flags.b2 || $8.src_os) { for (proto = $7; proto != NULL && proto->proto != IPPROTO_TCP; proto = proto->next) ; /* nothing */ if (proto == NULL && $7 != NULL) { if ($9.flags.b1 || $9.flags.b2) yyerror( "flags only apply to tcp"); if ($8.src_os) yyerror( "OS fingerprinting only " "apply to tcp"); YYERROR; } #if 0 if (($9.flags.b1 & parse_flags("S")) == 0 && $8.src_os) { yyerror("OS fingerprinting requires " "the SYN TCP flag (flags S/SA)"); YYERROR; } #endif } r.tos = $9.tos; r.keep_state = $9.keep.action; o = $9.keep.options; /* 'keep state' by default on pass rules. */ if (!r.keep_state && !r.action && !($9.marker & FOM_KEEP)) { r.keep_state = PF_STATE_NORMAL; o = keep_state_defaults; defaults = 1; } while (o) { struct node_state_opt *p = o; switch (o->type) { case PF_STATE_OPT_MAX: if (r.max_states) { yyerror("state option 'max' " "multiple definitions"); YYERROR; } r.max_states = o->data.max_states; break; case PF_STATE_OPT_NOSYNC: if (r.rule_flag & PFRULE_NOSYNC) { yyerror("state option 'sync' " "multiple definitions"); YYERROR; } r.rule_flag |= PFRULE_NOSYNC; break; case PF_STATE_OPT_SRCTRACK: if (srctrack) { yyerror("state option " "'source-track' " "multiple definitions"); YYERROR; } srctrack = o->data.src_track; r.rule_flag |= PFRULE_SRCTRACK; break; case PF_STATE_OPT_MAX_SRC_STATES: if (r.max_src_states) { yyerror("state option " "'max-src-states' " "multiple definitions"); YYERROR; } if (o->data.max_src_states == 0) { yyerror("'max-src-states' must " "be > 0"); YYERROR; } r.max_src_states = o->data.max_src_states; r.rule_flag |= PFRULE_SRCTRACK; break; case PF_STATE_OPT_OVERLOAD: if (r.overload_tblname[0]) { yyerror("multiple 'overload' " "table definitions"); YYERROR; } if (strlcpy(r.overload_tblname, o->data.overload.tblname, PF_TABLE_NAME_SIZE) >= PF_TABLE_NAME_SIZE) { yyerror("state option: " "strlcpy"); YYERROR; } r.flush = o->data.overload.flush; break; case PF_STATE_OPT_MAX_SRC_CONN: if (r.max_src_conn) { yyerror("state option " "'max-src-conn' " "multiple definitions"); YYERROR; } if (o->data.max_src_conn == 0) { yyerror("'max-src-conn' " "must be > 0"); YYERROR; } r.max_src_conn = o->data.max_src_conn; r.rule_flag |= PFRULE_SRCTRACK | PFRULE_RULESRCTRACK; break; case PF_STATE_OPT_MAX_SRC_CONN_RATE: if (r.max_src_conn_rate.limit) { yyerror("state option " "'max-src-conn-rate' " "multiple definitions"); YYERROR; } if (!o->data.max_src_conn_rate.limit || !o->data.max_src_conn_rate.seconds) { yyerror("'max-src-conn-rate' " "values must be > 0"); YYERROR; } if (o->data.max_src_conn_rate.limit > PF_THRESHOLD_MAX) { yyerror("'max-src-conn-rate' " "maximum rate must be < %u", PF_THRESHOLD_MAX); YYERROR; } r.max_src_conn_rate.limit = o->data.max_src_conn_rate.limit; r.max_src_conn_rate.seconds = o->data.max_src_conn_rate.seconds; r.rule_flag |= PFRULE_SRCTRACK | PFRULE_RULESRCTRACK; break; case PF_STATE_OPT_MAX_SRC_NODES: if (r.max_src_nodes) { yyerror("state option " "'max-src-nodes' " "multiple definitions"); YYERROR; } if (o->data.max_src_nodes == 0) { yyerror("'max-src-nodes' must " "be > 0"); YYERROR; } r.max_src_nodes = o->data.max_src_nodes; r.rule_flag |= PFRULE_SRCTRACK | PFRULE_RULESRCTRACK; break; case PF_STATE_OPT_STATELOCK: if (statelock) { yyerror("state locking option: " "multiple definitions"); YYERROR; } statelock = 1; r.rule_flag |= o->data.statelock; break; case PF_STATE_OPT_SLOPPY: if (r.rule_flag & PFRULE_STATESLOPPY) { yyerror("state sloppy option: " "multiple definitions"); YYERROR; } r.rule_flag |= PFRULE_STATESLOPPY; break; case PF_STATE_OPT_TIMEOUT: if (o->data.timeout.number == PFTM_ADAPTIVE_START || o->data.timeout.number == PFTM_ADAPTIVE_END) adaptive = 1; if (r.timeout[o->data.timeout.number]) { yyerror("state timeout %s " "multiple definitions", pf_timeouts[o->data. timeout.number].name); YYERROR; } r.timeout[o->data.timeout.number] = o->data.timeout.seconds; } o = o->next; if (!defaults) free(p); } /* 'flags S/SA' by default on stateful rules */ if (!r.action && !r.flags && !r.flagset && !$9.fragment && !($9.marker & FOM_FLAGS) && r.keep_state) { r.flags = parse_flags("S"); r.flagset = parse_flags("SA"); } if (!adaptive && r.max_states) { r.timeout[PFTM_ADAPTIVE_START] = (r.max_states / 10) * 6; r.timeout[PFTM_ADAPTIVE_END] = (r.max_states / 10) * 12; } if (r.rule_flag & PFRULE_SRCTRACK) { if (srctrack == PF_SRCTRACK_GLOBAL && r.max_src_nodes) { yyerror("'max-src-nodes' is " "incompatible with " "'source-track global'"); YYERROR; } if (srctrack == PF_SRCTRACK_GLOBAL && r.max_src_conn) { yyerror("'max-src-conn' is " "incompatible with " "'source-track global'"); YYERROR; } if (srctrack == PF_SRCTRACK_GLOBAL && r.max_src_conn_rate.seconds) { yyerror("'max-src-conn-rate' is " "incompatible with " "'source-track global'"); YYERROR; } if (r.timeout[PFTM_SRC_NODE] < r.max_src_conn_rate.seconds) r.timeout[PFTM_SRC_NODE] = r.max_src_conn_rate.seconds; r.rule_flag |= PFRULE_SRCTRACK; if (srctrack == PF_SRCTRACK_RULE) r.rule_flag |= PFRULE_RULESRCTRACK; } if (r.keep_state && !statelock) r.rule_flag |= default_statelock; if ($9.fragment) r.rule_flag |= PFRULE_FRAGMENT; r.allow_opts = $9.allowopts; decide_address_family($8.src.host, &r.af); decide_address_family($8.dst.host, &r.af); if ($5.rt) { if (!r.direction) { yyerror("direction must be explicit " "with rules that specify routing"); YYERROR; } r.rt = $5.rt; r.rpool.opts = $5.pool_opts; if ($5.key != NULL) memcpy(&r.rpool.key, $5.key, sizeof(struct pf_poolhashkey)); } if (r.rt && r.rt != PF_FASTROUTE) { decide_address_family($5.host, &r.af); remove_invalid_hosts(&$5.host, &r.af); if ($5.host == NULL) { yyerror("no routing address with " "matching address family found."); YYERROR; } if ((r.rpool.opts & PF_POOL_TYPEMASK) == PF_POOL_NONE && ($5.host->next != NULL || $5.host->addr.type == PF_ADDR_TABLE || DYNIF_MULTIADDR($5.host->addr))) r.rpool.opts |= PF_POOL_ROUNDROBIN; if ((r.rpool.opts & PF_POOL_TYPEMASK) != PF_POOL_ROUNDROBIN && disallow_table($5.host, "tables are only " "supported in round-robin routing pools")) YYERROR; if ((r.rpool.opts & PF_POOL_TYPEMASK) != PF_POOL_ROUNDROBIN && disallow_alias($5.host, "interface (%s) " "is only supported in round-robin " "routing pools")) YYERROR; if ($5.host->next != NULL) { if ((r.rpool.opts & PF_POOL_TYPEMASK) != PF_POOL_ROUNDROBIN) { yyerror("r.rpool.opts must " "be PF_POOL_ROUNDROBIN"); YYERROR; } } } if ($9.queues.qname != NULL) { if (strlcpy(r.qname, $9.queues.qname, sizeof(r.qname)) >= sizeof(r.qname)) { yyerror("rule qname too long (max " "%d chars)", sizeof(r.qname)-1); YYERROR; } free($9.queues.qname); } if ($9.queues.pqname != NULL) { if (strlcpy(r.pqname, $9.queues.pqname, sizeof(r.pqname)) >= sizeof(r.pqname)) { yyerror("rule pqname too long (max " "%d chars)", sizeof(r.pqname)-1); YYERROR; } free($9.queues.pqname); } #ifdef __FreeBSD__ r.divert.port = $9.divert.port; #else if ((r.divert.port = $9.divert.port)) { if (r.direction == PF_OUT) { if ($9.divert.addr) { yyerror("address specified " "for outgoing divert"); YYERROR; } bzero(&r.divert.addr, sizeof(r.divert.addr)); } else { if (!$9.divert.addr) { yyerror("no address specified " "for incoming divert"); YYERROR; } if ($9.divert.addr->af != r.af) { yyerror("address family " "mismatch for divert"); YYERROR; } r.divert.addr = $9.divert.addr->addr.v.a.addr; } } #endif expand_rule(&r, $4, $5.host, $7, $8.src_os, $8.src.host, $8.src.port, $8.dst.host, $8.dst.port, $9.uid, $9.gid, $9.icmpspec, ""); } ; filter_opts : { bzero(&filter_opts, sizeof filter_opts); filter_opts.rtableid = -1; } filter_opts_l { $$ = filter_opts; } | /* empty */ { bzero(&filter_opts, sizeof filter_opts); filter_opts.rtableid = -1; $$ = filter_opts; } ; filter_opts_l : filter_opts_l filter_opt | filter_opt ; filter_opt : USER uids { if (filter_opts.uid) $2->tail->next = filter_opts.uid; filter_opts.uid = $2; } | GROUP gids { if (filter_opts.gid) $2->tail->next = filter_opts.gid; filter_opts.gid = $2; } | flags { if (filter_opts.marker & FOM_FLAGS) { yyerror("flags cannot be redefined"); YYERROR; } filter_opts.marker |= FOM_FLAGS; filter_opts.flags.b1 |= $1.b1; filter_opts.flags.b2 |= $1.b2; filter_opts.flags.w |= $1.w; filter_opts.flags.w2 |= $1.w2; } | icmpspec { if (filter_opts.marker & FOM_ICMP) { yyerror("icmp-type cannot be redefined"); YYERROR; } filter_opts.marker |= FOM_ICMP; filter_opts.icmpspec = $1; } | PRIO NUMBER { if (filter_opts.marker & FOM_PRIO) { yyerror("prio cannot be redefined"); YYERROR; } if ($2 < 0 || $2 > PF_PRIO_MAX) { yyerror("prio must be 0 - %u", PF_PRIO_MAX); YYERROR; } filter_opts.marker |= FOM_PRIO; filter_opts.prio = $2; } | TOS tos { if (filter_opts.marker & FOM_TOS) { yyerror("tos cannot be redefined"); YYERROR; } filter_opts.marker |= FOM_TOS; filter_opts.tos = $2; } | keep { if (filter_opts.marker & FOM_KEEP) { yyerror("modulate or keep cannot be redefined"); YYERROR; } filter_opts.marker |= FOM_KEEP; filter_opts.keep.action = $1.action; filter_opts.keep.options = $1.options; } | FRAGMENT { filter_opts.fragment = 1; } | ALLOWOPTS { filter_opts.allowopts = 1; } | label { if (filter_opts.label) { yyerror("label cannot be redefined"); YYERROR; } filter_opts.label = $1; } | qname { if (filter_opts.queues.qname) { yyerror("queue cannot be redefined"); YYERROR; } filter_opts.queues = $1; } | TAG string { filter_opts.tag = $2; } | not TAGGED string { filter_opts.match_tag = $3; filter_opts.match_tag_not = $1; } | PROBABILITY probability { double p; p = floor($2 * UINT_MAX + 0.5); if (p < 0.0 || p > UINT_MAX) { yyerror("invalid probability: %lf", p); YYERROR; } filter_opts.prob = (u_int32_t)p; if (filter_opts.prob == 0) filter_opts.prob = 1; } | RTABLE NUMBER { if ($2 < 0 || $2 > rt_tableid_max()) { yyerror("invalid rtable id"); YYERROR; } filter_opts.rtableid = $2; } | DIVERTTO portplain { #ifdef __FreeBSD__ filter_opts.divert.port = $2.a; if (!filter_opts.divert.port) { yyerror("invalid divert port: %u", ntohs($2.a)); YYERROR; } #endif } | DIVERTTO STRING PORT portplain { #ifndef __FreeBSD__ if ((filter_opts.divert.addr = host($2)) == NULL) { yyerror("could not parse divert address: %s", $2); free($2); YYERROR; } #else if ($2) #endif free($2); filter_opts.divert.port = $4.a; if (!filter_opts.divert.port) { yyerror("invalid divert port: %u", ntohs($4.a)); YYERROR; } } | DIVERTREPLY { #ifdef __FreeBSD__ yyerror("divert-reply has no meaning in FreeBSD pf(4)"); YYERROR; #else filter_opts.divert.port = 1; /* some random value */ #endif } | filter_sets ; filter_sets : SET '(' filter_sets_l ')' { $$ = filter_opts; } | SET filter_set { $$ = filter_opts; } ; filter_sets_l : filter_sets_l comma filter_set | filter_set ; filter_set : prio { if (filter_opts.marker & FOM_SETPRIO) { yyerror("prio cannot be redefined"); YYERROR; } filter_opts.marker |= FOM_SETPRIO; filter_opts.set_prio[0] = $1.b1; filter_opts.set_prio[1] = $1.b2; } prio : PRIO NUMBER { if ($2 < 0 || $2 > PF_PRIO_MAX) { yyerror("prio must be 0 - %u", PF_PRIO_MAX); YYERROR; } $$.b1 = $$.b2 = $2; } | PRIO '(' NUMBER comma NUMBER ')' { if ($3 < 0 || $3 > PF_PRIO_MAX || $5 < 0 || $5 > PF_PRIO_MAX) { yyerror("prio must be 0 - %u", PF_PRIO_MAX); YYERROR; } $$.b1 = $3; $$.b2 = $5; } ; probability : STRING { char *e; double p = strtod($1, &e); if (*e == '%') { p *= 0.01; e++; } if (*e) { yyerror("invalid probability: %s", $1); free($1); YYERROR; } free($1); $$ = p; } | NUMBER { $$ = (double)$1; } ; action : PASS { $$.b1 = PF_PASS; $$.b2 = $$.w = 0; } | BLOCK blockspec { $$ = $2; $$.b1 = PF_DROP; } ; blockspec : /* empty */ { $$.b2 = blockpolicy; $$.w = returnicmpdefault; $$.w2 = returnicmp6default; } | DROP { $$.b2 = PFRULE_DROP; $$.w = 0; $$.w2 = 0; } | RETURNRST { $$.b2 = PFRULE_RETURNRST; $$.w = 0; $$.w2 = 0; } | RETURNRST '(' TTL NUMBER ')' { if ($4 < 0 || $4 > 255) { yyerror("illegal ttl value %d", $4); YYERROR; } $$.b2 = PFRULE_RETURNRST; $$.w = $4; $$.w2 = 0; } | RETURNICMP { $$.b2 = PFRULE_RETURNICMP; $$.w = returnicmpdefault; $$.w2 = returnicmp6default; } | RETURNICMP6 { $$.b2 = PFRULE_RETURNICMP; $$.w = returnicmpdefault; $$.w2 = returnicmp6default; } | RETURNICMP '(' reticmpspec ')' { $$.b2 = PFRULE_RETURNICMP; $$.w = $3; $$.w2 = returnicmpdefault; } | RETURNICMP6 '(' reticmp6spec ')' { $$.b2 = PFRULE_RETURNICMP; $$.w = returnicmpdefault; $$.w2 = $3; } | RETURNICMP '(' reticmpspec comma reticmp6spec ')' { $$.b2 = PFRULE_RETURNICMP; $$.w = $3; $$.w2 = $5; } | RETURN { $$.b2 = PFRULE_RETURN; $$.w = returnicmpdefault; $$.w2 = returnicmp6default; } ; reticmpspec : STRING { if (!($$ = parseicmpspec($1, AF_INET))) { free($1); YYERROR; } free($1); } | NUMBER { u_int8_t icmptype; if ($1 < 0 || $1 > 255) { yyerror("invalid icmp code %lu", $1); YYERROR; } icmptype = returnicmpdefault >> 8; $$ = (icmptype << 8 | $1); } ; reticmp6spec : STRING { if (!($$ = parseicmpspec($1, AF_INET6))) { free($1); YYERROR; } free($1); } | NUMBER { u_int8_t icmptype; if ($1 < 0 || $1 > 255) { yyerror("invalid icmp code %lu", $1); YYERROR; } icmptype = returnicmp6default >> 8; $$ = (icmptype << 8 | $1); } ; dir : /* empty */ { $$ = PF_INOUT; } | IN { $$ = PF_IN; } | OUT { $$ = PF_OUT; } ; quick : /* empty */ { $$.quick = 0; } | QUICK { $$.quick = 1; } ; logquick : /* empty */ { $$.log = 0; $$.quick = 0; $$.logif = 0; } | log { $$ = $1; $$.quick = 0; } | QUICK { $$.quick = 1; $$.log = 0; $$.logif = 0; } | log QUICK { $$ = $1; $$.quick = 1; } | QUICK log { $$ = $2; $$.quick = 1; } ; log : LOG { $$.log = PF_LOG; $$.logif = 0; } | LOG '(' logopts ')' { $$.log = PF_LOG | $3.log; $$.logif = $3.logif; } ; logopts : logopt { $$ = $1; } | logopts comma logopt { $$.log = $1.log | $3.log; $$.logif = $3.logif; if ($$.logif == 0) $$.logif = $1.logif; } ; logopt : ALL { $$.log = PF_LOG_ALL; $$.logif = 0; } | USER { $$.log = PF_LOG_SOCKET_LOOKUP; $$.logif = 0; } | GROUP { $$.log = PF_LOG_SOCKET_LOOKUP; $$.logif = 0; } | TO string { const char *errstr; u_int i; $$.log = 0; if (strncmp($2, "pflog", 5)) { yyerror("%s: should be a pflog interface", $2); free($2); YYERROR; } i = strtonum($2 + 5, 0, 255, &errstr); if (errstr) { yyerror("%s: %s", $2, errstr); free($2); YYERROR; } free($2); $$.logif = i; } ; interface : /* empty */ { $$ = NULL; } | ON if_item_not { $$ = $2; } | ON '{' optnl if_list '}' { $$ = $4; } ; if_list : if_item_not optnl { $$ = $1; } | if_list comma if_item_not optnl { $1->tail->next = $3; $1->tail = $3; $$ = $1; } ; if_item_not : not if_item { $$ = $2; $$->not = $1; } ; if_item : STRING { struct node_host *n; $$ = calloc(1, sizeof(struct node_if)); if ($$ == NULL) err(1, "if_item: calloc"); if (strlcpy($$->ifname, $1, sizeof($$->ifname)) >= sizeof($$->ifname)) { free($1); free($$); yyerror("interface name too long"); YYERROR; } if ((n = ifa_exists($1)) != NULL) $$->ifa_flags = n->ifa_flags; free($1); $$->not = 0; $$->next = NULL; $$->tail = $$; } ; af : /* empty */ { $$ = 0; } | INET { $$ = AF_INET; } | INET6 { $$ = AF_INET6; } ; proto : /* empty */ { $$ = NULL; } | PROTO proto_item { $$ = $2; } | PROTO '{' optnl proto_list '}' { $$ = $4; } ; proto_list : proto_item optnl { $$ = $1; } | proto_list comma proto_item optnl { $1->tail->next = $3; $1->tail = $3; $$ = $1; } ; proto_item : protoval { u_int8_t pr; pr = (u_int8_t)$1; if (pr == 0) { yyerror("proto 0 cannot be used"); YYERROR; } $$ = calloc(1, sizeof(struct node_proto)); if ($$ == NULL) err(1, "proto_item: calloc"); $$->proto = pr; $$->next = NULL; $$->tail = $$; } ; protoval : STRING { struct protoent *p; p = getprotobyname($1); if (p == NULL) { yyerror("unknown protocol %s", $1); free($1); YYERROR; } $$ = p->p_proto; free($1); } | NUMBER { if ($1 < 0 || $1 > 255) { yyerror("protocol outside range"); YYERROR; } } ; fromto : ALL { $$.src.host = NULL; $$.src.port = NULL; $$.dst.host = NULL; $$.dst.port = NULL; $$.src_os = NULL; } | from os to { $$.src = $1; $$.src_os = $2; $$.dst = $3; } ; os : /* empty */ { $$ = NULL; } | OS xos { $$ = $2; } | OS '{' optnl os_list '}' { $$ = $4; } ; xos : STRING { $$ = calloc(1, sizeof(struct node_os)); if ($$ == NULL) err(1, "os: calloc"); $$->os = $1; $$->tail = $$; } ; os_list : xos optnl { $$ = $1; } | os_list comma xos optnl { $1->tail->next = $3; $1->tail = $3; $$ = $1; } ; from : /* empty */ { $$.host = NULL; $$.port = NULL; } | FROM ipportspec { $$ = $2; } ; to : /* empty */ { $$.host = NULL; $$.port = NULL; } | TO ipportspec { if (disallow_urpf_failed($2.host, "\"urpf-failed\" is " "not permitted in a destination address")) YYERROR; $$ = $2; } ; ipportspec : ipspec { $$.host = $1; $$.port = NULL; } | ipspec PORT portspec { $$.host = $1; $$.port = $3; } | PORT portspec { $$.host = NULL; $$.port = $2; } ; optnl : '\n' optnl | ; ipspec : ANY { $$ = NULL; } | xhost { $$ = $1; } | '{' optnl host_list '}' { $$ = $3; } ; toipspec : TO ipspec { $$ = $2; } | /* empty */ { $$ = NULL; } ; host_list : ipspec optnl { $$ = $1; } | host_list comma ipspec optnl { if ($3 == NULL) $$ = $1; else if ($1 == NULL) $$ = $3; else { $1->tail->next = $3; $1->tail = $3->tail; $$ = $1; } } ; xhost : not host { struct node_host *n; for (n = $2; n != NULL; n = n->next) n->not = $1; $$ = $2; } | not NOROUTE { $$ = calloc(1, sizeof(struct node_host)); if ($$ == NULL) err(1, "xhost: calloc"); $$->addr.type = PF_ADDR_NOROUTE; $$->next = NULL; $$->not = $1; $$->tail = $$; } | not URPFFAILED { $$ = calloc(1, sizeof(struct node_host)); if ($$ == NULL) err(1, "xhost: calloc"); $$->addr.type = PF_ADDR_URPFFAILED; $$->next = NULL; $$->not = $1; $$->tail = $$; } ; host : STRING { if (($$ = host($1)) == NULL) { /* error. "any" is handled elsewhere */ free($1); yyerror("could not parse host specification"); YYERROR; } free($1); } | STRING '-' STRING { struct node_host *b, *e; if ((b = host($1)) == NULL || (e = host($3)) == NULL) { free($1); free($3); yyerror("could not parse host specification"); YYERROR; } if (b->af != e->af || b->addr.type != PF_ADDR_ADDRMASK || e->addr.type != PF_ADDR_ADDRMASK || unmask(&b->addr.v.a.mask, b->af) != (b->af == AF_INET ? 32 : 128) || unmask(&e->addr.v.a.mask, e->af) != (e->af == AF_INET ? 32 : 128) || b->next != NULL || b->not || e->next != NULL || e->not) { free(b); free(e); free($1); free($3); yyerror("invalid address range"); YYERROR; } memcpy(&b->addr.v.a.mask, &e->addr.v.a.addr, sizeof(b->addr.v.a.mask)); b->addr.type = PF_ADDR_RANGE; $$ = b; free(e); free($1); free($3); } | STRING '/' NUMBER { char *buf; if (asprintf(&buf, "%s/%lld", $1, (long long)$3) == -1) err(1, "host: asprintf"); free($1); if (($$ = host(buf)) == NULL) { /* error. "any" is handled elsewhere */ free(buf); yyerror("could not parse host specification"); YYERROR; } free(buf); } | NUMBER '/' NUMBER { char *buf; /* ie. for 10/8 parsing */ #ifdef __FreeBSD__ if (asprintf(&buf, "%lld/%lld", (long long)$1, (long long)$3) == -1) #else if (asprintf(&buf, "%lld/%lld", $1, $3) == -1) #endif err(1, "host: asprintf"); if (($$ = host(buf)) == NULL) { /* error. "any" is handled elsewhere */ free(buf); yyerror("could not parse host specification"); YYERROR; } free(buf); } | dynaddr | dynaddr '/' NUMBER { struct node_host *n; if ($3 < 0 || $3 > 128) { yyerror("bit number too big"); YYERROR; } $$ = $1; for (n = $1; n != NULL; n = n->next) set_ipmask(n, $3); } | '<' STRING '>' { if (strlen($2) >= PF_TABLE_NAME_SIZE) { yyerror("table name '%s' too long", $2); free($2); YYERROR; } $$ = calloc(1, sizeof(struct node_host)); if ($$ == NULL) err(1, "host: calloc"); $$->addr.type = PF_ADDR_TABLE; if (strlcpy($$->addr.v.tblname, $2, sizeof($$->addr.v.tblname)) >= sizeof($$->addr.v.tblname)) errx(1, "host: strlcpy"); free($2); $$->next = NULL; $$->tail = $$; } ; number : NUMBER | STRING { u_long ulval; if (atoul($1, &ulval) == -1) { yyerror("%s is not a number", $1); free($1); YYERROR; } else $$ = ulval; free($1); } ; dynaddr : '(' STRING ')' { int flags = 0; char *p, *op; op = $2; if (!isalpha(op[0])) { yyerror("invalid interface name '%s'", op); free(op); YYERROR; } while ((p = strrchr($2, ':')) != NULL) { if (!strcmp(p+1, "network")) flags |= PFI_AFLAG_NETWORK; else if (!strcmp(p+1, "broadcast")) flags |= PFI_AFLAG_BROADCAST; else if (!strcmp(p+1, "peer")) flags |= PFI_AFLAG_PEER; else if (!strcmp(p+1, "0")) flags |= PFI_AFLAG_NOALIAS; else { yyerror("interface %s has bad modifier", $2); free(op); YYERROR; } *p = '\0'; } if (flags & (flags - 1) & PFI_AFLAG_MODEMASK) { free(op); yyerror("illegal combination of " "interface modifiers"); YYERROR; } $$ = calloc(1, sizeof(struct node_host)); if ($$ == NULL) err(1, "address: calloc"); $$->af = 0; set_ipmask($$, 128); $$->addr.type = PF_ADDR_DYNIFTL; $$->addr.iflags = flags; if (strlcpy($$->addr.v.ifname, $2, sizeof($$->addr.v.ifname)) >= sizeof($$->addr.v.ifname)) { free(op); free($$); yyerror("interface name too long"); YYERROR; } free(op); $$->next = NULL; $$->tail = $$; } ; portspec : port_item { $$ = $1; } | '{' optnl port_list '}' { $$ = $3; } ; port_list : port_item optnl { $$ = $1; } | port_list comma port_item optnl { $1->tail->next = $3; $1->tail = $3; $$ = $1; } ; port_item : portrange { $$ = calloc(1, sizeof(struct node_port)); if ($$ == NULL) err(1, "port_item: calloc"); $$->port[0] = $1.a; $$->port[1] = $1.b; if ($1.t) $$->op = PF_OP_RRG; else $$->op = PF_OP_EQ; $$->next = NULL; $$->tail = $$; } | unaryop portrange { if ($2.t) { yyerror("':' cannot be used with an other " "port operator"); YYERROR; } $$ = calloc(1, sizeof(struct node_port)); if ($$ == NULL) err(1, "port_item: calloc"); $$->port[0] = $2.a; $$->port[1] = $2.b; $$->op = $1; $$->next = NULL; $$->tail = $$; } | portrange PORTBINARY portrange { if ($1.t || $3.t) { yyerror("':' cannot be used with an other " "port operator"); YYERROR; } $$ = calloc(1, sizeof(struct node_port)); if ($$ == NULL) err(1, "port_item: calloc"); $$->port[0] = $1.a; $$->port[1] = $3.a; $$->op = $2; $$->next = NULL; $$->tail = $$; } ; portplain : numberstring { if (parseport($1, &$$, 0) == -1) { free($1); YYERROR; } free($1); } ; portrange : numberstring { if (parseport($1, &$$, PPORT_RANGE) == -1) { free($1); YYERROR; } free($1); } ; uids : uid_item { $$ = $1; } | '{' optnl uid_list '}' { $$ = $3; } ; uid_list : uid_item optnl { $$ = $1; } | uid_list comma uid_item optnl { $1->tail->next = $3; $1->tail = $3; $$ = $1; } ; uid_item : uid { $$ = calloc(1, sizeof(struct node_uid)); if ($$ == NULL) err(1, "uid_item: calloc"); $$->uid[0] = $1; $$->uid[1] = $1; $$->op = PF_OP_EQ; $$->next = NULL; $$->tail = $$; } | unaryop uid { if ($2 == UID_MAX && $1 != PF_OP_EQ && $1 != PF_OP_NE) { yyerror("user unknown requires operator = or " "!="); YYERROR; } $$ = calloc(1, sizeof(struct node_uid)); if ($$ == NULL) err(1, "uid_item: calloc"); $$->uid[0] = $2; $$->uid[1] = $2; $$->op = $1; $$->next = NULL; $$->tail = $$; } | uid PORTBINARY uid { if ($1 == UID_MAX || $3 == UID_MAX) { yyerror("user unknown requires operator = or " "!="); YYERROR; } $$ = calloc(1, sizeof(struct node_uid)); if ($$ == NULL) err(1, "uid_item: calloc"); $$->uid[0] = $1; $$->uid[1] = $3; $$->op = $2; $$->next = NULL; $$->tail = $$; } ; uid : STRING { if (!strcmp($1, "unknown")) $$ = UID_MAX; else { struct passwd *pw; if ((pw = getpwnam($1)) == NULL) { yyerror("unknown user %s", $1); free($1); YYERROR; } $$ = pw->pw_uid; } free($1); } | NUMBER { if ($1 < 0 || $1 >= UID_MAX) { yyerror("illegal uid value %lu", $1); YYERROR; } $$ = $1; } ; gids : gid_item { $$ = $1; } | '{' optnl gid_list '}' { $$ = $3; } ; gid_list : gid_item optnl { $$ = $1; } | gid_list comma gid_item optnl { $1->tail->next = $3; $1->tail = $3; $$ = $1; } ; gid_item : gid { $$ = calloc(1, sizeof(struct node_gid)); if ($$ == NULL) err(1, "gid_item: calloc"); $$->gid[0] = $1; $$->gid[1] = $1; $$->op = PF_OP_EQ; $$->next = NULL; $$->tail = $$; } | unaryop gid { if ($2 == GID_MAX && $1 != PF_OP_EQ && $1 != PF_OP_NE) { yyerror("group unknown requires operator = or " "!="); YYERROR; } $$ = calloc(1, sizeof(struct node_gid)); if ($$ == NULL) err(1, "gid_item: calloc"); $$->gid[0] = $2; $$->gid[1] = $2; $$->op = $1; $$->next = NULL; $$->tail = $$; } | gid PORTBINARY gid { if ($1 == GID_MAX || $3 == GID_MAX) { yyerror("group unknown requires operator = or " "!="); YYERROR; } $$ = calloc(1, sizeof(struct node_gid)); if ($$ == NULL) err(1, "gid_item: calloc"); $$->gid[0] = $1; $$->gid[1] = $3; $$->op = $2; $$->next = NULL; $$->tail = $$; } ; gid : STRING { if (!strcmp($1, "unknown")) $$ = GID_MAX; else { struct group *grp; if ((grp = getgrnam($1)) == NULL) { yyerror("unknown group %s", $1); free($1); YYERROR; } $$ = grp->gr_gid; } free($1); } | NUMBER { if ($1 < 0 || $1 >= GID_MAX) { yyerror("illegal gid value %lu", $1); YYERROR; } $$ = $1; } ; flag : STRING { int f; if ((f = parse_flags($1)) < 0) { yyerror("bad flags %s", $1); free($1); YYERROR; } free($1); $$.b1 = f; } ; flags : FLAGS flag '/' flag { $$.b1 = $2.b1; $$.b2 = $4.b1; } | FLAGS '/' flag { $$.b1 = 0; $$.b2 = $3.b1; } | FLAGS ANY { $$.b1 = 0; $$.b2 = 0; } ; icmpspec : ICMPTYPE icmp_item { $$ = $2; } | ICMPTYPE '{' optnl icmp_list '}' { $$ = $4; } | ICMP6TYPE icmp6_item { $$ = $2; } | ICMP6TYPE '{' optnl icmp6_list '}' { $$ = $4; } ; icmp_list : icmp_item optnl { $$ = $1; } | icmp_list comma icmp_item optnl { $1->tail->next = $3; $1->tail = $3; $$ = $1; } ; icmp6_list : icmp6_item optnl { $$ = $1; } | icmp6_list comma icmp6_item optnl { $1->tail->next = $3; $1->tail = $3; $$ = $1; } ; icmp_item : icmptype { $$ = calloc(1, sizeof(struct node_icmp)); if ($$ == NULL) err(1, "icmp_item: calloc"); $$->type = $1; $$->code = 0; $$->proto = IPPROTO_ICMP; $$->next = NULL; $$->tail = $$; } | icmptype CODE STRING { const struct icmpcodeent *p; if ((p = geticmpcodebyname($1-1, $3, AF_INET)) == NULL) { yyerror("unknown icmp-code %s", $3); free($3); YYERROR; } free($3); $$ = calloc(1, sizeof(struct node_icmp)); if ($$ == NULL) err(1, "icmp_item: calloc"); $$->type = $1; $$->code = p->code + 1; $$->proto = IPPROTO_ICMP; $$->next = NULL; $$->tail = $$; } | icmptype CODE NUMBER { if ($3 < 0 || $3 > 255) { yyerror("illegal icmp-code %lu", $3); YYERROR; } $$ = calloc(1, sizeof(struct node_icmp)); if ($$ == NULL) err(1, "icmp_item: calloc"); $$->type = $1; $$->code = $3 + 1; $$->proto = IPPROTO_ICMP; $$->next = NULL; $$->tail = $$; } ; icmp6_item : icmp6type { $$ = calloc(1, sizeof(struct node_icmp)); if ($$ == NULL) err(1, "icmp_item: calloc"); $$->type = $1; $$->code = 0; $$->proto = IPPROTO_ICMPV6; $$->next = NULL; $$->tail = $$; } | icmp6type CODE STRING { const struct icmpcodeent *p; if ((p = geticmpcodebyname($1-1, $3, AF_INET6)) == NULL) { yyerror("unknown icmp6-code %s", $3); free($3); YYERROR; } free($3); $$ = calloc(1, sizeof(struct node_icmp)); if ($$ == NULL) err(1, "icmp_item: calloc"); $$->type = $1; $$->code = p->code + 1; $$->proto = IPPROTO_ICMPV6; $$->next = NULL; $$->tail = $$; } | icmp6type CODE NUMBER { if ($3 < 0 || $3 > 255) { yyerror("illegal icmp-code %lu", $3); YYERROR; } $$ = calloc(1, sizeof(struct node_icmp)); if ($$ == NULL) err(1, "icmp_item: calloc"); $$->type = $1; $$->code = $3 + 1; $$->proto = IPPROTO_ICMPV6; $$->next = NULL; $$->tail = $$; } ; icmptype : STRING { const struct icmptypeent *p; if ((p = geticmptypebyname($1, AF_INET)) == NULL) { yyerror("unknown icmp-type %s", $1); free($1); YYERROR; } $$ = p->type + 1; free($1); } | NUMBER { if ($1 < 0 || $1 > 255) { yyerror("illegal icmp-type %lu", $1); YYERROR; } $$ = $1 + 1; } ; icmp6type : STRING { const struct icmptypeent *p; if ((p = geticmptypebyname($1, AF_INET6)) == NULL) { yyerror("unknown icmp6-type %s", $1); free($1); YYERROR; } $$ = p->type + 1; free($1); } | NUMBER { if ($1 < 0 || $1 > 255) { yyerror("illegal icmp6-type %lu", $1); YYERROR; } $$ = $1 + 1; } ; tos : STRING { if (!strcmp($1, "lowdelay")) $$ = IPTOS_LOWDELAY; else if (!strcmp($1, "throughput")) $$ = IPTOS_THROUGHPUT; else if (!strcmp($1, "reliability")) $$ = IPTOS_RELIABILITY; else if ($1[0] == '0' && $1[1] == 'x') $$ = strtoul($1, NULL, 16); else - $$ = 0; /* flag bad argument */ - if (!$$ || $$ > 255) { + $$ = 256; /* flag bad argument */ + if ($$ < 0 || $$ > 255) { yyerror("illegal tos value %s", $1); free($1); YYERROR; } free($1); } | NUMBER { $$ = $1; - if (!$$ || $$ > 255) { + if ($$ < 0 || $$ > 255) { yyerror("illegal tos value %s", $1); YYERROR; } } ; sourcetrack : SOURCETRACK { $$ = PF_SRCTRACK; } | SOURCETRACK GLOBAL { $$ = PF_SRCTRACK_GLOBAL; } | SOURCETRACK RULE { $$ = PF_SRCTRACK_RULE; } ; statelock : IFBOUND { $$ = PFRULE_IFBOUND; } | FLOATING { $$ = 0; } ; keep : NO STATE { $$.action = 0; $$.options = NULL; } | KEEP STATE state_opt_spec { $$.action = PF_STATE_NORMAL; $$.options = $3; } | MODULATE STATE state_opt_spec { $$.action = PF_STATE_MODULATE; $$.options = $3; } | SYNPROXY STATE state_opt_spec { $$.action = PF_STATE_SYNPROXY; $$.options = $3; } ; flush : /* empty */ { $$ = 0; } | FLUSH { $$ = PF_FLUSH; } | FLUSH GLOBAL { $$ = PF_FLUSH | PF_FLUSH_GLOBAL; } ; state_opt_spec : '(' state_opt_list ')' { $$ = $2; } | /* empty */ { $$ = NULL; } ; state_opt_list : state_opt_item { $$ = $1; } | state_opt_list comma state_opt_item { $1->tail->next = $3; $1->tail = $3; $$ = $1; } ; state_opt_item : MAXIMUM NUMBER { if ($2 < 0 || $2 > UINT_MAX) { yyerror("only positive values permitted"); YYERROR; } $$ = calloc(1, sizeof(struct node_state_opt)); if ($$ == NULL) err(1, "state_opt_item: calloc"); $$->type = PF_STATE_OPT_MAX; $$->data.max_states = $2; $$->next = NULL; $$->tail = $$; } | NOSYNC { $$ = calloc(1, sizeof(struct node_state_opt)); if ($$ == NULL) err(1, "state_opt_item: calloc"); $$->type = PF_STATE_OPT_NOSYNC; $$->next = NULL; $$->tail = $$; } | MAXSRCSTATES NUMBER { if ($2 < 0 || $2 > UINT_MAX) { yyerror("only positive values permitted"); YYERROR; } $$ = calloc(1, sizeof(struct node_state_opt)); if ($$ == NULL) err(1, "state_opt_item: calloc"); $$->type = PF_STATE_OPT_MAX_SRC_STATES; $$->data.max_src_states = $2; $$->next = NULL; $$->tail = $$; } | MAXSRCCONN NUMBER { if ($2 < 0 || $2 > UINT_MAX) { yyerror("only positive values permitted"); YYERROR; } $$ = calloc(1, sizeof(struct node_state_opt)); if ($$ == NULL) err(1, "state_opt_item: calloc"); $$->type = PF_STATE_OPT_MAX_SRC_CONN; $$->data.max_src_conn = $2; $$->next = NULL; $$->tail = $$; } | MAXSRCCONNRATE NUMBER '/' NUMBER { if ($2 < 0 || $2 > UINT_MAX || $4 < 0 || $4 > UINT_MAX) { yyerror("only positive values permitted"); YYERROR; } $$ = calloc(1, sizeof(struct node_state_opt)); if ($$ == NULL) err(1, "state_opt_item: calloc"); $$->type = PF_STATE_OPT_MAX_SRC_CONN_RATE; $$->data.max_src_conn_rate.limit = $2; $$->data.max_src_conn_rate.seconds = $4; $$->next = NULL; $$->tail = $$; } | OVERLOAD '<' STRING '>' flush { if (strlen($3) >= PF_TABLE_NAME_SIZE) { yyerror("table name '%s' too long", $3); free($3); YYERROR; } $$ = calloc(1, sizeof(struct node_state_opt)); if ($$ == NULL) err(1, "state_opt_item: calloc"); if (strlcpy($$->data.overload.tblname, $3, PF_TABLE_NAME_SIZE) >= PF_TABLE_NAME_SIZE) errx(1, "state_opt_item: strlcpy"); free($3); $$->type = PF_STATE_OPT_OVERLOAD; $$->data.overload.flush = $5; $$->next = NULL; $$->tail = $$; } | MAXSRCNODES NUMBER { if ($2 < 0 || $2 > UINT_MAX) { yyerror("only positive values permitted"); YYERROR; } $$ = calloc(1, sizeof(struct node_state_opt)); if ($$ == NULL) err(1, "state_opt_item: calloc"); $$->type = PF_STATE_OPT_MAX_SRC_NODES; $$->data.max_src_nodes = $2; $$->next = NULL; $$->tail = $$; } | sourcetrack { $$ = calloc(1, sizeof(struct node_state_opt)); if ($$ == NULL) err(1, "state_opt_item: calloc"); $$->type = PF_STATE_OPT_SRCTRACK; $$->data.src_track = $1; $$->next = NULL; $$->tail = $$; } | statelock { $$ = calloc(1, sizeof(struct node_state_opt)); if ($$ == NULL) err(1, "state_opt_item: calloc"); $$->type = PF_STATE_OPT_STATELOCK; $$->data.statelock = $1; $$->next = NULL; $$->tail = $$; } | SLOPPY { $$ = calloc(1, sizeof(struct node_state_opt)); if ($$ == NULL) err(1, "state_opt_item: calloc"); $$->type = PF_STATE_OPT_SLOPPY; $$->next = NULL; $$->tail = $$; } | STRING NUMBER { int i; if ($2 < 0 || $2 > UINT_MAX) { yyerror("only positive values permitted"); YYERROR; } for (i = 0; pf_timeouts[i].name && strcmp(pf_timeouts[i].name, $1); ++i) ; /* nothing */ if (!pf_timeouts[i].name) { yyerror("illegal timeout name %s", $1); free($1); YYERROR; } if (strchr(pf_timeouts[i].name, '.') == NULL) { yyerror("illegal state timeout %s", $1); free($1); YYERROR; } free($1); $$ = calloc(1, sizeof(struct node_state_opt)); if ($$ == NULL) err(1, "state_opt_item: calloc"); $$->type = PF_STATE_OPT_TIMEOUT; $$->data.timeout.number = pf_timeouts[i].timeout; $$->data.timeout.seconds = $2; $$->next = NULL; $$->tail = $$; } ; label : LABEL STRING { $$ = $2; } ; qname : QUEUE STRING { $$.qname = $2; $$.pqname = NULL; } | QUEUE '(' STRING ')' { $$.qname = $3; $$.pqname = NULL; } | QUEUE '(' STRING comma STRING ')' { $$.qname = $3; $$.pqname = $5; } ; no : /* empty */ { $$ = 0; } | NO { $$ = 1; } ; portstar : numberstring { if (parseport($1, &$$, PPORT_RANGE|PPORT_STAR) == -1) { free($1); YYERROR; } free($1); } ; redirspec : host { $$ = $1; } | '{' optnl redir_host_list '}' { $$ = $3; } ; redir_host_list : host optnl { $$ = $1; } | redir_host_list comma host optnl { $1->tail->next = $3; $1->tail = $3->tail; $$ = $1; } ; redirpool : /* empty */ { $$ = NULL; } | ARROW redirspec { $$ = calloc(1, sizeof(struct redirection)); if ($$ == NULL) err(1, "redirection: calloc"); $$->host = $2; $$->rport.a = $$->rport.b = $$->rport.t = 0; } | ARROW redirspec PORT portstar { $$ = calloc(1, sizeof(struct redirection)); if ($$ == NULL) err(1, "redirection: calloc"); $$->host = $2; $$->rport = $4; } ; hashkey : /* empty */ { $$ = calloc(1, sizeof(struct pf_poolhashkey)); if ($$ == NULL) err(1, "hashkey: calloc"); $$->key32[0] = arc4random(); $$->key32[1] = arc4random(); $$->key32[2] = arc4random(); $$->key32[3] = arc4random(); } | string { if (!strncmp($1, "0x", 2)) { if (strlen($1) != 34) { free($1); yyerror("hex key must be 128 bits " "(32 hex digits) long"); YYERROR; } $$ = calloc(1, sizeof(struct pf_poolhashkey)); if ($$ == NULL) err(1, "hashkey: calloc"); if (sscanf($1, "0x%8x%8x%8x%8x", &$$->key32[0], &$$->key32[1], &$$->key32[2], &$$->key32[3]) != 4) { free($$); free($1); yyerror("invalid hex key"); YYERROR; } } else { MD5_CTX context; $$ = calloc(1, sizeof(struct pf_poolhashkey)); if ($$ == NULL) err(1, "hashkey: calloc"); MD5Init(&context); MD5Update(&context, (unsigned char *)$1, strlen($1)); MD5Final((unsigned char *)$$, &context); HTONL($$->key32[0]); HTONL($$->key32[1]); HTONL($$->key32[2]); HTONL($$->key32[3]); } free($1); } ; pool_opts : { bzero(&pool_opts, sizeof pool_opts); } pool_opts_l { $$ = pool_opts; } | /* empty */ { bzero(&pool_opts, sizeof pool_opts); $$ = pool_opts; } ; pool_opts_l : pool_opts_l pool_opt | pool_opt ; pool_opt : BITMASK { if (pool_opts.type) { yyerror("pool type cannot be redefined"); YYERROR; } pool_opts.type = PF_POOL_BITMASK; } | RANDOM { if (pool_opts.type) { yyerror("pool type cannot be redefined"); YYERROR; } pool_opts.type = PF_POOL_RANDOM; } | SOURCEHASH hashkey { if (pool_opts.type) { yyerror("pool type cannot be redefined"); YYERROR; } pool_opts.type = PF_POOL_SRCHASH; pool_opts.key = $2; } | ROUNDROBIN { if (pool_opts.type) { yyerror("pool type cannot be redefined"); YYERROR; } pool_opts.type = PF_POOL_ROUNDROBIN; } | STATICPORT { if (pool_opts.staticport) { yyerror("static-port cannot be redefined"); YYERROR; } pool_opts.staticport = 1; } | STICKYADDRESS { if (filter_opts.marker & POM_STICKYADDRESS) { yyerror("sticky-address cannot be redefined"); YYERROR; } pool_opts.marker |= POM_STICKYADDRESS; pool_opts.opts |= PF_POOL_STICKYADDR; } ; redirection : /* empty */ { $$ = NULL; } | ARROW host { $$ = calloc(1, sizeof(struct redirection)); if ($$ == NULL) err(1, "redirection: calloc"); $$->host = $2; $$->rport.a = $$->rport.b = $$->rport.t = 0; } | ARROW host PORT portstar { $$ = calloc(1, sizeof(struct redirection)); if ($$ == NULL) err(1, "redirection: calloc"); $$->host = $2; $$->rport = $4; } ; natpasslog : /* empty */ { $$.b1 = $$.b2 = 0; $$.w2 = 0; } | PASS { $$.b1 = 1; $$.b2 = 0; $$.w2 = 0; } | PASS log { $$.b1 = 1; $$.b2 = $2.log; $$.w2 = $2.logif; } | log { $$.b1 = 0; $$.b2 = $1.log; $$.w2 = $1.logif; } ; nataction : no NAT natpasslog { if ($1 && $3.b1) { yyerror("\"pass\" not valid with \"no\""); YYERROR; } if ($1) $$.b1 = PF_NONAT; else $$.b1 = PF_NAT; $$.b2 = $3.b1; $$.w = $3.b2; $$.w2 = $3.w2; } | no RDR natpasslog { if ($1 && $3.b1) { yyerror("\"pass\" not valid with \"no\""); YYERROR; } if ($1) $$.b1 = PF_NORDR; else $$.b1 = PF_RDR; $$.b2 = $3.b1; $$.w = $3.b2; $$.w2 = $3.w2; } ; natrule : nataction interface af proto fromto tag tagged rtable redirpool pool_opts { struct pf_rule r; if (check_rulestate(PFCTL_STATE_NAT)) YYERROR; memset(&r, 0, sizeof(r)); r.action = $1.b1; r.natpass = $1.b2; r.log = $1.w; r.logif = $1.w2; r.af = $3; if (!r.af) { if ($5.src.host && $5.src.host->af && !$5.src.host->ifindex) r.af = $5.src.host->af; else if ($5.dst.host && $5.dst.host->af && !$5.dst.host->ifindex) r.af = $5.dst.host->af; } if ($6 != NULL) if (strlcpy(r.tagname, $6, PF_TAG_NAME_SIZE) >= PF_TAG_NAME_SIZE) { yyerror("tag too long, max %u chars", PF_TAG_NAME_SIZE - 1); YYERROR; } if ($7.name) if (strlcpy(r.match_tagname, $7.name, PF_TAG_NAME_SIZE) >= PF_TAG_NAME_SIZE) { yyerror("tag too long, max %u chars", PF_TAG_NAME_SIZE - 1); YYERROR; } r.match_tag_not = $7.neg; r.rtableid = $8; if (r.action == PF_NONAT || r.action == PF_NORDR) { if ($9 != NULL) { yyerror("translation rule with 'no' " "does not need '->'"); YYERROR; } } else { if ($9 == NULL || $9->host == NULL) { yyerror("translation rule requires '-> " "address'"); YYERROR; } if (!r.af && ! $9->host->ifindex) r.af = $9->host->af; remove_invalid_hosts(&$9->host, &r.af); if (invalid_redirect($9->host, r.af)) YYERROR; if (check_netmask($9->host, r.af)) YYERROR; r.rpool.proxy_port[0] = ntohs($9->rport.a); switch (r.action) { case PF_RDR: if (!$9->rport.b && $9->rport.t && $5.dst.port != NULL) { r.rpool.proxy_port[1] = ntohs($9->rport.a) + (ntohs( $5.dst.port->port[1]) - ntohs( $5.dst.port->port[0])); } else r.rpool.proxy_port[1] = ntohs($9->rport.b); break; case PF_NAT: r.rpool.proxy_port[1] = ntohs($9->rport.b); if (!r.rpool.proxy_port[0] && !r.rpool.proxy_port[1]) { r.rpool.proxy_port[0] = PF_NAT_PROXY_PORT_LOW; r.rpool.proxy_port[1] = PF_NAT_PROXY_PORT_HIGH; } else if (!r.rpool.proxy_port[1]) r.rpool.proxy_port[1] = r.rpool.proxy_port[0]; break; default: break; } r.rpool.opts = $10.type; if ((r.rpool.opts & PF_POOL_TYPEMASK) == PF_POOL_NONE && ($9->host->next != NULL || $9->host->addr.type == PF_ADDR_TABLE || DYNIF_MULTIADDR($9->host->addr))) r.rpool.opts = PF_POOL_ROUNDROBIN; if ((r.rpool.opts & PF_POOL_TYPEMASK) != PF_POOL_ROUNDROBIN && disallow_table($9->host, "tables are only " "supported in round-robin redirection " "pools")) YYERROR; if ((r.rpool.opts & PF_POOL_TYPEMASK) != PF_POOL_ROUNDROBIN && disallow_alias($9->host, "interface (%s) " "is only supported in round-robin " "redirection pools")) YYERROR; if ($9->host->next != NULL) { if ((r.rpool.opts & PF_POOL_TYPEMASK) != PF_POOL_ROUNDROBIN) { yyerror("only round-robin " "valid for multiple " "redirection addresses"); YYERROR; } } } if ($10.key != NULL) memcpy(&r.rpool.key, $10.key, sizeof(struct pf_poolhashkey)); if ($10.opts) r.rpool.opts |= $10.opts; if ($10.staticport) { if (r.action != PF_NAT) { yyerror("the 'static-port' option is " "only valid with nat rules"); YYERROR; } if (r.rpool.proxy_port[0] != PF_NAT_PROXY_PORT_LOW && r.rpool.proxy_port[1] != PF_NAT_PROXY_PORT_HIGH) { yyerror("the 'static-port' option can't" " be used when specifying a port" " range"); YYERROR; } r.rpool.proxy_port[0] = 0; r.rpool.proxy_port[1] = 0; } expand_rule(&r, $2, $9 == NULL ? NULL : $9->host, $4, $5.src_os, $5.src.host, $5.src.port, $5.dst.host, $5.dst.port, 0, 0, 0, ""); free($9); } ; binatrule : no BINAT natpasslog interface af proto FROM host toipspec tag tagged rtable redirection { struct pf_rule binat; struct pf_pooladdr *pa; if (check_rulestate(PFCTL_STATE_NAT)) YYERROR; if (disallow_urpf_failed($9, "\"urpf-failed\" is not " "permitted as a binat destination")) YYERROR; memset(&binat, 0, sizeof(binat)); if ($1 && $3.b1) { yyerror("\"pass\" not valid with \"no\""); YYERROR; } if ($1) binat.action = PF_NOBINAT; else binat.action = PF_BINAT; binat.natpass = $3.b1; binat.log = $3.b2; binat.logif = $3.w2; binat.af = $5; if (!binat.af && $8 != NULL && $8->af) binat.af = $8->af; if (!binat.af && $9 != NULL && $9->af) binat.af = $9->af; if (!binat.af && $13 != NULL && $13->host) binat.af = $13->host->af; if (!binat.af) { yyerror("address family (inet/inet6) " "undefined"); YYERROR; } if ($4 != NULL) { memcpy(binat.ifname, $4->ifname, sizeof(binat.ifname)); binat.ifnot = $4->not; free($4); } if ($10 != NULL) if (strlcpy(binat.tagname, $10, PF_TAG_NAME_SIZE) >= PF_TAG_NAME_SIZE) { yyerror("tag too long, max %u chars", PF_TAG_NAME_SIZE - 1); YYERROR; } if ($11.name) if (strlcpy(binat.match_tagname, $11.name, PF_TAG_NAME_SIZE) >= PF_TAG_NAME_SIZE) { yyerror("tag too long, max %u chars", PF_TAG_NAME_SIZE - 1); YYERROR; } binat.match_tag_not = $11.neg; binat.rtableid = $12; if ($6 != NULL) { binat.proto = $6->proto; free($6); } if ($8 != NULL && disallow_table($8, "invalid use of " "table <%s> as the source address of a binat rule")) YYERROR; if ($8 != NULL && disallow_alias($8, "invalid use of " "interface (%s) as the source address of a binat " "rule")) YYERROR; if ($13 != NULL && $13->host != NULL && disallow_table( $13->host, "invalid use of table <%s> as the " "redirect address of a binat rule")) YYERROR; if ($13 != NULL && $13->host != NULL && disallow_alias( $13->host, "invalid use of interface (%s) as the " "redirect address of a binat rule")) YYERROR; if ($8 != NULL) { if ($8->next) { yyerror("multiple binat ip addresses"); YYERROR; } if ($8->addr.type == PF_ADDR_DYNIFTL) $8->af = binat.af; if ($8->af != binat.af) { yyerror("binat ip versions must match"); YYERROR; } if (check_netmask($8, binat.af)) YYERROR; memcpy(&binat.src.addr, &$8->addr, sizeof(binat.src.addr)); free($8); } if ($9 != NULL) { if ($9->next) { yyerror("multiple binat ip addresses"); YYERROR; } if ($9->af != binat.af && $9->af) { yyerror("binat ip versions must match"); YYERROR; } if (check_netmask($9, binat.af)) YYERROR; memcpy(&binat.dst.addr, &$9->addr, sizeof(binat.dst.addr)); binat.dst.neg = $9->not; free($9); } if (binat.action == PF_NOBINAT) { if ($13 != NULL) { yyerror("'no binat' rule does not need" " '->'"); YYERROR; } } else { if ($13 == NULL || $13->host == NULL) { yyerror("'binat' rule requires" " '-> address'"); YYERROR; } remove_invalid_hosts(&$13->host, &binat.af); if (invalid_redirect($13->host, binat.af)) YYERROR; if ($13->host->next != NULL) { yyerror("binat rule must redirect to " "a single address"); YYERROR; } if (check_netmask($13->host, binat.af)) YYERROR; if (!PF_AZERO(&binat.src.addr.v.a.mask, binat.af) && !PF_AEQ(&binat.src.addr.v.a.mask, &$13->host->addr.v.a.mask, binat.af)) { yyerror("'binat' source mask and " "redirect mask must be the same"); YYERROR; } TAILQ_INIT(&binat.rpool.list); pa = calloc(1, sizeof(struct pf_pooladdr)); if (pa == NULL) err(1, "binat: calloc"); pa->addr = $13->host->addr; pa->ifname[0] = 0; TAILQ_INSERT_TAIL(&binat.rpool.list, pa, entries); free($13); } pfctl_add_rule(pf, &binat, ""); } ; tag : /* empty */ { $$ = NULL; } | TAG STRING { $$ = $2; } ; tagged : /* empty */ { $$.neg = 0; $$.name = NULL; } | not TAGGED string { $$.neg = $1; $$.name = $3; } ; rtable : /* empty */ { $$ = -1; } | RTABLE NUMBER { if ($2 < 0 || $2 > rt_tableid_max()) { yyerror("invalid rtable id"); YYERROR; } $$ = $2; } ; route_host : STRING { $$ = calloc(1, sizeof(struct node_host)); if ($$ == NULL) err(1, "route_host: calloc"); $$->ifname = $1; set_ipmask($$, 128); $$->next = NULL; $$->tail = $$; } | '(' STRING host ')' { $$ = $3; $$->ifname = $2; } ; route_host_list : route_host optnl { $$ = $1; } | route_host_list comma route_host optnl { if ($1->af == 0) $1->af = $3->af; if ($1->af != $3->af) { yyerror("all pool addresses must be in the " "same address family"); YYERROR; } $1->tail->next = $3; $1->tail = $3->tail; $$ = $1; } ; routespec : route_host { $$ = $1; } | '{' optnl route_host_list '}' { $$ = $3; } ; route : /* empty */ { $$.host = NULL; $$.rt = 0; $$.pool_opts = 0; } | FASTROUTE { $$.host = NULL; $$.rt = PF_FASTROUTE; $$.pool_opts = 0; } | ROUTETO routespec pool_opts { $$.host = $2; $$.rt = PF_ROUTETO; $$.pool_opts = $3.type | $3.opts; if ($3.key != NULL) $$.key = $3.key; } | REPLYTO routespec pool_opts { $$.host = $2; $$.rt = PF_REPLYTO; $$.pool_opts = $3.type | $3.opts; if ($3.key != NULL) $$.key = $3.key; } | DUPTO routespec pool_opts { $$.host = $2; $$.rt = PF_DUPTO; $$.pool_opts = $3.type | $3.opts; if ($3.key != NULL) $$.key = $3.key; } ; timeout_spec : STRING NUMBER { if (check_rulestate(PFCTL_STATE_OPTION)) { free($1); YYERROR; } if ($2 < 0 || $2 > UINT_MAX) { yyerror("only positive values permitted"); YYERROR; } if (pfctl_set_timeout(pf, $1, $2, 0) != 0) { yyerror("unknown timeout %s", $1); free($1); YYERROR; } free($1); } ; timeout_list : timeout_list comma timeout_spec optnl | timeout_spec optnl ; limit_spec : STRING NUMBER { if (check_rulestate(PFCTL_STATE_OPTION)) { free($1); YYERROR; } if ($2 < 0 || $2 > UINT_MAX) { yyerror("only positive values permitted"); YYERROR; } if (pfctl_set_limit(pf, $1, $2) != 0) { yyerror("unable to set limit %s %u", $1, $2); free($1); YYERROR; } free($1); } ; limit_list : limit_list comma limit_spec optnl | limit_spec optnl ; comma : ',' | /* empty */ ; yesno : NO { $$ = 0; } | STRING { if (!strcmp($1, "yes")) $$ = 1; else { yyerror("invalid value '%s', expected 'yes' " "or 'no'", $1); free($1); YYERROR; } free($1); } ; unaryop : '=' { $$ = PF_OP_EQ; } | '!' '=' { $$ = PF_OP_NE; } | '<' '=' { $$ = PF_OP_LE; } | '<' { $$ = PF_OP_LT; } | '>' '=' { $$ = PF_OP_GE; } | '>' { $$ = PF_OP_GT; } ; %% int yyerror(const char *fmt, ...) { va_list ap; file->errors++; va_start(ap, fmt); fprintf(stderr, "%s:%d: ", file->name, yylval.lineno); vfprintf(stderr, fmt, ap); fprintf(stderr, "\n"); va_end(ap); return (0); } int disallow_table(struct node_host *h, const char *fmt) { for (; h != NULL; h = h->next) if (h->addr.type == PF_ADDR_TABLE) { yyerror(fmt, h->addr.v.tblname); return (1); } return (0); } int disallow_urpf_failed(struct node_host *h, const char *fmt) { for (; h != NULL; h = h->next) if (h->addr.type == PF_ADDR_URPFFAILED) { yyerror(fmt); return (1); } return (0); } int disallow_alias(struct node_host *h, const char *fmt) { for (; h != NULL; h = h->next) if (DYNIF_MULTIADDR(h->addr)) { yyerror(fmt, h->addr.v.tblname); return (1); } return (0); } int rule_consistent(struct pf_rule *r, int anchor_call) { int problems = 0; switch (r->action) { case PF_PASS: case PF_DROP: case PF_SCRUB: case PF_NOSCRUB: problems = filter_consistent(r, anchor_call); break; case PF_NAT: case PF_NONAT: problems = nat_consistent(r); break; case PF_RDR: case PF_NORDR: problems = rdr_consistent(r); break; case PF_BINAT: case PF_NOBINAT: default: break; } return (problems); } int filter_consistent(struct pf_rule *r, int anchor_call) { int problems = 0; if (r->proto != IPPROTO_TCP && r->proto != IPPROTO_UDP && (r->src.port_op || r->dst.port_op)) { yyerror("port only applies to tcp/udp"); problems++; } if (r->proto != IPPROTO_ICMP && r->proto != IPPROTO_ICMPV6 && (r->type || r->code)) { yyerror("icmp-type/code only applies to icmp"); problems++; } if (!r->af && (r->type || r->code)) { yyerror("must indicate address family with icmp-type/code"); problems++; } if (r->overload_tblname[0] && r->max_src_conn == 0 && r->max_src_conn_rate.seconds == 0) { yyerror("'overload' requires 'max-src-conn' " "or 'max-src-conn-rate'"); problems++; } if ((r->proto == IPPROTO_ICMP && r->af == AF_INET6) || (r->proto == IPPROTO_ICMPV6 && r->af == AF_INET)) { yyerror("proto %s doesn't match address family %s", r->proto == IPPROTO_ICMP ? "icmp" : "icmp6", r->af == AF_INET ? "inet" : "inet6"); problems++; } if (r->allow_opts && r->action != PF_PASS) { yyerror("allow-opts can only be specified for pass rules"); problems++; } if (r->rule_flag & PFRULE_FRAGMENT && (r->src.port_op || r->dst.port_op || r->flagset || r->type || r->code)) { yyerror("fragments can be filtered only on IP header fields"); problems++; } if (r->rule_flag & PFRULE_RETURNRST && r->proto != IPPROTO_TCP) { yyerror("return-rst can only be applied to TCP rules"); problems++; } if (r->max_src_nodes && !(r->rule_flag & PFRULE_RULESRCTRACK)) { yyerror("max-src-nodes requires 'source-track rule'"); problems++; } if (r->action == PF_DROP && r->keep_state) { yyerror("keep state on block rules doesn't make sense"); problems++; } if (r->rule_flag & PFRULE_STATESLOPPY && (r->keep_state == PF_STATE_MODULATE || r->keep_state == PF_STATE_SYNPROXY)) { yyerror("sloppy state matching cannot be used with " "synproxy state or modulate state"); problems++; } return (-problems); } int nat_consistent(struct pf_rule *r) { return (0); /* yeah! */ } int rdr_consistent(struct pf_rule *r) { int problems = 0; if (r->proto != IPPROTO_TCP && r->proto != IPPROTO_UDP) { if (r->src.port_op) { yyerror("src port only applies to tcp/udp"); problems++; } if (r->dst.port_op) { yyerror("dst port only applies to tcp/udp"); problems++; } if (r->rpool.proxy_port[0]) { yyerror("rpool port only applies to tcp/udp"); problems++; } } if (r->dst.port_op && r->dst.port_op != PF_OP_EQ && r->dst.port_op != PF_OP_RRG) { yyerror("invalid port operator for rdr destination port"); problems++; } return (-problems); } int process_tabledef(char *name, struct table_opts *opts) { struct pfr_buffer ab; struct node_tinit *ti; bzero(&ab, sizeof(ab)); ab.pfrb_type = PFRB_ADDRS; SIMPLEQ_FOREACH(ti, &opts->init_nodes, entries) { if (ti->file) if (pfr_buf_load(&ab, ti->file, 0, append_addr)) { if (errno) yyerror("cannot load \"%s\": %s", ti->file, strerror(errno)); else yyerror("file \"%s\" contains bad data", ti->file); goto _error; } if (ti->host) if (append_addr_host(&ab, ti->host, 0, 0)) { yyerror("cannot create address buffer: %s", strerror(errno)); goto _error; } } if (pf->opts & PF_OPT_VERBOSE) print_tabledef(name, opts->flags, opts->init_addr, &opts->init_nodes); if (!(pf->opts & PF_OPT_NOACTION) && pfctl_define_table(name, opts->flags, opts->init_addr, pf->anchor->name, &ab, pf->anchor->ruleset.tticket)) { yyerror("cannot define table %s: %s", name, pfr_strerror(errno)); goto _error; } pf->tdirty = 1; pfr_buf_clear(&ab); return (0); _error: pfr_buf_clear(&ab); return (-1); } struct keywords { const char *k_name; int k_val; }; /* macro gore, but you should've seen the prior indentation nightmare... */ #define FREE_LIST(T,r) \ do { \ T *p, *node = r; \ while (node != NULL) { \ p = node; \ node = node->next; \ free(p); \ } \ } while (0) #define LOOP_THROUGH(T,n,r,C) \ do { \ T *n; \ if (r == NULL) { \ r = calloc(1, sizeof(T)); \ if (r == NULL) \ err(1, "LOOP: calloc"); \ r->next = NULL; \ } \ n = r; \ while (n != NULL) { \ do { \ C; \ } while (0); \ n = n->next; \ } \ } while (0) void expand_label_str(char *label, size_t len, const char *srch, const char *repl) { char *tmp; char *p, *q; if ((tmp = calloc(1, len)) == NULL) err(1, "expand_label_str: calloc"); p = q = label; while ((q = strstr(p, srch)) != NULL) { *q = '\0'; if ((strlcat(tmp, p, len) >= len) || (strlcat(tmp, repl, len) >= len)) errx(1, "expand_label: label too long"); q += strlen(srch); p = q; } if (strlcat(tmp, p, len) >= len) errx(1, "expand_label: label too long"); strlcpy(label, tmp, len); /* always fits */ free(tmp); } void expand_label_if(const char *name, char *label, size_t len, const char *ifname) { if (strstr(label, name) != NULL) { if (!*ifname) expand_label_str(label, len, name, "any"); else expand_label_str(label, len, name, ifname); } } void expand_label_addr(const char *name, char *label, size_t len, sa_family_t af, struct node_host *h) { char tmp[64], tmp_not[66]; if (strstr(label, name) != NULL) { switch (h->addr.type) { case PF_ADDR_DYNIFTL: snprintf(tmp, sizeof(tmp), "(%s)", h->addr.v.ifname); break; case PF_ADDR_TABLE: snprintf(tmp, sizeof(tmp), "<%s>", h->addr.v.tblname); break; case PF_ADDR_NOROUTE: snprintf(tmp, sizeof(tmp), "no-route"); break; case PF_ADDR_URPFFAILED: snprintf(tmp, sizeof(tmp), "urpf-failed"); break; case PF_ADDR_ADDRMASK: if (!af || (PF_AZERO(&h->addr.v.a.addr, af) && PF_AZERO(&h->addr.v.a.mask, af))) snprintf(tmp, sizeof(tmp), "any"); else { char a[48]; int bits; if (inet_ntop(af, &h->addr.v.a.addr, a, sizeof(a)) == NULL) snprintf(tmp, sizeof(tmp), "?"); else { bits = unmask(&h->addr.v.a.mask, af); if ((af == AF_INET && bits < 32) || (af == AF_INET6 && bits < 128)) snprintf(tmp, sizeof(tmp), "%s/%d", a, bits); else snprintf(tmp, sizeof(tmp), "%s", a); } } break; default: snprintf(tmp, sizeof(tmp), "?"); break; } if (h->not) { snprintf(tmp_not, sizeof(tmp_not), "! %s", tmp); expand_label_str(label, len, name, tmp_not); } else expand_label_str(label, len, name, tmp); } } void expand_label_port(const char *name, char *label, size_t len, struct node_port *port) { char a1[6], a2[6], op[13] = ""; if (strstr(label, name) != NULL) { snprintf(a1, sizeof(a1), "%u", ntohs(port->port[0])); snprintf(a2, sizeof(a2), "%u", ntohs(port->port[1])); if (!port->op) ; else if (port->op == PF_OP_IRG) snprintf(op, sizeof(op), "%s><%s", a1, a2); else if (port->op == PF_OP_XRG) snprintf(op, sizeof(op), "%s<>%s", a1, a2); else if (port->op == PF_OP_EQ) snprintf(op, sizeof(op), "%s", a1); else if (port->op == PF_OP_NE) snprintf(op, sizeof(op), "!=%s", a1); else if (port->op == PF_OP_LT) snprintf(op, sizeof(op), "<%s", a1); else if (port->op == PF_OP_LE) snprintf(op, sizeof(op), "<=%s", a1); else if (port->op == PF_OP_GT) snprintf(op, sizeof(op), ">%s", a1); else if (port->op == PF_OP_GE) snprintf(op, sizeof(op), ">=%s", a1); expand_label_str(label, len, name, op); } } void expand_label_proto(const char *name, char *label, size_t len, u_int8_t proto) { struct protoent *pe; char n[4]; if (strstr(label, name) != NULL) { pe = getprotobynumber(proto); if (pe != NULL) expand_label_str(label, len, name, pe->p_name); else { snprintf(n, sizeof(n), "%u", proto); expand_label_str(label, len, name, n); } } } void expand_label_nr(const char *name, char *label, size_t len) { char n[11]; if (strstr(label, name) != NULL) { snprintf(n, sizeof(n), "%u", pf->anchor->match); expand_label_str(label, len, name, n); } } void expand_label(char *label, size_t len, const char *ifname, sa_family_t af, struct node_host *src_host, struct node_port *src_port, struct node_host *dst_host, struct node_port *dst_port, u_int8_t proto) { expand_label_if("$if", label, len, ifname); expand_label_addr("$srcaddr", label, len, af, src_host); expand_label_addr("$dstaddr", label, len, af, dst_host); expand_label_port("$srcport", label, len, src_port); expand_label_port("$dstport", label, len, dst_port); expand_label_proto("$proto", label, len, proto); expand_label_nr("$nr", label, len); } int expand_altq(struct pf_altq *a, struct node_if *interfaces, struct node_queue *nqueues, struct node_queue_bw bwspec, struct node_queue_opt *opts) { struct pf_altq pa, pb; char qname[PF_QNAME_SIZE]; struct node_queue *n; struct node_queue_bw bw; int errs = 0; if ((pf->loadopt & PFCTL_FLAG_ALTQ) == 0) { FREE_LIST(struct node_if, interfaces); if (nqueues) FREE_LIST(struct node_queue, nqueues); return (0); } LOOP_THROUGH(struct node_if, interface, interfaces, memcpy(&pa, a, sizeof(struct pf_altq)); if (strlcpy(pa.ifname, interface->ifname, sizeof(pa.ifname)) >= sizeof(pa.ifname)) errx(1, "expand_altq: strlcpy"); if (interface->not) { yyerror("altq on ! is not supported"); errs++; } else { if (eval_pfaltq(pf, &pa, &bwspec, opts)) errs++; else if (pfctl_add_altq(pf, &pa)) errs++; if (pf->opts & PF_OPT_VERBOSE) { print_altq(&pf->paltq->altq, 0, &bwspec, opts); if (nqueues && nqueues->tail) { printf("queue { "); LOOP_THROUGH(struct node_queue, queue, nqueues, printf("%s ", queue->queue); ); printf("}"); } printf("\n"); } if (pa.scheduler == ALTQT_CBQ || pa.scheduler == ALTQT_HFSC) { /* now create a root queue */ memset(&pb, 0, sizeof(struct pf_altq)); if (strlcpy(qname, "root_", sizeof(qname)) >= sizeof(qname)) errx(1, "expand_altq: strlcpy"); if (strlcat(qname, interface->ifname, sizeof(qname)) >= sizeof(qname)) errx(1, "expand_altq: strlcat"); if (strlcpy(pb.qname, qname, sizeof(pb.qname)) >= sizeof(pb.qname)) errx(1, "expand_altq: strlcpy"); if (strlcpy(pb.ifname, interface->ifname, sizeof(pb.ifname)) >= sizeof(pb.ifname)) errx(1, "expand_altq: strlcpy"); pb.qlimit = pa.qlimit; pb.scheduler = pa.scheduler; bw.bw_absolute = pa.ifbandwidth; bw.bw_percent = 0; if (eval_pfqueue(pf, &pb, &bw, opts)) errs++; else if (pfctl_add_altq(pf, &pb)) errs++; } LOOP_THROUGH(struct node_queue, queue, nqueues, n = calloc(1, sizeof(struct node_queue)); if (n == NULL) err(1, "expand_altq: calloc"); if (pa.scheduler == ALTQT_CBQ || pa.scheduler == ALTQT_HFSC) if (strlcpy(n->parent, qname, sizeof(n->parent)) >= sizeof(n->parent)) errx(1, "expand_altq: strlcpy"); if (strlcpy(n->queue, queue->queue, sizeof(n->queue)) >= sizeof(n->queue)) errx(1, "expand_altq: strlcpy"); if (strlcpy(n->ifname, interface->ifname, sizeof(n->ifname)) >= sizeof(n->ifname)) errx(1, "expand_altq: strlcpy"); n->scheduler = pa.scheduler; n->next = NULL; n->tail = n; if (queues == NULL) queues = n; else { queues->tail->next = n; queues->tail = n; } ); } ); FREE_LIST(struct node_if, interfaces); if (nqueues) FREE_LIST(struct node_queue, nqueues); return (errs); } int expand_queue(struct pf_altq *a, struct node_if *interfaces, struct node_queue *nqueues, struct node_queue_bw bwspec, struct node_queue_opt *opts) { struct node_queue *n, *nq; struct pf_altq pa; u_int8_t found = 0; u_int8_t errs = 0; if ((pf->loadopt & PFCTL_FLAG_ALTQ) == 0) { FREE_LIST(struct node_queue, nqueues); return (0); } if (queues == NULL) { yyerror("queue %s has no parent", a->qname); FREE_LIST(struct node_queue, nqueues); return (1); } LOOP_THROUGH(struct node_if, interface, interfaces, LOOP_THROUGH(struct node_queue, tqueue, queues, if (!strncmp(a->qname, tqueue->queue, PF_QNAME_SIZE) && (interface->ifname[0] == 0 || (!interface->not && !strncmp(interface->ifname, tqueue->ifname, IFNAMSIZ)) || (interface->not && strncmp(interface->ifname, tqueue->ifname, IFNAMSIZ)))) { /* found ourself in queues */ found++; memcpy(&pa, a, sizeof(struct pf_altq)); if (pa.scheduler != ALTQT_NONE && pa.scheduler != tqueue->scheduler) { yyerror("exactly one scheduler type " "per interface allowed"); return (1); } pa.scheduler = tqueue->scheduler; /* scheduler dependent error checking */ switch (pa.scheduler) { case ALTQT_PRIQ: if (nqueues != NULL) { yyerror("priq queues cannot " "have child queues"); return (1); } if (bwspec.bw_absolute > 0 || bwspec.bw_percent < 100) { yyerror("priq doesn't take " "bandwidth"); return (1); } break; default: break; } if (strlcpy(pa.ifname, tqueue->ifname, sizeof(pa.ifname)) >= sizeof(pa.ifname)) errx(1, "expand_queue: strlcpy"); if (strlcpy(pa.parent, tqueue->parent, sizeof(pa.parent)) >= sizeof(pa.parent)) errx(1, "expand_queue: strlcpy"); if (eval_pfqueue(pf, &pa, &bwspec, opts)) errs++; else if (pfctl_add_altq(pf, &pa)) errs++; for (nq = nqueues; nq != NULL; nq = nq->next) { if (!strcmp(a->qname, nq->queue)) { yyerror("queue cannot have " "itself as child"); errs++; continue; } n = calloc(1, sizeof(struct node_queue)); if (n == NULL) err(1, "expand_queue: calloc"); if (strlcpy(n->parent, a->qname, sizeof(n->parent)) >= sizeof(n->parent)) errx(1, "expand_queue strlcpy"); if (strlcpy(n->queue, nq->queue, sizeof(n->queue)) >= sizeof(n->queue)) errx(1, "expand_queue strlcpy"); if (strlcpy(n->ifname, tqueue->ifname, sizeof(n->ifname)) >= sizeof(n->ifname)) errx(1, "expand_queue strlcpy"); n->scheduler = tqueue->scheduler; n->next = NULL; n->tail = n; if (queues == NULL) queues = n; else { queues->tail->next = n; queues->tail = n; } } if ((pf->opts & PF_OPT_VERBOSE) && ( (found == 1 && interface->ifname[0] == 0) || (found > 0 && interface->ifname[0] != 0))) { print_queue(&pf->paltq->altq, 0, &bwspec, interface->ifname[0] != 0, opts); if (nqueues && nqueues->tail) { printf("{ "); LOOP_THROUGH(struct node_queue, queue, nqueues, printf("%s ", queue->queue); ); printf("}"); } printf("\n"); } } ); ); FREE_LIST(struct node_queue, nqueues); FREE_LIST(struct node_if, interfaces); if (!found) { yyerror("queue %s has no parent", a->qname); errs++; } if (errs) return (1); else return (0); } void expand_rule(struct pf_rule *r, struct node_if *interfaces, struct node_host *rpool_hosts, struct node_proto *protos, struct node_os *src_oses, struct node_host *src_hosts, struct node_port *src_ports, struct node_host *dst_hosts, struct node_port *dst_ports, struct node_uid *uids, struct node_gid *gids, struct node_icmp *icmp_types, const char *anchor_call) { sa_family_t af = r->af; int added = 0, error = 0; char ifname[IF_NAMESIZE]; char label[PF_RULE_LABEL_SIZE]; char tagname[PF_TAG_NAME_SIZE]; char match_tagname[PF_TAG_NAME_SIZE]; struct pf_pooladdr *pa; struct node_host *h; u_int8_t flags, flagset, keep_state; if (strlcpy(label, r->label, sizeof(label)) >= sizeof(label)) errx(1, "expand_rule: strlcpy"); if (strlcpy(tagname, r->tagname, sizeof(tagname)) >= sizeof(tagname)) errx(1, "expand_rule: strlcpy"); if (strlcpy(match_tagname, r->match_tagname, sizeof(match_tagname)) >= sizeof(match_tagname)) errx(1, "expand_rule: strlcpy"); flags = r->flags; flagset = r->flagset; keep_state = r->keep_state; LOOP_THROUGH(struct node_if, interface, interfaces, LOOP_THROUGH(struct node_proto, proto, protos, LOOP_THROUGH(struct node_icmp, icmp_type, icmp_types, LOOP_THROUGH(struct node_host, src_host, src_hosts, LOOP_THROUGH(struct node_port, src_port, src_ports, LOOP_THROUGH(struct node_os, src_os, src_oses, LOOP_THROUGH(struct node_host, dst_host, dst_hosts, LOOP_THROUGH(struct node_port, dst_port, dst_ports, LOOP_THROUGH(struct node_uid, uid, uids, LOOP_THROUGH(struct node_gid, gid, gids, r->af = af; /* for link-local IPv6 address, interface must match up */ if ((r->af && src_host->af && r->af != src_host->af) || (r->af && dst_host->af && r->af != dst_host->af) || (src_host->af && dst_host->af && src_host->af != dst_host->af) || (src_host->ifindex && dst_host->ifindex && src_host->ifindex != dst_host->ifindex) || (src_host->ifindex && *interface->ifname && src_host->ifindex != if_nametoindex(interface->ifname)) || (dst_host->ifindex && *interface->ifname && dst_host->ifindex != if_nametoindex(interface->ifname))) continue; if (!r->af && src_host->af) r->af = src_host->af; else if (!r->af && dst_host->af) r->af = dst_host->af; if (*interface->ifname) strlcpy(r->ifname, interface->ifname, sizeof(r->ifname)); else if (if_indextoname(src_host->ifindex, ifname)) strlcpy(r->ifname, ifname, sizeof(r->ifname)); else if (if_indextoname(dst_host->ifindex, ifname)) strlcpy(r->ifname, ifname, sizeof(r->ifname)); else memset(r->ifname, '\0', sizeof(r->ifname)); if (strlcpy(r->label, label, sizeof(r->label)) >= sizeof(r->label)) errx(1, "expand_rule: strlcpy"); if (strlcpy(r->tagname, tagname, sizeof(r->tagname)) >= sizeof(r->tagname)) errx(1, "expand_rule: strlcpy"); if (strlcpy(r->match_tagname, match_tagname, sizeof(r->match_tagname)) >= sizeof(r->match_tagname)) errx(1, "expand_rule: strlcpy"); expand_label(r->label, PF_RULE_LABEL_SIZE, r->ifname, r->af, src_host, src_port, dst_host, dst_port, proto->proto); expand_label(r->tagname, PF_TAG_NAME_SIZE, r->ifname, r->af, src_host, src_port, dst_host, dst_port, proto->proto); expand_label(r->match_tagname, PF_TAG_NAME_SIZE, r->ifname, r->af, src_host, src_port, dst_host, dst_port, proto->proto); error += check_netmask(src_host, r->af); error += check_netmask(dst_host, r->af); r->ifnot = interface->not; r->proto = proto->proto; r->src.addr = src_host->addr; r->src.neg = src_host->not; r->src.port[0] = src_port->port[0]; r->src.port[1] = src_port->port[1]; r->src.port_op = src_port->op; r->dst.addr = dst_host->addr; r->dst.neg = dst_host->not; r->dst.port[0] = dst_port->port[0]; r->dst.port[1] = dst_port->port[1]; r->dst.port_op = dst_port->op; r->uid.op = uid->op; r->uid.uid[0] = uid->uid[0]; r->uid.uid[1] = uid->uid[1]; r->gid.op = gid->op; r->gid.gid[0] = gid->gid[0]; r->gid.gid[1] = gid->gid[1]; r->type = icmp_type->type; r->code = icmp_type->code; if ((keep_state == PF_STATE_MODULATE || keep_state == PF_STATE_SYNPROXY) && r->proto && r->proto != IPPROTO_TCP) r->keep_state = PF_STATE_NORMAL; else r->keep_state = keep_state; if (r->proto && r->proto != IPPROTO_TCP) { r->flags = 0; r->flagset = 0; } else { r->flags = flags; r->flagset = flagset; } if (icmp_type->proto && r->proto != icmp_type->proto) { yyerror("icmp-type mismatch"); error++; } if (src_os && src_os->os) { r->os_fingerprint = pfctl_get_fingerprint(src_os->os); if ((pf->opts & PF_OPT_VERBOSE2) && r->os_fingerprint == PF_OSFP_NOMATCH) fprintf(stderr, "warning: unknown '%s' OS fingerprint\n", src_os->os); } else { r->os_fingerprint = PF_OSFP_ANY; } TAILQ_INIT(&r->rpool.list); for (h = rpool_hosts; h != NULL; h = h->next) { pa = calloc(1, sizeof(struct pf_pooladdr)); if (pa == NULL) err(1, "expand_rule: calloc"); pa->addr = h->addr; if (h->ifname != NULL) { if (strlcpy(pa->ifname, h->ifname, sizeof(pa->ifname)) >= sizeof(pa->ifname)) errx(1, "expand_rule: strlcpy"); } else pa->ifname[0] = 0; TAILQ_INSERT_TAIL(&r->rpool.list, pa, entries); } if (rule_consistent(r, anchor_call[0]) < 0 || error) yyerror("skipping rule due to errors"); else { r->nr = pf->astack[pf->asd]->match++; pfctl_add_rule(pf, r, anchor_call); added++; } )))))))))); FREE_LIST(struct node_if, interfaces); FREE_LIST(struct node_proto, protos); FREE_LIST(struct node_host, src_hosts); FREE_LIST(struct node_port, src_ports); FREE_LIST(struct node_os, src_oses); FREE_LIST(struct node_host, dst_hosts); FREE_LIST(struct node_port, dst_ports); FREE_LIST(struct node_uid, uids); FREE_LIST(struct node_gid, gids); FREE_LIST(struct node_icmp, icmp_types); FREE_LIST(struct node_host, rpool_hosts); if (!added) yyerror("rule expands to no valid combination"); } int expand_skip_interface(struct node_if *interfaces) { int errs = 0; if (!interfaces || (!interfaces->next && !interfaces->not && !strcmp(interfaces->ifname, "none"))) { if (pf->opts & PF_OPT_VERBOSE) printf("set skip on none\n"); errs = pfctl_set_interface_flags(pf, "", PFI_IFLAG_SKIP, 0); return (errs); } if (pf->opts & PF_OPT_VERBOSE) printf("set skip on {"); LOOP_THROUGH(struct node_if, interface, interfaces, if (pf->opts & PF_OPT_VERBOSE) printf(" %s", interface->ifname); if (interface->not) { yyerror("skip on ! is not supported"); errs++; } else errs += pfctl_set_interface_flags(pf, interface->ifname, PFI_IFLAG_SKIP, 1); ); if (pf->opts & PF_OPT_VERBOSE) printf(" }\n"); FREE_LIST(struct node_if, interfaces); if (errs) return (1); else return (0); } #undef FREE_LIST #undef LOOP_THROUGH int check_rulestate(int desired_state) { if (require_order && (rulestate > desired_state)) { yyerror("Rules must be in order: options, normalization, " "queueing, translation, filtering"); return (1); } rulestate = desired_state; return (0); } int kw_cmp(const void *k, const void *e) { return (strcmp(k, ((const struct keywords *)e)->k_name)); } int lookup(char *s) { /* this has to be sorted always */ static const struct keywords keywords[] = { { "all", ALL}, { "allow-opts", ALLOWOPTS}, { "altq", ALTQ}, { "anchor", ANCHOR}, { "antispoof", ANTISPOOF}, { "any", ANY}, { "bandwidth", BANDWIDTH}, { "binat", BINAT}, { "binat-anchor", BINATANCHOR}, { "bitmask", BITMASK}, { "block", BLOCK}, { "block-policy", BLOCKPOLICY}, { "buckets", BUCKETS}, { "cbq", CBQ}, { "code", CODE}, { "codelq", CODEL}, { "crop", FRAGCROP}, { "debug", DEBUG}, { "divert-reply", DIVERTREPLY}, { "divert-to", DIVERTTO}, { "drop", DROP}, { "drop-ovl", FRAGDROP}, { "dup-to", DUPTO}, { "fairq", FAIRQ}, { "fastroute", FASTROUTE}, { "file", FILENAME}, { "fingerprints", FINGERPRINTS}, { "flags", FLAGS}, { "floating", FLOATING}, { "flush", FLUSH}, { "for", FOR}, { "fragment", FRAGMENT}, { "from", FROM}, { "global", GLOBAL}, { "group", GROUP}, { "hfsc", HFSC}, { "hogs", HOGS}, { "hostid", HOSTID}, { "icmp-type", ICMPTYPE}, { "icmp6-type", ICMP6TYPE}, { "if-bound", IFBOUND}, { "in", IN}, { "include", INCLUDE}, { "inet", INET}, { "inet6", INET6}, { "interval", INTERVAL}, { "keep", KEEP}, { "label", LABEL}, { "limit", LIMIT}, { "linkshare", LINKSHARE}, { "load", LOAD}, { "log", LOG}, { "loginterface", LOGINTERFACE}, { "max", MAXIMUM}, { "max-mss", MAXMSS}, { "max-src-conn", MAXSRCCONN}, { "max-src-conn-rate", MAXSRCCONNRATE}, { "max-src-nodes", MAXSRCNODES}, { "max-src-states", MAXSRCSTATES}, { "min-ttl", MINTTL}, { "modulate", MODULATE}, { "nat", NAT}, { "nat-anchor", NATANCHOR}, { "no", NO}, { "no-df", NODF}, { "no-route", NOROUTE}, { "no-sync", NOSYNC}, { "on", ON}, { "optimization", OPTIMIZATION}, { "os", OS}, { "out", OUT}, { "overload", OVERLOAD}, { "pass", PASS}, { "port", PORT}, { "prio", PRIO}, { "priority", PRIORITY}, { "priq", PRIQ}, { "probability", PROBABILITY}, { "proto", PROTO}, { "qlimit", QLIMIT}, { "queue", QUEUE}, { "quick", QUICK}, { "random", RANDOM}, { "random-id", RANDOMID}, { "rdr", RDR}, { "rdr-anchor", RDRANCHOR}, { "realtime", REALTIME}, { "reassemble", REASSEMBLE}, { "reply-to", REPLYTO}, { "require-order", REQUIREORDER}, { "return", RETURN}, { "return-icmp", RETURNICMP}, { "return-icmp6", RETURNICMP6}, { "return-rst", RETURNRST}, { "round-robin", ROUNDROBIN}, { "route", ROUTE}, { "route-to", ROUTETO}, { "rtable", RTABLE}, { "rule", RULE}, { "ruleset-optimization", RULESET_OPTIMIZATION}, { "scrub", SCRUB}, { "set", SET}, { "set-tos", SETTOS}, { "skip", SKIP}, { "sloppy", SLOPPY}, { "source-hash", SOURCEHASH}, { "source-track", SOURCETRACK}, { "state", STATE}, { "state-defaults", STATEDEFAULTS}, { "state-policy", STATEPOLICY}, { "static-port", STATICPORT}, { "sticky-address", STICKYADDRESS}, { "synproxy", SYNPROXY}, { "table", TABLE}, { "tag", TAG}, { "tagged", TAGGED}, { "target", TARGET}, { "tbrsize", TBRSIZE}, { "timeout", TIMEOUT}, { "to", TO}, { "tos", TOS}, { "ttl", TTL}, { "upperlimit", UPPERLIMIT}, { "urpf-failed", URPFFAILED}, { "user", USER}, }; const struct keywords *p; p = bsearch(s, keywords, sizeof(keywords)/sizeof(keywords[0]), sizeof(keywords[0]), kw_cmp); if (p) { if (debug > 1) fprintf(stderr, "%s: %d\n", s, p->k_val); return (p->k_val); } else { if (debug > 1) fprintf(stderr, "string: %s\n", s); return (STRING); } } #define MAXPUSHBACK 128 char *parsebuf; int parseindex; char pushback_buffer[MAXPUSHBACK]; int pushback_index = 0; int lgetc(int quotec) { int c, next; if (parsebuf) { /* Read character from the parsebuffer instead of input. */ if (parseindex >= 0) { c = parsebuf[parseindex++]; if (c != '\0') return (c); parsebuf = NULL; } else parseindex++; } if (pushback_index) return (pushback_buffer[--pushback_index]); if (quotec) { if ((c = getc(file->stream)) == EOF) { yyerror("reached end of file while parsing quoted string"); if (popfile() == EOF) return (EOF); return (quotec); } return (c); } while ((c = getc(file->stream)) == '\\') { next = getc(file->stream); if (next != '\n') { c = next; break; } yylval.lineno = file->lineno; file->lineno++; } while (c == EOF) { if (popfile() == EOF) return (EOF); c = getc(file->stream); } return (c); } int lungetc(int c) { if (c == EOF) return (EOF); if (parsebuf) { parseindex--; if (parseindex >= 0) return (c); } if (pushback_index < MAXPUSHBACK-1) return (pushback_buffer[pushback_index++] = c); else return (EOF); } int findeol(void) { int c; parsebuf = NULL; /* skip to either EOF or the first real EOL */ while (1) { if (pushback_index) c = pushback_buffer[--pushback_index]; else c = lgetc(0); if (c == '\n') { file->lineno++; break; } if (c == EOF) break; } return (ERROR); } int yylex(void) { char buf[8096]; char *p, *val; int quotec, next, c; int token; top: p = buf; while ((c = lgetc(0)) == ' ' || c == '\t') ; /* nothing */ yylval.lineno = file->lineno; if (c == '#') while ((c = lgetc(0)) != '\n' && c != EOF) ; /* nothing */ if (c == '$' && parsebuf == NULL) { while (1) { if ((c = lgetc(0)) == EOF) return (0); if (p + 1 >= buf + sizeof(buf) - 1) { yyerror("string too long"); return (findeol()); } if (isalnum(c) || c == '_') { *p++ = (char)c; continue; } *p = '\0'; lungetc(c); break; } val = symget(buf); if (val == NULL) { yyerror("macro '%s' not defined", buf); return (findeol()); } parsebuf = val; parseindex = 0; goto top; } switch (c) { case '\'': case '"': quotec = c; while (1) { if ((c = lgetc(quotec)) == EOF) return (0); if (c == '\n') { file->lineno++; continue; } else if (c == '\\') { if ((next = lgetc(quotec)) == EOF) return (0); if (next == quotec || c == ' ' || c == '\t') c = next; else if (next == '\n') continue; else lungetc(next); } else if (c == quotec) { *p = '\0'; break; } if (p + 1 >= buf + sizeof(buf) - 1) { yyerror("string too long"); return (findeol()); } *p++ = (char)c; } yylval.v.string = strdup(buf); if (yylval.v.string == NULL) err(1, "yylex: strdup"); return (STRING); case '<': next = lgetc(0); if (next == '>') { yylval.v.i = PF_OP_XRG; return (PORTBINARY); } lungetc(next); break; case '>': next = lgetc(0); if (next == '<') { yylval.v.i = PF_OP_IRG; return (PORTBINARY); } lungetc(next); break; case '-': next = lgetc(0); if (next == '>') return (ARROW); lungetc(next); break; } #define allowed_to_end_number(x) \ (isspace(x) || x == ')' || x ==',' || x == '/' || x == '}' || x == '=') if (c == '-' || isdigit(c)) { do { *p++ = c; if ((unsigned)(p-buf) >= sizeof(buf)) { yyerror("string too long"); return (findeol()); } } while ((c = lgetc(0)) != EOF && isdigit(c)); lungetc(c); if (p == buf + 1 && buf[0] == '-') goto nodigits; if (c == EOF || allowed_to_end_number(c)) { const char *errstr = NULL; *p = '\0'; yylval.v.number = strtonum(buf, LLONG_MIN, LLONG_MAX, &errstr); if (errstr) { yyerror("\"%s\" invalid number: %s", buf, errstr); return (findeol()); } return (NUMBER); } else { nodigits: while (p > buf + 1) lungetc(*--p); c = *--p; if (c == '-') return (c); } } #define allowed_in_string(x) \ (isalnum(x) || (ispunct(x) && x != '(' && x != ')' && \ x != '{' && x != '}' && x != '<' && x != '>' && \ x != '!' && x != '=' && x != '/' && x != '#' && \ x != ',')) if (isalnum(c) || c == ':' || c == '_') { do { *p++ = c; if ((unsigned)(p-buf) >= sizeof(buf)) { yyerror("string too long"); return (findeol()); } } while ((c = lgetc(0)) != EOF && (allowed_in_string(c))); lungetc(c); *p = '\0'; if ((token = lookup(buf)) == STRING) if ((yylval.v.string = strdup(buf)) == NULL) err(1, "yylex: strdup"); return (token); } if (c == '\n') { yylval.lineno = file->lineno; file->lineno++; } if (c == EOF) return (0); return (c); } int check_file_secrecy(int fd, const char *fname) { struct stat st; if (fstat(fd, &st)) { warn("cannot stat %s", fname); return (-1); } if (st.st_uid != 0 && st.st_uid != getuid()) { warnx("%s: owner not root or current user", fname); return (-1); } if (st.st_mode & (S_IRWXG | S_IRWXO)) { warnx("%s: group/world readable/writeable", fname); return (-1); } return (0); } struct file * pushfile(const char *name, int secret) { struct file *nfile; if ((nfile = calloc(1, sizeof(struct file))) == NULL || (nfile->name = strdup(name)) == NULL) { warn("malloc"); return (NULL); } if (TAILQ_FIRST(&files) == NULL && strcmp(nfile->name, "-") == 0) { nfile->stream = stdin; free(nfile->name); if ((nfile->name = strdup("stdin")) == NULL) { warn("strdup"); free(nfile); return (NULL); } } else if ((nfile->stream = fopen(nfile->name, "r")) == NULL) { warn("%s", nfile->name); free(nfile->name); free(nfile); return (NULL); } else if (secret && check_file_secrecy(fileno(nfile->stream), nfile->name)) { fclose(nfile->stream); free(nfile->name); free(nfile); return (NULL); } nfile->lineno = 1; TAILQ_INSERT_TAIL(&files, nfile, entry); return (nfile); } int popfile(void) { struct file *prev; if ((prev = TAILQ_PREV(file, files, entry)) != NULL) { prev->errors += file->errors; TAILQ_REMOVE(&files, file, entry); fclose(file->stream); free(file->name); free(file); file = prev; return (0); } return (EOF); } int parse_config(char *filename, struct pfctl *xpf) { int errors = 0; struct sym *sym; pf = xpf; errors = 0; rulestate = PFCTL_STATE_NONE; returnicmpdefault = (ICMP_UNREACH << 8) | ICMP_UNREACH_PORT; returnicmp6default = (ICMP6_DST_UNREACH << 8) | ICMP6_DST_UNREACH_NOPORT; blockpolicy = PFRULE_DROP; require_order = 1; if ((file = pushfile(filename, 0)) == NULL) { warn("cannot open the main config file!"); return (-1); } yyparse(); errors = file->errors; popfile(); /* Free macros and check which have not been used. */ while ((sym = TAILQ_FIRST(&symhead))) { if ((pf->opts & PF_OPT_VERBOSE2) && !sym->used) fprintf(stderr, "warning: macro '%s' not " "used\n", sym->nam); free(sym->nam); free(sym->val); TAILQ_REMOVE(&symhead, sym, entry); free(sym); } return (errors ? -1 : 0); } int symset(const char *nam, const char *val, int persist) { struct sym *sym; for (sym = TAILQ_FIRST(&symhead); sym && strcmp(nam, sym->nam); sym = TAILQ_NEXT(sym, entry)) ; /* nothing */ if (sym != NULL) { if (sym->persist == 1) return (0); else { free(sym->nam); free(sym->val); TAILQ_REMOVE(&symhead, sym, entry); free(sym); } } if ((sym = calloc(1, sizeof(*sym))) == NULL) return (-1); sym->nam = strdup(nam); if (sym->nam == NULL) { free(sym); return (-1); } sym->val = strdup(val); if (sym->val == NULL) { free(sym->nam); free(sym); return (-1); } sym->used = 0; sym->persist = persist; TAILQ_INSERT_TAIL(&symhead, sym, entry); return (0); } int pfctl_cmdline_symset(char *s) { char *sym, *val; int ret; if ((val = strrchr(s, '=')) == NULL) return (-1); if ((sym = malloc(strlen(s) - strlen(val) + 1)) == NULL) err(1, "pfctl_cmdline_symset: malloc"); strlcpy(sym, s, strlen(s) - strlen(val) + 1); ret = symset(sym, val + 1, 1); free(sym); return (ret); } char * symget(const char *nam) { struct sym *sym; TAILQ_FOREACH(sym, &symhead, entry) if (strcmp(nam, sym->nam) == 0) { sym->used = 1; return (sym->val); } return (NULL); } void mv_rules(struct pf_ruleset *src, struct pf_ruleset *dst) { int i; struct pf_rule *r; for (i = 0; i < PF_RULESET_MAX; ++i) { while ((r = TAILQ_FIRST(src->rules[i].active.ptr)) != NULL) { TAILQ_REMOVE(src->rules[i].active.ptr, r, entries); TAILQ_INSERT_TAIL(dst->rules[i].active.ptr, r, entries); dst->anchor->match++; } src->anchor->match = 0; while ((r = TAILQ_FIRST(src->rules[i].inactive.ptr)) != NULL) { TAILQ_REMOVE(src->rules[i].inactive.ptr, r, entries); TAILQ_INSERT_TAIL(dst->rules[i].inactive.ptr, r, entries); } } } void decide_address_family(struct node_host *n, sa_family_t *af) { if (*af != 0 || n == NULL) return; *af = n->af; while ((n = n->next) != NULL) { if (n->af != *af) { *af = 0; return; } } } void remove_invalid_hosts(struct node_host **nh, sa_family_t *af) { struct node_host *n = *nh, *prev = NULL; while (n != NULL) { if (*af && n->af && n->af != *af) { /* unlink and free n */ struct node_host *next = n->next; /* adjust tail pointer */ if (n == (*nh)->tail) (*nh)->tail = prev; /* adjust previous node's next pointer */ if (prev == NULL) *nh = next; else prev->next = next; /* free node */ if (n->ifname != NULL) free(n->ifname); free(n); n = next; } else { if (n->af && !*af) *af = n->af; prev = n; n = n->next; } } } int invalid_redirect(struct node_host *nh, sa_family_t af) { if (!af) { struct node_host *n; /* tables and dyniftl are ok without an address family */ for (n = nh; n != NULL; n = n->next) { if (n->addr.type != PF_ADDR_TABLE && n->addr.type != PF_ADDR_DYNIFTL) { yyerror("address family not given and " "translation address expands to multiple " "address families"); return (1); } } } if (nh == NULL) { yyerror("no translation address with matching address family " "found."); return (1); } return (0); } int atoul(char *s, u_long *ulvalp) { u_long ulval; char *ep; errno = 0; ulval = strtoul(s, &ep, 0); if (s[0] == '\0' || *ep != '\0') return (-1); if (errno == ERANGE && ulval == ULONG_MAX) return (-1); *ulvalp = ulval; return (0); } int getservice(char *n) { struct servent *s; u_long ulval; if (atoul(n, &ulval) == 0) { if (ulval > 65535) { yyerror("illegal port value %lu", ulval); return (-1); } return (htons(ulval)); } else { s = getservbyname(n, "tcp"); if (s == NULL) s = getservbyname(n, "udp"); if (s == NULL) { yyerror("unknown port %s", n); return (-1); } return (s->s_port); } } int rule_label(struct pf_rule *r, char *s) { if (s) { if (strlcpy(r->label, s, sizeof(r->label)) >= sizeof(r->label)) { yyerror("rule label too long (max %d chars)", sizeof(r->label)-1); return (-1); } } return (0); } u_int16_t parseicmpspec(char *w, sa_family_t af) { const struct icmpcodeent *p; u_long ulval; u_int8_t icmptype; if (af == AF_INET) icmptype = returnicmpdefault >> 8; else icmptype = returnicmp6default >> 8; if (atoul(w, &ulval) == -1) { if ((p = geticmpcodebyname(icmptype, w, af)) == NULL) { yyerror("unknown icmp code %s", w); return (0); } ulval = p->code; } if (ulval > 255) { yyerror("invalid icmp code %lu", ulval); return (0); } return (icmptype << 8 | ulval); } int parseport(char *port, struct range *r, int extensions) { char *p = strchr(port, ':'); if (p == NULL) { if ((r->a = getservice(port)) == -1) return (-1); r->b = 0; r->t = PF_OP_NONE; return (0); } if ((extensions & PPORT_STAR) && !strcmp(p+1, "*")) { *p = 0; if ((r->a = getservice(port)) == -1) return (-1); r->b = 0; r->t = PF_OP_IRG; return (0); } if ((extensions & PPORT_RANGE)) { *p++ = 0; if ((r->a = getservice(port)) == -1 || (r->b = getservice(p)) == -1) return (-1); if (r->a == r->b) { r->b = 0; r->t = PF_OP_NONE; } else r->t = PF_OP_RRG; return (0); } return (-1); } int pfctl_load_anchors(int dev, struct pfctl *pf, struct pfr_buffer *trans) { struct loadanchors *la; TAILQ_FOREACH(la, &loadanchorshead, entries) { if (pf->opts & PF_OPT_VERBOSE) fprintf(stderr, "\nLoading anchor %s from %s\n", la->anchorname, la->filename); if (pfctl_rules(dev, la->filename, pf->opts, pf->optimize, la->anchorname, trans) == -1) return (-1); } return (0); } int rt_tableid_max(void) { #ifdef __FreeBSD__ int fibs; size_t l = sizeof(fibs); if (sysctlbyname("net.fibs", &fibs, &l, NULL, 0) == -1) fibs = 16; /* XXX RT_MAXFIBS, at least limit it some. */ /* * As the OpenBSD code only compares > and not >= we need to adjust * here given we only accept values of 0..n and want to avoid #ifdefs * in the grammar. */ return (fibs - 1); #else return (RT_TABLEID_MAX); #endif } Index: user/alc/PQ_LAUNDRY/sys/arm64/arm64/pmap.c =================================================================== --- user/alc/PQ_LAUNDRY/sys/arm64/arm64/pmap.c (revision 303666) +++ user/alc/PQ_LAUNDRY/sys/arm64/arm64/pmap.c (revision 303667) @@ -1,3595 +1,3595 @@ /*- * Copyright (c) 1991 Regents of the University of California. * All rights reserved. * Copyright (c) 1994 John S. Dyson * All rights reserved. * Copyright (c) 1994 David Greenman * All rights reserved. * Copyright (c) 2003 Peter Wemm * All rights reserved. * Copyright (c) 2005-2010 Alan L. Cox * All rights reserved. * Copyright (c) 2014 Andrew Turner * All rights reserved. * Copyright (c) 2014-2016 The FreeBSD Foundation * All rights reserved. * * This code is derived from software contributed to Berkeley by * the Systems Programming Group of the University of Utah Computer * Science Department and William Jolitz of UUNET Technologies Inc. * * This software was developed by Andrew Turner under sponsorship from * the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)pmap.c 7.7 (Berkeley) 5/12/91 */ /*- * Copyright (c) 2003 Networks Associates Technology, Inc. * All rights reserved. * * This software was developed for the FreeBSD Project by Jake Burkholder, * Safeport Network Services, and Network Associates Laboratories, the * Security Research Division of Network Associates, Inc. under * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA * CHATS research program. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); /* * Manages physical address maps. * * Since the information managed by this module is * also stored by the logical address mapping module, * this module may throw away valid virtual-to-physical * mappings at almost any time. However, invalidations * of virtual-to-physical mappings must be done as * requested. * * In order to cope with hardware architectures which * make virtual-to-physical map invalidates expensive, * this module may delay invalidate or reduced protection * operations until such time as they are actually * necessary. This module is given full information as * to which processors are currently using which maps, * and to when physical maps must be made correct. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define NL0PG (PAGE_SIZE/(sizeof (pd_entry_t))) #define NL1PG (PAGE_SIZE/(sizeof (pd_entry_t))) #define NL2PG (PAGE_SIZE/(sizeof (pd_entry_t))) #define NL3PG (PAGE_SIZE/(sizeof (pt_entry_t))) #define NUL0E L0_ENTRIES #define NUL1E (NUL0E * NL1PG) #define NUL2E (NUL1E * NL2PG) #if !defined(DIAGNOSTIC) #ifdef __GNUC_GNU_INLINE__ #define PMAP_INLINE __attribute__((__gnu_inline__)) inline #else #define PMAP_INLINE extern inline #endif #else #define PMAP_INLINE #endif /* * These are configured by the mair_el1 register. This is set up in locore.S */ #define DEVICE_MEMORY 0 #define UNCACHED_MEMORY 1 #define CACHED_MEMORY 2 #ifdef PV_STATS #define PV_STAT(x) do { x ; } while (0) #else #define PV_STAT(x) do { } while (0) #endif #define pmap_l2_pindex(v) ((v) >> L2_SHIFT) #define NPV_LIST_LOCKS MAXCPU #define PHYS_TO_PV_LIST_LOCK(pa) \ (&pv_list_locks[pa_index(pa) % NPV_LIST_LOCKS]) #define CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa) do { \ struct rwlock **_lockp = (lockp); \ struct rwlock *_new_lock; \ \ _new_lock = PHYS_TO_PV_LIST_LOCK(pa); \ if (_new_lock != *_lockp) { \ if (*_lockp != NULL) \ rw_wunlock(*_lockp); \ *_lockp = _new_lock; \ rw_wlock(*_lockp); \ } \ } while (0) #define CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m) \ CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, VM_PAGE_TO_PHYS(m)) #define RELEASE_PV_LIST_LOCK(lockp) do { \ struct rwlock **_lockp = (lockp); \ \ if (*_lockp != NULL) { \ rw_wunlock(*_lockp); \ *_lockp = NULL; \ } \ } while (0) #define VM_PAGE_TO_PV_LIST_LOCK(m) \ PHYS_TO_PV_LIST_LOCK(VM_PAGE_TO_PHYS(m)) struct pmap kernel_pmap_store; vm_offset_t virtual_avail; /* VA of first avail page (after kernel bss) */ vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */ vm_offset_t kernel_vm_end = 0; struct msgbuf *msgbufp = NULL; static struct rwlock_padalign pvh_global_lock; vm_paddr_t dmap_phys_base; /* The start of the dmap region */ vm_paddr_t dmap_phys_max; /* The limit of the dmap region */ vm_offset_t dmap_max_addr; /* The virtual address limit of the dmap */ /* This code assumes all L1 DMAP entries will be used */ CTASSERT((DMAP_MIN_ADDRESS & ~L0_OFFSET) == DMAP_MIN_ADDRESS); CTASSERT((DMAP_MAX_ADDRESS & ~L0_OFFSET) == DMAP_MAX_ADDRESS); #define DMAP_TABLES ((DMAP_MAX_ADDRESS - DMAP_MIN_ADDRESS) >> L0_SHIFT) extern pt_entry_t pagetable_dmap[]; /* * Data for the pv entry allocation mechanism */ static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks); static struct mtx pv_chunks_mutex; static struct rwlock pv_list_locks[NPV_LIST_LOCKS]; static void free_pv_chunk(struct pv_chunk *pc); static void free_pv_entry(pmap_t pmap, pv_entry_t pv); static pv_entry_t get_pv_entry(pmap_t pmap, struct rwlock **lockp); static vm_page_t reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp); static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va); static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va); static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp); static int pmap_remove_l3(pmap_t pmap, pt_entry_t *l3, vm_offset_t sva, pd_entry_t ptepde, struct spglist *free, struct rwlock **lockp); static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m, struct rwlock **lockp); static vm_page_t _pmap_alloc_l3(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp); static void _pmap_unwire_l3(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free); static int pmap_unuse_l3(pmap_t, vm_offset_t, pd_entry_t, struct spglist *); /* * These load the old table data and store the new value. * They need to be atomic as the System MMU may write to the table at * the same time as the CPU. */ #define pmap_load_store(table, entry) atomic_swap_64(table, entry) #define pmap_set(table, mask) atomic_set_64(table, mask) #define pmap_load_clear(table) atomic_swap_64(table, 0) #define pmap_load(table) (*table) /********************/ /* Inline functions */ /********************/ static __inline void pagecopy(void *s, void *d) { memcpy(d, s, PAGE_SIZE); } #define pmap_l0_index(va) (((va) >> L0_SHIFT) & L0_ADDR_MASK) #define pmap_l1_index(va) (((va) >> L1_SHIFT) & Ln_ADDR_MASK) #define pmap_l2_index(va) (((va) >> L2_SHIFT) & Ln_ADDR_MASK) #define pmap_l3_index(va) (((va) >> L3_SHIFT) & Ln_ADDR_MASK) static __inline pd_entry_t * pmap_l0(pmap_t pmap, vm_offset_t va) { return (&pmap->pm_l0[pmap_l0_index(va)]); } static __inline pd_entry_t * pmap_l0_to_l1(pd_entry_t *l0, vm_offset_t va) { pd_entry_t *l1; l1 = (pd_entry_t *)PHYS_TO_DMAP(pmap_load(l0) & ~ATTR_MASK); return (&l1[pmap_l1_index(va)]); } static __inline pd_entry_t * pmap_l1(pmap_t pmap, vm_offset_t va) { pd_entry_t *l0; l0 = pmap_l0(pmap, va); if ((pmap_load(l0) & ATTR_DESCR_MASK) != L0_TABLE) return (NULL); return (pmap_l0_to_l1(l0, va)); } static __inline pd_entry_t * pmap_l1_to_l2(pd_entry_t *l1, vm_offset_t va) { pd_entry_t *l2; l2 = (pd_entry_t *)PHYS_TO_DMAP(pmap_load(l1) & ~ATTR_MASK); return (&l2[pmap_l2_index(va)]); } static __inline pd_entry_t * pmap_l2(pmap_t pmap, vm_offset_t va) { pd_entry_t *l1; l1 = pmap_l1(pmap, va); if ((pmap_load(l1) & ATTR_DESCR_MASK) != L1_TABLE) return (NULL); return (pmap_l1_to_l2(l1, va)); } static __inline pt_entry_t * pmap_l2_to_l3(pd_entry_t *l2, vm_offset_t va) { pt_entry_t *l3; l3 = (pd_entry_t *)PHYS_TO_DMAP(pmap_load(l2) & ~ATTR_MASK); return (&l3[pmap_l3_index(va)]); } /* * Returns the lowest valid pde for a given virtual address. * The next level may or may not point to a valid page or block. */ static __inline pd_entry_t * pmap_pde(pmap_t pmap, vm_offset_t va, int *level) { pd_entry_t *l0, *l1, *l2, desc; l0 = pmap_l0(pmap, va); desc = pmap_load(l0) & ATTR_DESCR_MASK; if (desc != L0_TABLE) { *level = -1; return (NULL); } l1 = pmap_l0_to_l1(l0, va); desc = pmap_load(l1) & ATTR_DESCR_MASK; if (desc != L1_TABLE) { *level = 0; return (l0); } l2 = pmap_l1_to_l2(l1, va); desc = pmap_load(l2) & ATTR_DESCR_MASK; if (desc != L2_TABLE) { *level = 1; return (l1); } *level = 2; return (l2); } /* * Returns the lowest valid pte block or table entry for a given virtual * address. If there are no valid entries return NULL and set the level to * the first invalid level. */ static __inline pt_entry_t * pmap_pte(pmap_t pmap, vm_offset_t va, int *level) { pd_entry_t *l1, *l2, desc; pt_entry_t *l3; l1 = pmap_l1(pmap, va); if (l1 == NULL) { *level = 0; return (NULL); } desc = pmap_load(l1) & ATTR_DESCR_MASK; if (desc == L1_BLOCK) { *level = 1; return (l1); } if (desc != L1_TABLE) { *level = 1; return (NULL); } l2 = pmap_l1_to_l2(l1, va); desc = pmap_load(l2) & ATTR_DESCR_MASK; if (desc == L2_BLOCK) { *level = 2; return (l2); } if (desc != L2_TABLE) { *level = 2; return (NULL); } *level = 3; l3 = pmap_l2_to_l3(l2, va); if ((pmap_load(l3) & ATTR_DESCR_MASK) != L3_PAGE) return (NULL); return (l3); } bool pmap_get_tables(pmap_t pmap, vm_offset_t va, pd_entry_t **l0, pd_entry_t **l1, pd_entry_t **l2, pt_entry_t **l3) { pd_entry_t *l0p, *l1p, *l2p; if (pmap->pm_l0 == NULL) return (false); l0p = pmap_l0(pmap, va); *l0 = l0p; if ((pmap_load(l0p) & ATTR_DESCR_MASK) != L0_TABLE) return (false); l1p = pmap_l0_to_l1(l0p, va); *l1 = l1p; if ((pmap_load(l1p) & ATTR_DESCR_MASK) == L1_BLOCK) { *l2 = NULL; *l3 = NULL; return (true); } if ((pmap_load(l1p) & ATTR_DESCR_MASK) != L1_TABLE) return (false); l2p = pmap_l1_to_l2(l1p, va); *l2 = l2p; if ((pmap_load(l2p) & ATTR_DESCR_MASK) == L2_BLOCK) { *l3 = NULL; return (true); } *l3 = pmap_l2_to_l3(l2p, va); return (true); } static __inline int pmap_is_current(pmap_t pmap) { return ((pmap == pmap_kernel()) || (pmap == curthread->td_proc->p_vmspace->vm_map.pmap)); } static __inline int pmap_l3_valid(pt_entry_t l3) { return ((l3 & ATTR_DESCR_MASK) == L3_PAGE); } static __inline int pmap_l3_valid_cacheable(pt_entry_t l3) { return (((l3 & ATTR_DESCR_MASK) == L3_PAGE) && ((l3 & ATTR_IDX_MASK) == ATTR_IDX(CACHED_MEMORY))); } #define PTE_SYNC(pte) cpu_dcache_wb_range((vm_offset_t)pte, sizeof(*pte)) /* * Checks if the page is dirty. We currently lack proper tracking of this on * arm64 so for now assume is a page mapped as rw was accessed it is. */ static inline int pmap_page_dirty(pt_entry_t pte) { return ((pte & (ATTR_AF | ATTR_AP_RW_BIT)) == (ATTR_AF | ATTR_AP(ATTR_AP_RW))); } static __inline void pmap_resident_count_inc(pmap_t pmap, int count) { PMAP_LOCK_ASSERT(pmap, MA_OWNED); pmap->pm_stats.resident_count += count; } static __inline void pmap_resident_count_dec(pmap_t pmap, int count) { PMAP_LOCK_ASSERT(pmap, MA_OWNED); KASSERT(pmap->pm_stats.resident_count >= count, ("pmap %p resident count underflow %ld %d", pmap, pmap->pm_stats.resident_count, count)); pmap->pm_stats.resident_count -= count; } static pt_entry_t * pmap_early_page_idx(vm_offset_t l1pt, vm_offset_t va, u_int *l1_slot, u_int *l2_slot) { pt_entry_t *l2; pd_entry_t *l1; l1 = (pd_entry_t *)l1pt; *l1_slot = (va >> L1_SHIFT) & Ln_ADDR_MASK; /* Check locore has used a table L1 map */ KASSERT((l1[*l1_slot] & ATTR_DESCR_MASK) == L1_TABLE, ("Invalid bootstrap L1 table")); /* Find the address of the L2 table */ l2 = (pt_entry_t *)init_pt_va; *l2_slot = pmap_l2_index(va); return (l2); } static vm_paddr_t pmap_early_vtophys(vm_offset_t l1pt, vm_offset_t va) { u_int l1_slot, l2_slot; pt_entry_t *l2; l2 = pmap_early_page_idx(l1pt, va, &l1_slot, &l2_slot); return ((l2[l2_slot] & ~ATTR_MASK) + (va & L2_OFFSET)); } static void pmap_bootstrap_dmap(vm_offset_t kern_l1, vm_paddr_t min_pa, vm_paddr_t max_pa) { vm_offset_t va; vm_paddr_t pa; u_int l1_slot; pa = dmap_phys_base = min_pa & ~L1_OFFSET; va = DMAP_MIN_ADDRESS; for (; va < DMAP_MAX_ADDRESS && pa < max_pa; pa += L1_SIZE, va += L1_SIZE, l1_slot++) { l1_slot = ((va - DMAP_MIN_ADDRESS) >> L1_SHIFT); pmap_load_store(&pagetable_dmap[l1_slot], (pa & ~L1_OFFSET) | ATTR_DEFAULT | ATTR_IDX(CACHED_MEMORY) | L1_BLOCK); } /* Set the upper limit of the DMAP region */ dmap_phys_max = pa; dmap_max_addr = va; cpu_dcache_wb_range((vm_offset_t)pagetable_dmap, PAGE_SIZE * DMAP_TABLES); cpu_tlb_flushID(); } static vm_offset_t pmap_bootstrap_l2(vm_offset_t l1pt, vm_offset_t va, vm_offset_t l2_start) { vm_offset_t l2pt; vm_paddr_t pa; pd_entry_t *l1; u_int l1_slot; KASSERT((va & L1_OFFSET) == 0, ("Invalid virtual address")); l1 = (pd_entry_t *)l1pt; l1_slot = pmap_l1_index(va); l2pt = l2_start; for (; va < VM_MAX_KERNEL_ADDRESS; l1_slot++, va += L1_SIZE) { KASSERT(l1_slot < Ln_ENTRIES, ("Invalid L1 index")); pa = pmap_early_vtophys(l1pt, l2pt); pmap_load_store(&l1[l1_slot], (pa & ~Ln_TABLE_MASK) | L1_TABLE); l2pt += PAGE_SIZE; } /* Clean the L2 page table */ memset((void *)l2_start, 0, l2pt - l2_start); cpu_dcache_wb_range(l2_start, l2pt - l2_start); /* Flush the l1 table to ram */ cpu_dcache_wb_range((vm_offset_t)l1, PAGE_SIZE); return l2pt; } static vm_offset_t pmap_bootstrap_l3(vm_offset_t l1pt, vm_offset_t va, vm_offset_t l3_start) { vm_offset_t l2pt, l3pt; vm_paddr_t pa; pd_entry_t *l2; u_int l2_slot; KASSERT((va & L2_OFFSET) == 0, ("Invalid virtual address")); l2 = pmap_l2(kernel_pmap, va); l2 = (pd_entry_t *)rounddown2((uintptr_t)l2, PAGE_SIZE); l2pt = (vm_offset_t)l2; l2_slot = pmap_l2_index(va); l3pt = l3_start; for (; va < VM_MAX_KERNEL_ADDRESS; l2_slot++, va += L2_SIZE) { KASSERT(l2_slot < Ln_ENTRIES, ("Invalid L2 index")); pa = pmap_early_vtophys(l1pt, l3pt); pmap_load_store(&l2[l2_slot], (pa & ~Ln_TABLE_MASK) | L2_TABLE); l3pt += PAGE_SIZE; } /* Clean the L2 page table */ memset((void *)l3_start, 0, l3pt - l3_start); cpu_dcache_wb_range(l3_start, l3pt - l3_start); cpu_dcache_wb_range((vm_offset_t)l2, PAGE_SIZE); return l3pt; } /* * Bootstrap the system enough to run with virtual memory. */ void pmap_bootstrap(vm_offset_t l0pt, vm_offset_t l1pt, vm_paddr_t kernstart, vm_size_t kernlen) { u_int l1_slot, l2_slot, avail_slot, map_slot, used_map_slot; uint64_t kern_delta; pt_entry_t *l2; vm_offset_t va, freemempos; vm_offset_t dpcpu, msgbufpv; vm_paddr_t pa, max_pa, min_pa; int i; kern_delta = KERNBASE - kernstart; physmem = 0; printf("pmap_bootstrap %lx %lx %lx\n", l1pt, kernstart, kernlen); printf("%lx\n", l1pt); printf("%lx\n", (KERNBASE >> L1_SHIFT) & Ln_ADDR_MASK); /* Set this early so we can use the pagetable walking functions */ kernel_pmap_store.pm_l0 = (pd_entry_t *)l0pt; PMAP_LOCK_INIT(kernel_pmap); /* * Initialize the global pv list lock. */ rw_init(&pvh_global_lock, "pmap pv global"); /* Assume the address we were loaded to is a valid physical address */ min_pa = max_pa = KERNBASE - kern_delta; /* * Find the minimum physical address. physmap is sorted, * but may contain empty ranges. */ for (i = 0; i < (physmap_idx * 2); i += 2) { if (physmap[i] == physmap[i + 1]) continue; if (physmap[i] <= min_pa) min_pa = physmap[i]; if (physmap[i + 1] > max_pa) max_pa = physmap[i + 1]; } /* Create a direct map region early so we can use it for pa -> va */ pmap_bootstrap_dmap(l1pt, min_pa, max_pa); va = KERNBASE; pa = KERNBASE - kern_delta; /* * Start to initialise phys_avail by copying from physmap * up to the physical address KERNBASE points at. */ map_slot = avail_slot = 0; for (; map_slot < (physmap_idx * 2) && avail_slot < (PHYS_AVAIL_SIZE - 2); map_slot += 2) { if (physmap[map_slot] == physmap[map_slot + 1]) continue; if (physmap[map_slot] <= pa && physmap[map_slot + 1] > pa) break; phys_avail[avail_slot] = physmap[map_slot]; phys_avail[avail_slot + 1] = physmap[map_slot + 1]; physmem += (phys_avail[avail_slot + 1] - phys_avail[avail_slot]) >> PAGE_SHIFT; avail_slot += 2; } /* Add the memory before the kernel */ if (physmap[avail_slot] < pa && avail_slot < (PHYS_AVAIL_SIZE - 2)) { phys_avail[avail_slot] = physmap[map_slot]; phys_avail[avail_slot + 1] = pa; physmem += (phys_avail[avail_slot + 1] - phys_avail[avail_slot]) >> PAGE_SHIFT; avail_slot += 2; } used_map_slot = map_slot; /* * Read the page table to find out what is already mapped. * This assumes we have mapped a block of memory from KERNBASE * using a single L1 entry. */ l2 = pmap_early_page_idx(l1pt, KERNBASE, &l1_slot, &l2_slot); /* Sanity check the index, KERNBASE should be the first VA */ KASSERT(l2_slot == 0, ("The L2 index is non-zero")); /* Find how many pages we have mapped */ for (; l2_slot < Ln_ENTRIES; l2_slot++) { if ((l2[l2_slot] & ATTR_DESCR_MASK) == 0) break; /* Check locore used L2 blocks */ KASSERT((l2[l2_slot] & ATTR_DESCR_MASK) == L2_BLOCK, ("Invalid bootstrap L2 table")); KASSERT((l2[l2_slot] & ~ATTR_MASK) == pa, ("Incorrect PA in L2 table")); va += L2_SIZE; pa += L2_SIZE; } va = roundup2(va, L1_SIZE); freemempos = KERNBASE + kernlen; freemempos = roundup2(freemempos, PAGE_SIZE); /* Create the l2 tables up to VM_MAX_KERNEL_ADDRESS */ freemempos = pmap_bootstrap_l2(l1pt, va, freemempos); /* And the l3 tables for the early devmap */ freemempos = pmap_bootstrap_l3(l1pt, VM_MAX_KERNEL_ADDRESS - L2_SIZE, freemempos); cpu_tlb_flushID(); #define alloc_pages(var, np) \ (var) = freemempos; \ freemempos += (np * PAGE_SIZE); \ memset((char *)(var), 0, ((np) * PAGE_SIZE)); /* Allocate dynamic per-cpu area. */ alloc_pages(dpcpu, DPCPU_SIZE / PAGE_SIZE); dpcpu_init((void *)dpcpu, 0); /* Allocate memory for the msgbuf, e.g. for /sbin/dmesg */ alloc_pages(msgbufpv, round_page(msgbufsize) / PAGE_SIZE); msgbufp = (void *)msgbufpv; virtual_avail = roundup2(freemempos, L1_SIZE); virtual_end = VM_MAX_KERNEL_ADDRESS - L2_SIZE; kernel_vm_end = virtual_avail; - + pa = pmap_early_vtophys(l1pt, freemempos); /* Finish initialising physmap */ map_slot = used_map_slot; for (; avail_slot < (PHYS_AVAIL_SIZE - 2) && map_slot < (physmap_idx * 2); map_slot += 2) { if (physmap[map_slot] == physmap[map_slot + 1]) continue; /* Have we used the current range? */ if (physmap[map_slot + 1] <= pa) continue; /* Do we need to split the entry? */ if (physmap[map_slot] < pa) { phys_avail[avail_slot] = pa; phys_avail[avail_slot + 1] = physmap[map_slot + 1]; } else { phys_avail[avail_slot] = physmap[map_slot]; phys_avail[avail_slot + 1] = physmap[map_slot + 1]; } physmem += (phys_avail[avail_slot + 1] - phys_avail[avail_slot]) >> PAGE_SHIFT; avail_slot += 2; } phys_avail[avail_slot] = 0; phys_avail[avail_slot + 1] = 0; /* * Maxmem isn't the "maximum memory", it's one larger than the * highest page of the physical address space. It should be * called something like "Maxphyspage". */ Maxmem = atop(phys_avail[avail_slot - 1]); cpu_tlb_flushID(); } /* * Initialize a vm_page's machine-dependent fields. */ void pmap_page_init(vm_page_t m) { TAILQ_INIT(&m->md.pv_list); m->md.pv_memattr = VM_MEMATTR_WRITE_BACK; } /* * Initialize the pmap module. * Called by vm_init, to initialize any structures that the pmap * system needs to map virtual memory. */ void pmap_init(void) { int i; /* * Initialize the pv chunk list mutex. */ mtx_init(&pv_chunks_mutex, "pmap pv chunk list", NULL, MTX_DEF); /* * Initialize the pool of pv list locks. */ for (i = 0; i < NPV_LIST_LOCKS; i++) rw_init(&pv_list_locks[i], "pmap pv list"); } /* * Invalidate a single TLB entry. */ PMAP_INLINE void pmap_invalidate_page(pmap_t pmap, vm_offset_t va) { sched_pin(); __asm __volatile( "dsb ishst \n" "tlbi vaae1is, %0 \n" "dsb ish \n" "isb \n" : : "r"(va >> PAGE_SHIFT)); sched_unpin(); } PMAP_INLINE void pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { vm_offset_t addr; sched_pin(); dsb(ishst); for (addr = sva; addr < eva; addr += PAGE_SIZE) { __asm __volatile( "tlbi vaae1is, %0" : : "r"(addr >> PAGE_SHIFT)); } __asm __volatile( "dsb ish \n" "isb \n"); sched_unpin(); } PMAP_INLINE void pmap_invalidate_all(pmap_t pmap) { sched_pin(); __asm __volatile( "dsb ishst \n" "tlbi vmalle1is \n" "dsb ish \n" "isb \n"); sched_unpin(); } /* * Routine: pmap_extract * Function: * Extract the physical page address associated * with the given map/virtual_address pair. */ -vm_paddr_t +vm_paddr_t pmap_extract(pmap_t pmap, vm_offset_t va) { pt_entry_t *pte, tpte; vm_paddr_t pa; int lvl; pa = 0; PMAP_LOCK(pmap); /* * Find the block or page map for this virtual address. pmap_pte * will return either a valid block/page entry, or NULL. */ pte = pmap_pte(pmap, va, &lvl); if (pte != NULL) { tpte = pmap_load(pte); pa = tpte & ~ATTR_MASK; switch(lvl) { case 1: KASSERT((tpte & ATTR_DESCR_MASK) == L1_BLOCK, ("pmap_extract: Invalid L1 pte found: %lx", tpte & ATTR_DESCR_MASK)); pa |= (va & L1_OFFSET); break; case 2: KASSERT((tpte & ATTR_DESCR_MASK) == L2_BLOCK, ("pmap_extract: Invalid L2 pte found: %lx", tpte & ATTR_DESCR_MASK)); pa |= (va & L2_OFFSET); break; case 3: KASSERT((tpte & ATTR_DESCR_MASK) == L3_PAGE, ("pmap_extract: Invalid L3 pte found: %lx", tpte & ATTR_DESCR_MASK)); pa |= (va & L3_OFFSET); break; } } PMAP_UNLOCK(pmap); return (pa); } /* * Routine: pmap_extract_and_hold * Function: * Atomically extract and hold the physical page * with the given pmap and virtual address pair * if that mapping permits the given protection. */ vm_page_t pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot) { pt_entry_t *pte, tpte; vm_paddr_t pa; vm_page_t m; int lvl; pa = 0; m = NULL; PMAP_LOCK(pmap); retry: pte = pmap_pte(pmap, va, &lvl); if (pte != NULL) { tpte = pmap_load(pte); KASSERT(lvl > 0 && lvl <= 3, ("pmap_extract_and_hold: Invalid level %d", lvl)); CTASSERT(L1_BLOCK == L2_BLOCK); KASSERT((lvl == 3 && (tpte & ATTR_DESCR_MASK) == L3_PAGE) || (lvl < 3 && (tpte & ATTR_DESCR_MASK) == L1_BLOCK), ("pmap_extract_and_hold: Invalid pte at L%d: %lx", lvl, tpte & ATTR_DESCR_MASK)); if (((tpte & ATTR_AP_RW_BIT) == ATTR_AP(ATTR_AP_RW)) || ((prot & VM_PROT_WRITE) == 0)) { if (vm_page_pa_tryrelock(pmap, tpte & ~ATTR_MASK, &pa)) goto retry; m = PHYS_TO_VM_PAGE(tpte & ~ATTR_MASK); vm_page_hold(m); } } PA_UNLOCK_COND(pa); PMAP_UNLOCK(pmap); return (m); } vm_paddr_t pmap_kextract(vm_offset_t va) { pt_entry_t *pte, tpte; vm_paddr_t pa; int lvl; if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) { pa = DMAP_TO_PHYS(va); } else { pa = 0; pte = pmap_pte(kernel_pmap, va, &lvl); if (pte != NULL) { tpte = pmap_load(pte); pa = tpte & ~ATTR_MASK; switch(lvl) { case 1: KASSERT((tpte & ATTR_DESCR_MASK) == L1_BLOCK, ("pmap_kextract: Invalid L1 pte found: %lx", tpte & ATTR_DESCR_MASK)); pa |= (va & L1_OFFSET); break; case 2: KASSERT((tpte & ATTR_DESCR_MASK) == L2_BLOCK, ("pmap_kextract: Invalid L2 pte found: %lx", tpte & ATTR_DESCR_MASK)); pa |= (va & L2_OFFSET); break; case 3: KASSERT((tpte & ATTR_DESCR_MASK) == L3_PAGE, ("pmap_kextract: Invalid L3 pte found: %lx", tpte & ATTR_DESCR_MASK)); pa |= (va & L3_OFFSET); break; } } } return (pa); } /*************************************************** * Low level mapping routines..... ***************************************************/ static void pmap_kenter(vm_offset_t sva, vm_size_t size, vm_paddr_t pa, int mode) { pd_entry_t *pde; pt_entry_t *pte; vm_offset_t va; int lvl; KASSERT((pa & L3_OFFSET) == 0, ("pmap_kenter: Invalid physical address")); KASSERT((sva & L3_OFFSET) == 0, ("pmap_kenter: Invalid virtual address")); KASSERT((size & PAGE_MASK) == 0, ("pmap_kenter: Mapping is not page-sized")); va = sva; while (size != 0) { pde = pmap_pde(kernel_pmap, va, &lvl); KASSERT(pde != NULL, ("pmap_kenter: Invalid page entry, va: 0x%lx", va)); KASSERT(lvl == 2, ("pmap_kenter: Invalid level %d", lvl)); pte = pmap_l2_to_l3(pde, va); pmap_load_store(pte, (pa & ~L3_OFFSET) | ATTR_DEFAULT | ATTR_IDX(mode) | L3_PAGE); PTE_SYNC(pte); va += PAGE_SIZE; pa += PAGE_SIZE; size -= PAGE_SIZE; } pmap_invalidate_range(kernel_pmap, sva, va); } void pmap_kenter_device(vm_offset_t sva, vm_size_t size, vm_paddr_t pa) { pmap_kenter(sva, size, pa, DEVICE_MEMORY); } /* * Remove a page from the kernel pagetables. */ PMAP_INLINE void pmap_kremove(vm_offset_t va) { pt_entry_t *pte; int lvl; pte = pmap_pte(kernel_pmap, va, &lvl); KASSERT(pte != NULL, ("pmap_kremove: Invalid address")); KASSERT(lvl == 3, ("pmap_kremove: Invalid pte level %d", lvl)); if (pmap_l3_valid_cacheable(pmap_load(pte))) cpu_dcache_wb_range(va, L3_SIZE); pmap_load_clear(pte); PTE_SYNC(pte); pmap_invalidate_page(kernel_pmap, va); } void pmap_kremove_device(vm_offset_t sva, vm_size_t size) { pt_entry_t *pte; vm_offset_t va; int lvl; KASSERT((sva & L3_OFFSET) == 0, ("pmap_kremove_device: Invalid virtual address")); KASSERT((size & PAGE_MASK) == 0, ("pmap_kremove_device: Mapping is not page-sized")); va = sva; while (size != 0) { pte = pmap_pte(kernel_pmap, va, &lvl); KASSERT(pte != NULL, ("Invalid page table, va: 0x%lx", va)); KASSERT(lvl == 3, ("Invalid device pagetable level: %d != 3", lvl)); pmap_load_clear(pte); PTE_SYNC(pte); va += PAGE_SIZE; size -= PAGE_SIZE; } pmap_invalidate_range(kernel_pmap, sva, va); } /* * Used to map a range of physical addresses into kernel * virtual address space. * * The value passed in '*virt' is a suggested virtual address for * the mapping. Architectures which can support a direct-mapped * physical to virtual region can return the appropriate address * within that region, leaving '*virt' unchanged. Other * architectures should map the pages starting at '*virt' and * update '*virt' with the first usable address after the mapped * region. */ vm_offset_t pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot) { return PHYS_TO_DMAP(start); } /* * Add a list of wired pages to the kva * this routine is only used for temporary * kernel mappings that do not need to have * page modification or references recorded. * Note that old mappings are simply written * over. The page *must* be wired. * Note: SMP coherent. Uses a ranged shootdown IPI. */ void pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count) { pd_entry_t *pde; pt_entry_t *pte, pa; vm_offset_t va; vm_page_t m; int i, lvl; va = sva; for (i = 0; i < count; i++) { pde = pmap_pde(kernel_pmap, va, &lvl); KASSERT(pde != NULL, ("pmap_qenter: Invalid page entry, va: 0x%lx", va)); KASSERT(lvl == 2, ("pmap_qenter: Invalid level %d", lvl)); m = ma[i]; pa = VM_PAGE_TO_PHYS(m) | ATTR_DEFAULT | ATTR_AP(ATTR_AP_RW) | ATTR_IDX(m->md.pv_memattr) | L3_PAGE; pte = pmap_l2_to_l3(pde, va); pmap_load_store(pte, pa); PTE_SYNC(pte); va += L3_SIZE; } pmap_invalidate_range(kernel_pmap, sva, va); } /* * This routine tears out page mappings from the * kernel -- it is meant only for temporary mappings. */ void pmap_qremove(vm_offset_t sva, int count) { pt_entry_t *pte; vm_offset_t va; int lvl; KASSERT(sva >= VM_MIN_KERNEL_ADDRESS, ("usermode va %lx", sva)); va = sva; while (count-- > 0) { pte = pmap_pte(kernel_pmap, va, &lvl); KASSERT(lvl == 3, ("Invalid device pagetable level: %d != 3", lvl)); if (pte != NULL) { if (pmap_l3_valid_cacheable(pmap_load(pte))) cpu_dcache_wb_range(va, L3_SIZE); pmap_load_clear(pte); PTE_SYNC(pte); } va += PAGE_SIZE; } pmap_invalidate_range(kernel_pmap, sva, va); } /*************************************************** * Page table page management routines..... ***************************************************/ static __inline void pmap_free_zero_pages(struct spglist *free) { vm_page_t m; while ((m = SLIST_FIRST(free)) != NULL) { SLIST_REMOVE_HEAD(free, plinks.s.ss); /* Preserve the page's PG_ZERO setting. */ vm_page_free_toq(m); } } /* * Schedule the specified unused page table page to be freed. Specifically, * add the page to the specified list of pages that will be released to the * physical memory manager after the TLB has been updated. */ static __inline void pmap_add_delayed_free_list(vm_page_t m, struct spglist *free, boolean_t set_PG_ZERO) { if (set_PG_ZERO) m->flags |= PG_ZERO; else m->flags &= ~PG_ZERO; SLIST_INSERT_HEAD(free, m, plinks.s.ss); } - + /* * Decrements a page table page's wire count, which is used to record the * number of valid page table entries within the page. If the wire count * drops to zero, then the page table page is unmapped. Returns TRUE if the * page table page was unmapped and FALSE otherwise. */ static inline boolean_t pmap_unwire_l3(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) { --m->wire_count; if (m->wire_count == 0) { _pmap_unwire_l3(pmap, va, m, free); return (TRUE); } else return (FALSE); } static void _pmap_unwire_l3(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) { PMAP_LOCK_ASSERT(pmap, MA_OWNED); /* * unmap the page table page */ if (m->pindex >= (NUL2E + NUL1E)) { /* l1 page */ pd_entry_t *l0; l0 = pmap_l0(pmap, va); pmap_load_clear(l0); PTE_SYNC(l0); } else if (m->pindex >= NUL2E) { /* l2 page */ pd_entry_t *l1; l1 = pmap_l1(pmap, va); pmap_load_clear(l1); PTE_SYNC(l1); } else { /* l3 page */ pd_entry_t *l2; l2 = pmap_l2(pmap, va); pmap_load_clear(l2); PTE_SYNC(l2); } pmap_resident_count_dec(pmap, 1); if (m->pindex < NUL2E) { /* We just released an l3, unhold the matching l2 */ pd_entry_t *l1, tl1; vm_page_t l2pg; l1 = pmap_l1(pmap, va); tl1 = pmap_load(l1); l2pg = PHYS_TO_VM_PAGE(tl1 & ~ATTR_MASK); pmap_unwire_l3(pmap, va, l2pg, free); } else if (m->pindex < (NUL2E + NUL1E)) { /* We just released an l2, unhold the matching l1 */ pd_entry_t *l0, tl0; vm_page_t l1pg; l0 = pmap_l0(pmap, va); tl0 = pmap_load(l0); l1pg = PHYS_TO_VM_PAGE(tl0 & ~ATTR_MASK); pmap_unwire_l3(pmap, va, l1pg, free); } pmap_invalidate_page(pmap, va); /* * This is a release store so that the ordinary store unmapping * the page table page is globally performed before TLB shoot- * down is begun. */ atomic_subtract_rel_int(&vm_cnt.v_wire_count, 1); - /* + /* * Put page on a list so that it is released after * *ALL* TLB shootdown is done */ pmap_add_delayed_free_list(m, free, TRUE); } /* * After removing an l3 entry, this routine is used to * conditionally free the page, and manage the hold/wire counts. */ static int pmap_unuse_l3(pmap_t pmap, vm_offset_t va, pd_entry_t ptepde, struct spglist *free) { vm_page_t mpte; if (va >= VM_MAXUSER_ADDRESS) return (0); KASSERT(ptepde != 0, ("pmap_unuse_pt: ptepde != 0")); mpte = PHYS_TO_VM_PAGE(ptepde & ~ATTR_MASK); return (pmap_unwire_l3(pmap, va, mpte, free)); } void pmap_pinit0(pmap_t pmap) { PMAP_LOCK_INIT(pmap); bzero(&pmap->pm_stats, sizeof(pmap->pm_stats)); pmap->pm_l0 = kernel_pmap->pm_l0; } int pmap_pinit(pmap_t pmap) { vm_paddr_t l0phys; vm_page_t l0pt; /* * allocate the l0 page */ while ((l0pt = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) VM_WAIT; l0phys = VM_PAGE_TO_PHYS(l0pt); pmap->pm_l0 = (pd_entry_t *)PHYS_TO_DMAP(l0phys); if ((l0pt->flags & PG_ZERO) == 0) pagezero(pmap->pm_l0); bzero(&pmap->pm_stats, sizeof(pmap->pm_stats)); return (1); } /* * This routine is called if the desired page table page does not exist. * * If page table page allocation fails, this routine may sleep before * returning NULL. It sleeps only if a lock pointer was given. * * Note: If a page allocation fails at page table level two or three, * one or two pages may be held during the wait, only to be released * afterwards. This conservative approach is easily argued to avoid * race conditions. */ static vm_page_t _pmap_alloc_l3(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp) { vm_page_t m, l1pg, l2pg; PMAP_LOCK_ASSERT(pmap, MA_OWNED); /* * Allocate a page table page. */ if ((m = vm_page_alloc(NULL, ptepindex, VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) { if (lockp != NULL) { RELEASE_PV_LIST_LOCK(lockp); PMAP_UNLOCK(pmap); rw_runlock(&pvh_global_lock); VM_WAIT; rw_rlock(&pvh_global_lock); PMAP_LOCK(pmap); } /* * Indicate the need to retry. While waiting, the page table * page may have been allocated. */ return (NULL); } if ((m->flags & PG_ZERO) == 0) pmap_zero_page(m); /* * Map the pagetable page into the process address space, if * it isn't already there. */ if (ptepindex >= (NUL2E + NUL1E)) { pd_entry_t *l0; vm_pindex_t l0index; l0index = ptepindex - (NUL2E + NUL1E); l0 = &pmap->pm_l0[l0index]; pmap_load_store(l0, VM_PAGE_TO_PHYS(m) | L0_TABLE); PTE_SYNC(l0); } else if (ptepindex >= NUL2E) { vm_pindex_t l0index, l1index; pd_entry_t *l0, *l1; pd_entry_t tl0; l1index = ptepindex - NUL2E; l0index = l1index >> L0_ENTRIES_SHIFT; l0 = &pmap->pm_l0[l0index]; tl0 = pmap_load(l0); if (tl0 == 0) { /* recurse for allocating page dir */ if (_pmap_alloc_l3(pmap, NUL2E + NUL1E + l0index, lockp) == NULL) { --m->wire_count; /* XXX: release mem barrier? */ atomic_subtract_int(&vm_cnt.v_wire_count, 1); vm_page_free_zero(m); return (NULL); } } else { l1pg = PHYS_TO_VM_PAGE(tl0 & ~ATTR_MASK); l1pg->wire_count++; } l1 = (pd_entry_t *)PHYS_TO_DMAP(pmap_load(l0) & ~ATTR_MASK); l1 = &l1[ptepindex & Ln_ADDR_MASK]; pmap_load_store(l1, VM_PAGE_TO_PHYS(m) | L1_TABLE); PTE_SYNC(l1); } else { vm_pindex_t l0index, l1index; pd_entry_t *l0, *l1, *l2; pd_entry_t tl0, tl1; l1index = ptepindex >> Ln_ENTRIES_SHIFT; l0index = l1index >> L0_ENTRIES_SHIFT; l0 = &pmap->pm_l0[l0index]; tl0 = pmap_load(l0); if (tl0 == 0) { /* recurse for allocating page dir */ if (_pmap_alloc_l3(pmap, NUL2E + l1index, lockp) == NULL) { --m->wire_count; atomic_subtract_int(&vm_cnt.v_wire_count, 1); vm_page_free_zero(m); return (NULL); } tl0 = pmap_load(l0); l1 = (pd_entry_t *)PHYS_TO_DMAP(tl0 & ~ATTR_MASK); l1 = &l1[l1index & Ln_ADDR_MASK]; } else { l1 = (pd_entry_t *)PHYS_TO_DMAP(tl0 & ~ATTR_MASK); l1 = &l1[l1index & Ln_ADDR_MASK]; tl1 = pmap_load(l1); if (tl1 == 0) { /* recurse for allocating page dir */ if (_pmap_alloc_l3(pmap, NUL2E + l1index, lockp) == NULL) { --m->wire_count; /* XXX: release mem barrier? */ atomic_subtract_int( &vm_cnt.v_wire_count, 1); vm_page_free_zero(m); return (NULL); } } else { l2pg = PHYS_TO_VM_PAGE(tl1 & ~ATTR_MASK); l2pg->wire_count++; } } l2 = (pd_entry_t *)PHYS_TO_DMAP(pmap_load(l1) & ~ATTR_MASK); l2 = &l2[ptepindex & Ln_ADDR_MASK]; pmap_load_store(l2, VM_PAGE_TO_PHYS(m) | L2_TABLE); PTE_SYNC(l2); } pmap_resident_count_inc(pmap, 1); return (m); } static vm_page_t pmap_alloc_l3(pmap_t pmap, vm_offset_t va, struct rwlock **lockp) { vm_pindex_t ptepindex; pd_entry_t *pde, tpde; vm_page_t m; int lvl; /* * Calculate pagetable page index */ ptepindex = pmap_l2_pindex(va); retry: /* * Get the page directory entry */ pde = pmap_pde(pmap, va, &lvl); /* * If the page table page is mapped, we just increment the hold count, * and activate it. If we get a level 2 pde it will point to a level 3 * table. */ if (lvl == 2) { tpde = pmap_load(pde); if (tpde != 0) { m = PHYS_TO_VM_PAGE(tpde & ~ATTR_MASK); m->wire_count++; return (m); } } /* * Here if the pte page isn't mapped, or if it has been deallocated. */ m = _pmap_alloc_l3(pmap, ptepindex, lockp); if (m == NULL && lockp != NULL) goto retry; return (m); } /*************************************************** * Pmap allocation/deallocation routines. ***************************************************/ /* * Release any resources held by the given physical map. * Called when a pmap initialized by pmap_pinit is being released. * Should only be called if the map contains no valid mappings. */ void pmap_release(pmap_t pmap) { vm_page_t m; KASSERT(pmap->pm_stats.resident_count == 0, ("pmap_release: pmap resident count %ld != 0", pmap->pm_stats.resident_count)); m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pmap->pm_l0)); m->wire_count--; atomic_subtract_int(&vm_cnt.v_wire_count, 1); vm_page_free_zero(m); } #if 0 static int kvm_size(SYSCTL_HANDLER_ARGS) { unsigned long ksize = VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS; return sysctl_handle_long(oidp, &ksize, 0, req); } -SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG|CTLFLAG_RD, +SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG|CTLFLAG_RD, 0, 0, kvm_size, "LU", "Size of KVM"); static int kvm_free(SYSCTL_HANDLER_ARGS) { unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end; return sysctl_handle_long(oidp, &kfree, 0, req); } -SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG|CTLFLAG_RD, +SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG|CTLFLAG_RD, 0, 0, kvm_free, "LU", "Amount of KVM free"); #endif /* 0 */ /* * grow the number of kernel page table entries, if needed */ void pmap_growkernel(vm_offset_t addr) { vm_paddr_t paddr; vm_page_t nkpg; pd_entry_t *l0, *l1, *l2; mtx_assert(&kernel_map->system_mtx, MA_OWNED); addr = roundup2(addr, L2_SIZE); if (addr - 1 >= kernel_map->max_offset) addr = kernel_map->max_offset; while (kernel_vm_end < addr) { l0 = pmap_l0(kernel_pmap, kernel_vm_end); KASSERT(pmap_load(l0) != 0, ("pmap_growkernel: No level 0 kernel entry")); l1 = pmap_l0_to_l1(l0, kernel_vm_end); if (pmap_load(l1) == 0) { /* We need a new PDP entry */ nkpg = vm_page_alloc(NULL, kernel_vm_end >> L1_SHIFT, VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO); if (nkpg == NULL) panic("pmap_growkernel: no memory to grow kernel"); if ((nkpg->flags & PG_ZERO) == 0) pmap_zero_page(nkpg); paddr = VM_PAGE_TO_PHYS(nkpg); pmap_load_store(l1, paddr | L1_TABLE); PTE_SYNC(l1); continue; /* try again */ } l2 = pmap_l1_to_l2(l1, kernel_vm_end); if ((pmap_load(l2) & ATTR_AF) != 0) { kernel_vm_end = (kernel_vm_end + L2_SIZE) & ~L2_OFFSET; if (kernel_vm_end - 1 >= kernel_map->max_offset) { kernel_vm_end = kernel_map->max_offset; - break; + break; } continue; } nkpg = vm_page_alloc(NULL, kernel_vm_end >> L2_SHIFT, VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO); if (nkpg == NULL) panic("pmap_growkernel: no memory to grow kernel"); if ((nkpg->flags & PG_ZERO) == 0) pmap_zero_page(nkpg); paddr = VM_PAGE_TO_PHYS(nkpg); pmap_load_store(l2, paddr | L2_TABLE); PTE_SYNC(l2); pmap_invalidate_page(kernel_pmap, kernel_vm_end); kernel_vm_end = (kernel_vm_end + L2_SIZE) & ~L2_OFFSET; if (kernel_vm_end - 1 >= kernel_map->max_offset) { kernel_vm_end = kernel_map->max_offset; - break; + break; } } } /*************************************************** * page management routines. ***************************************************/ CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE); CTASSERT(_NPCM == 3); CTASSERT(_NPCPV == 168); static __inline struct pv_chunk * pv_to_chunk(pv_entry_t pv) { return ((struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK)); } #define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap) #define PC_FREE0 0xfffffffffffffffful #define PC_FREE1 0xfffffffffffffffful #define PC_FREE2 0x000000fffffffffful static const uint64_t pc_freemask[_NPCM] = { PC_FREE0, PC_FREE1, PC_FREE2 }; #if 0 #ifdef PV_STATS static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail; SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0, "Current number of pv entry chunks"); SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0, "Current number of pv entry chunks allocated"); SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0, "Current number of pv entry chunks frees"); SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0, "Number of times tried to get a chunk page but failed."); static long pv_entry_frees, pv_entry_allocs, pv_entry_count; static int pv_entry_spare; SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0, "Current number of pv entry frees"); SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0, "Current number of pv entry allocs"); SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0, "Current number of pv entries"); SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0, "Current number of spare pv entries"); #endif #endif /* 0 */ /* * We are in a serious low memory condition. Resort to * drastic measures to free some pages so we can allocate * another pv entry chunk. * * Returns NULL if PV entries were reclaimed from the specified pmap. * * We do not, however, unmap 2mpages because subsequent accesses will * allocate per-page pv entries until repromotion occurs, thereby * exacerbating the shortage of free pv entries. */ static vm_page_t reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp) { panic("ARM64TODO: reclaim_pv_chunk"); } /* * free the pv_entry back to the free list */ static void free_pv_entry(pmap_t pmap, pv_entry_t pv) { struct pv_chunk *pc; int idx, field, bit; rw_assert(&pvh_global_lock, RA_LOCKED); PMAP_LOCK_ASSERT(pmap, MA_OWNED); PV_STAT(atomic_add_long(&pv_entry_frees, 1)); PV_STAT(atomic_add_int(&pv_entry_spare, 1)); PV_STAT(atomic_subtract_long(&pv_entry_count, 1)); pc = pv_to_chunk(pv); idx = pv - &pc->pc_pventry[0]; field = idx / 64; bit = idx % 64; pc->pc_map[field] |= 1ul << bit; if (pc->pc_map[0] != PC_FREE0 || pc->pc_map[1] != PC_FREE1 || pc->pc_map[2] != PC_FREE2) { /* 98% of the time, pc is already at the head of the list. */ if (__predict_false(pc != TAILQ_FIRST(&pmap->pm_pvchunk))) { TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); } return; } TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); free_pv_chunk(pc); } static void free_pv_chunk(struct pv_chunk *pc) { vm_page_t m; mtx_lock(&pv_chunks_mutex); TAILQ_REMOVE(&pv_chunks, pc, pc_lru); mtx_unlock(&pv_chunks_mutex); PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV)); PV_STAT(atomic_subtract_int(&pc_chunk_count, 1)); PV_STAT(atomic_add_int(&pc_chunk_frees, 1)); /* entire chunk is free, return it */ m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc)); dump_drop_page(m->phys_addr); vm_page_unwire(m, PQ_NONE); vm_page_free(m); } /* * Returns a new PV entry, allocating a new PV chunk from the system when * needed. If this PV chunk allocation fails and a PV list lock pointer was * given, a PV chunk is reclaimed from an arbitrary pmap. Otherwise, NULL is * returned. * * The given PV list lock may be released. */ static pv_entry_t get_pv_entry(pmap_t pmap, struct rwlock **lockp) { int bit, field; pv_entry_t pv; struct pv_chunk *pc; vm_page_t m; rw_assert(&pvh_global_lock, RA_LOCKED); PMAP_LOCK_ASSERT(pmap, MA_OWNED); PV_STAT(atomic_add_long(&pv_entry_allocs, 1)); retry: pc = TAILQ_FIRST(&pmap->pm_pvchunk); if (pc != NULL) { for (field = 0; field < _NPCM; field++) { if (pc->pc_map[field]) { bit = ffsl(pc->pc_map[field]) - 1; break; } } if (field < _NPCM) { pv = &pc->pc_pventry[field * 64 + bit]; pc->pc_map[field] &= ~(1ul << bit); /* If this was the last item, move it to tail */ if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 && pc->pc_map[2] == 0) { TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); } PV_STAT(atomic_add_long(&pv_entry_count, 1)); PV_STAT(atomic_subtract_int(&pv_entry_spare, 1)); return (pv); } } /* No free items, allocate another chunk */ m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED); if (m == NULL) { if (lockp == NULL) { PV_STAT(pc_chunk_tryfail++); return (NULL); } m = reclaim_pv_chunk(pmap, lockp); if (m == NULL) goto retry; } PV_STAT(atomic_add_int(&pc_chunk_count, 1)); PV_STAT(atomic_add_int(&pc_chunk_allocs, 1)); dump_add_page(m->phys_addr); pc = (void *)PHYS_TO_DMAP(m->phys_addr); pc->pc_pmap = pmap; pc->pc_map[0] = PC_FREE0 & ~1ul; /* preallocated bit 0 */ pc->pc_map[1] = PC_FREE1; pc->pc_map[2] = PC_FREE2; mtx_lock(&pv_chunks_mutex); TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru); mtx_unlock(&pv_chunks_mutex); pv = &pc->pc_pventry[0]; TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); PV_STAT(atomic_add_long(&pv_entry_count, 1)); PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV - 1)); return (pv); } /* * First find and then remove the pv entry for the specified pmap and virtual * address from the specified pv list. Returns the pv entry if found and NULL * otherwise. This operation can be performed on pv lists for either 4KB or * 2MB page mappings. */ static __inline pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va) { pv_entry_t pv; rw_assert(&pvh_global_lock, RA_LOCKED); TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { if (pmap == PV_PMAP(pv) && va == pv->pv_va) { TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); pvh->pv_gen++; break; } } return (pv); } /* * First find and then destroy the pv entry for the specified pmap and virtual * address. This operation can be performed on pv lists for either 4KB or 2MB * page mappings. */ static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va) { pv_entry_t pv; pv = pmap_pvh_remove(pvh, pmap, va); KASSERT(pv != NULL, ("pmap_pvh_free: pv not found")); free_pv_entry(pmap, pv); } /* * Conditionally create the PV entry for a 4KB page mapping if the required * memory can be allocated without resorting to reclamation. */ static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m, struct rwlock **lockp) { pv_entry_t pv; rw_assert(&pvh_global_lock, RA_LOCKED); PMAP_LOCK_ASSERT(pmap, MA_OWNED); /* Pass NULL instead of the lock pointer to disable reclamation. */ if ((pv = get_pv_entry(pmap, NULL)) != NULL) { pv->pv_va = va; CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); m->md.pv_gen++; return (TRUE); } else return (FALSE); } /* * pmap_remove_l3: do the things to unmap a page in a process */ static int -pmap_remove_l3(pmap_t pmap, pt_entry_t *l3, vm_offset_t va, +pmap_remove_l3(pmap_t pmap, pt_entry_t *l3, vm_offset_t va, pd_entry_t l2e, struct spglist *free, struct rwlock **lockp) { pt_entry_t old_l3; vm_page_t m; PMAP_LOCK_ASSERT(pmap, MA_OWNED); if (pmap_is_current(pmap) && pmap_l3_valid_cacheable(pmap_load(l3))) cpu_dcache_wb_range(va, L3_SIZE); old_l3 = pmap_load_clear(l3); PTE_SYNC(l3); pmap_invalidate_page(pmap, va); if (old_l3 & ATTR_SW_WIRED) pmap->pm_stats.wired_count -= 1; pmap_resident_count_dec(pmap, 1); if (old_l3 & ATTR_SW_MANAGED) { m = PHYS_TO_VM_PAGE(old_l3 & ~ATTR_MASK); if (pmap_page_dirty(old_l3)) vm_page_dirty(m); if (old_l3 & ATTR_AF) vm_page_aflag_set(m, PGA_REFERENCED); CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); pmap_pvh_free(&m->md, pmap, va); } return (pmap_unuse_l3(pmap, va, l2e, free)); } /* * Remove the given range of addresses from the specified map. * * It is assumed that the start and end are properly * rounded to the page size. */ void pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { struct rwlock *lock; vm_offset_t va, va_next; pd_entry_t *l0, *l1, *l2; pt_entry_t l3_paddr, *l3; struct spglist free; int anyvalid; /* * Perform an unsynchronized read. This is, however, safe. */ if (pmap->pm_stats.resident_count == 0) return; anyvalid = 0; SLIST_INIT(&free); rw_rlock(&pvh_global_lock); PMAP_LOCK(pmap); lock = NULL; for (; sva < eva; sva = va_next) { if (pmap->pm_stats.resident_count == 0) break; l0 = pmap_l0(pmap, sva); if (pmap_load(l0) == 0) { va_next = (sva + L0_SIZE) & ~L0_OFFSET; if (va_next < sva) va_next = eva; continue; } l1 = pmap_l0_to_l1(l0, sva); if (pmap_load(l1) == 0) { va_next = (sva + L1_SIZE) & ~L1_OFFSET; if (va_next < sva) va_next = eva; continue; } /* * Calculate index for next page table. */ va_next = (sva + L2_SIZE) & ~L2_OFFSET; if (va_next < sva) va_next = eva; l2 = pmap_l1_to_l2(l1, sva); if (l2 == NULL) continue; l3_paddr = pmap_load(l2); /* * Weed out invalid mappings. */ if ((l3_paddr & ATTR_DESCR_MASK) != L2_TABLE) continue; /* * Limit our scan to either the end of the va represented * by the current page table page, or to the end of the * range being removed. */ if (va_next > eva) va_next = eva; va = va_next; for (l3 = pmap_l2_to_l3(l2, sva); sva != va_next; l3++, sva += L3_SIZE) { if (l3 == NULL) panic("l3 == NULL"); if (pmap_load(l3) == 0) { if (va != va_next) { pmap_invalidate_range(pmap, va, sva); va = va_next; } continue; } if (va == va_next) va = sva; if (pmap_remove_l3(pmap, l3, sva, l3_paddr, &free, &lock)) { sva += L3_SIZE; break; } } if (va != va_next) pmap_invalidate_range(pmap, va, sva); } if (lock != NULL) rw_wunlock(lock); if (anyvalid) pmap_invalidate_all(pmap); - rw_runlock(&pvh_global_lock); + rw_runlock(&pvh_global_lock); PMAP_UNLOCK(pmap); pmap_free_zero_pages(&free); } /* * Routine: pmap_remove_all * Function: * Removes this physical page from * all physical maps in which it resides. * Reflects back modify bits to the pager. * * Notes: * Original versions of this routine were very * inefficient because they iteratively called * pmap_remove (slow...) */ void pmap_remove_all(vm_page_t m) { pv_entry_t pv; pmap_t pmap; pd_entry_t *pde, tpde; pt_entry_t *pte, tpte; struct spglist free; int lvl; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_remove_all: page %p is not managed", m)); SLIST_INIT(&free); rw_wlock(&pvh_global_lock); while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pmap_resident_count_dec(pmap, 1); pde = pmap_pde(pmap, pv->pv_va, &lvl); KASSERT(pde != NULL, ("pmap_remove_all: no page directory entry found")); KASSERT(lvl == 2, ("pmap_remove_all: invalid pde level %d", lvl)); tpde = pmap_load(pde); pte = pmap_l2_to_l3(pde, pv->pv_va); tpte = pmap_load(pte); if (pmap_is_current(pmap) && pmap_l3_valid_cacheable(tpte)) cpu_dcache_wb_range(pv->pv_va, L3_SIZE); pmap_load_clear(pte); PTE_SYNC(pte); pmap_invalidate_page(pmap, pv->pv_va); if (tpte & ATTR_SW_WIRED) pmap->pm_stats.wired_count--; if ((tpte & ATTR_AF) != 0) vm_page_aflag_set(m, PGA_REFERENCED); /* * Update the vm_page_t clean and reference bits. */ if (pmap_page_dirty(tpte)) vm_page_dirty(m); pmap_unuse_l3(pmap, pv->pv_va, tpde, &free); TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); m->md.pv_gen++; free_pv_entry(pmap, pv); PMAP_UNLOCK(pmap); } vm_page_aflag_clear(m, PGA_WRITEABLE); rw_wunlock(&pvh_global_lock); pmap_free_zero_pages(&free); } /* * Set the physical protection on the * specified range of this map as requested. */ void pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) { vm_offset_t va, va_next; pd_entry_t *l0, *l1, *l2; pt_entry_t *l3p, l3; if ((prot & VM_PROT_READ) == VM_PROT_NONE) { pmap_remove(pmap, sva, eva); return; } if ((prot & VM_PROT_WRITE) == VM_PROT_WRITE) return; PMAP_LOCK(pmap); for (; sva < eva; sva = va_next) { l0 = pmap_l0(pmap, sva); if (pmap_load(l0) == 0) { va_next = (sva + L0_SIZE) & ~L0_OFFSET; if (va_next < sva) va_next = eva; continue; } l1 = pmap_l0_to_l1(l0, sva); if (pmap_load(l1) == 0) { va_next = (sva + L1_SIZE) & ~L1_OFFSET; if (va_next < sva) va_next = eva; continue; } va_next = (sva + L2_SIZE) & ~L2_OFFSET; if (va_next < sva) va_next = eva; l2 = pmap_l1_to_l2(l1, sva); if (l2 == NULL || (pmap_load(l2) & ATTR_DESCR_MASK) != L2_TABLE) continue; if (va_next > eva) va_next = eva; va = va_next; for (l3p = pmap_l2_to_l3(l2, sva); sva != va_next; l3p++, sva += L3_SIZE) { l3 = pmap_load(l3p); if (pmap_l3_valid(l3)) { pmap_set(l3p, ATTR_AP(ATTR_AP_RO)); PTE_SYNC(l3p); /* XXX: Use pmap_invalidate_range */ pmap_invalidate_page(pmap, va); } } } PMAP_UNLOCK(pmap); /* TODO: Only invalidate entries we are touching */ pmap_invalidate_all(pmap); } /* * Insert the given physical page (p) at * the specified virtual address (v) in the * target physical map with the protection requested. * * If specified, the page will be wired down, meaning * that the related pte can not be reclaimed. * * NB: This is the only routine which MAY NOT lazy-evaluate * or lose information. That is, this routine must actually * insert this page into the given map NOW. */ int pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, u_int flags, int8_t psind __unused) { struct rwlock *lock; pd_entry_t *pde; pt_entry_t new_l3, orig_l3; pt_entry_t *l3; pv_entry_t pv; vm_paddr_t opa, pa, l1_pa, l2_pa, l3_pa; vm_page_t mpte, om, l1_m, l2_m, l3_m; boolean_t nosleep; int lvl; va = trunc_page(va); if ((m->oflags & VPO_UNMANAGED) == 0 && !vm_page_xbusied(m)) VM_OBJECT_ASSERT_LOCKED(m->object); pa = VM_PAGE_TO_PHYS(m); new_l3 = (pt_entry_t)(pa | ATTR_DEFAULT | ATTR_IDX(m->md.pv_memattr) | L3_PAGE); if ((prot & VM_PROT_WRITE) == 0) new_l3 |= ATTR_AP(ATTR_AP_RO); if ((flags & PMAP_ENTER_WIRED) != 0) new_l3 |= ATTR_SW_WIRED; if ((va >> 63) == 0) new_l3 |= ATTR_AP(ATTR_AP_USER); CTR2(KTR_PMAP, "pmap_enter: %.16lx -> %.16lx", va, pa); mpte = NULL; lock = NULL; rw_rlock(&pvh_global_lock); PMAP_LOCK(pmap); if (va < VM_MAXUSER_ADDRESS) { nosleep = (flags & PMAP_ENTER_NOSLEEP) != 0; mpte = pmap_alloc_l3(pmap, va, nosleep ? NULL : &lock); if (mpte == NULL && nosleep) { CTR0(KTR_PMAP, "pmap_enter: mpte == NULL"); if (lock != NULL) rw_wunlock(lock); rw_runlock(&pvh_global_lock); PMAP_UNLOCK(pmap); return (KERN_RESOURCE_SHORTAGE); } pde = pmap_pde(pmap, va, &lvl); KASSERT(pde != NULL, ("pmap_enter: Invalid page entry, va: 0x%lx", va)); KASSERT(lvl == 2, ("pmap_enter: Invalid level %d", lvl)); l3 = pmap_l2_to_l3(pde, va); } else { pde = pmap_pde(pmap, va, &lvl); /* * If we get a level 2 pde it must point to a level 3 entry * otherwise we will need to create the intermediate tables */ if (lvl < 2) { switch(lvl) { default: case -1: /* Get the l0 pde to update */ pde = pmap_l0(pmap, va); KASSERT(pde != NULL, ("...")); l1_m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO); if (l1_m == NULL) panic("pmap_enter: l1 pte_m == NULL"); if ((l1_m->flags & PG_ZERO) == 0) pmap_zero_page(l1_m); l1_pa = VM_PAGE_TO_PHYS(l1_m); pmap_load_store(pde, l1_pa | L0_TABLE); PTE_SYNC(pde); /* FALLTHROUGH */ case 0: /* Get the l1 pde to update */ pde = pmap_l1_to_l2(pde, va); KASSERT(pde != NULL, ("...")); l2_m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO); if (l2_m == NULL) panic("pmap_enter: l2 pte_m == NULL"); if ((l2_m->flags & PG_ZERO) == 0) pmap_zero_page(l2_m); l2_pa = VM_PAGE_TO_PHYS(l2_m); pmap_load_store(pde, l2_pa | L1_TABLE); PTE_SYNC(pde); /* FALLTHROUGH */ case 1: /* Get the l2 pde to update */ pde = pmap_l1_to_l2(pde, va); l3_m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO); if (l3_m == NULL) panic("pmap_enter: l3 pte_m == NULL"); if ((l3_m->flags & PG_ZERO) == 0) pmap_zero_page(l3_m); l3_pa = VM_PAGE_TO_PHYS(l3_m); pmap_load_store(pde, l3_pa | L2_TABLE); PTE_SYNC(pde); break; } } l3 = pmap_l2_to_l3(pde, va); pmap_invalidate_page(pmap, va); } om = NULL; orig_l3 = pmap_load(l3); opa = orig_l3 & ~ATTR_MASK; /* * Is the specified virtual address already mapped? */ if (pmap_l3_valid(orig_l3)) { /* * Wiring change, just update stats. We don't worry about * wiring PT pages as they remain resident as long as there * are valid mappings in them. Hence, if a user page is wired, * the PT page will be also. */ if ((flags & PMAP_ENTER_WIRED) != 0 && (orig_l3 & ATTR_SW_WIRED) == 0) pmap->pm_stats.wired_count++; else if ((flags & PMAP_ENTER_WIRED) == 0 && (orig_l3 & ATTR_SW_WIRED) != 0) pmap->pm_stats.wired_count--; /* * Remove the extra PT page reference. */ if (mpte != NULL) { mpte->wire_count--; KASSERT(mpte->wire_count > 0, ("pmap_enter: missing reference to page table page," " va: 0x%lx", va)); } /* * Has the physical page changed? */ if (opa == pa) { /* * No, might be a protection or wiring change. */ if ((orig_l3 & ATTR_SW_MANAGED) != 0) { new_l3 |= ATTR_SW_MANAGED; if ((new_l3 & ATTR_AP(ATTR_AP_RW)) == ATTR_AP(ATTR_AP_RW)) { vm_page_aflag_set(m, PGA_WRITEABLE); } } goto validate; } /* Flush the cache, there might be uncommitted data in it */ if (pmap_is_current(pmap) && pmap_l3_valid_cacheable(orig_l3)) cpu_dcache_wb_range(va, L3_SIZE); } else { /* * Increment the counters. */ if ((new_l3 & ATTR_SW_WIRED) != 0) pmap->pm_stats.wired_count++; pmap_resident_count_inc(pmap, 1); } /* * Enter on the PV list if part of our managed memory. */ if ((m->oflags & VPO_UNMANAGED) == 0) { new_l3 |= ATTR_SW_MANAGED; pv = get_pv_entry(pmap, &lock); pv->pv_va = va; CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, pa); TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); m->md.pv_gen++; if ((new_l3 & ATTR_AP_RW_BIT) == ATTR_AP(ATTR_AP_RW)) vm_page_aflag_set(m, PGA_WRITEABLE); } /* * Update the L3 entry. */ if (orig_l3 != 0) { validate: orig_l3 = pmap_load_store(l3, new_l3); PTE_SYNC(l3); opa = orig_l3 & ~ATTR_MASK; if (opa != pa) { if ((orig_l3 & ATTR_SW_MANAGED) != 0) { om = PHYS_TO_VM_PAGE(opa); if (pmap_page_dirty(orig_l3)) vm_page_dirty(om); if ((orig_l3 & ATTR_AF) != 0) vm_page_aflag_set(om, PGA_REFERENCED); CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, opa); pmap_pvh_free(&om->md, pmap, va); } } else if (pmap_page_dirty(orig_l3)) { if ((orig_l3 & ATTR_SW_MANAGED) != 0) vm_page_dirty(m); } } else { pmap_load_store(l3, new_l3); PTE_SYNC(l3); } pmap_invalidate_page(pmap, va); if ((pmap != pmap_kernel()) && (pmap == &curproc->p_vmspace->vm_pmap)) cpu_icache_sync_range(va, PAGE_SIZE); if (lock != NULL) rw_wunlock(lock); rw_runlock(&pvh_global_lock); PMAP_UNLOCK(pmap); return (KERN_SUCCESS); } /* * Maps a sequence of resident pages belonging to the same object. * The sequence begins with the given page m_start. This page is * mapped at the given virtual address start. Each subsequent page is * mapped at a virtual address that is offset from start by the same * amount as the page is offset from m_start within the object. The * last page in the sequence is the page with the largest offset from * m_start that can be mapped at a virtual address less than the given * virtual address end. Not every virtual page between start and end * is mapped; only those for which a resident page exists with the * corresponding offset from m_start are mapped. */ void pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end, vm_page_t m_start, vm_prot_t prot) { struct rwlock *lock; vm_offset_t va; vm_page_t m, mpte; vm_pindex_t diff, psize; VM_OBJECT_ASSERT_LOCKED(m_start->object); psize = atop(end - start); mpte = NULL; m = m_start; lock = NULL; rw_rlock(&pvh_global_lock); PMAP_LOCK(pmap); while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) { va = start + ptoa(diff); mpte = pmap_enter_quick_locked(pmap, va, m, prot, mpte, &lock); m = TAILQ_NEXT(m, listq); } if (lock != NULL) rw_wunlock(lock); rw_runlock(&pvh_global_lock); PMAP_UNLOCK(pmap); } /* * this code makes some *MAJOR* assumptions: * 1. Current pmap & pmap exists. * 2. Not wired. * 3. Read access. * 4. No page table pages. * but is *MUCH* faster than pmap_enter... */ void pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot) { struct rwlock *lock; lock = NULL; rw_rlock(&pvh_global_lock); PMAP_LOCK(pmap); (void)pmap_enter_quick_locked(pmap, va, m, prot, NULL, &lock); if (lock != NULL) rw_wunlock(lock); rw_runlock(&pvh_global_lock); PMAP_UNLOCK(pmap); } static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp) { struct spglist free; pd_entry_t *pde; pt_entry_t *l3; vm_paddr_t pa; int lvl; KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva || (m->oflags & VPO_UNMANAGED) != 0, ("pmap_enter_quick_locked: managed mapping within the clean submap")); rw_assert(&pvh_global_lock, RA_LOCKED); PMAP_LOCK_ASSERT(pmap, MA_OWNED); CTR2(KTR_PMAP, "pmap_enter_quick_locked: %p %lx", pmap, va); /* * In the case that a page table page is not * resident, we are creating it here. */ if (va < VM_MAXUSER_ADDRESS) { vm_pindex_t l2pindex; /* * Calculate pagetable page index */ l2pindex = pmap_l2_pindex(va); if (mpte && (mpte->pindex == l2pindex)) { mpte->wire_count++; } else { /* * Get the l2 entry */ pde = pmap_pde(pmap, va, &lvl); /* * If the page table page is mapped, we just increment * the hold count, and activate it. Otherwise, we * attempt to allocate a page table page. If this * attempt fails, we don't retry. Instead, we give up. */ if (lvl == 2 && pmap_load(pde) != 0) { mpte = PHYS_TO_VM_PAGE(pmap_load(pde) & ~ATTR_MASK); mpte->wire_count++; } else { /* * Pass NULL instead of the PV list lock * pointer, because we don't intend to sleep. */ mpte = _pmap_alloc_l3(pmap, l2pindex, NULL); if (mpte == NULL) return (mpte); } } l3 = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mpte)); l3 = &l3[pmap_l3_index(va)]; } else { mpte = NULL; pde = pmap_pde(kernel_pmap, va, &lvl); KASSERT(pde != NULL, ("pmap_enter_quick_locked: Invalid page entry, va: 0x%lx", va)); KASSERT(lvl == 2, ("pmap_enter_quick_locked: Invalid level %d", lvl)); l3 = pmap_l2_to_l3(pde, va); } if (pmap_load(l3) != 0) { if (mpte != NULL) { mpte->wire_count--; mpte = NULL; } return (mpte); } /* * Enter on the PV list if part of our managed memory. */ if ((m->oflags & VPO_UNMANAGED) == 0 && !pmap_try_insert_pv_entry(pmap, va, m, lockp)) { if (mpte != NULL) { SLIST_INIT(&free); if (pmap_unwire_l3(pmap, va, mpte, &free)) { pmap_invalidate_page(pmap, va); pmap_free_zero_pages(&free); } mpte = NULL; } return (mpte); } /* * Increment counters */ pmap_resident_count_inc(pmap, 1); pa = VM_PAGE_TO_PHYS(m) | ATTR_DEFAULT | ATTR_IDX(m->md.pv_memattr) | ATTR_AP(ATTR_AP_RW) | L3_PAGE; /* * Now validate mapping with RO protection */ if ((m->oflags & VPO_UNMANAGED) == 0) pa |= ATTR_SW_MANAGED; pmap_load_store(l3, pa); PTE_SYNC(l3); pmap_invalidate_page(pmap, va); return (mpte); } /* * This code maps large physical mmap regions into the * processor address space. Note that some shortcuts * are taken, but the code works. */ void pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object, vm_pindex_t pindex, vm_size_t size) { VM_OBJECT_ASSERT_WLOCKED(object); KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG, ("pmap_object_init_pt: non-device object")); } /* * Clear the wired attribute from the mappings for the specified range of * addresses in the given pmap. Every valid mapping within that range * must have the wired attribute set. In contrast, invalid mappings * cannot have the wired attribute set, so they are ignored. * * The wired attribute of the page table entry is not a hardware feature, * so there is no need to invalidate any TLB entries. */ void pmap_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { vm_offset_t va_next; pd_entry_t *l0, *l1, *l2; pt_entry_t *l3; boolean_t pv_lists_locked; pv_lists_locked = FALSE; PMAP_LOCK(pmap); for (; sva < eva; sva = va_next) { l0 = pmap_l0(pmap, sva); if (pmap_load(l0) == 0) { va_next = (sva + L0_SIZE) & ~L0_OFFSET; if (va_next < sva) va_next = eva; continue; } l1 = pmap_l0_to_l1(l0, sva); if (pmap_load(l1) == 0) { va_next = (sva + L1_SIZE) & ~L1_OFFSET; if (va_next < sva) va_next = eva; continue; } va_next = (sva + L2_SIZE) & ~L2_OFFSET; if (va_next < sva) va_next = eva; l2 = pmap_l1_to_l2(l1, sva); if (pmap_load(l2) == 0) continue; if (va_next > eva) va_next = eva; for (l3 = pmap_l2_to_l3(l2, sva); sva != va_next; l3++, sva += L3_SIZE) { if (pmap_load(l3) == 0) continue; if ((pmap_load(l3) & ATTR_SW_WIRED) == 0) panic("pmap_unwire: l3 %#jx is missing " "ATTR_SW_WIRED", (uintmax_t)pmap_load(l3)); /* * PG_W must be cleared atomically. Although the pmap * lock synchronizes access to PG_W, another processor * could be setting PG_M and/or PG_A concurrently. */ atomic_clear_long(l3, ATTR_SW_WIRED); pmap->pm_stats.wired_count--; } } if (pv_lists_locked) rw_runlock(&pvh_global_lock); PMAP_UNLOCK(pmap); } /* * Copy the range specified by src_addr/len * from the source map to the range dst_addr/len * in the destination map. * * This routine is only advisory and need not do anything. */ void pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len, vm_offset_t src_addr) { } /* * pmap_zero_page zeros the specified hardware page by mapping * the page into KVM and using bzero to clear its contents. */ void pmap_zero_page(vm_page_t m) { vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); pagezero((void *)va); } /* - * pmap_zero_page_area zeros the specified hardware page by mapping + * pmap_zero_page_area zeros the specified hardware page by mapping * the page into KVM and using bzero to clear its contents. * * off and size may not cover an area beyond a single hardware page. */ void pmap_zero_page_area(vm_page_t m, int off, int size) { vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); if (off == 0 && size == PAGE_SIZE) pagezero((void *)va); else bzero((char *)va + off, size); } /* - * pmap_zero_page_idle zeros the specified hardware page by mapping + * pmap_zero_page_idle zeros the specified hardware page by mapping * the page into KVM and using bzero to clear its contents. This * is intended to be called from the vm_pagezero process only and * outside of Giant. */ void pmap_zero_page_idle(vm_page_t m) { vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); pagezero((void *)va); } /* * pmap_copy_page copies the specified (machine independent) * page by mapping the page into virtual memory and using * bcopy to copy the page, one machine dependent page at a * time. */ void pmap_copy_page(vm_page_t msrc, vm_page_t mdst) { vm_offset_t src = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(msrc)); vm_offset_t dst = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mdst)); pagecopy((void *)src, (void *)dst); } int unmapped_buf_allowed = 1; void pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[], vm_offset_t b_offset, int xfersize) { void *a_cp, *b_cp; vm_page_t m_a, m_b; vm_paddr_t p_a, p_b; vm_offset_t a_pg_offset, b_pg_offset; int cnt; while (xfersize > 0) { a_pg_offset = a_offset & PAGE_MASK; m_a = ma[a_offset >> PAGE_SHIFT]; p_a = m_a->phys_addr; b_pg_offset = b_offset & PAGE_MASK; m_b = mb[b_offset >> PAGE_SHIFT]; p_b = m_b->phys_addr; cnt = min(xfersize, PAGE_SIZE - a_pg_offset); cnt = min(cnt, PAGE_SIZE - b_pg_offset); if (__predict_false(!PHYS_IN_DMAP(p_a))) { panic("!DMAP a %lx", p_a); } else { a_cp = (char *)PHYS_TO_DMAP(p_a) + a_pg_offset; } if (__predict_false(!PHYS_IN_DMAP(p_b))) { panic("!DMAP b %lx", p_b); } else { b_cp = (char *)PHYS_TO_DMAP(p_b) + b_pg_offset; } bcopy(a_cp, b_cp, cnt); a_offset += cnt; b_offset += cnt; xfersize -= cnt; } } vm_offset_t pmap_quick_enter_page(vm_page_t m) { return (PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m))); } void pmap_quick_remove_page(vm_offset_t addr) { } /* * Returns true if the pmap's pv is one of the first * 16 pvs linked to from this page. This count may * be changed upwards or downwards in the future; it * is only necessary that true be returned for a small * subset of pmaps for proper page aging. */ boolean_t pmap_page_exists_quick(pmap_t pmap, vm_page_t m) { struct rwlock *lock; pv_entry_t pv; int loops = 0; boolean_t rv; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_page_exists_quick: page %p is not managed", m)); rv = FALSE; rw_rlock(&pvh_global_lock); lock = VM_PAGE_TO_PV_LIST_LOCK(m); rw_rlock(lock); TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { if (PV_PMAP(pv) == pmap) { rv = TRUE; break; } loops++; if (loops >= 16) break; } rw_runlock(lock); rw_runlock(&pvh_global_lock); return (rv); } /* * pmap_page_wired_mappings: * * Return the number of managed mappings to the given physical page * that are wired. */ int pmap_page_wired_mappings(vm_page_t m) { struct rwlock *lock; pmap_t pmap; pt_entry_t *pte; pv_entry_t pv; int count, lvl, md_gen; if ((m->oflags & VPO_UNMANAGED) != 0) return (0); rw_rlock(&pvh_global_lock); lock = VM_PAGE_TO_PV_LIST_LOCK(m); rw_rlock(lock); restart: count = 0; TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { md_gen = m->md.pv_gen; rw_runlock(lock); PMAP_LOCK(pmap); rw_rlock(lock); if (md_gen != m->md.pv_gen) { PMAP_UNLOCK(pmap); goto restart; } } pte = pmap_pte(pmap, pv->pv_va, &lvl); if (pte != NULL && (pmap_load(pte) & ATTR_SW_WIRED) != 0) count++; PMAP_UNLOCK(pmap); } rw_runlock(lock); rw_runlock(&pvh_global_lock); return (count); } /* * Destroy all managed, non-wired mappings in the given user-space * pmap. This pmap cannot be active on any processor besides the * caller. - * + * * This function cannot be applied to the kernel pmap. Moreover, it * is not intended for general use. It is only to be used during * process termination. Consequently, it can be implemented in ways * that make it faster than pmap_remove(). First, it can more quickly * destroy mappings by iterating over the pmap's collection of PV * entries, rather than searching the page table. Second, it doesn't * have to test and clear the page table entries atomically, because * no processor is currently accessing the user address space. In * particular, a page table entry's dirty bit won't change state once * this function starts. */ void pmap_remove_pages(pmap_t pmap) { pd_entry_t *pde; pt_entry_t *pte, tpte; struct spglist free; vm_page_t m; pv_entry_t pv; struct pv_chunk *pc, *npc; struct rwlock *lock; int64_t bit; uint64_t inuse, bitmask; int allfree, field, freed, idx, lvl; vm_paddr_t pa; lock = NULL; SLIST_INIT(&free); rw_rlock(&pvh_global_lock); PMAP_LOCK(pmap); TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) { allfree = 1; freed = 0; for (field = 0; field < _NPCM; field++) { inuse = ~pc->pc_map[field] & pc_freemask[field]; while (inuse != 0) { bit = ffsl(inuse) - 1; bitmask = 1UL << bit; idx = field * 64 + bit; pv = &pc->pc_pventry[idx]; inuse &= ~bitmask; pde = pmap_pde(pmap, pv->pv_va, &lvl); KASSERT(pde != NULL, ("Attempting to remove an unmapped page")); KASSERT(lvl == 2, ("Invalid page directory level: %d", lvl)); pte = pmap_l2_to_l3(pde, pv->pv_va); KASSERT(pte != NULL, ("Attempting to remove an unmapped page")); tpte = pmap_load(pte); /* * We cannot remove wired pages from a process' mapping at this time */ if (tpte & ATTR_SW_WIRED) { allfree = 0; continue; } pa = tpte & ~ATTR_MASK; m = PHYS_TO_VM_PAGE(pa); KASSERT(m->phys_addr == pa, ("vm_page_t %p phys_addr mismatch %016jx %016jx", m, (uintmax_t)m->phys_addr, (uintmax_t)tpte)); KASSERT((m->flags & PG_FICTITIOUS) != 0 || m < &vm_page_array[vm_page_array_size], ("pmap_remove_pages: bad pte %#jx", (uintmax_t)tpte)); /* XXX: assumes tpte is level 3 */ if (pmap_is_current(pmap) && pmap_l3_valid_cacheable(tpte)) cpu_dcache_wb_range(pv->pv_va, L3_SIZE); pmap_load_clear(pte); PTE_SYNC(pte); pmap_invalidate_page(pmap, pv->pv_va); /* * Update the vm_page_t clean/reference bits. */ if ((tpte & ATTR_AP_RW_BIT) == ATTR_AP(ATTR_AP_RW)) vm_page_dirty(m); CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, m); /* Mark free */ pc->pc_map[field] |= bitmask; pmap_resident_count_dec(pmap, 1); TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); m->md.pv_gen++; pmap_unuse_l3(pmap, pv->pv_va, pmap_load(pde), &free); freed++; } } PV_STAT(atomic_add_long(&pv_entry_frees, freed)); PV_STAT(atomic_add_int(&pv_entry_spare, freed)); PV_STAT(atomic_subtract_long(&pv_entry_count, freed)); if (allfree) { TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); free_pv_chunk(pc); } } pmap_invalidate_all(pmap); if (lock != NULL) rw_wunlock(lock); rw_runlock(&pvh_global_lock); PMAP_UNLOCK(pmap); pmap_free_zero_pages(&free); } /* * This is used to check if a page has been accessed or modified. As we * don't have a bit to see if it has been modified we have to assume it * has been if the page is read/write. */ static boolean_t pmap_page_test_mappings(vm_page_t m, boolean_t accessed, boolean_t modified) { struct rwlock *lock; pv_entry_t pv; pt_entry_t *pte, mask, value; pmap_t pmap; int lvl, md_gen; boolean_t rv; rv = FALSE; rw_rlock(&pvh_global_lock); lock = VM_PAGE_TO_PV_LIST_LOCK(m); rw_rlock(lock); restart: TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { md_gen = m->md.pv_gen; rw_runlock(lock); PMAP_LOCK(pmap); rw_rlock(lock); if (md_gen != m->md.pv_gen) { PMAP_UNLOCK(pmap); goto restart; } } pte = pmap_pte(pmap, pv->pv_va, &lvl); KASSERT(lvl == 3, ("pmap_page_test_mappings: Invalid level %d", lvl)); mask = 0; value = 0; if (modified) { mask |= ATTR_AP_RW_BIT; value |= ATTR_AP(ATTR_AP_RW); } if (accessed) { mask |= ATTR_AF | ATTR_DESCR_MASK; value |= ATTR_AF | L3_PAGE; } rv = (pmap_load(pte) & mask) == value; PMAP_UNLOCK(pmap); if (rv) goto out; } out: rw_runlock(lock); rw_runlock(&pvh_global_lock); return (rv); } /* * pmap_is_modified: * * Return whether or not the specified physical page was modified * in any physical maps. */ boolean_t pmap_is_modified(vm_page_t m) { KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_is_modified: page %p is not managed", m)); /* * If the page is not exclusive busied, then PGA_WRITEABLE cannot be * concurrently set while the object is locked. Thus, if PGA_WRITEABLE * is clear, no PTEs can have PG_M set. */ VM_OBJECT_ASSERT_WLOCKED(m->object); if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0) return (FALSE); return (pmap_page_test_mappings(m, FALSE, TRUE)); } /* * pmap_is_prefaultable: * * Return whether or not the specified virtual address is eligible * for prefault. */ boolean_t pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr) { pt_entry_t *pte; boolean_t rv; int lvl; rv = FALSE; PMAP_LOCK(pmap); pte = pmap_pte(pmap, addr, &lvl); if (pte != NULL && pmap_load(pte) != 0) { rv = TRUE; } PMAP_UNLOCK(pmap); return (rv); } /* * pmap_is_referenced: * * Return whether or not the specified physical page was referenced * in any physical maps. */ boolean_t pmap_is_referenced(vm_page_t m) { KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_is_referenced: page %p is not managed", m)); return (pmap_page_test_mappings(m, TRUE, FALSE)); } /* * Clear the write and modified bits in each of the given page's mappings. */ void pmap_remove_write(vm_page_t m) { pmap_t pmap; struct rwlock *lock; pv_entry_t pv; pt_entry_t oldpte, *pte; int lvl, md_gen; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_remove_write: page %p is not managed", m)); /* * If the page is not exclusive busied, then PGA_WRITEABLE cannot be * set by another thread while the object is locked. Thus, * if PGA_WRITEABLE is clear, no page table entries need updating. */ VM_OBJECT_ASSERT_WLOCKED(m->object); if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0) return; rw_rlock(&pvh_global_lock); lock = VM_PAGE_TO_PV_LIST_LOCK(m); retry_pv_loop: rw_wlock(lock); TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { md_gen = m->md.pv_gen; rw_wunlock(lock); PMAP_LOCK(pmap); rw_wlock(lock); if (md_gen != m->md.pv_gen) { PMAP_UNLOCK(pmap); rw_wunlock(lock); goto retry_pv_loop; } } pte = pmap_pte(pmap, pv->pv_va, &lvl); retry: oldpte = pmap_load(pte); if ((oldpte & ATTR_AP_RW_BIT) == ATTR_AP(ATTR_AP_RW)) { if (!atomic_cmpset_long(pte, oldpte, oldpte | ATTR_AP(ATTR_AP_RO))) goto retry; if ((oldpte & ATTR_AF) != 0) vm_page_dirty(m); pmap_invalidate_page(pmap, pv->pv_va); } PMAP_UNLOCK(pmap); } rw_wunlock(lock); vm_page_aflag_clear(m, PGA_WRITEABLE); rw_runlock(&pvh_global_lock); } static __inline boolean_t safe_to_clear_referenced(pmap_t pmap, pt_entry_t pte) { return (FALSE); } #define PMAP_TS_REFERENCED_MAX 5 /* * pmap_ts_referenced: * * Return a count of reference bits for a page, clearing those bits. * It is not necessary for every reference bit to be cleared, but it * is necessary that 0 only be returned when there are truly no * reference bits set. * * XXX: The exact number of bits to check and clear is a matter that * should be tested and standardized at some point in the future for * optimal aging of shared pages. */ int pmap_ts_referenced(vm_page_t m) { pv_entry_t pv, pvf; pmap_t pmap; struct rwlock *lock; pd_entry_t *pde, tpde; pt_entry_t *pte, tpte; vm_paddr_t pa; int cleared, md_gen, not_cleared, lvl; struct spglist free; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_ts_referenced: page %p is not managed", m)); SLIST_INIT(&free); cleared = 0; pa = VM_PAGE_TO_PHYS(m); lock = PHYS_TO_PV_LIST_LOCK(pa); rw_rlock(&pvh_global_lock); rw_wlock(lock); retry: not_cleared = 0; if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL) goto out; pv = pvf; do { if (pvf == NULL) pvf = pv; pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { md_gen = m->md.pv_gen; rw_wunlock(lock); PMAP_LOCK(pmap); rw_wlock(lock); if (md_gen != m->md.pv_gen) { PMAP_UNLOCK(pmap); goto retry; } } pde = pmap_pde(pmap, pv->pv_va, &lvl); KASSERT(pde != NULL, ("pmap_ts_referenced: no l2 table found")); KASSERT(lvl == 2, ("pmap_ts_referenced: invalid pde level %d", lvl)); tpde = pmap_load(pde); KASSERT((tpde & ATTR_DESCR_MASK) == L2_TABLE, ("pmap_ts_referenced: found an invalid l2 table")); pte = pmap_l2_to_l3(pde, pv->pv_va); tpte = pmap_load(pte); if ((tpte & ATTR_AF) != 0) { if (safe_to_clear_referenced(pmap, tpte)) { /* * TODO: We don't handle the access flag * at all. We need to be able to set it in * the exception handler. */ panic("ARM64TODO: safe_to_clear_referenced\n"); } else if ((tpte & ATTR_SW_WIRED) == 0) { /* * Wired pages cannot be paged out so * doing accessed bit emulation for * them is wasted effort. We do the * hard work for unwired pages only. */ pmap_remove_l3(pmap, pte, pv->pv_va, tpde, &free, &lock); pmap_invalidate_page(pmap, pv->pv_va); cleared++; if (pvf == pv) pvf = NULL; pv = NULL; KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m), ("inconsistent pv lock %p %p for page %p", lock, VM_PAGE_TO_PV_LIST_LOCK(m), m)); } else not_cleared++; } PMAP_UNLOCK(pmap); /* Rotate the PV list if it has more than one entry. */ if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) { TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); m->md.pv_gen++; } } while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && cleared + not_cleared < PMAP_TS_REFERENCED_MAX); out: rw_wunlock(lock); rw_runlock(&pvh_global_lock); pmap_free_zero_pages(&free); return (cleared + not_cleared); } /* * Apply the given advice to the specified range of addresses within the * given pmap. Depending on the advice, clear the referenced and/or * modified flags in each mapping and set the mapped page's dirty field. */ void pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice) { } /* * Clear the modify bits on the specified physical page. */ void pmap_clear_modify(vm_page_t m) { KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_clear_modify: page %p is not managed", m)); VM_OBJECT_ASSERT_WLOCKED(m->object); KASSERT(!vm_page_xbusied(m), ("pmap_clear_modify: page %p is exclusive busied", m)); /* * If the page is not PGA_WRITEABLE, then no PTEs can have PG_M set. * If the object containing the page is locked and the page is not * exclusive busied, then PGA_WRITEABLE cannot be concurrently set. */ if ((m->aflags & PGA_WRITEABLE) == 0) return; /* ARM64TODO: We lack support for tracking if a page is modified */ } void * pmap_mapbios(vm_paddr_t pa, vm_size_t size) { return ((void *)PHYS_TO_DMAP(pa)); } void pmap_unmapbios(vm_paddr_t pa, vm_size_t size) { } /* * Sets the memory attribute for the specified page. */ void pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma) { m->md.pv_memattr = ma; /* * ARM64TODO: Implement the below (from the amd64 pmap) * If "m" is a normal page, update its direct mapping. This update * can be relied upon to perform any cache operations that are * required for data coherence. */ if ((m->flags & PG_FICTITIOUS) == 0 && PHYS_IN_DMAP(VM_PAGE_TO_PHYS(m))) panic("ARM64TODO: pmap_page_set_memattr"); } /* * perform the pmap work for mincore */ int pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *locked_pa) { pd_entry_t *l1p, l1; pd_entry_t *l2p, l2; pt_entry_t *l3p, l3; vm_paddr_t pa; bool managed; int val; PMAP_LOCK(pmap); retry: pa = 0; val = 0; managed = false; l1p = pmap_l1(pmap, addr); if (l1p == NULL) /* No l1 */ goto done; l1 = pmap_load(l1p); if ((l1 & ATTR_DESCR_MASK) == L1_INVAL) goto done; if ((l1 & ATTR_DESCR_MASK) == L1_BLOCK) { pa = (l1 & ~ATTR_MASK) | (addr & L1_OFFSET); managed = (l1 & ATTR_SW_MANAGED) == ATTR_SW_MANAGED; val = MINCORE_SUPER | MINCORE_INCORE; if (pmap_page_dirty(l1)) val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER; if ((l1 & ATTR_AF) == ATTR_AF) val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER; goto done; } l2p = pmap_l1_to_l2(l1p, addr); if (l2p == NULL) /* No l2 */ goto done; l2 = pmap_load(l2p); if ((l2 & ATTR_DESCR_MASK) == L2_INVAL) goto done; if ((l2 & ATTR_DESCR_MASK) == L2_BLOCK) { pa = (l2 & ~ATTR_MASK) | (addr & L2_OFFSET); managed = (l2 & ATTR_SW_MANAGED) == ATTR_SW_MANAGED; val = MINCORE_SUPER | MINCORE_INCORE; if (pmap_page_dirty(l2)) val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER; if ((l2 & ATTR_AF) == ATTR_AF) val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER; goto done; } l3p = pmap_l2_to_l3(l2p, addr); if (l3p == NULL) /* No l3 */ goto done; l3 = pmap_load(l2p); if ((l3 & ATTR_DESCR_MASK) == L3_INVAL) goto done; if ((l3 & ATTR_DESCR_MASK) == L3_PAGE) { pa = (l3 & ~ATTR_MASK) | (addr & L3_OFFSET); managed = (l3 & ATTR_SW_MANAGED) == ATTR_SW_MANAGED; val = MINCORE_INCORE; if (pmap_page_dirty(l3)) val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER; if ((l3 & ATTR_AF) == ATTR_AF) val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER; } done: if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) != (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) && managed) { /* Ensure that "PHYS_TO_VM_PAGE(pa)->object" doesn't change. */ if (vm_page_pa_tryrelock(pmap, pa, locked_pa)) goto retry; } else PA_UNLOCK_COND(*locked_pa); PMAP_UNLOCK(pmap); return (val); } void pmap_activate(struct thread *td) { pmap_t pmap; critical_enter(); pmap = vmspace_pmap(td->td_proc->p_vmspace); td->td_pcb->pcb_l0addr = vtophys(pmap->pm_l0); __asm __volatile("msr ttbr0_el1, %0" : : "r"(td->td_pcb->pcb_l0addr)); pmap_invalidate_all(pmap); critical_exit(); } void pmap_sync_icache(pmap_t pmap, vm_offset_t va, vm_size_t sz) { if (va >= VM_MIN_KERNEL_ADDRESS) { cpu_icache_sync_range(va, sz); } else { u_int len, offset; vm_paddr_t pa; /* Find the length of data in this page to flush */ offset = va & PAGE_MASK; len = imin(PAGE_SIZE - offset, sz); while (sz != 0) { /* Extract the physical address & find it in the DMAP */ pa = pmap_extract(pmap, va); if (pa != 0) cpu_icache_sync_range(PHYS_TO_DMAP(pa), len); /* Move to the next page */ sz -= len; va += len; /* Set the length for the next iteration */ len = imin(PAGE_SIZE, sz); } } } /* * Increase the starting virtual address of the given mapping if a * different alignment might result in more superpage mappings. */ void pmap_align_superpage(vm_object_t object, vm_ooffset_t offset, vm_offset_t *addr, vm_size_t size) { } /** * Get the kernel virtual address of a set of physical pages. If there are * physical addresses not covered by the DMAP perform a transient mapping * that will be removed when calling pmap_unmap_io_transient. * * \param page The pages the caller wishes to obtain the virtual * address on the kernel memory map. * \param vaddr On return contains the kernel virtual memory address * of the pages passed in the page parameter. * \param count Number of pages passed in. * \param can_fault TRUE if the thread using the mapped pages can take * page faults, FALSE otherwise. * * \returns TRUE if the caller must call pmap_unmap_io_transient when * finished or FALSE otherwise. * */ boolean_t pmap_map_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count, boolean_t can_fault) { vm_paddr_t paddr; boolean_t needs_mapping; int error, i; /* * Allocate any KVA space that we need, this is done in a separate * loop to prevent calling vmem_alloc while pinned. */ needs_mapping = FALSE; for (i = 0; i < count; i++) { paddr = VM_PAGE_TO_PHYS(page[i]); if (__predict_false(!PHYS_IN_DMAP(paddr))) { error = vmem_alloc(kernel_arena, PAGE_SIZE, M_BESTFIT | M_WAITOK, &vaddr[i]); KASSERT(error == 0, ("vmem_alloc failed: %d", error)); needs_mapping = TRUE; } else { vaddr[i] = PHYS_TO_DMAP(paddr); } } /* Exit early if everything is covered by the DMAP */ if (!needs_mapping) return (FALSE); if (!can_fault) sched_pin(); for (i = 0; i < count; i++) { paddr = VM_PAGE_TO_PHYS(page[i]); if (!PHYS_IN_DMAP(paddr)) { panic( "pmap_map_io_transient: TODO: Map out of DMAP data"); } } return (needs_mapping); } void pmap_unmap_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count, boolean_t can_fault) { vm_paddr_t paddr; int i; if (!can_fault) sched_unpin(); for (i = 0; i < count; i++) { paddr = VM_PAGE_TO_PHYS(page[i]); if (!PHYS_IN_DMAP(paddr)) { panic("ARM64TODO: pmap_unmap_io_transient: Unmap data"); } } } Index: user/alc/PQ_LAUNDRY/sys/boot/fdt/dts/riscv/spike.dts =================================================================== --- user/alc/PQ_LAUNDRY/sys/boot/fdt/dts/riscv/spike.dts (revision 303666) +++ user/alc/PQ_LAUNDRY/sys/boot/fdt/dts/riscv/spike.dts (revision 303667) @@ -1,109 +1,110 @@ /*- - * Copyright (c) 2015 Ruslan Bukin + * Copyright (c) 2015-2016 Ruslan Bukin * All rights reserved. * * Portions of this software were developed by SRI International and the * University of Cambridge Computer Laboratory under DARPA/AFRL contract * FA8750-10-C-0237 ("CTSRD"), as part of the DARPA CRASH research programme. * * Portions of this software were developed by the University of Cambridge * Computer Laboratory as part of the CTSRD Project, with support from the * UK Higher Education Innovation Fund (HEIF). * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ /dts-v1/; / { - model = "UC Berkeley Spike Simulator RV64I"; - compatible = "riscv,rv64i"; + model = "UC Berkeley Spike Simulator RV64"; + compatible = "riscv,rv64"; #address-cells = <1>; #size-cells = <1>; #interrupt-cells = <1>; cpus { #address-cells = <1>; #size-cells = <0>; cpu@0 { device_type = "cpu"; - compatible = "riscv,rv64i"; - reg = <0x40002000>; + compatible = "riscv,rv64"; + reg = <0x40001000>; }; cpu@1 { device_type = "cpu"; - compatible = "riscv,rv64i"; - reg = <0x4000a000>; + compatible = "riscv,rv64"; + reg = <0x40002000>; }; }; aliases { console0 = &console0; }; memory { device_type = "memory"; - reg = <0x0 0x40000000>; /* 1GB at 0x0 */ + reg = <0x80000000 0x40000000>; /* 1GB at 0x80000000 */ }; soc { - #address-cells = <2>; - #size-cells = <2>; + #address-cells = <1>; + #size-cells = <1>; #interrupt-cells = <1>; compatible = "simple-bus"; ranges; pic0: pic@0 { compatible = "riscv,pic"; interrupt-controller; }; timer0: timer@0 { compatible = "riscv,timer"; - interrupts = < 1 >; + reg = < 0x40000000 0x100 >; + interrupts = < 5 >; interrupt-parent = < &pic0 >; clock-frequency = < 1000000 >; }; htif0: htif@0 { compatible = "riscv,htif"; - interrupts = < 0 >; + interrupts = < 1 >; interrupt-parent = < &pic0 >; console0: console@0 { compatible = "htif,console"; status = "okay"; }; }; }; chosen { bootargs = "-v"; stdin = "console0"; stdout = "console0"; }; }; Index: user/alc/PQ_LAUNDRY/sys/cddl/contrib/opensolaris/uts/common/sys/dtrace.h =================================================================== --- user/alc/PQ_LAUNDRY/sys/cddl/contrib/opensolaris/uts/common/sys/dtrace.h (revision 303666) +++ user/alc/PQ_LAUNDRY/sys/cddl/contrib/opensolaris/uts/common/sys/dtrace.h (revision 303667) @@ -1,2514 +1,2514 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* * Copyright (c) 2013, Joyent, Inc. All rights reserved. * Copyright (c) 2013 by Delphix. All rights reserved. */ #ifndef _SYS_DTRACE_H #define _SYS_DTRACE_H #pragma ident "%Z%%M% %I% %E% SMI" #ifdef __cplusplus extern "C" { #endif /* * DTrace Dynamic Tracing Software: Kernel Interfaces * * Note: The contents of this file are private to the implementation of the * Solaris system and DTrace subsystem and are subject to change at any time * without notice. Applications and drivers using these interfaces will fail * to run on future releases. These interfaces should not be used for any * purpose except those expressly outlined in dtrace(7D) and libdtrace(3LIB). * Please refer to the "Solaris Dynamic Tracing Guide" for more information. */ #ifndef _ASM #include #include #include #ifdef illumos #include #else #include #include #include #include #include typedef int model_t; #endif #include #ifdef illumos #include #include #else #include #endif /* * DTrace Universal Constants and Typedefs */ #define DTRACE_CPUALL -1 /* all CPUs */ #define DTRACE_IDNONE 0 /* invalid probe identifier */ #define DTRACE_EPIDNONE 0 /* invalid enabled probe identifier */ #define DTRACE_AGGIDNONE 0 /* invalid aggregation identifier */ #define DTRACE_AGGVARIDNONE 0 /* invalid aggregation variable ID */ #define DTRACE_CACHEIDNONE 0 /* invalid predicate cache */ #define DTRACE_PROVNONE 0 /* invalid provider identifier */ #define DTRACE_METAPROVNONE 0 /* invalid meta-provider identifier */ #define DTRACE_ARGNONE -1 /* invalid argument index */ #define DTRACE_PROVNAMELEN 64 #define DTRACE_MODNAMELEN 64 #define DTRACE_FUNCNAMELEN 192 #define DTRACE_NAMELEN 64 #define DTRACE_FULLNAMELEN (DTRACE_PROVNAMELEN + DTRACE_MODNAMELEN + \ DTRACE_FUNCNAMELEN + DTRACE_NAMELEN + 4) #define DTRACE_ARGTYPELEN 128 typedef uint32_t dtrace_id_t; /* probe identifier */ typedef uint32_t dtrace_epid_t; /* enabled probe identifier */ typedef uint32_t dtrace_aggid_t; /* aggregation identifier */ typedef int64_t dtrace_aggvarid_t; /* aggregation variable identifier */ typedef uint16_t dtrace_actkind_t; /* action kind */ typedef int64_t dtrace_optval_t; /* option value */ typedef uint32_t dtrace_cacheid_t; /* predicate cache identifier */ typedef enum dtrace_probespec { DTRACE_PROBESPEC_NONE = -1, DTRACE_PROBESPEC_PROVIDER = 0, DTRACE_PROBESPEC_MOD, DTRACE_PROBESPEC_FUNC, DTRACE_PROBESPEC_NAME } dtrace_probespec_t; /* * DTrace Intermediate Format (DIF) * * The following definitions describe the DTrace Intermediate Format (DIF), a * a RISC-like instruction set and program encoding used to represent * predicates and actions that can be bound to DTrace probes. The constants * below defining the number of available registers are suggested minimums; the * compiler should use DTRACEIOC_CONF to dynamically obtain the number of * registers provided by the current DTrace implementation. */ #define DIF_VERSION_1 1 /* DIF version 1: Solaris 10 Beta */ #define DIF_VERSION_2 2 /* DIF version 2: Solaris 10 FCS */ #define DIF_VERSION DIF_VERSION_2 /* latest DIF instruction set version */ #define DIF_DIR_NREGS 8 /* number of DIF integer registers */ #define DIF_DTR_NREGS 8 /* number of DIF tuple registers */ #define DIF_OP_OR 1 /* or r1, r2, rd */ #define DIF_OP_XOR 2 /* xor r1, r2, rd */ #define DIF_OP_AND 3 /* and r1, r2, rd */ #define DIF_OP_SLL 4 /* sll r1, r2, rd */ #define DIF_OP_SRL 5 /* srl r1, r2, rd */ #define DIF_OP_SUB 6 /* sub r1, r2, rd */ #define DIF_OP_ADD 7 /* add r1, r2, rd */ #define DIF_OP_MUL 8 /* mul r1, r2, rd */ #define DIF_OP_SDIV 9 /* sdiv r1, r2, rd */ #define DIF_OP_UDIV 10 /* udiv r1, r2, rd */ #define DIF_OP_SREM 11 /* srem r1, r2, rd */ #define DIF_OP_UREM 12 /* urem r1, r2, rd */ #define DIF_OP_NOT 13 /* not r1, rd */ #define DIF_OP_MOV 14 /* mov r1, rd */ #define DIF_OP_CMP 15 /* cmp r1, r2 */ #define DIF_OP_TST 16 /* tst r1 */ #define DIF_OP_BA 17 /* ba label */ #define DIF_OP_BE 18 /* be label */ #define DIF_OP_BNE 19 /* bne label */ #define DIF_OP_BG 20 /* bg label */ #define DIF_OP_BGU 21 /* bgu label */ #define DIF_OP_BGE 22 /* bge label */ #define DIF_OP_BGEU 23 /* bgeu label */ #define DIF_OP_BL 24 /* bl label */ #define DIF_OP_BLU 25 /* blu label */ #define DIF_OP_BLE 26 /* ble label */ #define DIF_OP_BLEU 27 /* bleu label */ #define DIF_OP_LDSB 28 /* ldsb [r1], rd */ #define DIF_OP_LDSH 29 /* ldsh [r1], rd */ #define DIF_OP_LDSW 30 /* ldsw [r1], rd */ #define DIF_OP_LDUB 31 /* ldub [r1], rd */ #define DIF_OP_LDUH 32 /* lduh [r1], rd */ #define DIF_OP_LDUW 33 /* lduw [r1], rd */ #define DIF_OP_LDX 34 /* ldx [r1], rd */ #define DIF_OP_RET 35 /* ret rd */ #define DIF_OP_NOP 36 /* nop */ #define DIF_OP_SETX 37 /* setx intindex, rd */ #define DIF_OP_SETS 38 /* sets strindex, rd */ #define DIF_OP_SCMP 39 /* scmp r1, r2 */ #define DIF_OP_LDGA 40 /* ldga var, ri, rd */ #define DIF_OP_LDGS 41 /* ldgs var, rd */ #define DIF_OP_STGS 42 /* stgs var, rs */ #define DIF_OP_LDTA 43 /* ldta var, ri, rd */ #define DIF_OP_LDTS 44 /* ldts var, rd */ #define DIF_OP_STTS 45 /* stts var, rs */ #define DIF_OP_SRA 46 /* sra r1, r2, rd */ #define DIF_OP_CALL 47 /* call subr, rd */ #define DIF_OP_PUSHTR 48 /* pushtr type, rs, rr */ #define DIF_OP_PUSHTV 49 /* pushtv type, rs, rv */ #define DIF_OP_POPTS 50 /* popts */ #define DIF_OP_FLUSHTS 51 /* flushts */ #define DIF_OP_LDGAA 52 /* ldgaa var, rd */ #define DIF_OP_LDTAA 53 /* ldtaa var, rd */ #define DIF_OP_STGAA 54 /* stgaa var, rs */ #define DIF_OP_STTAA 55 /* sttaa var, rs */ #define DIF_OP_LDLS 56 /* ldls var, rd */ #define DIF_OP_STLS 57 /* stls var, rs */ #define DIF_OP_ALLOCS 58 /* allocs r1, rd */ #define DIF_OP_COPYS 59 /* copys r1, r2, rd */ #define DIF_OP_STB 60 /* stb r1, [rd] */ #define DIF_OP_STH 61 /* sth r1, [rd] */ #define DIF_OP_STW 62 /* stw r1, [rd] */ #define DIF_OP_STX 63 /* stx r1, [rd] */ #define DIF_OP_ULDSB 64 /* uldsb [r1], rd */ #define DIF_OP_ULDSH 65 /* uldsh [r1], rd */ #define DIF_OP_ULDSW 66 /* uldsw [r1], rd */ #define DIF_OP_ULDUB 67 /* uldub [r1], rd */ #define DIF_OP_ULDUH 68 /* ulduh [r1], rd */ #define DIF_OP_ULDUW 69 /* ulduw [r1], rd */ #define DIF_OP_ULDX 70 /* uldx [r1], rd */ #define DIF_OP_RLDSB 71 /* rldsb [r1], rd */ #define DIF_OP_RLDSH 72 /* rldsh [r1], rd */ #define DIF_OP_RLDSW 73 /* rldsw [r1], rd */ #define DIF_OP_RLDUB 74 /* rldub [r1], rd */ #define DIF_OP_RLDUH 75 /* rlduh [r1], rd */ #define DIF_OP_RLDUW 76 /* rlduw [r1], rd */ #define DIF_OP_RLDX 77 /* rldx [r1], rd */ #define DIF_OP_XLATE 78 /* xlate xlrindex, rd */ #define DIF_OP_XLARG 79 /* xlarg xlrindex, rd */ #define DIF_INTOFF_MAX 0xffff /* highest integer table offset */ #define DIF_STROFF_MAX 0xffff /* highest string table offset */ #define DIF_REGISTER_MAX 0xff /* highest register number */ #define DIF_VARIABLE_MAX 0xffff /* highest variable identifier */ #define DIF_SUBROUTINE_MAX 0xffff /* highest subroutine code */ #define DIF_VAR_ARRAY_MIN 0x0000 /* lowest numbered array variable */ #define DIF_VAR_ARRAY_UBASE 0x0080 /* lowest user-defined array */ #define DIF_VAR_ARRAY_MAX 0x00ff /* highest numbered array variable */ #define DIF_VAR_OTHER_MIN 0x0100 /* lowest numbered scalar or assc */ #define DIF_VAR_OTHER_UBASE 0x0500 /* lowest user-defined scalar or assc */ #define DIF_VAR_OTHER_MAX 0xffff /* highest numbered scalar or assc */ #define DIF_VAR_ARGS 0x0000 /* arguments array */ #define DIF_VAR_REGS 0x0001 /* registers array */ #define DIF_VAR_UREGS 0x0002 /* user registers array */ #define DIF_VAR_CURTHREAD 0x0100 /* thread pointer */ #define DIF_VAR_TIMESTAMP 0x0101 /* timestamp */ #define DIF_VAR_VTIMESTAMP 0x0102 /* virtual timestamp */ #define DIF_VAR_IPL 0x0103 /* interrupt priority level */ #define DIF_VAR_EPID 0x0104 /* enabled probe ID */ #define DIF_VAR_ID 0x0105 /* probe ID */ #define DIF_VAR_ARG0 0x0106 /* first argument */ #define DIF_VAR_ARG1 0x0107 /* second argument */ #define DIF_VAR_ARG2 0x0108 /* third argument */ #define DIF_VAR_ARG3 0x0109 /* fourth argument */ #define DIF_VAR_ARG4 0x010a /* fifth argument */ #define DIF_VAR_ARG5 0x010b /* sixth argument */ #define DIF_VAR_ARG6 0x010c /* seventh argument */ #define DIF_VAR_ARG7 0x010d /* eighth argument */ #define DIF_VAR_ARG8 0x010e /* ninth argument */ #define DIF_VAR_ARG9 0x010f /* tenth argument */ #define DIF_VAR_STACKDEPTH 0x0110 /* stack depth */ #define DIF_VAR_CALLER 0x0111 /* caller */ #define DIF_VAR_PROBEPROV 0x0112 /* probe provider */ #define DIF_VAR_PROBEMOD 0x0113 /* probe module */ #define DIF_VAR_PROBEFUNC 0x0114 /* probe function */ #define DIF_VAR_PROBENAME 0x0115 /* probe name */ #define DIF_VAR_PID 0x0116 /* process ID */ #define DIF_VAR_TID 0x0117 /* (per-process) thread ID */ #define DIF_VAR_EXECNAME 0x0118 /* name of executable */ #define DIF_VAR_ZONENAME 0x0119 /* zone name associated with process */ #define DIF_VAR_WALLTIMESTAMP 0x011a /* wall-clock timestamp */ #define DIF_VAR_USTACKDEPTH 0x011b /* user-land stack depth */ #define DIF_VAR_UCALLER 0x011c /* user-level caller */ #define DIF_VAR_PPID 0x011d /* parent process ID */ #define DIF_VAR_UID 0x011e /* process user ID */ #define DIF_VAR_GID 0x011f /* process group ID */ #define DIF_VAR_ERRNO 0x0120 /* thread errno */ #define DIF_VAR_EXECARGS 0x0121 /* process arguments */ #ifndef illumos #define DIF_VAR_CPU 0x0200 #endif #define DIF_SUBR_RAND 0 #define DIF_SUBR_MUTEX_OWNED 1 #define DIF_SUBR_MUTEX_OWNER 2 #define DIF_SUBR_MUTEX_TYPE_ADAPTIVE 3 #define DIF_SUBR_MUTEX_TYPE_SPIN 4 #define DIF_SUBR_RW_READ_HELD 5 #define DIF_SUBR_RW_WRITE_HELD 6 #define DIF_SUBR_RW_ISWRITER 7 #define DIF_SUBR_COPYIN 8 #define DIF_SUBR_COPYINSTR 9 #define DIF_SUBR_SPECULATION 10 #define DIF_SUBR_PROGENYOF 11 #define DIF_SUBR_STRLEN 12 #define DIF_SUBR_COPYOUT 13 #define DIF_SUBR_COPYOUTSTR 14 #define DIF_SUBR_ALLOCA 15 #define DIF_SUBR_BCOPY 16 #define DIF_SUBR_COPYINTO 17 #define DIF_SUBR_MSGDSIZE 18 #define DIF_SUBR_MSGSIZE 19 #define DIF_SUBR_GETMAJOR 20 #define DIF_SUBR_GETMINOR 21 #define DIF_SUBR_DDI_PATHNAME 22 #define DIF_SUBR_STRJOIN 23 #define DIF_SUBR_LLTOSTR 24 #define DIF_SUBR_BASENAME 25 #define DIF_SUBR_DIRNAME 26 #define DIF_SUBR_CLEANPATH 27 #define DIF_SUBR_STRCHR 28 #define DIF_SUBR_STRRCHR 29 #define DIF_SUBR_STRSTR 30 #define DIF_SUBR_STRTOK 31 #define DIF_SUBR_SUBSTR 32 #define DIF_SUBR_INDEX 33 #define DIF_SUBR_RINDEX 34 #define DIF_SUBR_HTONS 35 #define DIF_SUBR_HTONL 36 #define DIF_SUBR_HTONLL 37 #define DIF_SUBR_NTOHS 38 #define DIF_SUBR_NTOHL 39 #define DIF_SUBR_NTOHLL 40 #define DIF_SUBR_INET_NTOP 41 #define DIF_SUBR_INET_NTOA 42 #define DIF_SUBR_INET_NTOA6 43 #define DIF_SUBR_TOUPPER 44 #define DIF_SUBR_TOLOWER 45 #define DIF_SUBR_MEMREF 46 #define DIF_SUBR_TYPEREF 47 #define DIF_SUBR_SX_SHARED_HELD 48 #define DIF_SUBR_SX_EXCLUSIVE_HELD 49 #define DIF_SUBR_SX_ISEXCLUSIVE 50 #define DIF_SUBR_MEMSTR 51 #define DIF_SUBR_GETF 52 #define DIF_SUBR_JSON 53 #define DIF_SUBR_STRTOLL 54 #define DIF_SUBR_MAX 54 /* max subroutine value */ typedef uint32_t dif_instr_t; #define DIF_INSTR_OP(i) (((i) >> 24) & 0xff) #define DIF_INSTR_R1(i) (((i) >> 16) & 0xff) #define DIF_INSTR_R2(i) (((i) >> 8) & 0xff) #define DIF_INSTR_RD(i) ((i) & 0xff) #define DIF_INSTR_RS(i) ((i) & 0xff) #define DIF_INSTR_LABEL(i) ((i) & 0xffffff) #define DIF_INSTR_VAR(i) (((i) >> 8) & 0xffff) #define DIF_INSTR_INTEGER(i) (((i) >> 8) & 0xffff) #define DIF_INSTR_STRING(i) (((i) >> 8) & 0xffff) #define DIF_INSTR_SUBR(i) (((i) >> 8) & 0xffff) #define DIF_INSTR_TYPE(i) (((i) >> 16) & 0xff) #define DIF_INSTR_XLREF(i) (((i) >> 8) & 0xffff) #define DIF_INSTR_FMT(op, r1, r2, d) \ (((op) << 24) | ((r1) << 16) | ((r2) << 8) | (d)) #define DIF_INSTR_NOT(r1, d) (DIF_INSTR_FMT(DIF_OP_NOT, r1, 0, d)) #define DIF_INSTR_MOV(r1, d) (DIF_INSTR_FMT(DIF_OP_MOV, r1, 0, d)) #define DIF_INSTR_CMP(op, r1, r2) (DIF_INSTR_FMT(op, r1, r2, 0)) #define DIF_INSTR_TST(r1) (DIF_INSTR_FMT(DIF_OP_TST, r1, 0, 0)) #define DIF_INSTR_BRANCH(op, label) (((op) << 24) | (label)) #define DIF_INSTR_LOAD(op, r1, d) (DIF_INSTR_FMT(op, r1, 0, d)) #define DIF_INSTR_STORE(op, r1, d) (DIF_INSTR_FMT(op, r1, 0, d)) #define DIF_INSTR_SETX(i, d) ((DIF_OP_SETX << 24) | ((i) << 8) | (d)) #define DIF_INSTR_SETS(s, d) ((DIF_OP_SETS << 24) | ((s) << 8) | (d)) #define DIF_INSTR_RET(d) (DIF_INSTR_FMT(DIF_OP_RET, 0, 0, d)) #define DIF_INSTR_NOP (DIF_OP_NOP << 24) #define DIF_INSTR_LDA(op, v, r, d) (DIF_INSTR_FMT(op, v, r, d)) #define DIF_INSTR_LDV(op, v, d) (((op) << 24) | ((v) << 8) | (d)) #define DIF_INSTR_STV(op, v, rs) (((op) << 24) | ((v) << 8) | (rs)) #define DIF_INSTR_CALL(s, d) ((DIF_OP_CALL << 24) | ((s) << 8) | (d)) #define DIF_INSTR_PUSHTS(op, t, r2, rs) (DIF_INSTR_FMT(op, t, r2, rs)) #define DIF_INSTR_POPTS (DIF_OP_POPTS << 24) #define DIF_INSTR_FLUSHTS (DIF_OP_FLUSHTS << 24) #define DIF_INSTR_ALLOCS(r1, d) (DIF_INSTR_FMT(DIF_OP_ALLOCS, r1, 0, d)) #define DIF_INSTR_COPYS(r1, r2, d) (DIF_INSTR_FMT(DIF_OP_COPYS, r1, r2, d)) #define DIF_INSTR_XLATE(op, r, d) (((op) << 24) | ((r) << 8) | (d)) #define DIF_REG_R0 0 /* %r0 is always set to zero */ /* * A DTrace Intermediate Format Type (DIF Type) is used to represent the types * of variables, function and associative array arguments, and the return type * for each DIF object (shown below). It contains a description of the type, * its size in bytes, and a module identifier. */ typedef struct dtrace_diftype { uint8_t dtdt_kind; /* type kind (see below) */ uint8_t dtdt_ckind; /* type kind in CTF */ uint8_t dtdt_flags; /* type flags (see below) */ uint8_t dtdt_pad; /* reserved for future use */ uint32_t dtdt_size; /* type size in bytes (unless string) */ } dtrace_diftype_t; #define DIF_TYPE_CTF 0 /* type is a CTF type */ #define DIF_TYPE_STRING 1 /* type is a D string */ #define DIF_TF_BYREF 0x1 /* type is passed by reference */ #define DIF_TF_BYUREF 0x2 /* user type is passed by reference */ /* * A DTrace Intermediate Format variable record is used to describe each of the * variables referenced by a given DIF object. It contains an integer variable * identifier along with variable scope and properties, as shown below. The * size of this structure must be sizeof (int) aligned. */ typedef struct dtrace_difv { uint32_t dtdv_name; /* variable name index in dtdo_strtab */ uint32_t dtdv_id; /* variable reference identifier */ uint8_t dtdv_kind; /* variable kind (see below) */ uint8_t dtdv_scope; /* variable scope (see below) */ uint16_t dtdv_flags; /* variable flags (see below) */ dtrace_diftype_t dtdv_type; /* variable type (see above) */ } dtrace_difv_t; #define DIFV_KIND_ARRAY 0 /* variable is an array of quantities */ #define DIFV_KIND_SCALAR 1 /* variable is a scalar quantity */ #define DIFV_SCOPE_GLOBAL 0 /* variable has global scope */ #define DIFV_SCOPE_THREAD 1 /* variable has thread scope */ #define DIFV_SCOPE_LOCAL 2 /* variable has local scope */ #define DIFV_F_REF 0x1 /* variable is referenced by DIFO */ #define DIFV_F_MOD 0x2 /* variable is written by DIFO */ /* * DTrace Actions * * The upper byte determines the class of the action; the low bytes determines * the specific action within that class. The classes of actions are as * follows: * * [ no class ] <= May record process- or kernel-related data * DTRACEACT_PROC <= Only records process-related data * DTRACEACT_PROC_DESTRUCTIVE <= Potentially destructive to processes * DTRACEACT_KERNEL <= Only records kernel-related data * DTRACEACT_KERNEL_DESTRUCTIVE <= Potentially destructive to the kernel * DTRACEACT_SPECULATIVE <= Speculation-related action * DTRACEACT_AGGREGATION <= Aggregating action */ #define DTRACEACT_NONE 0 /* no action */ #define DTRACEACT_DIFEXPR 1 /* action is DIF expression */ #define DTRACEACT_EXIT 2 /* exit() action */ #define DTRACEACT_PRINTF 3 /* printf() action */ #define DTRACEACT_PRINTA 4 /* printa() action */ #define DTRACEACT_LIBACT 5 /* library-controlled action */ #define DTRACEACT_TRACEMEM 6 /* tracemem() action */ #define DTRACEACT_TRACEMEM_DYNSIZE 7 /* dynamic tracemem() size */ #define DTRACEACT_PRINTM 8 /* printm() action (BSD) */ #define DTRACEACT_PRINTT 9 /* printt() action (BSD) */ #define DTRACEACT_PROC 0x0100 #define DTRACEACT_USTACK (DTRACEACT_PROC + 1) #define DTRACEACT_JSTACK (DTRACEACT_PROC + 2) #define DTRACEACT_USYM (DTRACEACT_PROC + 3) #define DTRACEACT_UMOD (DTRACEACT_PROC + 4) #define DTRACEACT_UADDR (DTRACEACT_PROC + 5) #define DTRACEACT_PROC_DESTRUCTIVE 0x0200 #define DTRACEACT_STOP (DTRACEACT_PROC_DESTRUCTIVE + 1) #define DTRACEACT_RAISE (DTRACEACT_PROC_DESTRUCTIVE + 2) #define DTRACEACT_SYSTEM (DTRACEACT_PROC_DESTRUCTIVE + 3) #define DTRACEACT_FREOPEN (DTRACEACT_PROC_DESTRUCTIVE + 4) #define DTRACEACT_PROC_CONTROL 0x0300 #define DTRACEACT_KERNEL 0x0400 #define DTRACEACT_STACK (DTRACEACT_KERNEL + 1) #define DTRACEACT_SYM (DTRACEACT_KERNEL + 2) #define DTRACEACT_MOD (DTRACEACT_KERNEL + 3) #define DTRACEACT_KERNEL_DESTRUCTIVE 0x0500 #define DTRACEACT_BREAKPOINT (DTRACEACT_KERNEL_DESTRUCTIVE + 1) #define DTRACEACT_PANIC (DTRACEACT_KERNEL_DESTRUCTIVE + 2) #define DTRACEACT_CHILL (DTRACEACT_KERNEL_DESTRUCTIVE + 3) #define DTRACEACT_SPECULATIVE 0x0600 #define DTRACEACT_SPECULATE (DTRACEACT_SPECULATIVE + 1) #define DTRACEACT_COMMIT (DTRACEACT_SPECULATIVE + 2) #define DTRACEACT_DISCARD (DTRACEACT_SPECULATIVE + 3) #define DTRACEACT_CLASS(x) ((x) & 0xff00) #define DTRACEACT_ISDESTRUCTIVE(x) \ (DTRACEACT_CLASS(x) == DTRACEACT_PROC_DESTRUCTIVE || \ DTRACEACT_CLASS(x) == DTRACEACT_KERNEL_DESTRUCTIVE) #define DTRACEACT_ISSPECULATIVE(x) \ (DTRACEACT_CLASS(x) == DTRACEACT_SPECULATIVE) #define DTRACEACT_ISPRINTFLIKE(x) \ ((x) == DTRACEACT_PRINTF || (x) == DTRACEACT_PRINTA || \ (x) == DTRACEACT_SYSTEM || (x) == DTRACEACT_FREOPEN) /* * DTrace Aggregating Actions * * These are functions f(x) for which the following is true: * * f(f(x_0) U f(x_1) U ... U f(x_n)) = f(x_0 U x_1 U ... U x_n) * * where x_n is a set of arbitrary data. Aggregating actions are in their own * DTrace action class, DTTRACEACT_AGGREGATION. The macros provided here allow * for easier processing of the aggregation argument and data payload for a few * aggregating actions (notably: quantize(), lquantize(), and ustack()). */ #define DTRACEACT_AGGREGATION 0x0700 #define DTRACEAGG_COUNT (DTRACEACT_AGGREGATION + 1) #define DTRACEAGG_MIN (DTRACEACT_AGGREGATION + 2) #define DTRACEAGG_MAX (DTRACEACT_AGGREGATION + 3) #define DTRACEAGG_AVG (DTRACEACT_AGGREGATION + 4) #define DTRACEAGG_SUM (DTRACEACT_AGGREGATION + 5) #define DTRACEAGG_STDDEV (DTRACEACT_AGGREGATION + 6) #define DTRACEAGG_QUANTIZE (DTRACEACT_AGGREGATION + 7) #define DTRACEAGG_LQUANTIZE (DTRACEACT_AGGREGATION + 8) #define DTRACEAGG_LLQUANTIZE (DTRACEACT_AGGREGATION + 9) #define DTRACEACT_ISAGG(x) \ (DTRACEACT_CLASS(x) == DTRACEACT_AGGREGATION) #define DTRACE_QUANTIZE_NBUCKETS \ (((sizeof (uint64_t) * NBBY) - 1) * 2 + 1) #define DTRACE_QUANTIZE_ZEROBUCKET ((sizeof (uint64_t) * NBBY) - 1) #define DTRACE_QUANTIZE_BUCKETVAL(buck) \ (int64_t)((buck) < DTRACE_QUANTIZE_ZEROBUCKET ? \ -(1LL << (DTRACE_QUANTIZE_ZEROBUCKET - 1 - (buck))) : \ (buck) == DTRACE_QUANTIZE_ZEROBUCKET ? 0 : \ 1LL << ((buck) - DTRACE_QUANTIZE_ZEROBUCKET - 1)) #define DTRACE_LQUANTIZE_STEPSHIFT 48 #define DTRACE_LQUANTIZE_STEPMASK ((uint64_t)UINT16_MAX << 48) #define DTRACE_LQUANTIZE_LEVELSHIFT 32 #define DTRACE_LQUANTIZE_LEVELMASK ((uint64_t)UINT16_MAX << 32) #define DTRACE_LQUANTIZE_BASESHIFT 0 #define DTRACE_LQUANTIZE_BASEMASK UINT32_MAX #define DTRACE_LQUANTIZE_STEP(x) \ (uint16_t)(((x) & DTRACE_LQUANTIZE_STEPMASK) >> \ DTRACE_LQUANTIZE_STEPSHIFT) #define DTRACE_LQUANTIZE_LEVELS(x) \ (uint16_t)(((x) & DTRACE_LQUANTIZE_LEVELMASK) >> \ DTRACE_LQUANTIZE_LEVELSHIFT) #define DTRACE_LQUANTIZE_BASE(x) \ (int32_t)(((x) & DTRACE_LQUANTIZE_BASEMASK) >> \ DTRACE_LQUANTIZE_BASESHIFT) #define DTRACE_LLQUANTIZE_FACTORSHIFT 48 #define DTRACE_LLQUANTIZE_FACTORMASK ((uint64_t)UINT16_MAX << 48) #define DTRACE_LLQUANTIZE_LOWSHIFT 32 #define DTRACE_LLQUANTIZE_LOWMASK ((uint64_t)UINT16_MAX << 32) #define DTRACE_LLQUANTIZE_HIGHSHIFT 16 #define DTRACE_LLQUANTIZE_HIGHMASK ((uint64_t)UINT16_MAX << 16) #define DTRACE_LLQUANTIZE_NSTEPSHIFT 0 #define DTRACE_LLQUANTIZE_NSTEPMASK UINT16_MAX #define DTRACE_LLQUANTIZE_FACTOR(x) \ (uint16_t)(((x) & DTRACE_LLQUANTIZE_FACTORMASK) >> \ DTRACE_LLQUANTIZE_FACTORSHIFT) #define DTRACE_LLQUANTIZE_LOW(x) \ (uint16_t)(((x) & DTRACE_LLQUANTIZE_LOWMASK) >> \ DTRACE_LLQUANTIZE_LOWSHIFT) #define DTRACE_LLQUANTIZE_HIGH(x) \ (uint16_t)(((x) & DTRACE_LLQUANTIZE_HIGHMASK) >> \ DTRACE_LLQUANTIZE_HIGHSHIFT) #define DTRACE_LLQUANTIZE_NSTEP(x) \ (uint16_t)(((x) & DTRACE_LLQUANTIZE_NSTEPMASK) >> \ DTRACE_LLQUANTIZE_NSTEPSHIFT) #define DTRACE_USTACK_NFRAMES(x) (uint32_t)((x) & UINT32_MAX) #define DTRACE_USTACK_STRSIZE(x) (uint32_t)((x) >> 32) #define DTRACE_USTACK_ARG(x, y) \ ((((uint64_t)(y)) << 32) | ((x) & UINT32_MAX)) #ifndef _LP64 #if BYTE_ORDER == _BIG_ENDIAN #define DTRACE_PTR(type, name) uint32_t name##pad; type *name #else #define DTRACE_PTR(type, name) type *name; uint32_t name##pad #endif #else #define DTRACE_PTR(type, name) type *name #endif /* * DTrace Object Format (DOF) * * DTrace programs can be persistently encoded in the DOF format so that they * may be embedded in other programs (for example, in an ELF file) or in the * dtrace driver configuration file for use in anonymous tracing. The DOF * format is versioned and extensible so that it can be revised and so that * internal data structures can be modified or extended compatibly. All DOF * structures use fixed-size types, so the 32-bit and 64-bit representations * are identical and consumers can use either data model transparently. * * The file layout is structured as follows: * * +---------------+-------------------+----- ... ----+---- ... ------+ * | dof_hdr_t | dof_sec_t[ ... ] | loadable | non-loadable | * | (file header) | (section headers) | section data | section data | * +---------------+-------------------+----- ... ----+---- ... ------+ * |<------------ dof_hdr.dofh_loadsz --------------->| | * |<------------ dof_hdr.dofh_filesz ------------------------------->| * * The file header stores meta-data including a magic number, data model for * the instrumentation, data encoding, and properties of the DIF code within. * The header describes its own size and the size of the section headers. By * convention, an array of section headers follows the file header, and then * the data for all loadable sections and unloadable sections. This permits * consumer code to easily download the headers and all loadable data into the * DTrace driver in one contiguous chunk, omitting other extraneous sections. * * The section headers describe the size, offset, alignment, and section type * for each section. Sections are described using a set of #defines that tell * the consumer what kind of data is expected. Sections can contain links to * other sections by storing a dof_secidx_t, an index into the section header * array, inside of the section data structures. The section header includes * an entry size so that sections with data arrays can grow their structures. * * The DOF data itself can contain many snippets of DIF (i.e. >1 DIFOs), which * are represented themselves as a collection of related DOF sections. This * permits us to change the set of sections associated with a DIFO over time, * and also permits us to encode DIFOs that contain different sets of sections. * When a DOF section wants to refer to a DIFO, it stores the dof_secidx_t of a * section of type DOF_SECT_DIFOHDR. This section's data is then an array of * dof_secidx_t's which in turn denote the sections associated with this DIFO. * * This loose coupling of the file structure (header and sections) to the * structure of the DTrace program itself (ECB descriptions, action * descriptions, and DIFOs) permits activities such as relocation processing * to occur in a single pass without having to understand D program structure. * * Finally, strings are always stored in ELF-style string tables along with a * string table section index and string table offset. Therefore strings in * DOF are always arbitrary-length and not bound to the current implementation. */ #define DOF_ID_SIZE 16 /* total size of dofh_ident[] in bytes */ typedef struct dof_hdr { uint8_t dofh_ident[DOF_ID_SIZE]; /* identification bytes (see below) */ uint32_t dofh_flags; /* file attribute flags (if any) */ uint32_t dofh_hdrsize; /* size of file header in bytes */ uint32_t dofh_secsize; /* size of section header in bytes */ uint32_t dofh_secnum; /* number of section headers */ uint64_t dofh_secoff; /* file offset of section headers */ uint64_t dofh_loadsz; /* file size of loadable portion */ uint64_t dofh_filesz; /* file size of entire DOF file */ uint64_t dofh_pad; /* reserved for future use */ } dof_hdr_t; #define DOF_ID_MAG0 0 /* first byte of magic number */ #define DOF_ID_MAG1 1 /* second byte of magic number */ #define DOF_ID_MAG2 2 /* third byte of magic number */ #define DOF_ID_MAG3 3 /* fourth byte of magic number */ #define DOF_ID_MODEL 4 /* DOF data model (see below) */ #define DOF_ID_ENCODING 5 /* DOF data encoding (see below) */ #define DOF_ID_VERSION 6 /* DOF file format major version (see below) */ #define DOF_ID_DIFVERS 7 /* DIF instruction set version */ #define DOF_ID_DIFIREG 8 /* DIF integer registers used by compiler */ #define DOF_ID_DIFTREG 9 /* DIF tuple registers used by compiler */ #define DOF_ID_PAD 10 /* start of padding bytes (all zeroes) */ #define DOF_MAG_MAG0 0x7F /* DOF_ID_MAG[0-3] */ #define DOF_MAG_MAG1 'D' #define DOF_MAG_MAG2 'O' #define DOF_MAG_MAG3 'F' #define DOF_MAG_STRING "\177DOF" #define DOF_MAG_STRLEN 4 #define DOF_MODEL_NONE 0 /* DOF_ID_MODEL */ #define DOF_MODEL_ILP32 1 #define DOF_MODEL_LP64 2 #ifdef _LP64 #define DOF_MODEL_NATIVE DOF_MODEL_LP64 #else #define DOF_MODEL_NATIVE DOF_MODEL_ILP32 #endif #define DOF_ENCODE_NONE 0 /* DOF_ID_ENCODING */ #define DOF_ENCODE_LSB 1 #define DOF_ENCODE_MSB 2 #if BYTE_ORDER == _BIG_ENDIAN #define DOF_ENCODE_NATIVE DOF_ENCODE_MSB #else #define DOF_ENCODE_NATIVE DOF_ENCODE_LSB #endif #define DOF_VERSION_1 1 /* DOF version 1: Solaris 10 FCS */ #define DOF_VERSION_2 2 /* DOF version 2: Solaris Express 6/06 */ #define DOF_VERSION DOF_VERSION_2 /* Latest DOF version */ #define DOF_FL_VALID 0 /* mask of all valid dofh_flags bits */ typedef uint32_t dof_secidx_t; /* section header table index type */ typedef uint32_t dof_stridx_t; /* string table index type */ #define DOF_SECIDX_NONE (-1U) /* null value for section indices */ #define DOF_STRIDX_NONE (-1U) /* null value for string indices */ typedef struct dof_sec { uint32_t dofs_type; /* section type (see below) */ uint32_t dofs_align; /* section data memory alignment */ uint32_t dofs_flags; /* section flags (if any) */ uint32_t dofs_entsize; /* size of section entry (if table) */ uint64_t dofs_offset; /* offset of section data within file */ uint64_t dofs_size; /* size of section data in bytes */ } dof_sec_t; #define DOF_SECT_NONE 0 /* null section */ #define DOF_SECT_COMMENTS 1 /* compiler comments */ #define DOF_SECT_SOURCE 2 /* D program source code */ #define DOF_SECT_ECBDESC 3 /* dof_ecbdesc_t */ #define DOF_SECT_PROBEDESC 4 /* dof_probedesc_t */ #define DOF_SECT_ACTDESC 5 /* dof_actdesc_t array */ #define DOF_SECT_DIFOHDR 6 /* dof_difohdr_t (variable length) */ #define DOF_SECT_DIF 7 /* uint32_t array of byte code */ #define DOF_SECT_STRTAB 8 /* string table */ #define DOF_SECT_VARTAB 9 /* dtrace_difv_t array */ #define DOF_SECT_RELTAB 10 /* dof_relodesc_t array */ #define DOF_SECT_TYPTAB 11 /* dtrace_diftype_t array */ #define DOF_SECT_URELHDR 12 /* dof_relohdr_t (user relocations) */ #define DOF_SECT_KRELHDR 13 /* dof_relohdr_t (kernel relocations) */ #define DOF_SECT_OPTDESC 14 /* dof_optdesc_t array */ #define DOF_SECT_PROVIDER 15 /* dof_provider_t */ #define DOF_SECT_PROBES 16 /* dof_probe_t array */ #define DOF_SECT_PRARGS 17 /* uint8_t array (probe arg mappings) */ #define DOF_SECT_PROFFS 18 /* uint32_t array (probe arg offsets) */ #define DOF_SECT_INTTAB 19 /* uint64_t array */ #define DOF_SECT_UTSNAME 20 /* struct utsname */ #define DOF_SECT_XLTAB 21 /* dof_xlref_t array */ #define DOF_SECT_XLMEMBERS 22 /* dof_xlmember_t array */ #define DOF_SECT_XLIMPORT 23 /* dof_xlator_t */ #define DOF_SECT_XLEXPORT 24 /* dof_xlator_t */ #define DOF_SECT_PREXPORT 25 /* dof_secidx_t array (exported objs) */ #define DOF_SECT_PRENOFFS 26 /* uint32_t array (enabled offsets) */ #define DOF_SECF_LOAD 1 /* section should be loaded */ #define DOF_SEC_ISLOADABLE(x) \ (((x) == DOF_SECT_ECBDESC) || ((x) == DOF_SECT_PROBEDESC) || \ ((x) == DOF_SECT_ACTDESC) || ((x) == DOF_SECT_DIFOHDR) || \ ((x) == DOF_SECT_DIF) || ((x) == DOF_SECT_STRTAB) || \ ((x) == DOF_SECT_VARTAB) || ((x) == DOF_SECT_RELTAB) || \ ((x) == DOF_SECT_TYPTAB) || ((x) == DOF_SECT_URELHDR) || \ ((x) == DOF_SECT_KRELHDR) || ((x) == DOF_SECT_OPTDESC) || \ ((x) == DOF_SECT_PROVIDER) || ((x) == DOF_SECT_PROBES) || \ ((x) == DOF_SECT_PRARGS) || ((x) == DOF_SECT_PROFFS) || \ ((x) == DOF_SECT_INTTAB) || ((x) == DOF_SECT_XLTAB) || \ ((x) == DOF_SECT_XLMEMBERS) || ((x) == DOF_SECT_XLIMPORT) || \ ((x) == DOF_SECT_XLIMPORT) || ((x) == DOF_SECT_XLEXPORT) || \ ((x) == DOF_SECT_PREXPORT) || ((x) == DOF_SECT_PRENOFFS)) typedef struct dof_ecbdesc { dof_secidx_t dofe_probes; /* link to DOF_SECT_PROBEDESC */ dof_secidx_t dofe_pred; /* link to DOF_SECT_DIFOHDR */ dof_secidx_t dofe_actions; /* link to DOF_SECT_ACTDESC */ uint32_t dofe_pad; /* reserved for future use */ uint64_t dofe_uarg; /* user-supplied library argument */ } dof_ecbdesc_t; typedef struct dof_probedesc { dof_secidx_t dofp_strtab; /* link to DOF_SECT_STRTAB section */ dof_stridx_t dofp_provider; /* provider string */ dof_stridx_t dofp_mod; /* module string */ dof_stridx_t dofp_func; /* function string */ dof_stridx_t dofp_name; /* name string */ uint32_t dofp_id; /* probe identifier (or zero) */ } dof_probedesc_t; typedef struct dof_actdesc { dof_secidx_t dofa_difo; /* link to DOF_SECT_DIFOHDR */ dof_secidx_t dofa_strtab; /* link to DOF_SECT_STRTAB section */ uint32_t dofa_kind; /* action kind (DTRACEACT_* constant) */ uint32_t dofa_ntuple; /* number of subsequent tuple actions */ uint64_t dofa_arg; /* kind-specific argument */ uint64_t dofa_uarg; /* user-supplied argument */ } dof_actdesc_t; typedef struct dof_difohdr { dtrace_diftype_t dofd_rtype; /* return type for this fragment */ dof_secidx_t dofd_links[1]; /* variable length array of indices */ } dof_difohdr_t; typedef struct dof_relohdr { dof_secidx_t dofr_strtab; /* link to DOF_SECT_STRTAB for names */ dof_secidx_t dofr_relsec; /* link to DOF_SECT_RELTAB for relos */ dof_secidx_t dofr_tgtsec; /* link to section we are relocating */ } dof_relohdr_t; typedef struct dof_relodesc { dof_stridx_t dofr_name; /* string name of relocation symbol */ uint32_t dofr_type; /* relo type (DOF_RELO_* constant) */ uint64_t dofr_offset; /* byte offset for relocation */ uint64_t dofr_data; /* additional type-specific data */ } dof_relodesc_t; #define DOF_RELO_NONE 0 /* empty relocation entry */ #define DOF_RELO_SETX 1 /* relocate setx value */ typedef struct dof_optdesc { uint32_t dofo_option; /* option identifier */ dof_secidx_t dofo_strtab; /* string table, if string option */ uint64_t dofo_value; /* option value or string index */ } dof_optdesc_t; typedef uint32_t dof_attr_t; /* encoded stability attributes */ #define DOF_ATTR(n, d, c) (((n) << 24) | ((d) << 16) | ((c) << 8)) #define DOF_ATTR_NAME(a) (((a) >> 24) & 0xff) #define DOF_ATTR_DATA(a) (((a) >> 16) & 0xff) #define DOF_ATTR_CLASS(a) (((a) >> 8) & 0xff) typedef struct dof_provider { dof_secidx_t dofpv_strtab; /* link to DOF_SECT_STRTAB section */ dof_secidx_t dofpv_probes; /* link to DOF_SECT_PROBES section */ dof_secidx_t dofpv_prargs; /* link to DOF_SECT_PRARGS section */ dof_secidx_t dofpv_proffs; /* link to DOF_SECT_PROFFS section */ dof_stridx_t dofpv_name; /* provider name string */ dof_attr_t dofpv_provattr; /* provider attributes */ dof_attr_t dofpv_modattr; /* module attributes */ dof_attr_t dofpv_funcattr; /* function attributes */ dof_attr_t dofpv_nameattr; /* name attributes */ dof_attr_t dofpv_argsattr; /* args attributes */ dof_secidx_t dofpv_prenoffs; /* link to DOF_SECT_PRENOFFS section */ } dof_provider_t; typedef struct dof_probe { uint64_t dofpr_addr; /* probe base address or offset */ dof_stridx_t dofpr_func; /* probe function string */ dof_stridx_t dofpr_name; /* probe name string */ dof_stridx_t dofpr_nargv; /* native argument type strings */ dof_stridx_t dofpr_xargv; /* translated argument type strings */ uint32_t dofpr_argidx; /* index of first argument mapping */ uint32_t dofpr_offidx; /* index of first offset entry */ uint8_t dofpr_nargc; /* native argument count */ uint8_t dofpr_xargc; /* translated argument count */ uint16_t dofpr_noffs; /* number of offset entries for probe */ uint32_t dofpr_enoffidx; /* index of first is-enabled offset */ uint16_t dofpr_nenoffs; /* number of is-enabled offsets */ uint16_t dofpr_pad1; /* reserved for future use */ uint32_t dofpr_pad2; /* reserved for future use */ } dof_probe_t; typedef struct dof_xlator { dof_secidx_t dofxl_members; /* link to DOF_SECT_XLMEMBERS section */ dof_secidx_t dofxl_strtab; /* link to DOF_SECT_STRTAB section */ dof_stridx_t dofxl_argv; /* input parameter type strings */ uint32_t dofxl_argc; /* input parameter list length */ dof_stridx_t dofxl_type; /* output type string name */ dof_attr_t dofxl_attr; /* output stability attributes */ } dof_xlator_t; typedef struct dof_xlmember { dof_secidx_t dofxm_difo; /* member link to DOF_SECT_DIFOHDR */ dof_stridx_t dofxm_name; /* member name */ dtrace_diftype_t dofxm_type; /* member type */ } dof_xlmember_t; typedef struct dof_xlref { dof_secidx_t dofxr_xlator; /* link to DOF_SECT_XLATORS section */ uint32_t dofxr_member; /* index of referenced dof_xlmember */ uint32_t dofxr_argn; /* index of argument for DIF_OP_XLARG */ } dof_xlref_t; /* * DTrace Intermediate Format Object (DIFO) * * A DIFO is used to store the compiled DIF for a D expression, its return * type, and its string and variable tables. The string table is a single * buffer of character data into which sets instructions and variable * references can reference strings using a byte offset. The variable table * is an array of dtrace_difv_t structures that describe the name and type of * each variable and the id used in the DIF code. This structure is described * above in the DIF section of this header file. The DIFO is used at both * user-level (in the library) and in the kernel, but the structure is never * passed between the two: the DOF structures form the only interface. As a * result, the definition can change depending on the presence of _KERNEL. */ typedef struct dtrace_difo { dif_instr_t *dtdo_buf; /* instruction buffer */ uint64_t *dtdo_inttab; /* integer table (optional) */ char *dtdo_strtab; /* string table (optional) */ dtrace_difv_t *dtdo_vartab; /* variable table (optional) */ uint_t dtdo_len; /* length of instruction buffer */ uint_t dtdo_intlen; /* length of integer table */ uint_t dtdo_strlen; /* length of string table */ uint_t dtdo_varlen; /* length of variable table */ dtrace_diftype_t dtdo_rtype; /* return type */ uint_t dtdo_refcnt; /* owner reference count */ uint_t dtdo_destructive; /* invokes destructive subroutines */ #ifndef _KERNEL dof_relodesc_t *dtdo_kreltab; /* kernel relocations */ dof_relodesc_t *dtdo_ureltab; /* user relocations */ struct dt_node **dtdo_xlmtab; /* translator references */ uint_t dtdo_krelen; /* length of krelo table */ uint_t dtdo_urelen; /* length of urelo table */ uint_t dtdo_xlmlen; /* length of translator table */ #endif } dtrace_difo_t; /* * DTrace Enabling Description Structures * * When DTrace is tracking the description of a DTrace enabling entity (probe, * predicate, action, ECB, record, etc.), it does so in a description * structure. These structures all end in "desc", and are used at both * user-level and in the kernel -- but (with the exception of * dtrace_probedesc_t) they are never passed between them. Typically, * user-level will use the description structures when assembling an enabling. * It will then distill those description structures into a DOF object (see * above), and send it into the kernel. The kernel will again use the * description structures to create a description of the enabling as it reads * the DOF. When the description is complete, the enabling will be actually * created -- turning it into the structures that represent the enabling * instead of merely describing it. Not surprisingly, the description * structures bear a strong resemblance to the DOF structures that act as their * conduit. */ struct dtrace_predicate; typedef struct dtrace_probedesc { dtrace_id_t dtpd_id; /* probe identifier */ char dtpd_provider[DTRACE_PROVNAMELEN]; /* probe provider name */ char dtpd_mod[DTRACE_MODNAMELEN]; /* probe module name */ char dtpd_func[DTRACE_FUNCNAMELEN]; /* probe function name */ char dtpd_name[DTRACE_NAMELEN]; /* probe name */ } dtrace_probedesc_t; typedef struct dtrace_repldesc { dtrace_probedesc_t dtrpd_match; /* probe descr. to match */ dtrace_probedesc_t dtrpd_create; /* probe descr. to create */ } dtrace_repldesc_t; typedef struct dtrace_preddesc { dtrace_difo_t *dtpdd_difo; /* pointer to DIF object */ struct dtrace_predicate *dtpdd_predicate; /* pointer to predicate */ } dtrace_preddesc_t; typedef struct dtrace_actdesc { dtrace_difo_t *dtad_difo; /* pointer to DIF object */ struct dtrace_actdesc *dtad_next; /* next action */ dtrace_actkind_t dtad_kind; /* kind of action */ uint32_t dtad_ntuple; /* number in tuple */ uint64_t dtad_arg; /* action argument */ uint64_t dtad_uarg; /* user argument */ int dtad_refcnt; /* reference count */ } dtrace_actdesc_t; typedef struct dtrace_ecbdesc { dtrace_actdesc_t *dted_action; /* action description(s) */ dtrace_preddesc_t dted_pred; /* predicate description */ dtrace_probedesc_t dted_probe; /* probe description */ uint64_t dted_uarg; /* library argument */ int dted_refcnt; /* reference count */ } dtrace_ecbdesc_t; /* * DTrace Metadata Description Structures * * DTrace separates the trace data stream from the metadata stream. The only * metadata tokens placed in the data stream are the dtrace_rechdr_t (EPID + * timestamp) or (in the case of aggregations) aggregation identifiers. To * determine the structure of the data, DTrace consumers pass the token to the * kernel, and receive in return a corresponding description of the enabled * probe (via the dtrace_eprobedesc structure) or the aggregation (via the * dtrace_aggdesc structure). Both of these structures are expressed in terms * of record descriptions (via the dtrace_recdesc structure) that describe the * exact structure of the data. Some record descriptions may also contain a * format identifier; this additional bit of metadata can be retrieved from the * kernel, for which a format description is returned via the dtrace_fmtdesc * structure. Note that all four of these structures must be bitness-neutral * to allow for a 32-bit DTrace consumer on a 64-bit kernel. */ typedef struct dtrace_recdesc { dtrace_actkind_t dtrd_action; /* kind of action */ uint32_t dtrd_size; /* size of record */ uint32_t dtrd_offset; /* offset in ECB's data */ uint16_t dtrd_alignment; /* required alignment */ uint16_t dtrd_format; /* format, if any */ uint64_t dtrd_arg; /* action argument */ uint64_t dtrd_uarg; /* user argument */ } dtrace_recdesc_t; typedef struct dtrace_eprobedesc { dtrace_epid_t dtepd_epid; /* enabled probe ID */ dtrace_id_t dtepd_probeid; /* probe ID */ uint64_t dtepd_uarg; /* library argument */ uint32_t dtepd_size; /* total size */ int dtepd_nrecs; /* number of records */ dtrace_recdesc_t dtepd_rec[1]; /* records themselves */ } dtrace_eprobedesc_t; typedef struct dtrace_aggdesc { DTRACE_PTR(char, dtagd_name); /* not filled in by kernel */ dtrace_aggvarid_t dtagd_varid; /* not filled in by kernel */ int dtagd_flags; /* not filled in by kernel */ dtrace_aggid_t dtagd_id; /* aggregation ID */ dtrace_epid_t dtagd_epid; /* enabled probe ID */ uint32_t dtagd_size; /* size in bytes */ int dtagd_nrecs; /* number of records */ uint32_t dtagd_pad; /* explicit padding */ dtrace_recdesc_t dtagd_rec[1]; /* record descriptions */ } dtrace_aggdesc_t; typedef struct dtrace_fmtdesc { DTRACE_PTR(char, dtfd_string); /* format string */ int dtfd_length; /* length of format string */ uint16_t dtfd_format; /* format identifier */ } dtrace_fmtdesc_t; #define DTRACE_SIZEOF_EPROBEDESC(desc) \ (sizeof (dtrace_eprobedesc_t) + ((desc)->dtepd_nrecs ? \ (((desc)->dtepd_nrecs - 1) * sizeof (dtrace_recdesc_t)) : 0)) #define DTRACE_SIZEOF_AGGDESC(desc) \ (sizeof (dtrace_aggdesc_t) + ((desc)->dtagd_nrecs ? \ (((desc)->dtagd_nrecs - 1) * sizeof (dtrace_recdesc_t)) : 0)) /* * DTrace Option Interface * * Run-time DTrace options are set and retrieved via DOF_SECT_OPTDESC sections * in a DOF image. The dof_optdesc structure contains an option identifier and * an option value. The valid option identifiers are found below; the mapping * between option identifiers and option identifying strings is maintained at * user-level. Note that the value of DTRACEOPT_UNSET is such that all of the * following are potentially valid option values: all positive integers, zero * and negative one. Some options (notably "bufpolicy" and "bufresize") take * predefined tokens as their values; these are defined with * DTRACEOPT_{option}_{token}. */ #define DTRACEOPT_BUFSIZE 0 /* buffer size */ #define DTRACEOPT_BUFPOLICY 1 /* buffer policy */ #define DTRACEOPT_DYNVARSIZE 2 /* dynamic variable size */ #define DTRACEOPT_AGGSIZE 3 /* aggregation size */ #define DTRACEOPT_SPECSIZE 4 /* speculation size */ #define DTRACEOPT_NSPEC 5 /* number of speculations */ #define DTRACEOPT_STRSIZE 6 /* string size */ #define DTRACEOPT_CLEANRATE 7 /* dynvar cleaning rate */ #define DTRACEOPT_CPU 8 /* CPU to trace */ #define DTRACEOPT_BUFRESIZE 9 /* buffer resizing policy */ #define DTRACEOPT_GRABANON 10 /* grab anonymous state, if any */ #define DTRACEOPT_FLOWINDENT 11 /* indent function entry/return */ #define DTRACEOPT_QUIET 12 /* only output explicitly traced data */ #define DTRACEOPT_STACKFRAMES 13 /* number of stack frames */ #define DTRACEOPT_USTACKFRAMES 14 /* number of user stack frames */ #define DTRACEOPT_AGGRATE 15 /* aggregation snapshot rate */ #define DTRACEOPT_SWITCHRATE 16 /* buffer switching rate */ #define DTRACEOPT_STATUSRATE 17 /* status rate */ #define DTRACEOPT_DESTRUCTIVE 18 /* destructive actions allowed */ #define DTRACEOPT_STACKINDENT 19 /* output indent for stack traces */ #define DTRACEOPT_RAWBYTES 20 /* always print bytes in raw form */ #define DTRACEOPT_JSTACKFRAMES 21 /* number of jstack() frames */ #define DTRACEOPT_JSTACKSTRSIZE 22 /* size of jstack() string table */ #define DTRACEOPT_AGGSORTKEY 23 /* sort aggregations by key */ #define DTRACEOPT_AGGSORTREV 24 /* reverse-sort aggregations */ #define DTRACEOPT_AGGSORTPOS 25 /* agg. position to sort on */ #define DTRACEOPT_AGGSORTKEYPOS 26 /* agg. key position to sort on */ #define DTRACEOPT_TEMPORAL 27 /* temporally ordered output */ #define DTRACEOPT_AGGHIST 28 /* histogram aggregation output */ #define DTRACEOPT_AGGPACK 29 /* packed aggregation output */ #define DTRACEOPT_AGGZOOM 30 /* zoomed aggregation scaling */ #define DTRACEOPT_ZONE 31 /* zone in which to enable probes */ #define DTRACEOPT_MAX 32 /* number of options */ #define DTRACEOPT_UNSET (dtrace_optval_t)-2 /* unset option */ #define DTRACEOPT_BUFPOLICY_RING 0 /* ring buffer */ #define DTRACEOPT_BUFPOLICY_FILL 1 /* fill buffer, then stop */ #define DTRACEOPT_BUFPOLICY_SWITCH 2 /* switch buffers */ #define DTRACEOPT_BUFRESIZE_AUTO 0 /* automatic resizing */ #define DTRACEOPT_BUFRESIZE_MANUAL 1 /* manual resizing */ /* * DTrace Buffer Interface * * In order to get a snapshot of the principal or aggregation buffer, * user-level passes a buffer description to the kernel with the dtrace_bufdesc * structure. This describes which CPU user-level is interested in, and * where user-level wishes the kernel to snapshot the buffer to (the * dtbd_data field). The kernel uses the same structure to pass back some * information regarding the buffer: the size of data actually copied out, the * number of drops, the number of errors, the offset of the oldest record, * and the time of the snapshot. * * If the buffer policy is a "switch" policy, taking a snapshot of the * principal buffer has the additional effect of switching the active and * inactive buffers. Taking a snapshot of the aggregation buffer _always_ has * the additional effect of switching the active and inactive buffers. */ typedef struct dtrace_bufdesc { uint64_t dtbd_size; /* size of buffer */ uint32_t dtbd_cpu; /* CPU or DTRACE_CPUALL */ uint32_t dtbd_errors; /* number of errors */ uint64_t dtbd_drops; /* number of drops */ DTRACE_PTR(char, dtbd_data); /* data */ uint64_t dtbd_oldest; /* offset of oldest record */ uint64_t dtbd_timestamp; /* hrtime of snapshot */ } dtrace_bufdesc_t; /* * Each record in the buffer (dtbd_data) begins with a header that includes * the epid and a timestamp. The timestamp is split into two 4-byte parts * so that we do not require 8-byte alignment. */ typedef struct dtrace_rechdr { dtrace_epid_t dtrh_epid; /* enabled probe id */ uint32_t dtrh_timestamp_hi; /* high bits of hrtime_t */ uint32_t dtrh_timestamp_lo; /* low bits of hrtime_t */ } dtrace_rechdr_t; #define DTRACE_RECORD_LOAD_TIMESTAMP(dtrh) \ ((dtrh)->dtrh_timestamp_lo + \ ((uint64_t)(dtrh)->dtrh_timestamp_hi << 32)) #define DTRACE_RECORD_STORE_TIMESTAMP(dtrh, hrtime) { \ (dtrh)->dtrh_timestamp_lo = (uint32_t)hrtime; \ (dtrh)->dtrh_timestamp_hi = hrtime >> 32; \ } /* * DTrace Status * * The status of DTrace is relayed via the dtrace_status structure. This * structure contains members to count drops other than the capacity drops * available via the buffer interface (see above). This consists of dynamic * drops (including capacity dynamic drops, rinsing drops and dirty drops), and * speculative drops (including capacity speculative drops, drops due to busy * speculative buffers and drops due to unavailable speculative buffers). * Additionally, the status structure contains a field to indicate the number * of "fill"-policy buffers have been filled and a boolean field to indicate * that exit() has been called. If the dtst_exiting field is non-zero, no * further data will be generated until tracing is stopped (at which time any * enablings of the END action will be processed); if user-level sees that * this field is non-zero, tracing should be stopped as soon as possible. */ typedef struct dtrace_status { uint64_t dtst_dyndrops; /* dynamic drops */ uint64_t dtst_dyndrops_rinsing; /* dyn drops due to rinsing */ uint64_t dtst_dyndrops_dirty; /* dyn drops due to dirty */ uint64_t dtst_specdrops; /* speculative drops */ uint64_t dtst_specdrops_busy; /* spec drops due to busy */ uint64_t dtst_specdrops_unavail; /* spec drops due to unavail */ uint64_t dtst_errors; /* total errors */ uint64_t dtst_filled; /* number of filled bufs */ uint64_t dtst_stkstroverflows; /* stack string tab overflows */ uint64_t dtst_dblerrors; /* errors in ERROR probes */ char dtst_killed; /* non-zero if killed */ char dtst_exiting; /* non-zero if exit() called */ char dtst_pad[6]; /* pad out to 64-bit align */ } dtrace_status_t; /* * DTrace Configuration * * User-level may need to understand some elements of the kernel DTrace * configuration in order to generate correct DIF. This information is * conveyed via the dtrace_conf structure. */ typedef struct dtrace_conf { uint_t dtc_difversion; /* supported DIF version */ uint_t dtc_difintregs; /* # of DIF integer registers */ uint_t dtc_diftupregs; /* # of DIF tuple registers */ uint_t dtc_ctfmodel; /* CTF data model */ uint_t dtc_pad[8]; /* reserved for future use */ } dtrace_conf_t; /* * DTrace Faults * * The constants below DTRACEFLT_LIBRARY indicate probe processing faults; * constants at or above DTRACEFLT_LIBRARY indicate faults in probe * postprocessing at user-level. Probe processing faults induce an ERROR * probe and are replicated in unistd.d to allow users' ERROR probes to decode * the error condition using thse symbolic labels. */ #define DTRACEFLT_UNKNOWN 0 /* Unknown fault */ #define DTRACEFLT_BADADDR 1 /* Bad address */ #define DTRACEFLT_BADALIGN 2 /* Bad alignment */ #define DTRACEFLT_ILLOP 3 /* Illegal operation */ #define DTRACEFLT_DIVZERO 4 /* Divide-by-zero */ #define DTRACEFLT_NOSCRATCH 5 /* Out of scratch space */ #define DTRACEFLT_KPRIV 6 /* Illegal kernel access */ #define DTRACEFLT_UPRIV 7 /* Illegal user access */ #define DTRACEFLT_TUPOFLOW 8 /* Tuple stack overflow */ #define DTRACEFLT_BADSTACK 9 /* Bad stack */ #define DTRACEFLT_LIBRARY 1000 /* Library-level fault */ /* * DTrace Argument Types * * Because it would waste both space and time, argument types do not reside * with the probe. In order to determine argument types for args[X] * variables, the D compiler queries for argument types on a probe-by-probe * basis. (This optimizes for the common case that arguments are either not * used or used in an untyped fashion.) Typed arguments are specified with a * string of the type name in the dtragd_native member of the argument * description structure. Typed arguments may be further translated to types * of greater stability; the provider indicates such a translated argument by * filling in the dtargd_xlate member with the string of the translated type. * Finally, the provider may indicate which argument value a given argument * maps to by setting the dtargd_mapping member -- allowing a single argument * to map to multiple args[X] variables. */ typedef struct dtrace_argdesc { dtrace_id_t dtargd_id; /* probe identifier */ int dtargd_ndx; /* arg number (-1 iff none) */ int dtargd_mapping; /* value mapping */ char dtargd_native[DTRACE_ARGTYPELEN]; /* native type name */ char dtargd_xlate[DTRACE_ARGTYPELEN]; /* translated type name */ } dtrace_argdesc_t; /* * DTrace Stability Attributes * * Each DTrace provider advertises the name and data stability of each of its * probe description components, as well as its architectural dependencies. * The D compiler can query the provider attributes (dtrace_pattr_t below) in * order to compute the properties of an input program and report them. */ typedef uint8_t dtrace_stability_t; /* stability code (see attributes(5)) */ typedef uint8_t dtrace_class_t; /* architectural dependency class */ #define DTRACE_STABILITY_INTERNAL 0 /* private to DTrace itself */ #define DTRACE_STABILITY_PRIVATE 1 /* private to Sun (see docs) */ #define DTRACE_STABILITY_OBSOLETE 2 /* scheduled for removal */ #define DTRACE_STABILITY_EXTERNAL 3 /* not controlled by Sun */ #define DTRACE_STABILITY_UNSTABLE 4 /* new or rapidly changing */ #define DTRACE_STABILITY_EVOLVING 5 /* less rapidly changing */ #define DTRACE_STABILITY_STABLE 6 /* mature interface from Sun */ #define DTRACE_STABILITY_STANDARD 7 /* industry standard */ #define DTRACE_STABILITY_MAX 7 /* maximum valid stability */ #define DTRACE_CLASS_UNKNOWN 0 /* unknown architectural dependency */ #define DTRACE_CLASS_CPU 1 /* CPU-module-specific */ #define DTRACE_CLASS_PLATFORM 2 /* platform-specific (uname -i) */ #define DTRACE_CLASS_GROUP 3 /* hardware-group-specific (uname -m) */ #define DTRACE_CLASS_ISA 4 /* ISA-specific (uname -p) */ #define DTRACE_CLASS_COMMON 5 /* common to all systems */ #define DTRACE_CLASS_MAX 5 /* maximum valid class */ #define DTRACE_PRIV_NONE 0x0000 #define DTRACE_PRIV_KERNEL 0x0001 #define DTRACE_PRIV_USER 0x0002 #define DTRACE_PRIV_PROC 0x0004 #define DTRACE_PRIV_OWNER 0x0008 #define DTRACE_PRIV_ZONEOWNER 0x0010 #define DTRACE_PRIV_ALL \ (DTRACE_PRIV_KERNEL | DTRACE_PRIV_USER | \ DTRACE_PRIV_PROC | DTRACE_PRIV_OWNER | DTRACE_PRIV_ZONEOWNER) typedef struct dtrace_ppriv { uint32_t dtpp_flags; /* privilege flags */ uid_t dtpp_uid; /* user ID */ zoneid_t dtpp_zoneid; /* zone ID */ } dtrace_ppriv_t; typedef struct dtrace_attribute { dtrace_stability_t dtat_name; /* entity name stability */ dtrace_stability_t dtat_data; /* entity data stability */ dtrace_class_t dtat_class; /* entity data dependency */ } dtrace_attribute_t; typedef struct dtrace_pattr { dtrace_attribute_t dtpa_provider; /* provider attributes */ dtrace_attribute_t dtpa_mod; /* module attributes */ dtrace_attribute_t dtpa_func; /* function attributes */ dtrace_attribute_t dtpa_name; /* name attributes */ dtrace_attribute_t dtpa_args; /* args[] attributes */ } dtrace_pattr_t; typedef struct dtrace_providerdesc { char dtvd_name[DTRACE_PROVNAMELEN]; /* provider name */ dtrace_pattr_t dtvd_attr; /* stability attributes */ dtrace_ppriv_t dtvd_priv; /* privileges required */ } dtrace_providerdesc_t; /* * DTrace Pseudodevice Interface * * DTrace is controlled through ioctl(2)'s to the in-kernel dtrace:dtrace * pseudodevice driver. These ioctls comprise the user-kernel interface to * DTrace. */ #ifdef illumos #define DTRACEIOC (('d' << 24) | ('t' << 16) | ('r' << 8)) #define DTRACEIOC_PROVIDER (DTRACEIOC | 1) /* provider query */ #define DTRACEIOC_PROBES (DTRACEIOC | 2) /* probe query */ #define DTRACEIOC_BUFSNAP (DTRACEIOC | 4) /* snapshot buffer */ #define DTRACEIOC_PROBEMATCH (DTRACEIOC | 5) /* match probes */ #define DTRACEIOC_ENABLE (DTRACEIOC | 6) /* enable probes */ #define DTRACEIOC_AGGSNAP (DTRACEIOC | 7) /* snapshot agg. */ #define DTRACEIOC_EPROBE (DTRACEIOC | 8) /* get eprobe desc. */ #define DTRACEIOC_PROBEARG (DTRACEIOC | 9) /* get probe arg */ #define DTRACEIOC_CONF (DTRACEIOC | 10) /* get config. */ #define DTRACEIOC_STATUS (DTRACEIOC | 11) /* get status */ #define DTRACEIOC_GO (DTRACEIOC | 12) /* start tracing */ #define DTRACEIOC_STOP (DTRACEIOC | 13) /* stop tracing */ #define DTRACEIOC_AGGDESC (DTRACEIOC | 15) /* get agg. desc. */ #define DTRACEIOC_FORMAT (DTRACEIOC | 16) /* get format str */ #define DTRACEIOC_DOFGET (DTRACEIOC | 17) /* get DOF */ #define DTRACEIOC_REPLICATE (DTRACEIOC | 18) /* replicate enab */ #else #define DTRACEIOC_PROVIDER _IOWR('x',1,dtrace_providerdesc_t) /* provider query */ #define DTRACEIOC_PROBES _IOWR('x',2,dtrace_probedesc_t) /* probe query */ #define DTRACEIOC_BUFSNAP _IOW('x',4,dtrace_bufdesc_t *) /* snapshot buffer */ #define DTRACEIOC_PROBEMATCH _IOWR('x',5,dtrace_probedesc_t) /* match probes */ typedef struct { void *dof; /* DOF userland address written to driver. */ int n_matched; /* # matches returned by driver. */ } dtrace_enable_io_t; #define DTRACEIOC_ENABLE _IOWR('x',6,dtrace_enable_io_t) /* enable probes */ #define DTRACEIOC_AGGSNAP _IOW('x',7,dtrace_bufdesc_t *) /* snapshot agg. */ #define DTRACEIOC_EPROBE _IOW('x',8,dtrace_eprobedesc_t) /* get eprobe desc. */ #define DTRACEIOC_PROBEARG _IOWR('x',9,dtrace_argdesc_t) /* get probe arg */ #define DTRACEIOC_CONF _IOR('x',10,dtrace_conf_t) /* get config. */ #define DTRACEIOC_STATUS _IOR('x',11,dtrace_status_t) /* get status */ #define DTRACEIOC_GO _IOR('x',12,processorid_t) /* start tracing */ #define DTRACEIOC_STOP _IOWR('x',13,processorid_t) /* stop tracing */ #define DTRACEIOC_AGGDESC _IOW('x',15,dtrace_aggdesc_t *) /* get agg. desc. */ #define DTRACEIOC_FORMAT _IOWR('x',16,dtrace_fmtdesc_t) /* get format str */ #define DTRACEIOC_DOFGET _IOW('x',17,dof_hdr_t *) /* get DOF */ #define DTRACEIOC_REPLICATE _IOW('x',18,dtrace_repldesc_t) /* replicate enab */ #endif /* * DTrace Helpers * * In general, DTrace establishes probes in processes and takes actions on * processes without knowing their specific user-level structures. Instead of * existing in the framework, process-specific knowledge is contained by the * enabling D program -- which can apply process-specific knowledge by making * appropriate use of DTrace primitives like copyin() and copyinstr() to * operate on user-level data. However, there may exist some specific probes * of particular semantic relevance that the application developer may wish to * explicitly export. For example, an application may wish to export a probe * at the point that it begins and ends certain well-defined transactions. In * addition to providing probes, programs may wish to offer assistance for * certain actions. For example, in highly dynamic environments (e.g., Java), * it may be difficult to obtain a stack trace in terms of meaningful symbol * names (the translation from instruction addresses to corresponding symbol * names may only be possible in situ); these environments may wish to define * a series of actions to be applied in situ to obtain a meaningful stack * trace. * * These two mechanisms -- user-level statically defined tracing and assisting * DTrace actions -- are provided via DTrace _helpers_. Helpers are specified * via DOF, but unlike enabling DOF, helper DOF may contain definitions of * providers, probes and their arguments. If a helper wishes to provide * action assistance, probe descriptions and corresponding DIF actions may be * specified in the helper DOF. For such helper actions, however, the probe * description describes the specific helper: all DTrace helpers have the * provider name "dtrace" and the module name "helper", and the name of the * helper is contained in the function name (for example, the ustack() helper * is named "ustack"). Any helper-specific name may be contained in the name * (for example, if a helper were to have a constructor, it might be named * "dtrace:helper::init"). Helper actions are only called when the * action that they are helping is taken. Helper actions may only return DIF * expressions, and may only call the following subroutines: * * alloca() <= Allocates memory out of the consumer's scratch space * bcopy() <= Copies memory to scratch space * copyin() <= Copies memory from user-level into consumer's scratch * copyinto() <= Copies memory into a specific location in scratch * copyinstr() <= Copies a string into a specific location in scratch * * Helper actions may only access the following built-in variables: * * curthread <= Current kthread_t pointer * tid <= Current thread identifier * pid <= Current process identifier * ppid <= Parent process identifier * uid <= Current user ID * gid <= Current group ID * execname <= Current executable name * zonename <= Current zone name * * Helper actions may not manipulate or allocate dynamic variables, but they * may have clause-local and statically-allocated global variables. The * helper action variable state is specific to the helper action -- variables * used by the helper action may not be accessed outside of the helper * action, and the helper action may not access variables that like outside * of it. Helper actions may not load from kernel memory at-large; they are * restricting to loading current user state (via copyin() and variants) and * scratch space. As with probe enablings, helper actions are executed in * program order. The result of the helper action is the result of the last * executing helper expression. * * Helpers -- composed of either providers/probes or probes/actions (or both) * -- are added by opening the "helper" minor node, and issuing an ioctl(2) * (DTRACEHIOC_ADDDOF) that specifies the dof_helper_t structure. This * encapsulates the name and base address of the user-level library or * executable publishing the helpers and probes as well as the DOF that * contains the definitions of those helpers and probes. * * The DTRACEHIOC_ADD and DTRACEHIOC_REMOVE are left in place for legacy * helpers and should no longer be used. No other ioctls are valid on the * helper minor node. */ #ifdef illumos #define DTRACEHIOC (('d' << 24) | ('t' << 16) | ('h' << 8)) #define DTRACEHIOC_ADD (DTRACEHIOC | 1) /* add helper */ #define DTRACEHIOC_REMOVE (DTRACEHIOC | 2) /* remove helper */ #define DTRACEHIOC_ADDDOF (DTRACEHIOC | 3) /* add helper DOF */ #else #define DTRACEHIOC_ADD _IOWR('z', 1, dof_hdr_t)/* add helper */ #define DTRACEHIOC_REMOVE _IOW('z', 2, int) /* remove helper */ #define DTRACEHIOC_ADDDOF _IOWR('z', 3, dof_helper_t)/* add helper DOF */ #endif typedef struct dof_helper { char dofhp_mod[DTRACE_MODNAMELEN]; /* executable or library name */ uint64_t dofhp_addr; /* base address of object */ uint64_t dofhp_dof; /* address of helper DOF */ #ifdef __FreeBSD__ pid_t dofhp_pid; /* target process ID */ int dofhp_gen; #endif } dof_helper_t; #define DTRACEMNR_DTRACE "dtrace" /* node for DTrace ops */ #define DTRACEMNR_HELPER "helper" /* node for helpers */ #define DTRACEMNRN_DTRACE 0 /* minor for DTrace ops */ #define DTRACEMNRN_HELPER 1 /* minor for helpers */ #define DTRACEMNRN_CLONE 2 /* first clone minor */ #ifdef _KERNEL /* * DTrace Provider API * * The following functions are implemented by the DTrace framework and are * used to implement separate in-kernel DTrace providers. Common functions * are provided in uts/common/os/dtrace.c. ISA-dependent subroutines are * defined in uts//dtrace/dtrace_asm.s or uts//dtrace/dtrace_isa.c. * * The provider API has two halves: the API that the providers consume from * DTrace, and the API that providers make available to DTrace. * * 1 Framework-to-Provider API * * 1.1 Overview * * The Framework-to-Provider API is represented by the dtrace_pops structure * that the provider passes to the framework when registering itself. This * structure consists of the following members: * * dtps_provide() <-- Provide all probes, all modules * dtps_provide_module() <-- Provide all probes in specified module * dtps_enable() <-- Enable specified probe * dtps_disable() <-- Disable specified probe * dtps_suspend() <-- Suspend specified probe * dtps_resume() <-- Resume specified probe * dtps_getargdesc() <-- Get the argument description for args[X] * dtps_getargval() <-- Get the value for an argX or args[X] variable * dtps_usermode() <-- Find out if the probe was fired in user mode * dtps_destroy() <-- Destroy all state associated with this probe * * 1.2 void dtps_provide(void *arg, const dtrace_probedesc_t *spec) * * 1.2.1 Overview * * Called to indicate that the provider should provide all probes. If the * specified description is non-NULL, dtps_provide() is being called because * no probe matched a specified probe -- if the provider has the ability to * create custom probes, it may wish to create a probe that matches the * specified description. * * 1.2.2 Arguments and notes * * The first argument is the cookie as passed to dtrace_register(). The * second argument is a pointer to a probe description that the provider may * wish to consider when creating custom probes. The provider is expected to * call back into the DTrace framework via dtrace_probe_create() to create * any necessary probes. dtps_provide() may be called even if the provider * has made available all probes; the provider should check the return value * of dtrace_probe_create() to handle this case. Note that the provider need * not implement both dtps_provide() and dtps_provide_module(); see * "Arguments and Notes" for dtrace_register(), below. * * 1.2.3 Return value * * None. * * 1.2.4 Caller's context * * dtps_provide() is typically called from open() or ioctl() context, but may * be called from other contexts as well. The DTrace framework is locked in * such a way that providers may not register or unregister. This means that * the provider may not call any DTrace API that affects its registration with * the framework, including dtrace_register(), dtrace_unregister(), * dtrace_invalidate(), and dtrace_condense(). However, the context is such * that the provider may (and indeed, is expected to) call probe-related * DTrace routines, including dtrace_probe_create(), dtrace_probe_lookup(), * and dtrace_probe_arg(). * * 1.3 void dtps_provide_module(void *arg, modctl_t *mp) * * 1.3.1 Overview * * Called to indicate that the provider should provide all probes in the * specified module. * * 1.3.2 Arguments and notes * * The first argument is the cookie as passed to dtrace_register(). The * second argument is a pointer to a modctl structure that indicates the * module for which probes should be created. * * 1.3.3 Return value * * None. * * 1.3.4 Caller's context * * dtps_provide_module() may be called from open() or ioctl() context, but * may also be called from a module loading context. mod_lock is held, and * the DTrace framework is locked in such a way that providers may not * register or unregister. This means that the provider may not call any * DTrace API that affects its registration with the framework, including * dtrace_register(), dtrace_unregister(), dtrace_invalidate(), and * dtrace_condense(). However, the context is such that the provider may (and * indeed, is expected to) call probe-related DTrace routines, including * dtrace_probe_create(), dtrace_probe_lookup(), and dtrace_probe_arg(). Note * that the provider need not implement both dtps_provide() and * dtps_provide_module(); see "Arguments and Notes" for dtrace_register(), * below. * * 1.4 void dtps_enable(void *arg, dtrace_id_t id, void *parg) * * 1.4.1 Overview * * Called to enable the specified probe. * * 1.4.2 Arguments and notes * * The first argument is the cookie as passed to dtrace_register(). The * second argument is the identifier of the probe to be enabled. The third * argument is the probe argument as passed to dtrace_probe_create(). * dtps_enable() will be called when a probe transitions from not being * enabled at all to having one or more ECB. The number of ECBs associated * with the probe may change without subsequent calls into the provider. * When the number of ECBs drops to zero, the provider will be explicitly * told to disable the probe via dtps_disable(). dtrace_probe() should never * be called for a probe identifier that hasn't been explicitly enabled via * dtps_enable(). * * 1.4.3 Return value * * None. * * 1.4.4 Caller's context * * The DTrace framework is locked in such a way that it may not be called * back into at all. cpu_lock is held. mod_lock is not held and may not * be acquired. * * 1.5 void dtps_disable(void *arg, dtrace_id_t id, void *parg) * * 1.5.1 Overview * * Called to disable the specified probe. * * 1.5.2 Arguments and notes * * The first argument is the cookie as passed to dtrace_register(). The * second argument is the identifier of the probe to be disabled. The third * argument is the probe argument as passed to dtrace_probe_create(). * dtps_disable() will be called when a probe transitions from being enabled * to having zero ECBs. dtrace_probe() should never be called for a probe * identifier that has been explicitly enabled via dtps_disable(). * * 1.5.3 Return value * * None. * * 1.5.4 Caller's context * * The DTrace framework is locked in such a way that it may not be called * back into at all. cpu_lock is held. mod_lock is not held and may not * be acquired. * * 1.6 void dtps_suspend(void *arg, dtrace_id_t id, void *parg) * * 1.6.1 Overview * * Called to suspend the specified enabled probe. This entry point is for * providers that may need to suspend some or all of their probes when CPUs * are being powered on or when the boot monitor is being entered for a * prolonged period of time. * * 1.6.2 Arguments and notes * * The first argument is the cookie as passed to dtrace_register(). The * second argument is the identifier of the probe to be suspended. The * third argument is the probe argument as passed to dtrace_probe_create(). * dtps_suspend will only be called on an enabled probe. Providers that * provide a dtps_suspend entry point will want to take roughly the action * that it takes for dtps_disable. * * 1.6.3 Return value * * None. * * 1.6.4 Caller's context * * Interrupts are disabled. The DTrace framework is in a state such that the * specified probe cannot be disabled or destroyed for the duration of * dtps_suspend(). As interrupts are disabled, the provider is afforded * little latitude; the provider is expected to do no more than a store to * memory. * * 1.7 void dtps_resume(void *arg, dtrace_id_t id, void *parg) * * 1.7.1 Overview * * Called to resume the specified enabled probe. This entry point is for * providers that may need to resume some or all of their probes after the * completion of an event that induced a call to dtps_suspend(). * * 1.7.2 Arguments and notes * * The first argument is the cookie as passed to dtrace_register(). The * second argument is the identifier of the probe to be resumed. The * third argument is the probe argument as passed to dtrace_probe_create(). * dtps_resume will only be called on an enabled probe. Providers that * provide a dtps_resume entry point will want to take roughly the action * that it takes for dtps_enable. * * 1.7.3 Return value * * None. * * 1.7.4 Caller's context * * Interrupts are disabled. The DTrace framework is in a state such that the * specified probe cannot be disabled or destroyed for the duration of * dtps_resume(). As interrupts are disabled, the provider is afforded * little latitude; the provider is expected to do no more than a store to * memory. * * 1.8 void dtps_getargdesc(void *arg, dtrace_id_t id, void *parg, * dtrace_argdesc_t *desc) * * 1.8.1 Overview * * Called to retrieve the argument description for an args[X] variable. * * 1.8.2 Arguments and notes * * The first argument is the cookie as passed to dtrace_register(). The * second argument is the identifier of the current probe. The third * argument is the probe argument as passed to dtrace_probe_create(). The * fourth argument is a pointer to the argument description. This * description is both an input and output parameter: it contains the * index of the desired argument in the dtargd_ndx field, and expects * the other fields to be filled in upon return. If there is no argument * corresponding to the specified index, the dtargd_ndx field should be set * to DTRACE_ARGNONE. * * 1.8.3 Return value * * None. The dtargd_ndx, dtargd_native, dtargd_xlate and dtargd_mapping * members of the dtrace_argdesc_t structure are all output values. * * 1.8.4 Caller's context * * dtps_getargdesc() is called from ioctl() context. mod_lock is held, and * the DTrace framework is locked in such a way that providers may not * register or unregister. This means that the provider may not call any * DTrace API that affects its registration with the framework, including * dtrace_register(), dtrace_unregister(), dtrace_invalidate(), and * dtrace_condense(). * * 1.9 uint64_t dtps_getargval(void *arg, dtrace_id_t id, void *parg, * int argno, int aframes) * * 1.9.1 Overview * * Called to retrieve a value for an argX or args[X] variable. * * 1.9.2 Arguments and notes * * The first argument is the cookie as passed to dtrace_register(). The * second argument is the identifier of the current probe. The third * argument is the probe argument as passed to dtrace_probe_create(). The * fourth argument is the number of the argument (the X in the example in * 1.9.1). The fifth argument is the number of stack frames that were used * to get from the actual place in the code that fired the probe to * dtrace_probe() itself, the so-called artificial frames. This argument may * be used to descend an appropriate number of frames to find the correct * values. If this entry point is left NULL, the dtrace_getarg() built-in * function is used. * * 1.9.3 Return value * * The value of the argument. * * 1.9.4 Caller's context * * This is called from within dtrace_probe() meaning that interrupts * are disabled. No locks should be taken within this entry point. * * 1.10 int dtps_usermode(void *arg, dtrace_id_t id, void *parg) * * 1.10.1 Overview * * Called to determine if the probe was fired in a user context. * * 1.10.2 Arguments and notes * * The first argument is the cookie as passed to dtrace_register(). The * second argument is the identifier of the current probe. The third * argument is the probe argument as passed to dtrace_probe_create(). This * entry point must not be left NULL for providers whose probes allow for * mixed mode tracing, that is to say those probes that can fire during * kernel- _or_ user-mode execution * * 1.10.3 Return value * * A bitwise OR that encapsulates both the mode (either DTRACE_MODE_KERNEL * or DTRACE_MODE_USER) and the policy when the privilege of the enabling * is insufficient for that mode (a combination of DTRACE_MODE_NOPRIV_DROP, * DTRACE_MODE_NOPRIV_RESTRICT, and DTRACE_MODE_LIMITEDPRIV_RESTRICT). If * DTRACE_MODE_NOPRIV_DROP bit is set, insufficient privilege will result * in the probe firing being silently ignored for the enabling; if the * DTRACE_NODE_NOPRIV_RESTRICT bit is set, insufficient privilege will not * prevent probe processing for the enabling, but restrictions will be in * place that induce a UPRIV fault upon attempt to examine probe arguments * or current process state. If the DTRACE_MODE_LIMITEDPRIV_RESTRICT bit * is set, similar restrictions will be placed upon operation if the * privilege is sufficient to process the enabling, but does not otherwise * entitle the enabling to all zones. The DTRACE_MODE_NOPRIV_DROP and * DTRACE_MODE_NOPRIV_RESTRICT are mutually exclusive (and one of these * two policies must be specified), but either may be combined (or not) * with DTRACE_MODE_LIMITEDPRIV_RESTRICT. * * 1.10.4 Caller's context * * This is called from within dtrace_probe() meaning that interrupts * are disabled. No locks should be taken within this entry point. * * 1.11 void dtps_destroy(void *arg, dtrace_id_t id, void *parg) * * 1.11.1 Overview * * Called to destroy the specified probe. * * 1.11.2 Arguments and notes * * The first argument is the cookie as passed to dtrace_register(). The * second argument is the identifier of the probe to be destroyed. The third * argument is the probe argument as passed to dtrace_probe_create(). The * provider should free all state associated with the probe. The framework * guarantees that dtps_destroy() is only called for probes that have either * been disabled via dtps_disable() or were never enabled via dtps_enable(). * Once dtps_disable() has been called for a probe, no further call will be * made specifying the probe. * * 1.11.3 Return value * * None. * * 1.11.4 Caller's context * * The DTrace framework is locked in such a way that it may not be called * back into at all. mod_lock is held. cpu_lock is not held, and may not be * acquired. * * * 2 Provider-to-Framework API * * 2.1 Overview * * The Provider-to-Framework API provides the mechanism for the provider to * register itself with the DTrace framework, to create probes, to lookup * probes and (most importantly) to fire probes. The Provider-to-Framework * consists of: * * dtrace_register() <-- Register a provider with the DTrace framework * dtrace_unregister() <-- Remove a provider's DTrace registration * dtrace_invalidate() <-- Invalidate the specified provider * dtrace_condense() <-- Remove a provider's unenabled probes * dtrace_attached() <-- Indicates whether or not DTrace has attached * dtrace_probe_create() <-- Create a DTrace probe * dtrace_probe_lookup() <-- Lookup a DTrace probe based on its name * dtrace_probe_arg() <-- Return the probe argument for a specific probe * dtrace_probe() <-- Fire the specified probe * * 2.2 int dtrace_register(const char *name, const dtrace_pattr_t *pap, * uint32_t priv, cred_t *cr, const dtrace_pops_t *pops, void *arg, * dtrace_provider_id_t *idp) * * 2.2.1 Overview * * dtrace_register() registers the calling provider with the DTrace * framework. It should generally be called by DTrace providers in their * attach(9E) entry point. * * 2.2.2 Arguments and Notes * * The first argument is the name of the provider. The second argument is a * pointer to the stability attributes for the provider. The third argument * is the privilege flags for the provider, and must be some combination of: * * DTRACE_PRIV_NONE <= All users may enable probes from this provider * * DTRACE_PRIV_PROC <= Any user with privilege of PRIV_DTRACE_PROC may * enable probes from this provider * * DTRACE_PRIV_USER <= Any user with privilege of PRIV_DTRACE_USER may * enable probes from this provider * * DTRACE_PRIV_KERNEL <= Any user with privilege of PRIV_DTRACE_KERNEL * may enable probes from this provider * * DTRACE_PRIV_OWNER <= This flag places an additional constraint on * the privilege requirements above. These probes * require either (a) a user ID matching the user * ID of the cred passed in the fourth argument * or (b) the PRIV_PROC_OWNER privilege. * * DTRACE_PRIV_ZONEOWNER<= This flag places an additional constraint on * the privilege requirements above. These probes * require either (a) a zone ID matching the zone * ID of the cred passed in the fourth argument * or (b) the PRIV_PROC_ZONE privilege. * * Note that these flags designate the _visibility_ of the probes, not * the conditions under which they may or may not fire. * * The fourth argument is the credential that is associated with the * provider. This argument should be NULL if the privilege flags don't * include DTRACE_PRIV_OWNER or DTRACE_PRIV_ZONEOWNER. If non-NULL, the * framework stashes the uid and zoneid represented by this credential * for use at probe-time, in implicit predicates. These limit visibility * of the probes to users and/or zones which have sufficient privilege to * access them. * * The fifth argument is a DTrace provider operations vector, which provides * the implementation for the Framework-to-Provider API. (See Section 1, * above.) This must be non-NULL, and each member must be non-NULL. The * exceptions to this are (1) the dtps_provide() and dtps_provide_module() * members (if the provider so desires, _one_ of these members may be left * NULL -- denoting that the provider only implements the other) and (2) * the dtps_suspend() and dtps_resume() members, which must either both be * NULL or both be non-NULL. * * The sixth argument is a cookie to be specified as the first argument for * each function in the Framework-to-Provider API. This argument may have * any value. * * The final argument is a pointer to dtrace_provider_id_t. If * dtrace_register() successfully completes, the provider identifier will be * stored in the memory pointed to be this argument. This argument must be * non-NULL. * * 2.2.3 Return value * * On success, dtrace_register() returns 0 and stores the new provider's * identifier into the memory pointed to by the idp argument. On failure, * dtrace_register() returns an errno: * * EINVAL The arguments passed to dtrace_register() were somehow invalid. * This may because a parameter that must be non-NULL was NULL, * because the name was invalid (either empty or an illegal * provider name) or because the attributes were invalid. * * No other failure code is returned. * * 2.2.4 Caller's context * * dtrace_register() may induce calls to dtrace_provide(); the provider must * hold no locks across dtrace_register() that may also be acquired by * dtrace_provide(). cpu_lock and mod_lock must not be held. * * 2.3 int dtrace_unregister(dtrace_provider_t id) * * 2.3.1 Overview * * Unregisters the specified provider from the DTrace framework. It should * generally be called by DTrace providers in their detach(9E) entry point. * * 2.3.2 Arguments and Notes * * The only argument is the provider identifier, as returned from a * successful call to dtrace_register(). As a result of calling * dtrace_unregister(), the DTrace framework will call back into the provider * via the dtps_destroy() entry point. Once dtrace_unregister() successfully * completes, however, the DTrace framework will no longer make calls through * the Framework-to-Provider API. * * 2.3.3 Return value * * On success, dtrace_unregister returns 0. On failure, dtrace_unregister() * returns an errno: * * EBUSY There are currently processes that have the DTrace pseudodevice * open, or there exists an anonymous enabling that hasn't yet * been claimed. * * No other failure code is returned. * * 2.3.4 Caller's context * * Because a call to dtrace_unregister() may induce calls through the * Framework-to-Provider API, the caller may not hold any lock across * dtrace_register() that is also acquired in any of the Framework-to- * Provider API functions. Additionally, mod_lock may not be held. * * 2.4 void dtrace_invalidate(dtrace_provider_id_t id) * * 2.4.1 Overview * * Invalidates the specified provider. All subsequent probe lookups for the * specified provider will fail, but its probes will not be removed. * * 2.4.2 Arguments and note * * The only argument is the provider identifier, as returned from a * successful call to dtrace_register(). In general, a provider's probes * always remain valid; dtrace_invalidate() is a mechanism for invalidating * an entire provider, regardless of whether or not probes are enabled or * not. Note that dtrace_invalidate() will _not_ prevent already enabled * probes from firing -- it will merely prevent any new enablings of the * provider's probes. * * 2.5 int dtrace_condense(dtrace_provider_id_t id) * * 2.5.1 Overview * * Removes all the unenabled probes for the given provider. This function is * not unlike dtrace_unregister(), except that it doesn't remove the * provider just as many of its associated probes as it can. * * 2.5.2 Arguments and Notes * * As with dtrace_unregister(), the sole argument is the provider identifier * as returned from a successful call to dtrace_register(). As a result of * calling dtrace_condense(), the DTrace framework will call back into the * given provider's dtps_destroy() entry point for each of the provider's * unenabled probes. * * 2.5.3 Return value * * Currently, dtrace_condense() always returns 0. However, consumers of this * function should check the return value as appropriate; its behavior may * change in the future. * * 2.5.4 Caller's context * * As with dtrace_unregister(), the caller may not hold any lock across * dtrace_condense() that is also acquired in the provider's entry points. * Also, mod_lock may not be held. * * 2.6 int dtrace_attached() * * 2.6.1 Overview * * Indicates whether or not DTrace has attached. * * 2.6.2 Arguments and Notes * * For most providers, DTrace makes initial contact beyond registration. * That is, once a provider has registered with DTrace, it waits to hear * from DTrace to create probes. However, some providers may wish to * proactively create probes without first being told by DTrace to do so. * If providers wish to do this, they must first call dtrace_attached() to * determine if DTrace itself has attached. If dtrace_attached() returns 0, * the provider must not make any other Provider-to-Framework API call. * * 2.6.3 Return value * * dtrace_attached() returns 1 if DTrace has attached, 0 otherwise. * * 2.7 int dtrace_probe_create(dtrace_provider_t id, const char *mod, * const char *func, const char *name, int aframes, void *arg) * * 2.7.1 Overview * * Creates a probe with specified module name, function name, and name. * * 2.7.2 Arguments and Notes * * The first argument is the provider identifier, as returned from a * successful call to dtrace_register(). The second, third, and fourth * arguments are the module name, function name, and probe name, * respectively. Of these, module name and function name may both be NULL * (in which case the probe is considered to be unanchored), or they may both * be non-NULL. The name must be non-NULL, and must point to a non-empty * string. * * The fifth argument is the number of artificial stack frames that will be * found on the stack when dtrace_probe() is called for the new probe. These * artificial frames will be automatically be pruned should the stack() or * stackdepth() functions be called as part of one of the probe's ECBs. If * the parameter doesn't add an artificial frame, this parameter should be * zero. * * The final argument is a probe argument that will be passed back to the * provider when a probe-specific operation is called. (e.g., via * dtps_enable(), dtps_disable(), etc.) * * Note that it is up to the provider to be sure that the probe that it * creates does not already exist -- if the provider is unsure of the probe's * existence, it should assure its absence with dtrace_probe_lookup() before * calling dtrace_probe_create(). * * 2.7.3 Return value * * dtrace_probe_create() always succeeds, and always returns the identifier * of the newly-created probe. * * 2.7.4 Caller's context * * While dtrace_probe_create() is generally expected to be called from * dtps_provide() and/or dtps_provide_module(), it may be called from other * non-DTrace contexts. Neither cpu_lock nor mod_lock may be held. * * 2.8 dtrace_id_t dtrace_probe_lookup(dtrace_provider_t id, const char *mod, * const char *func, const char *name) * * 2.8.1 Overview * * Looks up a probe based on provdider and one or more of module name, * function name and probe name. * * 2.8.2 Arguments and Notes * * The first argument is the provider identifier, as returned from a * successful call to dtrace_register(). The second, third, and fourth * arguments are the module name, function name, and probe name, * respectively. Any of these may be NULL; dtrace_probe_lookup() will return * the identifier of the first probe that is provided by the specified * provider and matches all of the non-NULL matching criteria. * dtrace_probe_lookup() is generally used by a provider to be check the * existence of a probe before creating it with dtrace_probe_create(). * * 2.8.3 Return value * * If the probe exists, returns its identifier. If the probe does not exist, * return DTRACE_IDNONE. * * 2.8.4 Caller's context * * While dtrace_probe_lookup() is generally expected to be called from * dtps_provide() and/or dtps_provide_module(), it may also be called from * other non-DTrace contexts. Neither cpu_lock nor mod_lock may be held. * * 2.9 void *dtrace_probe_arg(dtrace_provider_t id, dtrace_id_t probe) * * 2.9.1 Overview * * Returns the probe argument associated with the specified probe. * * 2.9.2 Arguments and Notes * * The first argument is the provider identifier, as returned from a * successful call to dtrace_register(). The second argument is a probe * identifier, as returned from dtrace_probe_lookup() or * dtrace_probe_create(). This is useful if a probe has multiple * provider-specific components to it: the provider can create the probe * once with provider-specific state, and then add to the state by looking * up the probe based on probe identifier. * * 2.9.3 Return value * * Returns the argument associated with the specified probe. If the * specified probe does not exist, or if the specified probe is not provided * by the specified provider, NULL is returned. * * 2.9.4 Caller's context * * While dtrace_probe_arg() is generally expected to be called from * dtps_provide() and/or dtps_provide_module(), it may also be called from * other non-DTrace contexts. Neither cpu_lock nor mod_lock may be held. * * 2.10 void dtrace_probe(dtrace_id_t probe, uintptr_t arg0, uintptr_t arg1, * uintptr_t arg2, uintptr_t arg3, uintptr_t arg4) * * 2.10.1 Overview * * The epicenter of DTrace: fires the specified probes with the specified * arguments. * * 2.10.2 Arguments and Notes * * The first argument is a probe identifier as returned by * dtrace_probe_create() or dtrace_probe_lookup(). The second through sixth * arguments are the values to which the D variables "arg0" through "arg4" * will be mapped. * * dtrace_probe() should be called whenever the specified probe has fired -- * however the provider defines it. * * 2.10.3 Return value * * None. * * 2.10.4 Caller's context * * dtrace_probe() may be called in virtually any context: kernel, user, * interrupt, high-level interrupt, with arbitrary adaptive locks held, with * dispatcher locks held, with interrupts disabled, etc. The only latitude * that must be afforded to DTrace is the ability to make calls within * itself (and to its in-kernel subroutines) and the ability to access * arbitrary (but mapped) memory. On some platforms, this constrains * context. For example, on UltraSPARC, dtrace_probe() cannot be called * from any context in which TL is greater than zero. dtrace_probe() may * also not be called from any routine which may be called by dtrace_probe() * -- which includes functions in the DTrace framework and some in-kernel * DTrace subroutines. All such functions "dtrace_"; providers that * instrument the kernel arbitrarily should be sure to not instrument these * routines. */ typedef struct dtrace_pops { void (*dtps_provide)(void *arg, dtrace_probedesc_t *spec); void (*dtps_provide_module)(void *arg, modctl_t *mp); void (*dtps_enable)(void *arg, dtrace_id_t id, void *parg); void (*dtps_disable)(void *arg, dtrace_id_t id, void *parg); void (*dtps_suspend)(void *arg, dtrace_id_t id, void *parg); void (*dtps_resume)(void *arg, dtrace_id_t id, void *parg); void (*dtps_getargdesc)(void *arg, dtrace_id_t id, void *parg, dtrace_argdesc_t *desc); uint64_t (*dtps_getargval)(void *arg, dtrace_id_t id, void *parg, int argno, int aframes); int (*dtps_usermode)(void *arg, dtrace_id_t id, void *parg); void (*dtps_destroy)(void *arg, dtrace_id_t id, void *parg); } dtrace_pops_t; #define DTRACE_MODE_KERNEL 0x01 #define DTRACE_MODE_USER 0x02 #define DTRACE_MODE_NOPRIV_DROP 0x10 #define DTRACE_MODE_NOPRIV_RESTRICT 0x20 #define DTRACE_MODE_LIMITEDPRIV_RESTRICT 0x40 typedef uintptr_t dtrace_provider_id_t; extern int dtrace_register(const char *, const dtrace_pattr_t *, uint32_t, cred_t *, const dtrace_pops_t *, void *, dtrace_provider_id_t *); extern int dtrace_unregister(dtrace_provider_id_t); extern int dtrace_condense(dtrace_provider_id_t); extern void dtrace_invalidate(dtrace_provider_id_t); extern dtrace_id_t dtrace_probe_lookup(dtrace_provider_id_t, char *, char *, char *); extern dtrace_id_t dtrace_probe_create(dtrace_provider_id_t, const char *, const char *, const char *, int, void *); extern void *dtrace_probe_arg(dtrace_provider_id_t, dtrace_id_t); extern void dtrace_probe(dtrace_id_t, uintptr_t arg0, uintptr_t arg1, uintptr_t arg2, uintptr_t arg3, uintptr_t arg4); /* * DTrace Meta Provider API * * The following functions are implemented by the DTrace framework and are * used to implement meta providers. Meta providers plug into the DTrace * framework and are used to instantiate new providers on the fly. At * present, there is only one type of meta provider and only one meta * provider may be registered with the DTrace framework at a time. The * sole meta provider type provides user-land static tracing facilities * by taking meta probe descriptions and adding a corresponding provider * into the DTrace framework. * * 1 Framework-to-Provider * * 1.1 Overview * * The Framework-to-Provider API is represented by the dtrace_mops structure * that the meta provider passes to the framework when registering itself as * a meta provider. This structure consists of the following members: * * dtms_create_probe() <-- Add a new probe to a created provider * dtms_provide_pid() <-- Create a new provider for a given process * dtms_remove_pid() <-- Remove a previously created provider * * 1.2 void dtms_create_probe(void *arg, void *parg, * dtrace_helper_probedesc_t *probedesc); * * 1.2.1 Overview * * Called by the DTrace framework to create a new probe in a provider * created by this meta provider. * * 1.2.2 Arguments and notes * * The first argument is the cookie as passed to dtrace_meta_register(). * The second argument is the provider cookie for the associated provider; * this is obtained from the return value of dtms_provide_pid(). The third * argument is the helper probe description. * * 1.2.3 Return value * * None * * 1.2.4 Caller's context * * dtms_create_probe() is called from either ioctl() or module load context * in the context of a newly-created provider (that is, a provider that * is a result of a call to dtms_provide_pid()). The DTrace framework is * locked in such a way that meta providers may not register or unregister, * such that no other thread can call into a meta provider operation and that * atomicity is assured with respect to meta provider operations across * dtms_provide_pid() and subsequent calls to dtms_create_probe(). * The context is thus effectively single-threaded with respect to the meta * provider, and that the meta provider cannot call dtrace_meta_register() * or dtrace_meta_unregister(). However, the context is such that the * provider may (and is expected to) call provider-related DTrace provider * APIs including dtrace_probe_create(). * * 1.3 void *dtms_provide_pid(void *arg, dtrace_meta_provider_t *mprov, * pid_t pid) * * 1.3.1 Overview * * Called by the DTrace framework to instantiate a new provider given the * description of the provider and probes in the mprov argument. The * meta provider should call dtrace_register() to insert the new provider * into the DTrace framework. * * 1.3.2 Arguments and notes * * The first argument is the cookie as passed to dtrace_meta_register(). * The second argument is a pointer to a structure describing the new * helper provider. The third argument is the process identifier for * process associated with this new provider. Note that the name of the * provider as passed to dtrace_register() should be the contatenation of * the dtmpb_provname member of the mprov argument and the processs * identifier as a string. * * 1.3.3 Return value * * The cookie for the provider that the meta provider creates. This is * the same value that it passed to dtrace_register(). * * 1.3.4 Caller's context * * dtms_provide_pid() is called from either ioctl() or module load context. * The DTrace framework is locked in such a way that meta providers may not * register or unregister. This means that the meta provider cannot call * dtrace_meta_register() or dtrace_meta_unregister(). However, the context * is such that the provider may -- and is expected to -- call * provider-related DTrace provider APIs including dtrace_register(). * * 1.4 void dtms_remove_pid(void *arg, dtrace_meta_provider_t *mprov, * pid_t pid) * * 1.4.1 Overview * * Called by the DTrace framework to remove a provider that had previously * been instantiated via the dtms_provide_pid() entry point. The meta * provider need not remove the provider immediately, but this entry * point indicates that the provider should be removed as soon as possible * using the dtrace_unregister() API. * * 1.4.2 Arguments and notes * * The first argument is the cookie as passed to dtrace_meta_register(). * The second argument is a pointer to a structure describing the helper * provider. The third argument is the process identifier for process * associated with this new provider. * * 1.4.3 Return value * * None * * 1.4.4 Caller's context * * dtms_remove_pid() is called from either ioctl() or exit() context. * The DTrace framework is locked in such a way that meta providers may not * register or unregister. This means that the meta provider cannot call * dtrace_meta_register() or dtrace_meta_unregister(). However, the context * is such that the provider may -- and is expected to -- call * provider-related DTrace provider APIs including dtrace_unregister(). */ typedef struct dtrace_helper_probedesc { char *dthpb_mod; /* probe module */ char *dthpb_func; /* probe function */ char *dthpb_name; /* probe name */ uint64_t dthpb_base; /* base address */ uint32_t *dthpb_offs; /* offsets array */ uint32_t *dthpb_enoffs; /* is-enabled offsets array */ uint32_t dthpb_noffs; /* offsets count */ uint32_t dthpb_nenoffs; /* is-enabled offsets count */ uint8_t *dthpb_args; /* argument mapping array */ uint8_t dthpb_xargc; /* translated argument count */ uint8_t dthpb_nargc; /* native argument count */ char *dthpb_xtypes; /* translated types strings */ char *dthpb_ntypes; /* native types strings */ } dtrace_helper_probedesc_t; typedef struct dtrace_helper_provdesc { char *dthpv_provname; /* provider name */ dtrace_pattr_t dthpv_pattr; /* stability attributes */ } dtrace_helper_provdesc_t; typedef struct dtrace_mops { void (*dtms_create_probe)(void *, void *, dtrace_helper_probedesc_t *); void *(*dtms_provide_pid)(void *, dtrace_helper_provdesc_t *, pid_t); void (*dtms_remove_pid)(void *, dtrace_helper_provdesc_t *, pid_t); } dtrace_mops_t; typedef uintptr_t dtrace_meta_provider_id_t; extern int dtrace_meta_register(const char *, const dtrace_mops_t *, void *, dtrace_meta_provider_id_t *); extern int dtrace_meta_unregister(dtrace_meta_provider_id_t); /* * DTrace Kernel Hooks * * The following functions are implemented by the base kernel and form a set of * hooks used by the DTrace framework. DTrace hooks are implemented in either * uts/common/os/dtrace_subr.c, an ISA-specific assembly file, or in a * uts//os/dtrace_subr.c corresponding to each hardware platform. */ typedef enum dtrace_vtime_state { DTRACE_VTIME_INACTIVE = 0, /* No DTrace, no TNF */ DTRACE_VTIME_ACTIVE, /* DTrace virtual time, no TNF */ DTRACE_VTIME_INACTIVE_TNF, /* No DTrace, TNF active */ DTRACE_VTIME_ACTIVE_TNF /* DTrace virtual time _and_ TNF */ } dtrace_vtime_state_t; #ifdef illumos extern dtrace_vtime_state_t dtrace_vtime_active; #endif extern void dtrace_vtime_switch(kthread_t *next); extern void dtrace_vtime_enable_tnf(void); extern void dtrace_vtime_disable_tnf(void); extern void dtrace_vtime_enable(void); extern void dtrace_vtime_disable(void); struct regs; struct reg; #ifdef illumos extern int (*dtrace_pid_probe_ptr)(struct reg *); extern int (*dtrace_return_probe_ptr)(struct reg *); extern void (*dtrace_fasttrap_fork_ptr)(proc_t *, proc_t *); extern void (*dtrace_fasttrap_exec_ptr)(proc_t *); extern void (*dtrace_fasttrap_exit_ptr)(proc_t *); extern void dtrace_fasttrap_fork(proc_t *, proc_t *); #endif typedef uintptr_t dtrace_icookie_t; typedef void (*dtrace_xcall_t)(void *); extern dtrace_icookie_t dtrace_interrupt_disable(void); extern void dtrace_interrupt_enable(dtrace_icookie_t); extern void dtrace_membar_producer(void); extern void dtrace_membar_consumer(void); extern void (*dtrace_cpu_init)(processorid_t); #ifdef illumos extern void (*dtrace_modload)(modctl_t *); extern void (*dtrace_modunload)(modctl_t *); #endif extern void (*dtrace_helpers_cleanup)(void); extern void (*dtrace_helpers_fork)(proc_t *parent, proc_t *child); extern void (*dtrace_cpustart_init)(void); extern void (*dtrace_cpustart_fini)(void); extern void (*dtrace_closef)(void); extern void (*dtrace_debugger_init)(void); extern void (*dtrace_debugger_fini)(void); extern dtrace_cacheid_t dtrace_predcache_id; #ifdef illumos extern hrtime_t dtrace_gethrtime(void); #else void dtrace_debug_printf(const char *, ...) __printflike(1, 2); #endif extern void dtrace_sync(void); extern void dtrace_toxic_ranges(void (*)(uintptr_t, uintptr_t)); extern void dtrace_xcall(processorid_t, dtrace_xcall_t, void *); extern void dtrace_vpanic(const char *, __va_list); extern void dtrace_panic(const char *, ...); extern int dtrace_safe_defer_signal(void); extern void dtrace_safe_synchronous_signal(void); extern int dtrace_mach_aframes(void); #if defined(__i386) || defined(__amd64) extern int dtrace_instr_size(uchar_t *instr); extern int dtrace_instr_size_isa(uchar_t *, model_t, int *); extern void dtrace_invop_callsite(void); #endif extern void dtrace_invop_add(int (*)(uintptr_t, struct trapframe *, uintptr_t)); extern void dtrace_invop_remove(int (*)(uintptr_t, struct trapframe *, uintptr_t)); #ifdef __sparc extern int dtrace_blksuword32(uintptr_t, uint32_t *, int); extern void dtrace_getfsr(uint64_t *); #endif #ifndef illumos extern void dtrace_helpers_duplicate(proc_t *, proc_t *); extern void dtrace_helpers_destroy(proc_t *); #endif #define DTRACE_CPUFLAG_ISSET(flag) \ (cpu_core[curcpu].cpuc_dtrace_flags & (flag)) #define DTRACE_CPUFLAG_SET(flag) \ (cpu_core[curcpu].cpuc_dtrace_flags |= (flag)) #define DTRACE_CPUFLAG_CLEAR(flag) \ (cpu_core[curcpu].cpuc_dtrace_flags &= ~(flag)) #endif /* _KERNEL */ #endif /* _ASM */ #if defined(__i386) || defined(__amd64) #define DTRACE_INVOP_PUSHL_EBP 1 #define DTRACE_INVOP_PUSHQ_RBP DTRACE_INVOP_PUSHL_EBP #define DTRACE_INVOP_POPL_EBP 2 #define DTRACE_INVOP_POPQ_RBP DTRACE_INVOP_POPL_EBP #define DTRACE_INVOP_LEAVE 3 #define DTRACE_INVOP_NOP 4 #define DTRACE_INVOP_RET 5 #elif defined(__powerpc__) #define DTRACE_INVOP_RET 1 #define DTRACE_INVOP_BCTR 2 #define DTRACE_INVOP_BLR 3 #define DTRACE_INVOP_JUMP 4 #define DTRACE_INVOP_MFLR_R0 5 #define DTRACE_INVOP_NOP 6 #elif defined(__arm__) #define DTRACE_INVOP_SHIFT 4 #define DTRACE_INVOP_MASK ((1 << DTRACE_INVOP_SHIFT) - 1) #define DTRACE_INVOP_DATA(x) ((x) >> DTRACE_INVOP_SHIFT) #define DTRACE_INVOP_PUSHM 1 #define DTRACE_INVOP_POPM 2 #define DTRACE_INVOP_B 3 #elif defined(__aarch64__) #define INSN_SIZE 4 #define B_MASK 0xff000000 #define B_DATA_MASK 0x00ffffff #define B_INSTR 0x14000000 #define RET_INSTR 0xd65f03c0 #define LDP_STP_MASK 0xffc00000 #define STP_32 0x29800000 #define STP_64 0xa9800000 #define LDP_32 0x28c00000 #define LDP_64 0xa8c00000 #define LDP_STP_PREIND (1 << 24) #define LDP_STP_DIR (1 << 22) /* Load instruction */ #define ARG1_SHIFT 0 #define ARG1_MASK 0x1f #define ARG2_SHIFT 10 #define ARG2_MASK 0x1f #define OFFSET_SHIFT 15 #define OFFSET_SIZE 7 #define OFFSET_MASK ((1 << OFFSET_SIZE) - 1) #define DTRACE_INVOP_PUSHM 1 #define DTRACE_INVOP_RET 2 #define DTRACE_INVOP_B 3 #elif defined(__mips__) #define INSN_SIZE 4 /* Load/Store double RA to/from SP */ #define LDSD_RA_SP_MASK 0xffff0000 #define LDSD_DATA_MASK 0x0000ffff #define SD_RA_SP 0xffbf0000 #define LD_RA_SP 0xdfbf0000 #define DTRACE_INVOP_SD 1 #define DTRACE_INVOP_LD 2 #elif defined(__riscv__) -#define SD_RA_SP_MASK 0x1fff07f -#define SD_RA_SP 0x0113023 +#define SD_RA_SP_MASK 0x01fff07f +#define SD_RA_SP 0x00113023 #define DTRACE_INVOP_SD 1 #define DTRACE_INVOP_RET 2 #define DTRACE_INVOP_NOP 3 #endif #ifdef __cplusplus } #endif #endif /* _SYS_DTRACE_H */ Index: user/alc/PQ_LAUNDRY/sys/cddl/contrib/opensolaris =================================================================== --- user/alc/PQ_LAUNDRY/sys/cddl/contrib/opensolaris (revision 303666) +++ user/alc/PQ_LAUNDRY/sys/cddl/contrib/opensolaris (revision 303667) Property changes on: user/alc/PQ_LAUNDRY/sys/cddl/contrib/opensolaris ___________________________________________________________________ Modified: svn:mergeinfo ## -0,0 +0,1 ## Merged /head/sys/cddl/contrib/opensolaris:r303642-303666 Index: user/alc/PQ_LAUNDRY/sys/cddl/dev/dtrace/riscv/dtrace_asm.S =================================================================== --- user/alc/PQ_LAUNDRY/sys/cddl/dev/dtrace/riscv/dtrace_asm.S (revision 303666) +++ user/alc/PQ_LAUNDRY/sys/cddl/dev/dtrace/riscv/dtrace_asm.S (revision 303667) @@ -1,177 +1,177 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License, Version 1.0 only * (the "License"). You may not use this file except in compliance * with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END * * Portions Copyright 2016 Ruslan Bukin * * $FreeBSD$ */ /* * Copyright 2004 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #define _ASM #define _LOCORE #include #include #include #include #include "assym.s" /* void dtrace_membar_producer(void) */ ENTRY(dtrace_membar_producer) RET END(dtrace_membar_producer) /* void dtrace_membar_consumer(void) */ ENTRY(dtrace_membar_consumer) RET END(dtrace_membar_consumer) /* dtrace_icookie_t dtrace_interrupt_disable(void) */ ENTRY(dtrace_interrupt_disable) - csrrci a0, sstatus, SSTATUS_IE - andi a0, a0, SSTATUS_IE + csrrci a0, sstatus, (SSTATUS_SIE) + andi a0, a0, (SSTATUS_SIE) RET END(dtrace_interrupt_disable) /* void dtrace_interrupt_enable(dtrace_icookie_t cookie) */ ENTRY(dtrace_interrupt_enable) csrs sstatus, a0 RET END(dtrace_interrupt_enable) /* uint8_t dtrace_fuword8_nocheck(void *addr) */ ENTRY(dtrace_fuword8_nocheck) lb a0, 0(a0) RET END(dtrace_fuword8_nocheck) /* uint16_t dtrace_fuword16_nocheck(void *addr) */ ENTRY(dtrace_fuword16_nocheck) lh a0, 0(a0) RET END(dtrace_fuword16_nocheck) /* uint32_t dtrace_fuword32_nocheck(void *addr) */ ENTRY(dtrace_fuword32_nocheck) lw a0, 0(a0) RET END(dtrace_fuword32_nocheck) /* uint64_t dtrace_fuword64_nocheck(void *addr) */ ENTRY(dtrace_fuword64_nocheck) ld a0, 0(a0) RET END(dtrace_fuword64_nocheck) /* void dtrace_copy(uintptr_t uaddr, uintptr_t kaddr, size_t size) */ ENTRY(dtrace_copy) beqz a2, 2f /* If len == 0 then skip loop */ 1: lb a4, 0(a0) /* Load from uaddr */ addi a0, a0, 1 sb a4, 0(a1) /* Store in kaddr */ addi a1, a1, 1 addi a2, a2, -1 /* len-- */ bnez a2, 1b 2: RET END(dtrace_copy) /* void dtrace_copystr(uintptr_t uaddr, uintptr_t kaddr, size_t size, volatile uint16_t *flags) XXX: Check for flags? */ ENTRY(dtrace_copystr) beqz a2, 2f /* If len == 0 then skip loop */ lb a4, 0(a0) /* Load from uaddr */ addi a0, a0, 1 sb a4, 0(a1) /* Store in kaddr */ addi a1, a1, 1 beqz a4, 2f /* If == 0 then break */ addi a2, a2, -1 /* len-- */ bnez a2, 1b 2: RET END(dtrace_copystr) /* uintptr_t dtrace_caller(int aframes) */ ENTRY(dtrace_caller) li a0, -1 RET END(dtrace_caller) /* uint32_t dtrace_cas32(uint32_t *target, uint32_t cmp, uint32_t new) */ ENTRY(dtrace_cas32) 1: lr.w a3, 0(a0) /* Load target */ bne a3, a1, 2f /* *target != cmp ? return */ sc.w a4, a2, 0(a0) /* Store new to target */ bnez a4, 1b /* Try again if store not succeed */ 2: mv a0, a3 /* Return the value loaded from target */ RET END(dtrace_cas32) /* void * dtrace_casptr(volatile void *target, volatile void *cmp, volatile void *new) */ ENTRY(dtrace_casptr) 1: lr.d a3, 0(a0) /* Load target */ bne a3, a1, 2f /* *target != cmp ? return */ sc.d a4, a2, 0(a0) /* Store new to target */ bnez a4, 1b /* Try again if store not succeed */ 2: mv a0, a3 /* Return the value loaded from target */ RET END(dtrace_casptr) Index: user/alc/PQ_LAUNDRY/sys/cddl/dev/dtrace/riscv/dtrace_subr.c =================================================================== --- user/alc/PQ_LAUNDRY/sys/cddl/dev/dtrace/riscv/dtrace_subr.c (revision 303666) +++ user/alc/PQ_LAUNDRY/sys/cddl/dev/dtrace/riscv/dtrace_subr.c (revision 303667) @@ -1,282 +1,282 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License, Version 1.0 only * (the "License"). You may not use this file except in compliance * with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END * * Portions Copyright 2016 Ruslan Bukin * * $FreeBSD$ * */ /* * Copyright 2005 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include extern dtrace_id_t dtrace_probeid_error; extern int (*dtrace_invop_jump_addr)(struct trapframe *); extern void dtrace_getnanotime(struct timespec *tsp); int dtrace_invop(uintptr_t, struct trapframe *, uintptr_t); void dtrace_invop_init(void); void dtrace_invop_uninit(void); typedef struct dtrace_invop_hdlr { int (*dtih_func)(uintptr_t, struct trapframe *, uintptr_t); struct dtrace_invop_hdlr *dtih_next; } dtrace_invop_hdlr_t; dtrace_invop_hdlr_t *dtrace_invop_hdlr; int dtrace_invop(uintptr_t addr, struct trapframe *frame, uintptr_t eax) { dtrace_invop_hdlr_t *hdlr; int rval; for (hdlr = dtrace_invop_hdlr; hdlr != NULL; hdlr = hdlr->dtih_next) if ((rval = hdlr->dtih_func(addr, frame, eax)) != 0) return (rval); return (0); } void dtrace_invop_add(int (*func)(uintptr_t, struct trapframe *, uintptr_t)) { dtrace_invop_hdlr_t *hdlr; hdlr = kmem_alloc(sizeof (dtrace_invop_hdlr_t), KM_SLEEP); hdlr->dtih_func = func; hdlr->dtih_next = dtrace_invop_hdlr; dtrace_invop_hdlr = hdlr; } void dtrace_invop_remove(int (*func)(uintptr_t, struct trapframe *, uintptr_t)) { dtrace_invop_hdlr_t *hdlr, *prev; hdlr = dtrace_invop_hdlr; prev = NULL; for (;;) { if (hdlr == NULL) panic("attempt to remove non-existent invop handler"); if (hdlr->dtih_func == func) break; prev = hdlr; hdlr = hdlr->dtih_next; } if (prev == NULL) { ASSERT(dtrace_invop_hdlr == hdlr); dtrace_invop_hdlr = hdlr->dtih_next; } else { ASSERT(dtrace_invop_hdlr != hdlr); prev->dtih_next = hdlr->dtih_next; } kmem_free(hdlr, 0); } /*ARGSUSED*/ void dtrace_toxic_ranges(void (*func)(uintptr_t base, uintptr_t limit)) { (*func)(0, (uintptr_t)VM_MIN_KERNEL_ADDRESS); } void dtrace_xcall(processorid_t cpu, dtrace_xcall_t func, void *arg) { cpuset_t cpus; if (cpu == DTRACE_CPUALL) cpus = all_cpus; else CPU_SETOF(cpu, &cpus); smp_rendezvous_cpus(cpus, smp_no_rendevous_barrier, func, smp_no_rendevous_barrier, arg); } static void dtrace_sync_func(void) { } void dtrace_sync(void) { dtrace_xcall(DTRACE_CPUALL, (dtrace_xcall_t)dtrace_sync_func, NULL); } /* * DTrace needs a high resolution time function which can * be called from a probe context and guaranteed not to have * instrumented with probes itself. * * Returns nanoseconds since boot. */ uint64_t dtrace_gethrtime() { struct timespec curtime; nanouptime(&curtime); return (curtime.tv_sec * 1000000000UL + curtime.tv_nsec); } uint64_t dtrace_gethrestime(void) { struct timespec current_time; dtrace_getnanotime(¤t_time); return (current_time.tv_sec * 1000000000UL + current_time.tv_nsec); } /* Function to handle DTrace traps during probes. See riscv/riscv/trap.c */ int dtrace_trap(struct trapframe *frame, u_int type) { /* * A trap can occur while DTrace executes a probe. Before * executing the probe, DTrace blocks re-scheduling and sets * a flag in it's per-cpu flags to indicate that it doesn't * want to fault. On returning from the probe, the no-fault * flag is cleared and finally re-scheduling is enabled. * * Check if DTrace has enabled 'no-fault' mode: * */ if ((cpu_core[curcpu].cpuc_dtrace_flags & CPU_DTRACE_NOFAULT) != 0) { /* * There are only a couple of trap types that are expected. * All the rest will be handled in the usual way. */ switch (type) { - case EXCP_LOAD_ACCESS_FAULT: - case EXCP_STORE_ACCESS_FAULT: - case EXCP_INSTR_ACCESS_FAULT: + case EXCP_FAULT_LOAD: + case EXCP_FAULT_STORE: + case EXCP_FAULT_FETCH: /* Flag a bad address. */ cpu_core[curcpu].cpuc_dtrace_flags |= CPU_DTRACE_BADADDR; cpu_core[curcpu].cpuc_dtrace_illval = 0; /* * Offset the instruction pointer to the instruction * following the one causing the fault. */ frame->tf_sepc += 4; return (1); default: /* Handle all other traps in the usual way. */ break; } } /* Handle the trap in the usual way. */ return (0); } void dtrace_probe_error(dtrace_state_t *state, dtrace_epid_t epid, int which, int fault, int fltoffs, uintptr_t illval) { dtrace_probe(dtrace_probeid_error, (uint64_t)(uintptr_t)state, (uintptr_t)epid, (uintptr_t)which, (uintptr_t)fault, (uintptr_t)fltoffs); } static int dtrace_invop_start(struct trapframe *frame) { int data, invop, reg, update_sp; register_t arg1, arg2; register_t *sp; uint32_t imm; InstFmt i; int offs; int tmp; invop = dtrace_invop(frame->tf_sepc, frame, frame->tf_sepc); if (invop == RISCV_INSN_RET) { frame->tf_sepc = frame->tf_ra; return (0); } if ((invop & SD_RA_SP_MASK) == SD_RA_SP) { i.word = invop; imm = i.SType.imm0_4 | (i.SType.imm5_11 << 5); sp = (register_t *)((uint8_t *)frame->tf_sp + imm); *sp = frame->tf_ra; frame->tf_sepc += INSN_SIZE; return (0); } return (-1); } void dtrace_invop_init(void) { dtrace_invop_jump_addr = dtrace_invop_start; } void dtrace_invop_uninit(void) { dtrace_invop_jump_addr = 0; } Index: user/alc/PQ_LAUNDRY/sys/conf/ldscript.riscv =================================================================== --- user/alc/PQ_LAUNDRY/sys/conf/ldscript.riscv (revision 303666) +++ user/alc/PQ_LAUNDRY/sys/conf/ldscript.riscv (revision 303667) @@ -1,135 +1,135 @@ /* $FreeBSD$ */ OUTPUT_ARCH(riscv) ENTRY(_start) SEARCH_DIR(/usr/lib); SECTIONS { /* Read-only sections, merged into text segment: */ - . = kernbase + 0x100; + . = kernbase + 0x80000000 /* KERNENTRY */; .text : AT(ADDR(.text) - kernbase) { *(.text) *(.stub) /* .gnu.warning sections are handled specially by elf32.em. */ *(.gnu.warning) *(.gnu.linkonce.t*) } =0x9090 _etext = .; PROVIDE (etext = .); .fini : { *(.fini) } =0x9090 .rodata : { *(.rodata) *(.gnu.linkonce.r*) } .rodata1 : { *(.rodata1) } .interp : { *(.interp) } .hash : { *(.hash) } .dynsym : { *(.dynsym) } .dynstr : { *(.dynstr) } .gnu.version : { *(.gnu.version) } .gnu.version_d : { *(.gnu.version_d) } .gnu.version_r : { *(.gnu.version_r) } .rel.text : { *(.rel.text) *(.rel.gnu.linkonce.t*) } .rela.text : { *(.rela.text) *(.rela.gnu.linkonce.t*) } .rel.data : { *(.rel.data) *(.rel.gnu.linkonce.d*) } .rela.data : { *(.rela.data) *(.rela.gnu.linkonce.d*) } .rel.rodata : { *(.rel.rodata) *(.rel.gnu.linkonce.r*) } .rela.rodata : { *(.rela.rodata) *(.rela.gnu.linkonce.r*) } .rel.got : { *(.rel.got) } .rela.got : { *(.rela.got) } .rel.ctors : { *(.rel.ctors) } .rela.ctors : { *(.rela.ctors) } .rel.dtors : { *(.rel.dtors) } .rela.dtors : { *(.rela.dtors) } .rel.init : { *(.rel.init) } .rela.init : { *(.rela.init) } .rel.fini : { *(.rel.fini) } .rela.fini : { *(.rela.fini) } .rel.bss : { *(.rel.bss) } .rela.bss : { *(.rela.bss) } .rel.plt : { *(.rel.plt) } .rela.plt : { *(.rela.plt) } .init : { *(.init) } =0x9090 .plt : { *(.plt) } /* Adjust the address for the data segment. We want to adjust up to the same address within the page on the next page up. */ . = ALIGN(0x1000) + (. & (0x1000 - 1)) ; .data : { *(.data) *(.gnu.linkonce.d*) } .data1 : { *(.data1) } . = ALIGN(32 / 8); _start_ctors = .; PROVIDE (start_ctors = .); .ctors : { *(.ctors) } _stop_ctors = .; PROVIDE (stop_ctors = .); .dtors : { *(.dtors) } .got : { *(.got.plt) *(.got) } .dynamic : { *(.dynamic) } /* We want the small data sections together, so single-instruction offsets can access them all, and initialized data all before uninitialized, so we can shorten the on-disk segment size. */ . = ALIGN(8); .sdata : { *(.sdata) } _edata = .; PROVIDE (edata = .); __bss_start = .; .sbss : { *(.sbss) *(.scommon) } .bss : { *(.dynbss) *(.bss) *(COMMON) } . = ALIGN(8); _end = . ; PROVIDE (end = .); /* Stabs debugging sections. */ .stab 0 : { *(.stab) } .stabstr 0 : { *(.stabstr) } .stab.excl 0 : { *(.stab.excl) } .stab.exclstr 0 : { *(.stab.exclstr) } .stab.index 0 : { *(.stab.index) } .stab.indexstr 0 : { *(.stab.indexstr) } .comment 0 : { *(.comment) } /* DWARF debug sections. Symbols in the DWARF debugging sections are relative to the beginning of the section so we begin them at 0. */ /* DWARF 1 */ .debug 0 : { *(.debug) } .line 0 : { *(.line) } /* GNU DWARF 1 extensions */ .debug_srcinfo 0 : { *(.debug_srcinfo) } .debug_sfnames 0 : { *(.debug_sfnames) } /* DWARF 1.1 and DWARF 2 */ .debug_aranges 0 : { *(.debug_aranges) } .debug_pubnames 0 : { *(.debug_pubnames) } /* DWARF 2 */ .debug_info 0 : { *(.debug_info) } .debug_abbrev 0 : { *(.debug_abbrev) } .debug_line 0 : { *(.debug_line) } .debug_frame 0 : { *(.debug_frame) } .debug_str 0 : { *(.debug_str) } .debug_loc 0 : { *(.debug_loc) } .debug_macinfo 0 : { *(.debug_macinfo) } /* SGI/MIPS DWARF 2 extensions */ .debug_weaknames 0 : { *(.debug_weaknames) } .debug_funcnames 0 : { *(.debug_funcnames) } .debug_typenames 0 : { *(.debug_typenames) } .debug_varnames 0 : { *(.debug_varnames) } /* These must appear regardless of . */ } Index: user/alc/PQ_LAUNDRY/sys/kern/kern_mutex.c =================================================================== --- user/alc/PQ_LAUNDRY/sys/kern/kern_mutex.c (revision 303666) +++ user/alc/PQ_LAUNDRY/sys/kern/kern_mutex.c (revision 303667) @@ -1,1080 +1,1082 @@ /*- * Copyright (c) 1998 Berkeley Software Design, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Berkeley Software Design Inc's name may not be used to endorse or * promote products derived from this software without specific prior * written permission. * * THIS SOFTWARE IS PROVIDED BY BERKELEY SOFTWARE DESIGN INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL BERKELEY SOFTWARE DESIGN INC BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from BSDI $Id: mutex_witness.c,v 1.1.2.20 2000/04/27 03:10:27 cp Exp $ * and BSDI $Id: synch_machdep.c,v 2.3.2.39 2000/04/27 03:10:25 cp Exp $ */ /* * Machine independent bits of mutex implementation. */ #include __FBSDID("$FreeBSD$"); #include "opt_adaptive_mutexes.h" #include "opt_ddb.h" #include "opt_hwpmc_hooks.h" #include "opt_sched.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if defined(SMP) && !defined(NO_ADAPTIVE_MUTEXES) #define ADAPTIVE_MUTEXES #endif #ifdef HWPMC_HOOKS #include PMC_SOFT_DEFINE( , , lock, failed); #endif /* * Return the mutex address when the lock cookie address is provided. * This functionality assumes that struct mtx* have a member named mtx_lock. */ #define mtxlock2mtx(c) (__containerof(c, struct mtx, mtx_lock)) /* * Internal utility macros. */ #define mtx_unowned(m) ((m)->mtx_lock == MTX_UNOWNED) #define mtx_destroyed(m) ((m)->mtx_lock == MTX_DESTROYED) #define mtx_owner(m) ((struct thread *)((m)->mtx_lock & ~MTX_FLAGMASK)) static void assert_mtx(const struct lock_object *lock, int what); #ifdef DDB static void db_show_mtx(const struct lock_object *lock); #endif static void lock_mtx(struct lock_object *lock, uintptr_t how); static void lock_spin(struct lock_object *lock, uintptr_t how); #ifdef KDTRACE_HOOKS static int owner_mtx(const struct lock_object *lock, struct thread **owner); #endif static uintptr_t unlock_mtx(struct lock_object *lock); static uintptr_t unlock_spin(struct lock_object *lock); /* * Lock classes for sleep and spin mutexes. */ struct lock_class lock_class_mtx_sleep = { .lc_name = "sleep mutex", .lc_flags = LC_SLEEPLOCK | LC_RECURSABLE, .lc_assert = assert_mtx, #ifdef DDB .lc_ddb_show = db_show_mtx, #endif .lc_lock = lock_mtx, .lc_unlock = unlock_mtx, #ifdef KDTRACE_HOOKS .lc_owner = owner_mtx, #endif }; struct lock_class lock_class_mtx_spin = { .lc_name = "spin mutex", .lc_flags = LC_SPINLOCK | LC_RECURSABLE, .lc_assert = assert_mtx, #ifdef DDB .lc_ddb_show = db_show_mtx, #endif .lc_lock = lock_spin, .lc_unlock = unlock_spin, #ifdef KDTRACE_HOOKS .lc_owner = owner_mtx, #endif }; #ifdef ADAPTIVE_MUTEXES static SYSCTL_NODE(_debug, OID_AUTO, mtx, CTLFLAG_RD, NULL, "mtx debugging"); static struct lock_delay_config mtx_delay = { .initial = 1000, .step = 500, .min = 100, .max = 5000, }; SYSCTL_INT(_debug_mtx, OID_AUTO, delay_initial, CTLFLAG_RW, &mtx_delay.initial, 0, ""); SYSCTL_INT(_debug_mtx, OID_AUTO, delay_step, CTLFLAG_RW, &mtx_delay.step, 0, ""); SYSCTL_INT(_debug_mtx, OID_AUTO, delay_min, CTLFLAG_RW, &mtx_delay.min, 0, ""); SYSCTL_INT(_debug_mtx, OID_AUTO, delay_max, CTLFLAG_RW, &mtx_delay.max, 0, ""); static void mtx_delay_sysinit(void *dummy) { mtx_delay.initial = mp_ncpus * 25; mtx_delay.step = (mp_ncpus * 25) / 2; mtx_delay.min = mp_ncpus * 5; mtx_delay.max = mp_ncpus * 25 * 10; } LOCK_DELAY_SYSINIT(mtx_delay_sysinit); #endif /* * System-wide mutexes */ struct mtx blocked_lock; struct mtx Giant; void assert_mtx(const struct lock_object *lock, int what) { mtx_assert((const struct mtx *)lock, what); } void lock_mtx(struct lock_object *lock, uintptr_t how) { mtx_lock((struct mtx *)lock); } void lock_spin(struct lock_object *lock, uintptr_t how) { panic("spin locks can only use msleep_spin"); } uintptr_t unlock_mtx(struct lock_object *lock) { struct mtx *m; m = (struct mtx *)lock; mtx_assert(m, MA_OWNED | MA_NOTRECURSED); mtx_unlock(m); return (0); } uintptr_t unlock_spin(struct lock_object *lock) { panic("spin locks can only use msleep_spin"); } #ifdef KDTRACE_HOOKS int owner_mtx(const struct lock_object *lock, struct thread **owner) { const struct mtx *m = (const struct mtx *)lock; *owner = mtx_owner(m); return (mtx_unowned(m) == 0); } #endif /* * Function versions of the inlined __mtx_* macros. These are used by * modules and can also be called from assembly language if needed. */ void __mtx_lock_flags(volatile uintptr_t *c, int opts, const char *file, int line) { struct mtx *m; if (SCHEDULER_STOPPED()) return; m = mtxlock2mtx(c); KASSERT(kdb_active != 0 || !TD_IS_IDLETHREAD(curthread), ("mtx_lock() by idle thread %p on sleep mutex %s @ %s:%d", curthread, m->lock_object.lo_name, file, line)); KASSERT(m->mtx_lock != MTX_DESTROYED, ("mtx_lock() of destroyed mutex @ %s:%d", file, line)); KASSERT(LOCK_CLASS(&m->lock_object) == &lock_class_mtx_sleep, ("mtx_lock() of spin mutex %s @ %s:%d", m->lock_object.lo_name, file, line)); WITNESS_CHECKORDER(&m->lock_object, (opts & ~MTX_RECURSE) | LOP_NEWORDER | LOP_EXCLUSIVE, file, line, NULL); __mtx_lock(m, curthread, opts, file, line); LOCK_LOG_LOCK("LOCK", &m->lock_object, opts, m->mtx_recurse, file, line); WITNESS_LOCK(&m->lock_object, (opts & ~MTX_RECURSE) | LOP_EXCLUSIVE, file, line); TD_LOCKS_INC(curthread); } void __mtx_unlock_flags(volatile uintptr_t *c, int opts, const char *file, int line) { struct mtx *m; if (SCHEDULER_STOPPED()) return; m = mtxlock2mtx(c); KASSERT(m->mtx_lock != MTX_DESTROYED, ("mtx_unlock() of destroyed mutex @ %s:%d", file, line)); KASSERT(LOCK_CLASS(&m->lock_object) == &lock_class_mtx_sleep, ("mtx_unlock() of spin mutex %s @ %s:%d", m->lock_object.lo_name, file, line)); WITNESS_UNLOCK(&m->lock_object, opts | LOP_EXCLUSIVE, file, line); LOCK_LOG_LOCK("UNLOCK", &m->lock_object, opts, m->mtx_recurse, file, line); mtx_assert(m, MA_OWNED); __mtx_unlock(m, curthread, opts, file, line); TD_LOCKS_DEC(curthread); } void __mtx_lock_spin_flags(volatile uintptr_t *c, int opts, const char *file, int line) { struct mtx *m; if (SCHEDULER_STOPPED()) return; m = mtxlock2mtx(c); KASSERT(m->mtx_lock != MTX_DESTROYED, ("mtx_lock_spin() of destroyed mutex @ %s:%d", file, line)); KASSERT(LOCK_CLASS(&m->lock_object) == &lock_class_mtx_spin, ("mtx_lock_spin() of sleep mutex %s @ %s:%d", m->lock_object.lo_name, file, line)); if (mtx_owned(m)) KASSERT((m->lock_object.lo_flags & LO_RECURSABLE) != 0 || (opts & MTX_RECURSE) != 0, ("mtx_lock_spin: recursed on non-recursive mutex %s @ %s:%d\n", m->lock_object.lo_name, file, line)); opts &= ~MTX_RECURSE; WITNESS_CHECKORDER(&m->lock_object, opts | LOP_NEWORDER | LOP_EXCLUSIVE, file, line, NULL); __mtx_lock_spin(m, curthread, opts, file, line); LOCK_LOG_LOCK("LOCK", &m->lock_object, opts, m->mtx_recurse, file, line); WITNESS_LOCK(&m->lock_object, opts | LOP_EXCLUSIVE, file, line); } int __mtx_trylock_spin_flags(volatile uintptr_t *c, int opts, const char *file, int line) { struct mtx *m; if (SCHEDULER_STOPPED()) return (1); m = mtxlock2mtx(c); KASSERT(m->mtx_lock != MTX_DESTROYED, ("mtx_trylock_spin() of destroyed mutex @ %s:%d", file, line)); KASSERT(LOCK_CLASS(&m->lock_object) == &lock_class_mtx_spin, ("mtx_trylock_spin() of sleep mutex %s @ %s:%d", m->lock_object.lo_name, file, line)); KASSERT((opts & MTX_RECURSE) == 0, ("mtx_trylock_spin: unsupp. opt MTX_RECURSE on mutex %s @ %s:%d\n", m->lock_object.lo_name, file, line)); if (__mtx_trylock_spin(m, curthread, opts, file, line)) { LOCK_LOG_TRY("LOCK", &m->lock_object, opts, 1, file, line); WITNESS_LOCK(&m->lock_object, opts | LOP_EXCLUSIVE, file, line); return (1); } LOCK_LOG_TRY("LOCK", &m->lock_object, opts, 0, file, line); return (0); } void __mtx_unlock_spin_flags(volatile uintptr_t *c, int opts, const char *file, int line) { struct mtx *m; if (SCHEDULER_STOPPED()) return; m = mtxlock2mtx(c); KASSERT(m->mtx_lock != MTX_DESTROYED, ("mtx_unlock_spin() of destroyed mutex @ %s:%d", file, line)); KASSERT(LOCK_CLASS(&m->lock_object) == &lock_class_mtx_spin, ("mtx_unlock_spin() of sleep mutex %s @ %s:%d", m->lock_object.lo_name, file, line)); WITNESS_UNLOCK(&m->lock_object, opts | LOP_EXCLUSIVE, file, line); LOCK_LOG_LOCK("UNLOCK", &m->lock_object, opts, m->mtx_recurse, file, line); mtx_assert(m, MA_OWNED); __mtx_unlock_spin(m); } /* * The important part of mtx_trylock{,_flags}() * Tries to acquire lock `m.' If this function is called on a mutex that * is already owned, it will recursively acquire the lock. */ int _mtx_trylock_flags_(volatile uintptr_t *c, int opts, const char *file, int line) { struct mtx *m; #ifdef LOCK_PROFILING uint64_t waittime = 0; int contested = 0; #endif int rval; if (SCHEDULER_STOPPED()) return (1); m = mtxlock2mtx(c); KASSERT(kdb_active != 0 || !TD_IS_IDLETHREAD(curthread), ("mtx_trylock() by idle thread %p on sleep mutex %s @ %s:%d", curthread, m->lock_object.lo_name, file, line)); KASSERT(m->mtx_lock != MTX_DESTROYED, ("mtx_trylock() of destroyed mutex @ %s:%d", file, line)); KASSERT(LOCK_CLASS(&m->lock_object) == &lock_class_mtx_sleep, ("mtx_trylock() of spin mutex %s @ %s:%d", m->lock_object.lo_name, file, line)); if (mtx_owned(m) && ((m->lock_object.lo_flags & LO_RECURSABLE) != 0 || (opts & MTX_RECURSE) != 0)) { m->mtx_recurse++; atomic_set_ptr(&m->mtx_lock, MTX_RECURSED); rval = 1; } else rval = _mtx_obtain_lock(m, (uintptr_t)curthread); opts &= ~MTX_RECURSE; LOCK_LOG_TRY("LOCK", &m->lock_object, opts, rval, file, line); if (rval) { WITNESS_LOCK(&m->lock_object, opts | LOP_EXCLUSIVE | LOP_TRYLOCK, file, line); TD_LOCKS_INC(curthread); if (m->mtx_recurse == 0) LOCKSTAT_PROFILE_OBTAIN_LOCK_SUCCESS(adaptive__acquire, m, contested, waittime, file, line); } return (rval); } /* * __mtx_lock_sleep: the tougher part of acquiring an MTX_DEF lock. * * We call this if the lock is either contested (i.e. we need to go to * sleep waiting for it), or if we need to recurse on it. */ void __mtx_lock_sleep(volatile uintptr_t *c, uintptr_t tid, int opts, const char *file, int line) { struct mtx *m; struct turnstile *ts; uintptr_t v; #ifdef ADAPTIVE_MUTEXES volatile struct thread *owner; #endif #ifdef KTR int cont_logged = 0; #endif #ifdef LOCK_PROFILING int contested = 0; uint64_t waittime = 0; #endif #if defined(ADAPTIVE_MUTEXES) || defined(KDTRACE_HOOKS) struct lock_delay_arg lda; #endif #ifdef KDTRACE_HOOKS u_int sleep_cnt = 0; int64_t sleep_time = 0; int64_t all_time = 0; #endif if (SCHEDULER_STOPPED()) return; -#if defined(ADAPTIVE_MUTEXES) || defined(KDTRACE_HOOKS) +#if defined(ADAPTIVE_MUTEXES) lock_delay_arg_init(&lda, &mtx_delay); +#elif defined(KDTRACE_HOOKS) + lock_delay_arg_init(&lda, NULL); #endif m = mtxlock2mtx(c); if (mtx_owned(m)) { KASSERT((m->lock_object.lo_flags & LO_RECURSABLE) != 0 || (opts & MTX_RECURSE) != 0, ("_mtx_lock_sleep: recursed on non-recursive mutex %s @ %s:%d\n", m->lock_object.lo_name, file, line)); opts &= ~MTX_RECURSE; m->mtx_recurse++; atomic_set_ptr(&m->mtx_lock, MTX_RECURSED); if (LOCK_LOG_TEST(&m->lock_object, opts)) CTR1(KTR_LOCK, "_mtx_lock_sleep: %p recursing", m); return; } opts &= ~MTX_RECURSE; #ifdef HWPMC_HOOKS PMC_SOFT_CALL( , , lock, failed); #endif lock_profile_obtain_lock_failed(&m->lock_object, &contested, &waittime); if (LOCK_LOG_TEST(&m->lock_object, opts)) CTR4(KTR_LOCK, "_mtx_lock_sleep: %s contested (lock=%p) at %s:%d", m->lock_object.lo_name, (void *)m->mtx_lock, file, line); #ifdef KDTRACE_HOOKS all_time -= lockstat_nsecs(&m->lock_object); #endif for (;;) { if (m->mtx_lock == MTX_UNOWNED && _mtx_obtain_lock(m, tid)) break; #ifdef KDTRACE_HOOKS lda.spin_cnt++; #endif #ifdef ADAPTIVE_MUTEXES /* * If the owner is running on another CPU, spin until the * owner stops running or the state of the lock changes. */ v = m->mtx_lock; if (v != MTX_UNOWNED) { owner = (struct thread *)(v & ~MTX_FLAGMASK); if (TD_IS_RUNNING(owner)) { if (LOCK_LOG_TEST(&m->lock_object, 0)) CTR3(KTR_LOCK, "%s: spinning on %p held by %p", __func__, m, owner); KTR_STATE1(KTR_SCHED, "thread", sched_tdname((struct thread *)tid), "spinning", "lockname:\"%s\"", m->lock_object.lo_name); while (mtx_owner(m) == owner && TD_IS_RUNNING(owner)) lock_delay(&lda); KTR_STATE0(KTR_SCHED, "thread", sched_tdname((struct thread *)tid), "running"); continue; } } #endif ts = turnstile_trywait(&m->lock_object); v = m->mtx_lock; /* * Check if the lock has been released while spinning for * the turnstile chain lock. */ if (v == MTX_UNOWNED) { turnstile_cancel(ts); continue; } #ifdef ADAPTIVE_MUTEXES /* * The current lock owner might have started executing * on another CPU (or the lock could have changed * owners) while we were waiting on the turnstile * chain lock. If so, drop the turnstile lock and try * again. */ owner = (struct thread *)(v & ~MTX_FLAGMASK); if (TD_IS_RUNNING(owner)) { turnstile_cancel(ts); continue; } #endif /* * If the mutex isn't already contested and a failure occurs * setting the contested bit, the mutex was either released * or the state of the MTX_RECURSED bit changed. */ if ((v & MTX_CONTESTED) == 0 && !atomic_cmpset_ptr(&m->mtx_lock, v, v | MTX_CONTESTED)) { turnstile_cancel(ts); continue; } /* * We definitely must sleep for this lock. */ mtx_assert(m, MA_NOTOWNED); #ifdef KTR if (!cont_logged) { CTR6(KTR_CONTENTION, "contention: %p at %s:%d wants %s, taken by %s:%d", (void *)tid, file, line, m->lock_object.lo_name, WITNESS_FILE(&m->lock_object), WITNESS_LINE(&m->lock_object)); cont_logged = 1; } #endif /* * Block on the turnstile. */ #ifdef KDTRACE_HOOKS sleep_time -= lockstat_nsecs(&m->lock_object); #endif turnstile_wait(ts, mtx_owner(m), TS_EXCLUSIVE_QUEUE); #ifdef KDTRACE_HOOKS sleep_time += lockstat_nsecs(&m->lock_object); sleep_cnt++; #endif } #ifdef KDTRACE_HOOKS all_time += lockstat_nsecs(&m->lock_object); #endif #ifdef KTR if (cont_logged) { CTR4(KTR_CONTENTION, "contention end: %s acquired by %p at %s:%d", m->lock_object.lo_name, (void *)tid, file, line); } #endif LOCKSTAT_PROFILE_OBTAIN_LOCK_SUCCESS(adaptive__acquire, m, contested, waittime, file, line); #ifdef KDTRACE_HOOKS if (sleep_time) LOCKSTAT_RECORD1(adaptive__block, m, sleep_time); /* * Only record the loops spinning and not sleeping. */ if (lda.spin_cnt > sleep_cnt) LOCKSTAT_RECORD1(adaptive__spin, m, all_time - sleep_time); #endif } static void _mtx_lock_spin_failed(struct mtx *m) { struct thread *td; td = mtx_owner(m); /* If the mutex is unlocked, try again. */ if (td == NULL) return; printf( "spin lock %p (%s) held by %p (tid %d) too long\n", m, m->lock_object.lo_name, td, td->td_tid); #ifdef WITNESS witness_display_spinlock(&m->lock_object, td, printf); #endif panic("spin lock held too long"); } #ifdef SMP /* * _mtx_lock_spin_cookie: the tougher part of acquiring an MTX_SPIN lock. * * This is only called if we need to actually spin for the lock. Recursion * is handled inline. */ void _mtx_lock_spin_cookie(volatile uintptr_t *c, uintptr_t tid, int opts, const char *file, int line) { struct mtx *m; int i = 0; #ifdef LOCK_PROFILING int contested = 0; uint64_t waittime = 0; #endif #ifdef KDTRACE_HOOKS int64_t spin_time = 0; #endif if (SCHEDULER_STOPPED()) return; m = mtxlock2mtx(c); if (LOCK_LOG_TEST(&m->lock_object, opts)) CTR1(KTR_LOCK, "_mtx_lock_spin: %p spinning", m); KTR_STATE1(KTR_SCHED, "thread", sched_tdname((struct thread *)tid), "spinning", "lockname:\"%s\"", m->lock_object.lo_name); #ifdef HWPMC_HOOKS PMC_SOFT_CALL( , , lock, failed); #endif lock_profile_obtain_lock_failed(&m->lock_object, &contested, &waittime); #ifdef KDTRACE_HOOKS spin_time -= lockstat_nsecs(&m->lock_object); #endif for (;;) { if (m->mtx_lock == MTX_UNOWNED && _mtx_obtain_lock(m, tid)) break; /* Give interrupts a chance while we spin. */ spinlock_exit(); while (m->mtx_lock != MTX_UNOWNED) { if (i++ < 10000000) { cpu_spinwait(); continue; } if (i < 60000000 || kdb_active || panicstr != NULL) DELAY(1); else _mtx_lock_spin_failed(m); cpu_spinwait(); } spinlock_enter(); } #ifdef KDTRACE_HOOKS spin_time += lockstat_nsecs(&m->lock_object); #endif if (LOCK_LOG_TEST(&m->lock_object, opts)) CTR1(KTR_LOCK, "_mtx_lock_spin: %p spin done", m); KTR_STATE0(KTR_SCHED, "thread", sched_tdname((struct thread *)tid), "running"); #ifdef KDTRACE_HOOKS LOCKSTAT_PROFILE_OBTAIN_LOCK_SUCCESS(spin__acquire, m, contested, waittime, file, line); if (spin_time != 0) LOCKSTAT_RECORD1(spin__spin, m, spin_time); #endif } #endif /* SMP */ void thread_lock_flags_(struct thread *td, int opts, const char *file, int line) { struct mtx *m; uintptr_t tid; int i; #ifdef LOCK_PROFILING int contested = 0; uint64_t waittime = 0; #endif #ifdef KDTRACE_HOOKS int64_t spin_time = 0; #endif i = 0; tid = (uintptr_t)curthread; if (SCHEDULER_STOPPED()) { /* * Ensure that spinlock sections are balanced even when the * scheduler is stopped, since we may otherwise inadvertently * re-enable interrupts while dumping core. */ spinlock_enter(); return; } #ifdef KDTRACE_HOOKS spin_time -= lockstat_nsecs(&td->td_lock->lock_object); #endif for (;;) { retry: spinlock_enter(); m = td->td_lock; KASSERT(m->mtx_lock != MTX_DESTROYED, ("thread_lock() of destroyed mutex @ %s:%d", file, line)); KASSERT(LOCK_CLASS(&m->lock_object) == &lock_class_mtx_spin, ("thread_lock() of sleep mutex %s @ %s:%d", m->lock_object.lo_name, file, line)); if (mtx_owned(m)) KASSERT((m->lock_object.lo_flags & LO_RECURSABLE) != 0, ("thread_lock: recursed on non-recursive mutex %s @ %s:%d\n", m->lock_object.lo_name, file, line)); WITNESS_CHECKORDER(&m->lock_object, opts | LOP_NEWORDER | LOP_EXCLUSIVE, file, line, NULL); for (;;) { if (m->mtx_lock == MTX_UNOWNED && _mtx_obtain_lock(m, tid)) break; if (m->mtx_lock == tid) { m->mtx_recurse++; break; } #ifdef HWPMC_HOOKS PMC_SOFT_CALL( , , lock, failed); #endif lock_profile_obtain_lock_failed(&m->lock_object, &contested, &waittime); /* Give interrupts a chance while we spin. */ spinlock_exit(); while (m->mtx_lock != MTX_UNOWNED) { if (i++ < 10000000) cpu_spinwait(); else if (i < 60000000 || kdb_active || panicstr != NULL) DELAY(1); else _mtx_lock_spin_failed(m); cpu_spinwait(); if (m != td->td_lock) goto retry; } spinlock_enter(); } if (m == td->td_lock) break; __mtx_unlock_spin(m); /* does spinlock_exit() */ } #ifdef KDTRACE_HOOKS spin_time += lockstat_nsecs(&m->lock_object); #endif if (m->mtx_recurse == 0) LOCKSTAT_PROFILE_OBTAIN_LOCK_SUCCESS(spin__acquire, m, contested, waittime, file, line); LOCK_LOG_LOCK("LOCK", &m->lock_object, opts, m->mtx_recurse, file, line); WITNESS_LOCK(&m->lock_object, opts | LOP_EXCLUSIVE, file, line); #ifdef KDTRACE_HOOKS if (spin_time != 0) LOCKSTAT_RECORD1(thread__spin, m, spin_time); #endif } struct mtx * thread_lock_block(struct thread *td) { struct mtx *lock; THREAD_LOCK_ASSERT(td, MA_OWNED); lock = td->td_lock; td->td_lock = &blocked_lock; mtx_unlock_spin(lock); return (lock); } void thread_lock_unblock(struct thread *td, struct mtx *new) { mtx_assert(new, MA_OWNED); MPASS(td->td_lock == &blocked_lock); atomic_store_rel_ptr((volatile void *)&td->td_lock, (uintptr_t)new); } void thread_lock_set(struct thread *td, struct mtx *new) { struct mtx *lock; mtx_assert(new, MA_OWNED); THREAD_LOCK_ASSERT(td, MA_OWNED); lock = td->td_lock; td->td_lock = new; mtx_unlock_spin(lock); } /* * __mtx_unlock_sleep: the tougher part of releasing an MTX_DEF lock. * * We are only called here if the lock is recursed or contested (i.e. we * need to wake up a blocked thread). */ void __mtx_unlock_sleep(volatile uintptr_t *c, int opts, const char *file, int line) { struct mtx *m; struct turnstile *ts; if (SCHEDULER_STOPPED()) return; m = mtxlock2mtx(c); if (mtx_recursed(m)) { if (--(m->mtx_recurse) == 0) atomic_clear_ptr(&m->mtx_lock, MTX_RECURSED); if (LOCK_LOG_TEST(&m->lock_object, opts)) CTR1(KTR_LOCK, "_mtx_unlock_sleep: %p unrecurse", m); return; } /* * We have to lock the chain before the turnstile so this turnstile * can be removed from the hash list if it is empty. */ turnstile_chain_lock(&m->lock_object); ts = turnstile_lookup(&m->lock_object); if (LOCK_LOG_TEST(&m->lock_object, opts)) CTR1(KTR_LOCK, "_mtx_unlock_sleep: %p contested", m); MPASS(ts != NULL); turnstile_broadcast(ts, TS_EXCLUSIVE_QUEUE); _mtx_release_lock_quick(m); /* * This turnstile is now no longer associated with the mutex. We can * unlock the chain lock so a new turnstile may take it's place. */ turnstile_unpend(ts, TS_EXCLUSIVE_LOCK); turnstile_chain_unlock(&m->lock_object); } /* * All the unlocking of MTX_SPIN locks is done inline. * See the __mtx_unlock_spin() macro for the details. */ /* * The backing function for the INVARIANTS-enabled mtx_assert() */ #ifdef INVARIANT_SUPPORT void __mtx_assert(const volatile uintptr_t *c, int what, const char *file, int line) { const struct mtx *m; if (panicstr != NULL || dumping) return; m = mtxlock2mtx(c); switch (what) { case MA_OWNED: case MA_OWNED | MA_RECURSED: case MA_OWNED | MA_NOTRECURSED: if (!mtx_owned(m)) panic("mutex %s not owned at %s:%d", m->lock_object.lo_name, file, line); if (mtx_recursed(m)) { if ((what & MA_NOTRECURSED) != 0) panic("mutex %s recursed at %s:%d", m->lock_object.lo_name, file, line); } else if ((what & MA_RECURSED) != 0) { panic("mutex %s unrecursed at %s:%d", m->lock_object.lo_name, file, line); } break; case MA_NOTOWNED: if (mtx_owned(m)) panic("mutex %s owned at %s:%d", m->lock_object.lo_name, file, line); break; default: panic("unknown mtx_assert at %s:%d", file, line); } } #endif /* * General init routine used by the MTX_SYSINIT() macro. */ void mtx_sysinit(void *arg) { struct mtx_args *margs = arg; mtx_init((struct mtx *)margs->ma_mtx, margs->ma_desc, NULL, margs->ma_opts); } /* * Mutex initialization routine; initialize lock `m' of type contained in * `opts' with options contained in `opts' and name `name.' The optional * lock type `type' is used as a general lock category name for use with * witness. */ void _mtx_init(volatile uintptr_t *c, const char *name, const char *type, int opts) { struct mtx *m; struct lock_class *class; int flags; m = mtxlock2mtx(c); MPASS((opts & ~(MTX_SPIN | MTX_QUIET | MTX_RECURSE | MTX_NOWITNESS | MTX_DUPOK | MTX_NOPROFILE | MTX_NEW)) == 0); ASSERT_ATOMIC_LOAD_PTR(m->mtx_lock, ("%s: mtx_lock not aligned for %s: %p", __func__, name, &m->mtx_lock)); /* Determine lock class and lock flags. */ if (opts & MTX_SPIN) class = &lock_class_mtx_spin; else class = &lock_class_mtx_sleep; flags = 0; if (opts & MTX_QUIET) flags |= LO_QUIET; if (opts & MTX_RECURSE) flags |= LO_RECURSABLE; if ((opts & MTX_NOWITNESS) == 0) flags |= LO_WITNESS; if (opts & MTX_DUPOK) flags |= LO_DUPOK; if (opts & MTX_NOPROFILE) flags |= LO_NOPROFILE; if (opts & MTX_NEW) flags |= LO_NEW; /* Initialize mutex. */ lock_init(&m->lock_object, class, name, type, flags); m->mtx_lock = MTX_UNOWNED; m->mtx_recurse = 0; } /* * Remove lock `m' from all_mtx queue. We don't allow MTX_QUIET to be * passed in as a flag here because if the corresponding mtx_init() was * called with MTX_QUIET set, then it will already be set in the mutex's * flags. */ void _mtx_destroy(volatile uintptr_t *c) { struct mtx *m; m = mtxlock2mtx(c); if (!mtx_owned(m)) MPASS(mtx_unowned(m)); else { MPASS((m->mtx_lock & (MTX_RECURSED|MTX_CONTESTED)) == 0); /* Perform the non-mtx related part of mtx_unlock_spin(). */ if (LOCK_CLASS(&m->lock_object) == &lock_class_mtx_spin) spinlock_exit(); else TD_LOCKS_DEC(curthread); lock_profile_release_lock(&m->lock_object); /* Tell witness this isn't locked to make it happy. */ WITNESS_UNLOCK(&m->lock_object, LOP_EXCLUSIVE, __FILE__, __LINE__); } m->mtx_lock = MTX_DESTROYED; lock_destroy(&m->lock_object); } /* * Intialize the mutex code and system mutexes. This is called from the MD * startup code prior to mi_startup(). The per-CPU data space needs to be * setup before this is called. */ void mutex_init(void) { /* Setup turnstiles so that sleep mutexes work. */ init_turnstiles(); /* * Initialize mutexes. */ mtx_init(&Giant, "Giant", NULL, MTX_DEF | MTX_RECURSE); mtx_init(&blocked_lock, "blocked lock", NULL, MTX_SPIN); blocked_lock.mtx_lock = 0xdeadc0de; /* Always blocked. */ mtx_init(&proc0.p_mtx, "process lock", NULL, MTX_DEF | MTX_DUPOK); mtx_init(&proc0.p_slock, "process slock", NULL, MTX_SPIN); mtx_init(&proc0.p_statmtx, "pstatl", NULL, MTX_SPIN); mtx_init(&proc0.p_itimmtx, "pitiml", NULL, MTX_SPIN); mtx_init(&proc0.p_profmtx, "pprofl", NULL, MTX_SPIN); mtx_init(&devmtx, "cdev", NULL, MTX_DEF); mtx_lock(&Giant); } #ifdef DDB void db_show_mtx(const struct lock_object *lock) { struct thread *td; const struct mtx *m; m = (const struct mtx *)lock; db_printf(" flags: {"); if (LOCK_CLASS(lock) == &lock_class_mtx_spin) db_printf("SPIN"); else db_printf("DEF"); if (m->lock_object.lo_flags & LO_RECURSABLE) db_printf(", RECURSE"); if (m->lock_object.lo_flags & LO_DUPOK) db_printf(", DUPOK"); db_printf("}\n"); db_printf(" state: {"); if (mtx_unowned(m)) db_printf("UNOWNED"); else if (mtx_destroyed(m)) db_printf("DESTROYED"); else { db_printf("OWNED"); if (m->mtx_lock & MTX_CONTESTED) db_printf(", CONTESTED"); if (m->mtx_lock & MTX_RECURSED) db_printf(", RECURSED"); } db_printf("}\n"); if (!mtx_unowned(m) && !mtx_destroyed(m)) { td = mtx_owner(m); db_printf(" owner: %p (tid %d, pid %d, \"%s\")\n", td, td->td_tid, td->td_proc->p_pid, td->td_name); if (mtx_recursed(m)) db_printf(" recursed: %d\n", m->mtx_recurse); } } #endif Index: user/alc/PQ_LAUNDRY/sys/kern/kern_rwlock.c =================================================================== --- user/alc/PQ_LAUNDRY/sys/kern/kern_rwlock.c (revision 303666) +++ user/alc/PQ_LAUNDRY/sys/kern/kern_rwlock.c (revision 303667) @@ -1,1307 +1,1311 @@ /*- * Copyright (c) 2006 John Baldwin * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * Machine independent bits of reader/writer lock implementation. */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include "opt_hwpmc_hooks.h" #include "opt_no_adaptive_rwlocks.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if defined(SMP) && !defined(NO_ADAPTIVE_RWLOCKS) #define ADAPTIVE_RWLOCKS #endif #ifdef HWPMC_HOOKS #include PMC_SOFT_DECLARE( , , lock, failed); #endif /* * Return the rwlock address when the lock cookie address is provided. * This functionality assumes that struct rwlock* have a member named rw_lock. */ #define rwlock2rw(c) (__containerof(c, struct rwlock, rw_lock)) #ifdef DDB #include static void db_show_rwlock(const struct lock_object *lock); #endif static void assert_rw(const struct lock_object *lock, int what); static void lock_rw(struct lock_object *lock, uintptr_t how); #ifdef KDTRACE_HOOKS static int owner_rw(const struct lock_object *lock, struct thread **owner); #endif static uintptr_t unlock_rw(struct lock_object *lock); struct lock_class lock_class_rw = { .lc_name = "rw", .lc_flags = LC_SLEEPLOCK | LC_RECURSABLE | LC_UPGRADABLE, .lc_assert = assert_rw, #ifdef DDB .lc_ddb_show = db_show_rwlock, #endif .lc_lock = lock_rw, .lc_unlock = unlock_rw, #ifdef KDTRACE_HOOKS .lc_owner = owner_rw, #endif }; #ifdef ADAPTIVE_RWLOCKS static int rowner_retries = 10; static int rowner_loops = 10000; static SYSCTL_NODE(_debug, OID_AUTO, rwlock, CTLFLAG_RD, NULL, "rwlock debugging"); SYSCTL_INT(_debug_rwlock, OID_AUTO, retry, CTLFLAG_RW, &rowner_retries, 0, ""); SYSCTL_INT(_debug_rwlock, OID_AUTO, loops, CTLFLAG_RW, &rowner_loops, 0, ""); static struct lock_delay_config rw_delay = { .initial = 1000, .step = 500, .min = 100, .max = 5000, }; SYSCTL_INT(_debug_rwlock, OID_AUTO, delay_initial, CTLFLAG_RW, &rw_delay.initial, 0, ""); SYSCTL_INT(_debug_rwlock, OID_AUTO, delay_step, CTLFLAG_RW, &rw_delay.step, 0, ""); SYSCTL_INT(_debug_rwlock, OID_AUTO, delay_min, CTLFLAG_RW, &rw_delay.min, 0, ""); SYSCTL_INT(_debug_rwlock, OID_AUTO, delay_max, CTLFLAG_RW, &rw_delay.max, 0, ""); static void rw_delay_sysinit(void *dummy) { rw_delay.initial = mp_ncpus * 25; rw_delay.step = (mp_ncpus * 25) / 2; rw_delay.min = mp_ncpus * 5; rw_delay.max = mp_ncpus * 25 * 10; } LOCK_DELAY_SYSINIT(rw_delay_sysinit); #endif /* * Return a pointer to the owning thread if the lock is write-locked or * NULL if the lock is unlocked or read-locked. */ #define rw_wowner(rw) \ ((rw)->rw_lock & RW_LOCK_READ ? NULL : \ (struct thread *)RW_OWNER((rw)->rw_lock)) /* * Returns if a write owner is recursed. Write ownership is not assured * here and should be previously checked. */ #define rw_recursed(rw) ((rw)->rw_recurse != 0) /* * Return true if curthread helds the lock. */ #define rw_wlocked(rw) (rw_wowner((rw)) == curthread) /* * Return a pointer to the owning thread for this lock who should receive * any priority lent by threads that block on this lock. Currently this * is identical to rw_wowner(). */ #define rw_owner(rw) rw_wowner(rw) #ifndef INVARIANTS #define __rw_assert(c, what, file, line) #endif void assert_rw(const struct lock_object *lock, int what) { rw_assert((const struct rwlock *)lock, what); } void lock_rw(struct lock_object *lock, uintptr_t how) { struct rwlock *rw; rw = (struct rwlock *)lock; if (how) rw_rlock(rw); else rw_wlock(rw); } uintptr_t unlock_rw(struct lock_object *lock) { struct rwlock *rw; rw = (struct rwlock *)lock; rw_assert(rw, RA_LOCKED | LA_NOTRECURSED); if (rw->rw_lock & RW_LOCK_READ) { rw_runlock(rw); return (1); } else { rw_wunlock(rw); return (0); } } #ifdef KDTRACE_HOOKS int owner_rw(const struct lock_object *lock, struct thread **owner) { const struct rwlock *rw = (const struct rwlock *)lock; uintptr_t x = rw->rw_lock; *owner = rw_wowner(rw); return ((x & RW_LOCK_READ) != 0 ? (RW_READERS(x) != 0) : (*owner != NULL)); } #endif void _rw_init_flags(volatile uintptr_t *c, const char *name, int opts) { struct rwlock *rw; int flags; rw = rwlock2rw(c); MPASS((opts & ~(RW_DUPOK | RW_NOPROFILE | RW_NOWITNESS | RW_QUIET | RW_RECURSE | RW_NEW)) == 0); ASSERT_ATOMIC_LOAD_PTR(rw->rw_lock, ("%s: rw_lock not aligned for %s: %p", __func__, name, &rw->rw_lock)); flags = LO_UPGRADABLE; if (opts & RW_DUPOK) flags |= LO_DUPOK; if (opts & RW_NOPROFILE) flags |= LO_NOPROFILE; if (!(opts & RW_NOWITNESS)) flags |= LO_WITNESS; if (opts & RW_RECURSE) flags |= LO_RECURSABLE; if (opts & RW_QUIET) flags |= LO_QUIET; if (opts & RW_NEW) flags |= LO_NEW; lock_init(&rw->lock_object, &lock_class_rw, name, NULL, flags); rw->rw_lock = RW_UNLOCKED; rw->rw_recurse = 0; } void _rw_destroy(volatile uintptr_t *c) { struct rwlock *rw; rw = rwlock2rw(c); KASSERT(rw->rw_lock == RW_UNLOCKED, ("rw lock %p not unlocked", rw)); KASSERT(rw->rw_recurse == 0, ("rw lock %p still recursed", rw)); rw->rw_lock = RW_DESTROYED; lock_destroy(&rw->lock_object); } void rw_sysinit(void *arg) { struct rw_args *args = arg; rw_init((struct rwlock *)args->ra_rw, args->ra_desc); } void rw_sysinit_flags(void *arg) { struct rw_args_flags *args = arg; rw_init_flags((struct rwlock *)args->ra_rw, args->ra_desc, args->ra_flags); } int _rw_wowned(const volatile uintptr_t *c) { return (rw_wowner(rwlock2rw(c)) == curthread); } void _rw_wlock_cookie(volatile uintptr_t *c, const char *file, int line) { struct rwlock *rw; if (SCHEDULER_STOPPED()) return; rw = rwlock2rw(c); KASSERT(kdb_active != 0 || !TD_IS_IDLETHREAD(curthread), ("rw_wlock() by idle thread %p on rwlock %s @ %s:%d", curthread, rw->lock_object.lo_name, file, line)); KASSERT(rw->rw_lock != RW_DESTROYED, ("rw_wlock() of destroyed rwlock @ %s:%d", file, line)); WITNESS_CHECKORDER(&rw->lock_object, LOP_NEWORDER | LOP_EXCLUSIVE, file, line, NULL); __rw_wlock(rw, curthread, file, line); LOCK_LOG_LOCK("WLOCK", &rw->lock_object, 0, rw->rw_recurse, file, line); WITNESS_LOCK(&rw->lock_object, LOP_EXCLUSIVE, file, line); TD_LOCKS_INC(curthread); } int __rw_try_wlock(volatile uintptr_t *c, const char *file, int line) { struct rwlock *rw; int rval; if (SCHEDULER_STOPPED()) return (1); rw = rwlock2rw(c); KASSERT(kdb_active != 0 || !TD_IS_IDLETHREAD(curthread), ("rw_try_wlock() by idle thread %p on rwlock %s @ %s:%d", curthread, rw->lock_object.lo_name, file, line)); KASSERT(rw->rw_lock != RW_DESTROYED, ("rw_try_wlock() of destroyed rwlock @ %s:%d", file, line)); if (rw_wlocked(rw) && (rw->lock_object.lo_flags & LO_RECURSABLE) != 0) { rw->rw_recurse++; rval = 1; } else rval = atomic_cmpset_acq_ptr(&rw->rw_lock, RW_UNLOCKED, (uintptr_t)curthread); LOCK_LOG_TRY("WLOCK", &rw->lock_object, 0, rval, file, line); if (rval) { WITNESS_LOCK(&rw->lock_object, LOP_EXCLUSIVE | LOP_TRYLOCK, file, line); if (!rw_recursed(rw)) LOCKSTAT_PROFILE_OBTAIN_RWLOCK_SUCCESS(rw__acquire, rw, 0, 0, file, line, LOCKSTAT_WRITER); TD_LOCKS_INC(curthread); } return (rval); } void _rw_wunlock_cookie(volatile uintptr_t *c, const char *file, int line) { struct rwlock *rw; if (SCHEDULER_STOPPED()) return; rw = rwlock2rw(c); KASSERT(rw->rw_lock != RW_DESTROYED, ("rw_wunlock() of destroyed rwlock @ %s:%d", file, line)); __rw_assert(c, RA_WLOCKED, file, line); WITNESS_UNLOCK(&rw->lock_object, LOP_EXCLUSIVE, file, line); LOCK_LOG_LOCK("WUNLOCK", &rw->lock_object, 0, rw->rw_recurse, file, line); __rw_wunlock(rw, curthread, file, line); TD_LOCKS_DEC(curthread); } /* * Determines whether a new reader can acquire a lock. Succeeds if the * reader already owns a read lock and the lock is locked for read to * prevent deadlock from reader recursion. Also succeeds if the lock * is unlocked and has no writer waiters or spinners. Failing otherwise * prioritizes writers before readers. */ #define RW_CAN_READ(_rw) \ ((curthread->td_rw_rlocks && (_rw) & RW_LOCK_READ) || ((_rw) & \ (RW_LOCK_READ | RW_LOCK_WRITE_WAITERS | RW_LOCK_WRITE_SPINNER)) == \ RW_LOCK_READ) void __rw_rlock(volatile uintptr_t *c, const char *file, int line) { struct rwlock *rw; struct turnstile *ts; #ifdef ADAPTIVE_RWLOCKS volatile struct thread *owner; int spintries = 0; int i; #endif #ifdef LOCK_PROFILING uint64_t waittime = 0; int contested = 0; #endif uintptr_t v; #if defined(ADAPTIVE_RWLOCKS) || defined(KDTRACE_HOOKS) struct lock_delay_arg lda; #endif #ifdef KDTRACE_HOOKS uintptr_t state; u_int sleep_cnt = 0; int64_t sleep_time = 0; int64_t all_time = 0; #endif if (SCHEDULER_STOPPED()) return; -#if defined(ADAPTIVE_RWLOCKS) || defined(KDTRACE_HOOKS) +#if defined(ADAPTIVE_RWLOCKS) lock_delay_arg_init(&lda, &rw_delay); +#elif defined(KDTRACE_HOOKS) + lock_delay_arg_init(&lda, NULL); #endif rw = rwlock2rw(c); KASSERT(kdb_active != 0 || !TD_IS_IDLETHREAD(curthread), ("rw_rlock() by idle thread %p on rwlock %s @ %s:%d", curthread, rw->lock_object.lo_name, file, line)); KASSERT(rw->rw_lock != RW_DESTROYED, ("rw_rlock() of destroyed rwlock @ %s:%d", file, line)); KASSERT(rw_wowner(rw) != curthread, ("rw_rlock: wlock already held for %s @ %s:%d", rw->lock_object.lo_name, file, line)); WITNESS_CHECKORDER(&rw->lock_object, LOP_NEWORDER, file, line, NULL); #ifdef KDTRACE_HOOKS all_time -= lockstat_nsecs(&rw->lock_object); state = rw->rw_lock; #endif for (;;) { /* * Handle the easy case. If no other thread has a write * lock, then try to bump up the count of read locks. Note * that we have to preserve the current state of the * RW_LOCK_WRITE_WAITERS flag. If we fail to acquire a * read lock, then rw_lock must have changed, so restart * the loop. Note that this handles the case of a * completely unlocked rwlock since such a lock is encoded * as a read lock with no waiters. */ v = rw->rw_lock; if (RW_CAN_READ(v)) { /* * The RW_LOCK_READ_WAITERS flag should only be set * if the lock has been unlocked and write waiters * were present. */ if (atomic_cmpset_acq_ptr(&rw->rw_lock, v, v + RW_ONE_READER)) { if (LOCK_LOG_TEST(&rw->lock_object, 0)) CTR4(KTR_LOCK, "%s: %p succeed %p -> %p", __func__, rw, (void *)v, (void *)(v + RW_ONE_READER)); break; } continue; } #ifdef KDTRACE_HOOKS lda.spin_cnt++; #endif #ifdef HWPMC_HOOKS PMC_SOFT_CALL( , , lock, failed); #endif lock_profile_obtain_lock_failed(&rw->lock_object, &contested, &waittime); #ifdef ADAPTIVE_RWLOCKS /* * If the owner is running on another CPU, spin until * the owner stops running or the state of the lock * changes. */ if ((v & RW_LOCK_READ) == 0) { owner = (struct thread *)RW_OWNER(v); if (TD_IS_RUNNING(owner)) { if (LOCK_LOG_TEST(&rw->lock_object, 0)) CTR3(KTR_LOCK, "%s: spinning on %p held by %p", __func__, rw, owner); KTR_STATE1(KTR_SCHED, "thread", sched_tdname(curthread), "spinning", "lockname:\"%s\"", rw->lock_object.lo_name); while ((struct thread*)RW_OWNER(rw->rw_lock) == owner && TD_IS_RUNNING(owner)) lock_delay(&lda); KTR_STATE0(KTR_SCHED, "thread", sched_tdname(curthread), "running"); continue; } } else if (spintries < rowner_retries) { spintries++; KTR_STATE1(KTR_SCHED, "thread", sched_tdname(curthread), "spinning", "lockname:\"%s\"", rw->lock_object.lo_name); for (i = 0; i < rowner_loops; i++) { v = rw->rw_lock; if ((v & RW_LOCK_READ) == 0 || RW_CAN_READ(v)) break; cpu_spinwait(); } #ifdef KDTRACE_HOOKS lda.spin_cnt += rowner_loops - i; #endif KTR_STATE0(KTR_SCHED, "thread", sched_tdname(curthread), "running"); if (i != rowner_loops) continue; } #endif /* * Okay, now it's the hard case. Some other thread already * has a write lock or there are write waiters present, * acquire the turnstile lock so we can begin the process * of blocking. */ ts = turnstile_trywait(&rw->lock_object); /* * The lock might have been released while we spun, so * recheck its state and restart the loop if needed. */ v = rw->rw_lock; if (RW_CAN_READ(v)) { turnstile_cancel(ts); continue; } #ifdef ADAPTIVE_RWLOCKS /* * The current lock owner might have started executing * on another CPU (or the lock could have changed * owners) while we were waiting on the turnstile * chain lock. If so, drop the turnstile lock and try * again. */ if ((v & RW_LOCK_READ) == 0) { owner = (struct thread *)RW_OWNER(v); if (TD_IS_RUNNING(owner)) { turnstile_cancel(ts); continue; } } #endif /* * The lock is held in write mode or it already has waiters. */ MPASS(!RW_CAN_READ(v)); /* * If the RW_LOCK_READ_WAITERS flag is already set, then * we can go ahead and block. If it is not set then try * to set it. If we fail to set it drop the turnstile * lock and restart the loop. */ if (!(v & RW_LOCK_READ_WAITERS)) { if (!atomic_cmpset_ptr(&rw->rw_lock, v, v | RW_LOCK_READ_WAITERS)) { turnstile_cancel(ts); continue; } if (LOCK_LOG_TEST(&rw->lock_object, 0)) CTR2(KTR_LOCK, "%s: %p set read waiters flag", __func__, rw); } /* * We were unable to acquire the lock and the read waiters * flag is set, so we must block on the turnstile. */ if (LOCK_LOG_TEST(&rw->lock_object, 0)) CTR2(KTR_LOCK, "%s: %p blocking on turnstile", __func__, rw); #ifdef KDTRACE_HOOKS sleep_time -= lockstat_nsecs(&rw->lock_object); #endif turnstile_wait(ts, rw_owner(rw), TS_SHARED_QUEUE); #ifdef KDTRACE_HOOKS sleep_time += lockstat_nsecs(&rw->lock_object); sleep_cnt++; #endif if (LOCK_LOG_TEST(&rw->lock_object, 0)) CTR2(KTR_LOCK, "%s: %p resuming from turnstile", __func__, rw); } #ifdef KDTRACE_HOOKS all_time += lockstat_nsecs(&rw->lock_object); if (sleep_time) LOCKSTAT_RECORD4(rw__block, rw, sleep_time, LOCKSTAT_READER, (state & RW_LOCK_READ) == 0, (state & RW_LOCK_READ) == 0 ? 0 : RW_READERS(state)); /* Record only the loops spinning and not sleeping. */ if (lda.spin_cnt > sleep_cnt) LOCKSTAT_RECORD4(rw__spin, rw, all_time - sleep_time, LOCKSTAT_READER, (state & RW_LOCK_READ) == 0, (state & RW_LOCK_READ) == 0 ? 0 : RW_READERS(state)); #endif /* * TODO: acquire "owner of record" here. Here be turnstile dragons * however. turnstiles don't like owners changing between calls to * turnstile_wait() currently. */ LOCKSTAT_PROFILE_OBTAIN_RWLOCK_SUCCESS(rw__acquire, rw, contested, waittime, file, line, LOCKSTAT_READER); LOCK_LOG_LOCK("RLOCK", &rw->lock_object, 0, 0, file, line); WITNESS_LOCK(&rw->lock_object, 0, file, line); TD_LOCKS_INC(curthread); curthread->td_rw_rlocks++; } int __rw_try_rlock(volatile uintptr_t *c, const char *file, int line) { struct rwlock *rw; uintptr_t x; if (SCHEDULER_STOPPED()) return (1); rw = rwlock2rw(c); KASSERT(kdb_active != 0 || !TD_IS_IDLETHREAD(curthread), ("rw_try_rlock() by idle thread %p on rwlock %s @ %s:%d", curthread, rw->lock_object.lo_name, file, line)); for (;;) { x = rw->rw_lock; KASSERT(rw->rw_lock != RW_DESTROYED, ("rw_try_rlock() of destroyed rwlock @ %s:%d", file, line)); if (!(x & RW_LOCK_READ)) break; if (atomic_cmpset_acq_ptr(&rw->rw_lock, x, x + RW_ONE_READER)) { LOCK_LOG_TRY("RLOCK", &rw->lock_object, 0, 1, file, line); WITNESS_LOCK(&rw->lock_object, LOP_TRYLOCK, file, line); LOCKSTAT_PROFILE_OBTAIN_RWLOCK_SUCCESS(rw__acquire, rw, 0, 0, file, line, LOCKSTAT_READER); TD_LOCKS_INC(curthread); curthread->td_rw_rlocks++; return (1); } } LOCK_LOG_TRY("RLOCK", &rw->lock_object, 0, 0, file, line); return (0); } void _rw_runlock_cookie(volatile uintptr_t *c, const char *file, int line) { struct rwlock *rw; struct turnstile *ts; uintptr_t x, v, queue; if (SCHEDULER_STOPPED()) return; rw = rwlock2rw(c); KASSERT(rw->rw_lock != RW_DESTROYED, ("rw_runlock() of destroyed rwlock @ %s:%d", file, line)); __rw_assert(c, RA_RLOCKED, file, line); WITNESS_UNLOCK(&rw->lock_object, 0, file, line); LOCK_LOG_LOCK("RUNLOCK", &rw->lock_object, 0, 0, file, line); /* TODO: drop "owner of record" here. */ for (;;) { /* * See if there is more than one read lock held. If so, * just drop one and return. */ x = rw->rw_lock; if (RW_READERS(x) > 1) { if (atomic_cmpset_rel_ptr(&rw->rw_lock, x, x - RW_ONE_READER)) { if (LOCK_LOG_TEST(&rw->lock_object, 0)) CTR4(KTR_LOCK, "%s: %p succeeded %p -> %p", __func__, rw, (void *)x, (void *)(x - RW_ONE_READER)); break; } continue; } /* * If there aren't any waiters for a write lock, then try * to drop it quickly. */ if (!(x & RW_LOCK_WAITERS)) { MPASS((x & ~RW_LOCK_WRITE_SPINNER) == RW_READERS_LOCK(1)); if (atomic_cmpset_rel_ptr(&rw->rw_lock, x, RW_UNLOCKED)) { if (LOCK_LOG_TEST(&rw->lock_object, 0)) CTR2(KTR_LOCK, "%s: %p last succeeded", __func__, rw); break; } continue; } /* * Ok, we know we have waiters and we think we are the * last reader, so grab the turnstile lock. */ turnstile_chain_lock(&rw->lock_object); v = rw->rw_lock & (RW_LOCK_WAITERS | RW_LOCK_WRITE_SPINNER); MPASS(v & RW_LOCK_WAITERS); /* * Try to drop our lock leaving the lock in a unlocked * state. * * If you wanted to do explicit lock handoff you'd have to * do it here. You'd also want to use turnstile_signal() * and you'd have to handle the race where a higher * priority thread blocks on the write lock before the * thread you wakeup actually runs and have the new thread * "steal" the lock. For now it's a lot simpler to just * wakeup all of the waiters. * * As above, if we fail, then another thread might have * acquired a read lock, so drop the turnstile lock and * restart. */ x = RW_UNLOCKED; if (v & RW_LOCK_WRITE_WAITERS) { queue = TS_EXCLUSIVE_QUEUE; x |= (v & RW_LOCK_READ_WAITERS); } else queue = TS_SHARED_QUEUE; if (!atomic_cmpset_rel_ptr(&rw->rw_lock, RW_READERS_LOCK(1) | v, x)) { turnstile_chain_unlock(&rw->lock_object); continue; } if (LOCK_LOG_TEST(&rw->lock_object, 0)) CTR2(KTR_LOCK, "%s: %p last succeeded with waiters", __func__, rw); /* * Ok. The lock is released and all that's left is to * wake up the waiters. Note that the lock might not be * free anymore, but in that case the writers will just * block again if they run before the new lock holder(s) * release the lock. */ ts = turnstile_lookup(&rw->lock_object); MPASS(ts != NULL); turnstile_broadcast(ts, queue); turnstile_unpend(ts, TS_SHARED_LOCK); turnstile_chain_unlock(&rw->lock_object); break; } LOCKSTAT_PROFILE_RELEASE_RWLOCK(rw__release, rw, LOCKSTAT_READER); TD_LOCKS_DEC(curthread); curthread->td_rw_rlocks--; } /* * This function is called when we are unable to obtain a write lock on the * first try. This means that at least one other thread holds either a * read or write lock. */ void __rw_wlock_hard(volatile uintptr_t *c, uintptr_t tid, const char *file, int line) { struct rwlock *rw; struct turnstile *ts; #ifdef ADAPTIVE_RWLOCKS volatile struct thread *owner; int spintries = 0; int i; #endif uintptr_t v, x; #ifdef LOCK_PROFILING uint64_t waittime = 0; int contested = 0; #endif #if defined(ADAPTIVE_RWLOCKS) || defined(KDTRACE_HOOKS) struct lock_delay_arg lda; #endif #ifdef KDTRACE_HOOKS uintptr_t state; u_int sleep_cnt = 0; int64_t sleep_time = 0; int64_t all_time = 0; #endif if (SCHEDULER_STOPPED()) return; -#if defined(ADAPTIVE_RWLOCKS) || defined(KDTRACE_HOOKS) +#if defined(ADAPTIVE_RWLOCKS) lock_delay_arg_init(&lda, &rw_delay); +#elif defined(KDTRACE_HOOKS) + lock_delay_arg_init(&lda, NULL); #endif rw = rwlock2rw(c); if (rw_wlocked(rw)) { KASSERT(rw->lock_object.lo_flags & LO_RECURSABLE, ("%s: recursing but non-recursive rw %s @ %s:%d\n", __func__, rw->lock_object.lo_name, file, line)); rw->rw_recurse++; if (LOCK_LOG_TEST(&rw->lock_object, 0)) CTR2(KTR_LOCK, "%s: %p recursing", __func__, rw); return; } if (LOCK_LOG_TEST(&rw->lock_object, 0)) CTR5(KTR_LOCK, "%s: %s contested (lock=%p) at %s:%d", __func__, rw->lock_object.lo_name, (void *)rw->rw_lock, file, line); #ifdef KDTRACE_HOOKS all_time -= lockstat_nsecs(&rw->lock_object); state = rw->rw_lock; #endif for (;;) { if (rw->rw_lock == RW_UNLOCKED && _rw_write_lock(rw, tid)) break; #ifdef KDTRACE_HOOKS lda.spin_cnt++; #endif #ifdef HWPMC_HOOKS PMC_SOFT_CALL( , , lock, failed); #endif lock_profile_obtain_lock_failed(&rw->lock_object, &contested, &waittime); #ifdef ADAPTIVE_RWLOCKS /* * If the lock is write locked and the owner is * running on another CPU, spin until the owner stops * running or the state of the lock changes. */ v = rw->rw_lock; owner = (struct thread *)RW_OWNER(v); if (!(v & RW_LOCK_READ) && TD_IS_RUNNING(owner)) { if (LOCK_LOG_TEST(&rw->lock_object, 0)) CTR3(KTR_LOCK, "%s: spinning on %p held by %p", __func__, rw, owner); KTR_STATE1(KTR_SCHED, "thread", sched_tdname(curthread), "spinning", "lockname:\"%s\"", rw->lock_object.lo_name); while ((struct thread*)RW_OWNER(rw->rw_lock) == owner && TD_IS_RUNNING(owner)) lock_delay(&lda); KTR_STATE0(KTR_SCHED, "thread", sched_tdname(curthread), "running"); continue; } if ((v & RW_LOCK_READ) && RW_READERS(v) && spintries < rowner_retries) { if (!(v & RW_LOCK_WRITE_SPINNER)) { if (!atomic_cmpset_ptr(&rw->rw_lock, v, v | RW_LOCK_WRITE_SPINNER)) { continue; } } spintries++; KTR_STATE1(KTR_SCHED, "thread", sched_tdname(curthread), "spinning", "lockname:\"%s\"", rw->lock_object.lo_name); for (i = 0; i < rowner_loops; i++) { if ((rw->rw_lock & RW_LOCK_WRITE_SPINNER) == 0) break; cpu_spinwait(); } KTR_STATE0(KTR_SCHED, "thread", sched_tdname(curthread), "running"); #ifdef KDTRACE_HOOKS lda.spin_cnt += rowner_loops - i; #endif if (i != rowner_loops) continue; } #endif ts = turnstile_trywait(&rw->lock_object); v = rw->rw_lock; #ifdef ADAPTIVE_RWLOCKS /* * The current lock owner might have started executing * on another CPU (or the lock could have changed * owners) while we were waiting on the turnstile * chain lock. If so, drop the turnstile lock and try * again. */ if (!(v & RW_LOCK_READ)) { owner = (struct thread *)RW_OWNER(v); if (TD_IS_RUNNING(owner)) { turnstile_cancel(ts); continue; } } #endif /* * Check for the waiters flags about this rwlock. * If the lock was released, without maintain any pending * waiters queue, simply try to acquire it. * If a pending waiters queue is present, claim the lock * ownership and maintain the pending queue. */ x = v & (RW_LOCK_WAITERS | RW_LOCK_WRITE_SPINNER); if ((v & ~x) == RW_UNLOCKED) { x &= ~RW_LOCK_WRITE_SPINNER; if (atomic_cmpset_acq_ptr(&rw->rw_lock, v, tid | x)) { if (x) turnstile_claim(ts); else turnstile_cancel(ts); break; } turnstile_cancel(ts); continue; } /* * If the RW_LOCK_WRITE_WAITERS flag isn't set, then try to * set it. If we fail to set it, then loop back and try * again. */ if (!(v & RW_LOCK_WRITE_WAITERS)) { if (!atomic_cmpset_ptr(&rw->rw_lock, v, v | RW_LOCK_WRITE_WAITERS)) { turnstile_cancel(ts); continue; } if (LOCK_LOG_TEST(&rw->lock_object, 0)) CTR2(KTR_LOCK, "%s: %p set write waiters flag", __func__, rw); } /* * We were unable to acquire the lock and the write waiters * flag is set, so we must block on the turnstile. */ if (LOCK_LOG_TEST(&rw->lock_object, 0)) CTR2(KTR_LOCK, "%s: %p blocking on turnstile", __func__, rw); #ifdef KDTRACE_HOOKS sleep_time -= lockstat_nsecs(&rw->lock_object); #endif turnstile_wait(ts, rw_owner(rw), TS_EXCLUSIVE_QUEUE); #ifdef KDTRACE_HOOKS sleep_time += lockstat_nsecs(&rw->lock_object); sleep_cnt++; #endif if (LOCK_LOG_TEST(&rw->lock_object, 0)) CTR2(KTR_LOCK, "%s: %p resuming from turnstile", __func__, rw); #ifdef ADAPTIVE_RWLOCKS spintries = 0; #endif } #ifdef KDTRACE_HOOKS all_time += lockstat_nsecs(&rw->lock_object); if (sleep_time) LOCKSTAT_RECORD4(rw__block, rw, sleep_time, LOCKSTAT_WRITER, (state & RW_LOCK_READ) == 0, (state & RW_LOCK_READ) == 0 ? 0 : RW_READERS(state)); /* Record only the loops spinning and not sleeping. */ if (lda.spin_cnt > sleep_cnt) LOCKSTAT_RECORD4(rw__spin, rw, all_time - sleep_time, LOCKSTAT_WRITER, (state & RW_LOCK_READ) == 0, (state & RW_LOCK_READ) == 0 ? 0 : RW_READERS(state)); #endif LOCKSTAT_PROFILE_OBTAIN_RWLOCK_SUCCESS(rw__acquire, rw, contested, waittime, file, line, LOCKSTAT_WRITER); } /* * This function is called if the first try at releasing a write lock failed. * This means that one of the 2 waiter bits must be set indicating that at * least one thread is waiting on this lock. */ void __rw_wunlock_hard(volatile uintptr_t *c, uintptr_t tid, const char *file, int line) { struct rwlock *rw; struct turnstile *ts; uintptr_t v; int queue; if (SCHEDULER_STOPPED()) return; rw = rwlock2rw(c); if (rw_wlocked(rw) && rw_recursed(rw)) { rw->rw_recurse--; if (LOCK_LOG_TEST(&rw->lock_object, 0)) CTR2(KTR_LOCK, "%s: %p unrecursing", __func__, rw); return; } KASSERT(rw->rw_lock & (RW_LOCK_READ_WAITERS | RW_LOCK_WRITE_WAITERS), ("%s: neither of the waiter flags are set", __func__)); if (LOCK_LOG_TEST(&rw->lock_object, 0)) CTR2(KTR_LOCK, "%s: %p contested", __func__, rw); turnstile_chain_lock(&rw->lock_object); ts = turnstile_lookup(&rw->lock_object); MPASS(ts != NULL); /* * Use the same algo as sx locks for now. Prefer waking up shared * waiters if we have any over writers. This is probably not ideal. * * 'v' is the value we are going to write back to rw_lock. If we * have waiters on both queues, we need to preserve the state of * the waiter flag for the queue we don't wake up. For now this is * hardcoded for the algorithm mentioned above. * * In the case of both readers and writers waiting we wakeup the * readers but leave the RW_LOCK_WRITE_WAITERS flag set. If a * new writer comes in before a reader it will claim the lock up * above. There is probably a potential priority inversion in * there that could be worked around either by waking both queues * of waiters or doing some complicated lock handoff gymnastics. */ v = RW_UNLOCKED; if (rw->rw_lock & RW_LOCK_WRITE_WAITERS) { queue = TS_EXCLUSIVE_QUEUE; v |= (rw->rw_lock & RW_LOCK_READ_WAITERS); } else queue = TS_SHARED_QUEUE; /* Wake up all waiters for the specific queue. */ if (LOCK_LOG_TEST(&rw->lock_object, 0)) CTR3(KTR_LOCK, "%s: %p waking up %s waiters", __func__, rw, queue == TS_SHARED_QUEUE ? "read" : "write"); turnstile_broadcast(ts, queue); atomic_store_rel_ptr(&rw->rw_lock, v); turnstile_unpend(ts, TS_EXCLUSIVE_LOCK); turnstile_chain_unlock(&rw->lock_object); } /* * Attempt to do a non-blocking upgrade from a read lock to a write * lock. This will only succeed if this thread holds a single read * lock. Returns true if the upgrade succeeded and false otherwise. */ int __rw_try_upgrade(volatile uintptr_t *c, const char *file, int line) { struct rwlock *rw; uintptr_t v, x, tid; struct turnstile *ts; int success; if (SCHEDULER_STOPPED()) return (1); rw = rwlock2rw(c); KASSERT(rw->rw_lock != RW_DESTROYED, ("rw_try_upgrade() of destroyed rwlock @ %s:%d", file, line)); __rw_assert(c, RA_RLOCKED, file, line); /* * Attempt to switch from one reader to a writer. If there * are any write waiters, then we will have to lock the * turnstile first to prevent races with another writer * calling turnstile_wait() before we have claimed this * turnstile. So, do the simple case of no waiters first. */ tid = (uintptr_t)curthread; success = 0; for (;;) { v = rw->rw_lock; if (RW_READERS(v) > 1) break; if (!(v & RW_LOCK_WAITERS)) { success = atomic_cmpset_ptr(&rw->rw_lock, v, tid); if (!success) continue; break; } /* * Ok, we think we have waiters, so lock the turnstile. */ ts = turnstile_trywait(&rw->lock_object); v = rw->rw_lock; if (RW_READERS(v) > 1) { turnstile_cancel(ts); break; } /* * Try to switch from one reader to a writer again. This time * we honor the current state of the waiters flags. * If we obtain the lock with the flags set, then claim * ownership of the turnstile. */ x = rw->rw_lock & RW_LOCK_WAITERS; success = atomic_cmpset_ptr(&rw->rw_lock, v, tid | x); if (success) { if (x) turnstile_claim(ts); else turnstile_cancel(ts); break; } turnstile_cancel(ts); } LOCK_LOG_TRY("WUPGRADE", &rw->lock_object, 0, success, file, line); if (success) { curthread->td_rw_rlocks--; WITNESS_UPGRADE(&rw->lock_object, LOP_EXCLUSIVE | LOP_TRYLOCK, file, line); LOCKSTAT_RECORD0(rw__upgrade, rw); } return (success); } /* * Downgrade a write lock into a single read lock. */ void __rw_downgrade(volatile uintptr_t *c, const char *file, int line) { struct rwlock *rw; struct turnstile *ts; uintptr_t tid, v; int rwait, wwait; if (SCHEDULER_STOPPED()) return; rw = rwlock2rw(c); KASSERT(rw->rw_lock != RW_DESTROYED, ("rw_downgrade() of destroyed rwlock @ %s:%d", file, line)); __rw_assert(c, RA_WLOCKED | RA_NOTRECURSED, file, line); #ifndef INVARIANTS if (rw_recursed(rw)) panic("downgrade of a recursed lock"); #endif WITNESS_DOWNGRADE(&rw->lock_object, 0, file, line); /* * Convert from a writer to a single reader. First we handle * the easy case with no waiters. If there are any waiters, we * lock the turnstile and "disown" the lock. */ tid = (uintptr_t)curthread; if (atomic_cmpset_rel_ptr(&rw->rw_lock, tid, RW_READERS_LOCK(1))) goto out; /* * Ok, we think we have waiters, so lock the turnstile so we can * read the waiter flags without any races. */ turnstile_chain_lock(&rw->lock_object); v = rw->rw_lock & RW_LOCK_WAITERS; rwait = v & RW_LOCK_READ_WAITERS; wwait = v & RW_LOCK_WRITE_WAITERS; MPASS(rwait | wwait); /* * Downgrade from a write lock while preserving waiters flag * and give up ownership of the turnstile. */ ts = turnstile_lookup(&rw->lock_object); MPASS(ts != NULL); if (!wwait) v &= ~RW_LOCK_READ_WAITERS; atomic_store_rel_ptr(&rw->rw_lock, RW_READERS_LOCK(1) | v); /* * Wake other readers if there are no writers pending. Otherwise they * won't be able to acquire the lock anyway. */ if (rwait && !wwait) { turnstile_broadcast(ts, TS_SHARED_QUEUE); turnstile_unpend(ts, TS_EXCLUSIVE_LOCK); } else turnstile_disown(ts); turnstile_chain_unlock(&rw->lock_object); out: curthread->td_rw_rlocks++; LOCK_LOG_LOCK("WDOWNGRADE", &rw->lock_object, 0, 0, file, line); LOCKSTAT_RECORD0(rw__downgrade, rw); } #ifdef INVARIANT_SUPPORT #ifndef INVARIANTS #undef __rw_assert #endif /* * In the non-WITNESS case, rw_assert() can only detect that at least * *some* thread owns an rlock, but it cannot guarantee that *this* * thread owns an rlock. */ void __rw_assert(const volatile uintptr_t *c, int what, const char *file, int line) { const struct rwlock *rw; if (panicstr != NULL) return; rw = rwlock2rw(c); switch (what) { case RA_LOCKED: case RA_LOCKED | RA_RECURSED: case RA_LOCKED | RA_NOTRECURSED: case RA_RLOCKED: case RA_RLOCKED | RA_RECURSED: case RA_RLOCKED | RA_NOTRECURSED: #ifdef WITNESS witness_assert(&rw->lock_object, what, file, line); #else /* * If some other thread has a write lock or we have one * and are asserting a read lock, fail. Also, if no one * has a lock at all, fail. */ if (rw->rw_lock == RW_UNLOCKED || (!(rw->rw_lock & RW_LOCK_READ) && (what & RA_RLOCKED || rw_wowner(rw) != curthread))) panic("Lock %s not %slocked @ %s:%d\n", rw->lock_object.lo_name, (what & RA_RLOCKED) ? "read " : "", file, line); if (!(rw->rw_lock & RW_LOCK_READ) && !(what & RA_RLOCKED)) { if (rw_recursed(rw)) { if (what & RA_NOTRECURSED) panic("Lock %s recursed @ %s:%d\n", rw->lock_object.lo_name, file, line); } else if (what & RA_RECURSED) panic("Lock %s not recursed @ %s:%d\n", rw->lock_object.lo_name, file, line); } #endif break; case RA_WLOCKED: case RA_WLOCKED | RA_RECURSED: case RA_WLOCKED | RA_NOTRECURSED: if (rw_wowner(rw) != curthread) panic("Lock %s not exclusively locked @ %s:%d\n", rw->lock_object.lo_name, file, line); if (rw_recursed(rw)) { if (what & RA_NOTRECURSED) panic("Lock %s recursed @ %s:%d\n", rw->lock_object.lo_name, file, line); } else if (what & RA_RECURSED) panic("Lock %s not recursed @ %s:%d\n", rw->lock_object.lo_name, file, line); break; case RA_UNLOCKED: #ifdef WITNESS witness_assert(&rw->lock_object, what, file, line); #else /* * If we hold a write lock fail. We can't reliably check * to see if we hold a read lock or not. */ if (rw_wowner(rw) == curthread) panic("Lock %s exclusively locked @ %s:%d\n", rw->lock_object.lo_name, file, line); #endif break; default: panic("Unknown rw lock assertion: %d @ %s:%d", what, file, line); } } #endif /* INVARIANT_SUPPORT */ #ifdef DDB void db_show_rwlock(const struct lock_object *lock) { const struct rwlock *rw; struct thread *td; rw = (const struct rwlock *)lock; db_printf(" state: "); if (rw->rw_lock == RW_UNLOCKED) db_printf("UNLOCKED\n"); else if (rw->rw_lock == RW_DESTROYED) { db_printf("DESTROYED\n"); return; } else if (rw->rw_lock & RW_LOCK_READ) db_printf("RLOCK: %ju locks\n", (uintmax_t)(RW_READERS(rw->rw_lock))); else { td = rw_wowner(rw); db_printf("WLOCK: %p (tid %d, pid %d, \"%s\")\n", td, td->td_tid, td->td_proc->p_pid, td->td_name); if (rw_recursed(rw)) db_printf(" recursed: %u\n", rw->rw_recurse); } db_printf(" waiters: "); switch (rw->rw_lock & (RW_LOCK_READ_WAITERS | RW_LOCK_WRITE_WAITERS)) { case RW_LOCK_READ_WAITERS: db_printf("readers\n"); break; case RW_LOCK_WRITE_WAITERS: db_printf("writers\n"); break; case RW_LOCK_READ_WAITERS | RW_LOCK_WRITE_WAITERS: db_printf("readers and writers\n"); break; default: db_printf("none\n"); break; } } #endif Index: user/alc/PQ_LAUNDRY/sys/kern/kern_sx.c =================================================================== --- user/alc/PQ_LAUNDRY/sys/kern/kern_sx.c (revision 303666) +++ user/alc/PQ_LAUNDRY/sys/kern/kern_sx.c (revision 303667) @@ -1,1289 +1,1293 @@ /*- * Copyright (c) 2007 Attilio Rao * Copyright (c) 2001 Jason Evans * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice(s), this list of conditions and the following disclaimer as * the first lines of this file unmodified other than the possible * addition of one or more copyright notices. * 2. Redistributions in binary form must reproduce the above copyright * notice(s), this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) ``AS IS'' AND ANY * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH * DAMAGE. */ /* * Shared/exclusive locks. This implementation attempts to ensure * deterministic lock granting behavior, so that slocks and xlocks are * interleaved. * * Priority propagation will not generally raise the priority of lock holders, * so should not be relied upon in combination with sx locks. */ #include "opt_ddb.h" #include "opt_hwpmc_hooks.h" #include "opt_no_adaptive_sx.h" #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #if defined(SMP) && !defined(NO_ADAPTIVE_SX) #include #endif #ifdef DDB #include #endif #if defined(SMP) && !defined(NO_ADAPTIVE_SX) #define ADAPTIVE_SX #endif CTASSERT((SX_NOADAPTIVE & LO_CLASSFLAGS) == SX_NOADAPTIVE); #ifdef HWPMC_HOOKS #include PMC_SOFT_DECLARE( , , lock, failed); #endif /* Handy macros for sleep queues. */ #define SQ_EXCLUSIVE_QUEUE 0 #define SQ_SHARED_QUEUE 1 /* * Variations on DROP_GIANT()/PICKUP_GIANT() for use in this file. We * drop Giant anytime we have to sleep or if we adaptively spin. */ #define GIANT_DECLARE \ int _giantcnt = 0; \ WITNESS_SAVE_DECL(Giant) \ #define GIANT_SAVE() do { \ if (mtx_owned(&Giant)) { \ WITNESS_SAVE(&Giant.lock_object, Giant); \ while (mtx_owned(&Giant)) { \ _giantcnt++; \ mtx_unlock(&Giant); \ } \ } \ } while (0) #define GIANT_RESTORE() do { \ if (_giantcnt > 0) { \ mtx_assert(&Giant, MA_NOTOWNED); \ while (_giantcnt--) \ mtx_lock(&Giant); \ WITNESS_RESTORE(&Giant.lock_object, Giant); \ } \ } while (0) /* * Returns true if an exclusive lock is recursed. It assumes * curthread currently has an exclusive lock. */ #define sx_recursed(sx) ((sx)->sx_recurse != 0) static void assert_sx(const struct lock_object *lock, int what); #ifdef DDB static void db_show_sx(const struct lock_object *lock); #endif static void lock_sx(struct lock_object *lock, uintptr_t how); #ifdef KDTRACE_HOOKS static int owner_sx(const struct lock_object *lock, struct thread **owner); #endif static uintptr_t unlock_sx(struct lock_object *lock); struct lock_class lock_class_sx = { .lc_name = "sx", .lc_flags = LC_SLEEPLOCK | LC_SLEEPABLE | LC_RECURSABLE | LC_UPGRADABLE, .lc_assert = assert_sx, #ifdef DDB .lc_ddb_show = db_show_sx, #endif .lc_lock = lock_sx, .lc_unlock = unlock_sx, #ifdef KDTRACE_HOOKS .lc_owner = owner_sx, #endif }; #ifndef INVARIANTS #define _sx_assert(sx, what, file, line) #endif #ifdef ADAPTIVE_SX static u_int asx_retries = 10; static u_int asx_loops = 10000; static SYSCTL_NODE(_debug, OID_AUTO, sx, CTLFLAG_RD, NULL, "sxlock debugging"); SYSCTL_UINT(_debug_sx, OID_AUTO, retries, CTLFLAG_RW, &asx_retries, 0, ""); SYSCTL_UINT(_debug_sx, OID_AUTO, loops, CTLFLAG_RW, &asx_loops, 0, ""); static struct lock_delay_config sx_delay = { .initial = 1000, .step = 500, .min = 100, .max = 5000, }; SYSCTL_INT(_debug_sx, OID_AUTO, delay_initial, CTLFLAG_RW, &sx_delay.initial, 0, ""); SYSCTL_INT(_debug_sx, OID_AUTO, delay_step, CTLFLAG_RW, &sx_delay.step, 0, ""); SYSCTL_INT(_debug_sx, OID_AUTO, delay_min, CTLFLAG_RW, &sx_delay.min, 0, ""); SYSCTL_INT(_debug_sx, OID_AUTO, delay_max, CTLFLAG_RW, &sx_delay.max, 0, ""); static void sx_delay_sysinit(void *dummy) { sx_delay.initial = mp_ncpus * 25; sx_delay.step = (mp_ncpus * 25) / 2; sx_delay.min = mp_ncpus * 5; sx_delay.max = mp_ncpus * 25 * 10; } LOCK_DELAY_SYSINIT(sx_delay_sysinit); #endif void assert_sx(const struct lock_object *lock, int what) { sx_assert((const struct sx *)lock, what); } void lock_sx(struct lock_object *lock, uintptr_t how) { struct sx *sx; sx = (struct sx *)lock; if (how) sx_slock(sx); else sx_xlock(sx); } uintptr_t unlock_sx(struct lock_object *lock) { struct sx *sx; sx = (struct sx *)lock; sx_assert(sx, SA_LOCKED | SA_NOTRECURSED); if (sx_xlocked(sx)) { sx_xunlock(sx); return (0); } else { sx_sunlock(sx); return (1); } } #ifdef KDTRACE_HOOKS int owner_sx(const struct lock_object *lock, struct thread **owner) { const struct sx *sx = (const struct sx *)lock; uintptr_t x = sx->sx_lock; *owner = (struct thread *)SX_OWNER(x); return ((x & SX_LOCK_SHARED) != 0 ? (SX_SHARERS(x) != 0) : (*owner != NULL)); } #endif void sx_sysinit(void *arg) { struct sx_args *sargs = arg; sx_init_flags(sargs->sa_sx, sargs->sa_desc, sargs->sa_flags); } void sx_init_flags(struct sx *sx, const char *description, int opts) { int flags; MPASS((opts & ~(SX_QUIET | SX_RECURSE | SX_NOWITNESS | SX_DUPOK | SX_NOPROFILE | SX_NOADAPTIVE | SX_NEW)) == 0); ASSERT_ATOMIC_LOAD_PTR(sx->sx_lock, ("%s: sx_lock not aligned for %s: %p", __func__, description, &sx->sx_lock)); flags = LO_SLEEPABLE | LO_UPGRADABLE; if (opts & SX_DUPOK) flags |= LO_DUPOK; if (opts & SX_NOPROFILE) flags |= LO_NOPROFILE; if (!(opts & SX_NOWITNESS)) flags |= LO_WITNESS; if (opts & SX_RECURSE) flags |= LO_RECURSABLE; if (opts & SX_QUIET) flags |= LO_QUIET; if (opts & SX_NEW) flags |= LO_NEW; flags |= opts & SX_NOADAPTIVE; lock_init(&sx->lock_object, &lock_class_sx, description, NULL, flags); sx->sx_lock = SX_LOCK_UNLOCKED; sx->sx_recurse = 0; } void sx_destroy(struct sx *sx) { KASSERT(sx->sx_lock == SX_LOCK_UNLOCKED, ("sx lock still held")); KASSERT(sx->sx_recurse == 0, ("sx lock still recursed")); sx->sx_lock = SX_LOCK_DESTROYED; lock_destroy(&sx->lock_object); } int _sx_slock(struct sx *sx, int opts, const char *file, int line) { int error = 0; if (SCHEDULER_STOPPED()) return (0); KASSERT(kdb_active != 0 || !TD_IS_IDLETHREAD(curthread), ("sx_slock() by idle thread %p on sx %s @ %s:%d", curthread, sx->lock_object.lo_name, file, line)); KASSERT(sx->sx_lock != SX_LOCK_DESTROYED, ("sx_slock() of destroyed sx @ %s:%d", file, line)); WITNESS_CHECKORDER(&sx->lock_object, LOP_NEWORDER, file, line, NULL); error = __sx_slock(sx, opts, file, line); if (!error) { LOCK_LOG_LOCK("SLOCK", &sx->lock_object, 0, 0, file, line); WITNESS_LOCK(&sx->lock_object, 0, file, line); TD_LOCKS_INC(curthread); } return (error); } int sx_try_slock_(struct sx *sx, const char *file, int line) { uintptr_t x; if (SCHEDULER_STOPPED()) return (1); KASSERT(kdb_active != 0 || !TD_IS_IDLETHREAD(curthread), ("sx_try_slock() by idle thread %p on sx %s @ %s:%d", curthread, sx->lock_object.lo_name, file, line)); for (;;) { x = sx->sx_lock; KASSERT(x != SX_LOCK_DESTROYED, ("sx_try_slock() of destroyed sx @ %s:%d", file, line)); if (!(x & SX_LOCK_SHARED)) break; if (atomic_cmpset_acq_ptr(&sx->sx_lock, x, x + SX_ONE_SHARER)) { LOCK_LOG_TRY("SLOCK", &sx->lock_object, 0, 1, file, line); WITNESS_LOCK(&sx->lock_object, LOP_TRYLOCK, file, line); LOCKSTAT_PROFILE_OBTAIN_RWLOCK_SUCCESS(sx__acquire, sx, 0, 0, file, line, LOCKSTAT_READER); TD_LOCKS_INC(curthread); return (1); } } LOCK_LOG_TRY("SLOCK", &sx->lock_object, 0, 0, file, line); return (0); } int _sx_xlock(struct sx *sx, int opts, const char *file, int line) { int error = 0; if (SCHEDULER_STOPPED()) return (0); KASSERT(kdb_active != 0 || !TD_IS_IDLETHREAD(curthread), ("sx_xlock() by idle thread %p on sx %s @ %s:%d", curthread, sx->lock_object.lo_name, file, line)); KASSERT(sx->sx_lock != SX_LOCK_DESTROYED, ("sx_xlock() of destroyed sx @ %s:%d", file, line)); WITNESS_CHECKORDER(&sx->lock_object, LOP_NEWORDER | LOP_EXCLUSIVE, file, line, NULL); error = __sx_xlock(sx, curthread, opts, file, line); if (!error) { LOCK_LOG_LOCK("XLOCK", &sx->lock_object, 0, sx->sx_recurse, file, line); WITNESS_LOCK(&sx->lock_object, LOP_EXCLUSIVE, file, line); TD_LOCKS_INC(curthread); } return (error); } int sx_try_xlock_(struct sx *sx, const char *file, int line) { int rval; if (SCHEDULER_STOPPED()) return (1); KASSERT(kdb_active != 0 || !TD_IS_IDLETHREAD(curthread), ("sx_try_xlock() by idle thread %p on sx %s @ %s:%d", curthread, sx->lock_object.lo_name, file, line)); KASSERT(sx->sx_lock != SX_LOCK_DESTROYED, ("sx_try_xlock() of destroyed sx @ %s:%d", file, line)); if (sx_xlocked(sx) && (sx->lock_object.lo_flags & LO_RECURSABLE) != 0) { sx->sx_recurse++; atomic_set_ptr(&sx->sx_lock, SX_LOCK_RECURSED); rval = 1; } else rval = atomic_cmpset_acq_ptr(&sx->sx_lock, SX_LOCK_UNLOCKED, (uintptr_t)curthread); LOCK_LOG_TRY("XLOCK", &sx->lock_object, 0, rval, file, line); if (rval) { WITNESS_LOCK(&sx->lock_object, LOP_EXCLUSIVE | LOP_TRYLOCK, file, line); if (!sx_recursed(sx)) LOCKSTAT_PROFILE_OBTAIN_RWLOCK_SUCCESS(sx__acquire, sx, 0, 0, file, line, LOCKSTAT_WRITER); TD_LOCKS_INC(curthread); } return (rval); } void _sx_sunlock(struct sx *sx, const char *file, int line) { if (SCHEDULER_STOPPED()) return; KASSERT(sx->sx_lock != SX_LOCK_DESTROYED, ("sx_sunlock() of destroyed sx @ %s:%d", file, line)); _sx_assert(sx, SA_SLOCKED, file, line); WITNESS_UNLOCK(&sx->lock_object, 0, file, line); LOCK_LOG_LOCK("SUNLOCK", &sx->lock_object, 0, 0, file, line); __sx_sunlock(sx, file, line); TD_LOCKS_DEC(curthread); } void _sx_xunlock(struct sx *sx, const char *file, int line) { if (SCHEDULER_STOPPED()) return; KASSERT(sx->sx_lock != SX_LOCK_DESTROYED, ("sx_xunlock() of destroyed sx @ %s:%d", file, line)); _sx_assert(sx, SA_XLOCKED, file, line); WITNESS_UNLOCK(&sx->lock_object, LOP_EXCLUSIVE, file, line); LOCK_LOG_LOCK("XUNLOCK", &sx->lock_object, 0, sx->sx_recurse, file, line); __sx_xunlock(sx, curthread, file, line); TD_LOCKS_DEC(curthread); } /* * Try to do a non-blocking upgrade from a shared lock to an exclusive lock. * This will only succeed if this thread holds a single shared lock. * Return 1 if if the upgrade succeed, 0 otherwise. */ int sx_try_upgrade_(struct sx *sx, const char *file, int line) { uintptr_t x; int success; if (SCHEDULER_STOPPED()) return (1); KASSERT(sx->sx_lock != SX_LOCK_DESTROYED, ("sx_try_upgrade() of destroyed sx @ %s:%d", file, line)); _sx_assert(sx, SA_SLOCKED, file, line); /* * Try to switch from one shared lock to an exclusive lock. We need * to maintain the SX_LOCK_EXCLUSIVE_WAITERS flag if set so that * we will wake up the exclusive waiters when we drop the lock. */ x = sx->sx_lock & SX_LOCK_EXCLUSIVE_WAITERS; success = atomic_cmpset_ptr(&sx->sx_lock, SX_SHARERS_LOCK(1) | x, (uintptr_t)curthread | x); LOCK_LOG_TRY("XUPGRADE", &sx->lock_object, 0, success, file, line); if (success) { WITNESS_UPGRADE(&sx->lock_object, LOP_EXCLUSIVE | LOP_TRYLOCK, file, line); LOCKSTAT_RECORD0(sx__upgrade, sx); } return (success); } /* * Downgrade an unrecursed exclusive lock into a single shared lock. */ void sx_downgrade_(struct sx *sx, const char *file, int line) { uintptr_t x; int wakeup_swapper; if (SCHEDULER_STOPPED()) return; KASSERT(sx->sx_lock != SX_LOCK_DESTROYED, ("sx_downgrade() of destroyed sx @ %s:%d", file, line)); _sx_assert(sx, SA_XLOCKED | SA_NOTRECURSED, file, line); #ifndef INVARIANTS if (sx_recursed(sx)) panic("downgrade of a recursed lock"); #endif WITNESS_DOWNGRADE(&sx->lock_object, 0, file, line); /* * Try to switch from an exclusive lock with no shared waiters * to one sharer with no shared waiters. If there are * exclusive waiters, we don't need to lock the sleep queue so * long as we preserve the flag. We do one quick try and if * that fails we grab the sleepq lock to keep the flags from * changing and do it the slow way. * * We have to lock the sleep queue if there are shared waiters * so we can wake them up. */ x = sx->sx_lock; if (!(x & SX_LOCK_SHARED_WAITERS) && atomic_cmpset_rel_ptr(&sx->sx_lock, x, SX_SHARERS_LOCK(1) | (x & SX_LOCK_EXCLUSIVE_WAITERS))) { LOCK_LOG_LOCK("XDOWNGRADE", &sx->lock_object, 0, 0, file, line); return; } /* * Lock the sleep queue so we can read the waiters bits * without any races and wakeup any shared waiters. */ sleepq_lock(&sx->lock_object); /* * Preserve SX_LOCK_EXCLUSIVE_WAITERS while downgraded to a single * shared lock. If there are any shared waiters, wake them up. */ wakeup_swapper = 0; x = sx->sx_lock; atomic_store_rel_ptr(&sx->sx_lock, SX_SHARERS_LOCK(1) | (x & SX_LOCK_EXCLUSIVE_WAITERS)); if (x & SX_LOCK_SHARED_WAITERS) wakeup_swapper = sleepq_broadcast(&sx->lock_object, SLEEPQ_SX, 0, SQ_SHARED_QUEUE); sleepq_release(&sx->lock_object); LOCK_LOG_LOCK("XDOWNGRADE", &sx->lock_object, 0, 0, file, line); LOCKSTAT_RECORD0(sx__downgrade, sx); if (wakeup_swapper) kick_proc0(); } /* * This function represents the so-called 'hard case' for sx_xlock * operation. All 'easy case' failures are redirected to this. Note * that ideally this would be a static function, but it needs to be * accessible from at least sx.h. */ int _sx_xlock_hard(struct sx *sx, uintptr_t tid, int opts, const char *file, int line) { GIANT_DECLARE; #ifdef ADAPTIVE_SX volatile struct thread *owner; u_int i, spintries = 0; #endif uintptr_t x; #ifdef LOCK_PROFILING uint64_t waittime = 0; int contested = 0; #endif int error = 0; #if defined(ADAPTIVE_SX) || defined(KDTRACE_HOOKS) struct lock_delay_arg lda; #endif #ifdef KDTRACE_HOOKS uintptr_t state; u_int sleep_cnt = 0; int64_t sleep_time = 0; int64_t all_time = 0; #endif if (SCHEDULER_STOPPED()) return (0); -#if defined(ADAPTIVE_SX) || defined(KDTRACE_HOOKS) +#if defined(ADAPTIVE_SX) lock_delay_arg_init(&lda, &sx_delay); +#elif defined(KDTRACE_HOOKS) + lock_delay_arg_init(&lda, NULL); #endif /* If we already hold an exclusive lock, then recurse. */ if (sx_xlocked(sx)) { KASSERT((sx->lock_object.lo_flags & LO_RECURSABLE) != 0, ("_sx_xlock_hard: recursed on non-recursive sx %s @ %s:%d\n", sx->lock_object.lo_name, file, line)); sx->sx_recurse++; atomic_set_ptr(&sx->sx_lock, SX_LOCK_RECURSED); if (LOCK_LOG_TEST(&sx->lock_object, 0)) CTR2(KTR_LOCK, "%s: %p recursing", __func__, sx); return (0); } if (LOCK_LOG_TEST(&sx->lock_object, 0)) CTR5(KTR_LOCK, "%s: %s contested (lock=%p) at %s:%d", __func__, sx->lock_object.lo_name, (void *)sx->sx_lock, file, line); #ifdef KDTRACE_HOOKS all_time -= lockstat_nsecs(&sx->lock_object); state = sx->sx_lock; #endif for (;;) { if (sx->sx_lock == SX_LOCK_UNLOCKED && atomic_cmpset_acq_ptr(&sx->sx_lock, SX_LOCK_UNLOCKED, tid)) break; #ifdef KDTRACE_HOOKS lda.spin_cnt++; #endif #ifdef HWPMC_HOOKS PMC_SOFT_CALL( , , lock, failed); #endif lock_profile_obtain_lock_failed(&sx->lock_object, &contested, &waittime); #ifdef ADAPTIVE_SX /* * If the lock is write locked and the owner is * running on another CPU, spin until the owner stops * running or the state of the lock changes. */ x = sx->sx_lock; if ((sx->lock_object.lo_flags & SX_NOADAPTIVE) == 0) { if ((x & SX_LOCK_SHARED) == 0) { x = SX_OWNER(x); owner = (struct thread *)x; if (TD_IS_RUNNING(owner)) { if (LOCK_LOG_TEST(&sx->lock_object, 0)) CTR3(KTR_LOCK, "%s: spinning on %p held by %p", __func__, sx, owner); KTR_STATE1(KTR_SCHED, "thread", sched_tdname(curthread), "spinning", "lockname:\"%s\"", sx->lock_object.lo_name); GIANT_SAVE(); while (SX_OWNER(sx->sx_lock) == x && TD_IS_RUNNING(owner)) lock_delay(&lda); KTR_STATE0(KTR_SCHED, "thread", sched_tdname(curthread), "running"); continue; } } else if (SX_SHARERS(x) && spintries < asx_retries) { KTR_STATE1(KTR_SCHED, "thread", sched_tdname(curthread), "spinning", "lockname:\"%s\"", sx->lock_object.lo_name); GIANT_SAVE(); spintries++; for (i = 0; i < asx_loops; i++) { if (LOCK_LOG_TEST(&sx->lock_object, 0)) CTR4(KTR_LOCK, "%s: shared spinning on %p with %u and %u", __func__, sx, spintries, i); x = sx->sx_lock; if ((x & SX_LOCK_SHARED) == 0 || SX_SHARERS(x) == 0) break; cpu_spinwait(); #ifdef KDTRACE_HOOKS lda.spin_cnt++; #endif } KTR_STATE0(KTR_SCHED, "thread", sched_tdname(curthread), "running"); if (i != asx_loops) continue; } } #endif sleepq_lock(&sx->lock_object); x = sx->sx_lock; /* * If the lock was released while spinning on the * sleep queue chain lock, try again. */ if (x == SX_LOCK_UNLOCKED) { sleepq_release(&sx->lock_object); continue; } #ifdef ADAPTIVE_SX /* * The current lock owner might have started executing * on another CPU (or the lock could have changed * owners) while we were waiting on the sleep queue * chain lock. If so, drop the sleep queue lock and try * again. */ if (!(x & SX_LOCK_SHARED) && (sx->lock_object.lo_flags & SX_NOADAPTIVE) == 0) { owner = (struct thread *)SX_OWNER(x); if (TD_IS_RUNNING(owner)) { sleepq_release(&sx->lock_object); continue; } } #endif /* * If an exclusive lock was released with both shared * and exclusive waiters and a shared waiter hasn't * woken up and acquired the lock yet, sx_lock will be * set to SX_LOCK_UNLOCKED | SX_LOCK_EXCLUSIVE_WAITERS. * If we see that value, try to acquire it once. Note * that we have to preserve SX_LOCK_EXCLUSIVE_WAITERS * as there are other exclusive waiters still. If we * fail, restart the loop. */ if (x == (SX_LOCK_UNLOCKED | SX_LOCK_EXCLUSIVE_WAITERS)) { if (atomic_cmpset_acq_ptr(&sx->sx_lock, SX_LOCK_UNLOCKED | SX_LOCK_EXCLUSIVE_WAITERS, tid | SX_LOCK_EXCLUSIVE_WAITERS)) { sleepq_release(&sx->lock_object); CTR2(KTR_LOCK, "%s: %p claimed by new writer", __func__, sx); break; } sleepq_release(&sx->lock_object); continue; } /* * Try to set the SX_LOCK_EXCLUSIVE_WAITERS. If we fail, * than loop back and retry. */ if (!(x & SX_LOCK_EXCLUSIVE_WAITERS)) { if (!atomic_cmpset_ptr(&sx->sx_lock, x, x | SX_LOCK_EXCLUSIVE_WAITERS)) { sleepq_release(&sx->lock_object); continue; } if (LOCK_LOG_TEST(&sx->lock_object, 0)) CTR2(KTR_LOCK, "%s: %p set excl waiters flag", __func__, sx); } /* * Since we have been unable to acquire the exclusive * lock and the exclusive waiters flag is set, we have * to sleep. */ if (LOCK_LOG_TEST(&sx->lock_object, 0)) CTR2(KTR_LOCK, "%s: %p blocking on sleep queue", __func__, sx); #ifdef KDTRACE_HOOKS sleep_time -= lockstat_nsecs(&sx->lock_object); #endif GIANT_SAVE(); sleepq_add(&sx->lock_object, NULL, sx->lock_object.lo_name, SLEEPQ_SX | ((opts & SX_INTERRUPTIBLE) ? SLEEPQ_INTERRUPTIBLE : 0), SQ_EXCLUSIVE_QUEUE); if (!(opts & SX_INTERRUPTIBLE)) sleepq_wait(&sx->lock_object, 0); else error = sleepq_wait_sig(&sx->lock_object, 0); #ifdef KDTRACE_HOOKS sleep_time += lockstat_nsecs(&sx->lock_object); sleep_cnt++; #endif if (error) { if (LOCK_LOG_TEST(&sx->lock_object, 0)) CTR2(KTR_LOCK, "%s: interruptible sleep by %p suspended by signal", __func__, sx); break; } if (LOCK_LOG_TEST(&sx->lock_object, 0)) CTR2(KTR_LOCK, "%s: %p resuming from sleep queue", __func__, sx); } #ifdef KDTRACE_HOOKS all_time += lockstat_nsecs(&sx->lock_object); if (sleep_time) LOCKSTAT_RECORD4(sx__block, sx, sleep_time, LOCKSTAT_WRITER, (state & SX_LOCK_SHARED) == 0, (state & SX_LOCK_SHARED) == 0 ? 0 : SX_SHARERS(state)); if (lda.spin_cnt > sleep_cnt) LOCKSTAT_RECORD4(sx__spin, sx, all_time - sleep_time, LOCKSTAT_WRITER, (state & SX_LOCK_SHARED) == 0, (state & SX_LOCK_SHARED) == 0 ? 0 : SX_SHARERS(state)); #endif if (!error) LOCKSTAT_PROFILE_OBTAIN_RWLOCK_SUCCESS(sx__acquire, sx, contested, waittime, file, line, LOCKSTAT_WRITER); GIANT_RESTORE(); return (error); } /* * This function represents the so-called 'hard case' for sx_xunlock * operation. All 'easy case' failures are redirected to this. Note * that ideally this would be a static function, but it needs to be * accessible from at least sx.h. */ void _sx_xunlock_hard(struct sx *sx, uintptr_t tid, const char *file, int line) { uintptr_t x; int queue, wakeup_swapper; if (SCHEDULER_STOPPED()) return; MPASS(!(sx->sx_lock & SX_LOCK_SHARED)); /* If the lock is recursed, then unrecurse one level. */ if (sx_xlocked(sx) && sx_recursed(sx)) { if ((--sx->sx_recurse) == 0) atomic_clear_ptr(&sx->sx_lock, SX_LOCK_RECURSED); if (LOCK_LOG_TEST(&sx->lock_object, 0)) CTR2(KTR_LOCK, "%s: %p unrecursing", __func__, sx); return; } MPASS(sx->sx_lock & (SX_LOCK_SHARED_WAITERS | SX_LOCK_EXCLUSIVE_WAITERS)); if (LOCK_LOG_TEST(&sx->lock_object, 0)) CTR2(KTR_LOCK, "%s: %p contested", __func__, sx); sleepq_lock(&sx->lock_object); x = SX_LOCK_UNLOCKED; /* * The wake up algorithm here is quite simple and probably not * ideal. It gives precedence to shared waiters if they are * present. For this condition, we have to preserve the * state of the exclusive waiters flag. * If interruptible sleeps left the shared queue empty avoid a * starvation for the threads sleeping on the exclusive queue by giving * them precedence and cleaning up the shared waiters bit anyway. */ if ((sx->sx_lock & SX_LOCK_SHARED_WAITERS) != 0 && sleepq_sleepcnt(&sx->lock_object, SQ_SHARED_QUEUE) != 0) { queue = SQ_SHARED_QUEUE; x |= (sx->sx_lock & SX_LOCK_EXCLUSIVE_WAITERS); } else queue = SQ_EXCLUSIVE_QUEUE; /* Wake up all the waiters for the specific queue. */ if (LOCK_LOG_TEST(&sx->lock_object, 0)) CTR3(KTR_LOCK, "%s: %p waking up all threads on %s queue", __func__, sx, queue == SQ_SHARED_QUEUE ? "shared" : "exclusive"); atomic_store_rel_ptr(&sx->sx_lock, x); wakeup_swapper = sleepq_broadcast(&sx->lock_object, SLEEPQ_SX, 0, queue); sleepq_release(&sx->lock_object); if (wakeup_swapper) kick_proc0(); } /* * This function represents the so-called 'hard case' for sx_slock * operation. All 'easy case' failures are redirected to this. Note * that ideally this would be a static function, but it needs to be * accessible from at least sx.h. */ int _sx_slock_hard(struct sx *sx, int opts, const char *file, int line) { GIANT_DECLARE; #ifdef ADAPTIVE_SX volatile struct thread *owner; #endif #ifdef LOCK_PROFILING uint64_t waittime = 0; int contested = 0; #endif uintptr_t x; int error = 0; #if defined(ADAPTIVE_SX) || defined(KDTRACE_HOOKS) struct lock_delay_arg lda; #endif #ifdef KDTRACE_HOOKS uintptr_t state; u_int sleep_cnt = 0; int64_t sleep_time = 0; int64_t all_time = 0; #endif if (SCHEDULER_STOPPED()) return (0); -#if defined(ADAPTIVE_SX) || defined(KDTRACE_HOOKS) +#if defined(ADAPTIVE_SX) lock_delay_arg_init(&lda, &sx_delay); +#elif defined(KDTRACE_HOOKS) + lock_delay_arg_init(&lda, NULL); #endif #ifdef KDTRACE_HOOKS state = sx->sx_lock; all_time -= lockstat_nsecs(&sx->lock_object); #endif /* * As with rwlocks, we don't make any attempt to try to block * shared locks once there is an exclusive waiter. */ for (;;) { #ifdef KDTRACE_HOOKS lda.spin_cnt++; #endif x = sx->sx_lock; /* * If no other thread has an exclusive lock then try to bump up * the count of sharers. Since we have to preserve the state * of SX_LOCK_EXCLUSIVE_WAITERS, if we fail to acquire the * shared lock loop back and retry. */ if (x & SX_LOCK_SHARED) { MPASS(!(x & SX_LOCK_SHARED_WAITERS)); if (atomic_cmpset_acq_ptr(&sx->sx_lock, x, x + SX_ONE_SHARER)) { if (LOCK_LOG_TEST(&sx->lock_object, 0)) CTR4(KTR_LOCK, "%s: %p succeed %p -> %p", __func__, sx, (void *)x, (void *)(x + SX_ONE_SHARER)); break; } continue; } #ifdef HWPMC_HOOKS PMC_SOFT_CALL( , , lock, failed); #endif lock_profile_obtain_lock_failed(&sx->lock_object, &contested, &waittime); #ifdef ADAPTIVE_SX /* * If the owner is running on another CPU, spin until * the owner stops running or the state of the lock * changes. */ if ((sx->lock_object.lo_flags & SX_NOADAPTIVE) == 0) { x = SX_OWNER(x); owner = (struct thread *)x; if (TD_IS_RUNNING(owner)) { if (LOCK_LOG_TEST(&sx->lock_object, 0)) CTR3(KTR_LOCK, "%s: spinning on %p held by %p", __func__, sx, owner); KTR_STATE1(KTR_SCHED, "thread", sched_tdname(curthread), "spinning", "lockname:\"%s\"", sx->lock_object.lo_name); GIANT_SAVE(); while (SX_OWNER(sx->sx_lock) == x && TD_IS_RUNNING(owner)) lock_delay(&lda); KTR_STATE0(KTR_SCHED, "thread", sched_tdname(curthread), "running"); continue; } } #endif /* * Some other thread already has an exclusive lock, so * start the process of blocking. */ sleepq_lock(&sx->lock_object); x = sx->sx_lock; /* * The lock could have been released while we spun. * In this case loop back and retry. */ if (x & SX_LOCK_SHARED) { sleepq_release(&sx->lock_object); continue; } #ifdef ADAPTIVE_SX /* * If the owner is running on another CPU, spin until * the owner stops running or the state of the lock * changes. */ if (!(x & SX_LOCK_SHARED) && (sx->lock_object.lo_flags & SX_NOADAPTIVE) == 0) { owner = (struct thread *)SX_OWNER(x); if (TD_IS_RUNNING(owner)) { sleepq_release(&sx->lock_object); continue; } } #endif /* * Try to set the SX_LOCK_SHARED_WAITERS flag. If we * fail to set it drop the sleep queue lock and loop * back. */ if (!(x & SX_LOCK_SHARED_WAITERS)) { if (!atomic_cmpset_ptr(&sx->sx_lock, x, x | SX_LOCK_SHARED_WAITERS)) { sleepq_release(&sx->lock_object); continue; } if (LOCK_LOG_TEST(&sx->lock_object, 0)) CTR2(KTR_LOCK, "%s: %p set shared waiters flag", __func__, sx); } /* * Since we have been unable to acquire the shared lock, * we have to sleep. */ if (LOCK_LOG_TEST(&sx->lock_object, 0)) CTR2(KTR_LOCK, "%s: %p blocking on sleep queue", __func__, sx); #ifdef KDTRACE_HOOKS sleep_time -= lockstat_nsecs(&sx->lock_object); #endif GIANT_SAVE(); sleepq_add(&sx->lock_object, NULL, sx->lock_object.lo_name, SLEEPQ_SX | ((opts & SX_INTERRUPTIBLE) ? SLEEPQ_INTERRUPTIBLE : 0), SQ_SHARED_QUEUE); if (!(opts & SX_INTERRUPTIBLE)) sleepq_wait(&sx->lock_object, 0); else error = sleepq_wait_sig(&sx->lock_object, 0); #ifdef KDTRACE_HOOKS sleep_time += lockstat_nsecs(&sx->lock_object); sleep_cnt++; #endif if (error) { if (LOCK_LOG_TEST(&sx->lock_object, 0)) CTR2(KTR_LOCK, "%s: interruptible sleep by %p suspended by signal", __func__, sx); break; } if (LOCK_LOG_TEST(&sx->lock_object, 0)) CTR2(KTR_LOCK, "%s: %p resuming from sleep queue", __func__, sx); } #ifdef KDTRACE_HOOKS all_time += lockstat_nsecs(&sx->lock_object); if (sleep_time) LOCKSTAT_RECORD4(sx__block, sx, sleep_time, LOCKSTAT_READER, (state & SX_LOCK_SHARED) == 0, (state & SX_LOCK_SHARED) == 0 ? 0 : SX_SHARERS(state)); if (lda.spin_cnt > sleep_cnt) LOCKSTAT_RECORD4(sx__spin, sx, all_time - sleep_time, LOCKSTAT_READER, (state & SX_LOCK_SHARED) == 0, (state & SX_LOCK_SHARED) == 0 ? 0 : SX_SHARERS(state)); #endif if (error == 0) LOCKSTAT_PROFILE_OBTAIN_RWLOCK_SUCCESS(sx__acquire, sx, contested, waittime, file, line, LOCKSTAT_READER); GIANT_RESTORE(); return (error); } /* * This function represents the so-called 'hard case' for sx_sunlock * operation. All 'easy case' failures are redirected to this. Note * that ideally this would be a static function, but it needs to be * accessible from at least sx.h. */ void _sx_sunlock_hard(struct sx *sx, const char *file, int line) { uintptr_t x; int wakeup_swapper; if (SCHEDULER_STOPPED()) return; for (;;) { x = sx->sx_lock; /* * We should never have sharers while at least one thread * holds a shared lock. */ KASSERT(!(x & SX_LOCK_SHARED_WAITERS), ("%s: waiting sharers", __func__)); /* * See if there is more than one shared lock held. If * so, just drop one and return. */ if (SX_SHARERS(x) > 1) { if (atomic_cmpset_rel_ptr(&sx->sx_lock, x, x - SX_ONE_SHARER)) { if (LOCK_LOG_TEST(&sx->lock_object, 0)) CTR4(KTR_LOCK, "%s: %p succeeded %p -> %p", __func__, sx, (void *)x, (void *)(x - SX_ONE_SHARER)); break; } continue; } /* * If there aren't any waiters for an exclusive lock, * then try to drop it quickly. */ if (!(x & SX_LOCK_EXCLUSIVE_WAITERS)) { MPASS(x == SX_SHARERS_LOCK(1)); if (atomic_cmpset_rel_ptr(&sx->sx_lock, SX_SHARERS_LOCK(1), SX_LOCK_UNLOCKED)) { if (LOCK_LOG_TEST(&sx->lock_object, 0)) CTR2(KTR_LOCK, "%s: %p last succeeded", __func__, sx); break; } continue; } /* * At this point, there should just be one sharer with * exclusive waiters. */ MPASS(x == (SX_SHARERS_LOCK(1) | SX_LOCK_EXCLUSIVE_WAITERS)); sleepq_lock(&sx->lock_object); /* * Wake up semantic here is quite simple: * Just wake up all the exclusive waiters. * Note that the state of the lock could have changed, * so if it fails loop back and retry. */ if (!atomic_cmpset_rel_ptr(&sx->sx_lock, SX_SHARERS_LOCK(1) | SX_LOCK_EXCLUSIVE_WAITERS, SX_LOCK_UNLOCKED)) { sleepq_release(&sx->lock_object); continue; } if (LOCK_LOG_TEST(&sx->lock_object, 0)) CTR2(KTR_LOCK, "%s: %p waking up all thread on" "exclusive queue", __func__, sx); wakeup_swapper = sleepq_broadcast(&sx->lock_object, SLEEPQ_SX, 0, SQ_EXCLUSIVE_QUEUE); sleepq_release(&sx->lock_object); if (wakeup_swapper) kick_proc0(); break; } } #ifdef INVARIANT_SUPPORT #ifndef INVARIANTS #undef _sx_assert #endif /* * In the non-WITNESS case, sx_assert() can only detect that at least * *some* thread owns an slock, but it cannot guarantee that *this* * thread owns an slock. */ void _sx_assert(const struct sx *sx, int what, const char *file, int line) { #ifndef WITNESS int slocked = 0; #endif if (panicstr != NULL) return; switch (what) { case SA_SLOCKED: case SA_SLOCKED | SA_NOTRECURSED: case SA_SLOCKED | SA_RECURSED: #ifndef WITNESS slocked = 1; /* FALLTHROUGH */ #endif case SA_LOCKED: case SA_LOCKED | SA_NOTRECURSED: case SA_LOCKED | SA_RECURSED: #ifdef WITNESS witness_assert(&sx->lock_object, what, file, line); #else /* * If some other thread has an exclusive lock or we * have one and are asserting a shared lock, fail. * Also, if no one has a lock at all, fail. */ if (sx->sx_lock == SX_LOCK_UNLOCKED || (!(sx->sx_lock & SX_LOCK_SHARED) && (slocked || sx_xholder(sx) != curthread))) panic("Lock %s not %slocked @ %s:%d\n", sx->lock_object.lo_name, slocked ? "share " : "", file, line); if (!(sx->sx_lock & SX_LOCK_SHARED)) { if (sx_recursed(sx)) { if (what & SA_NOTRECURSED) panic("Lock %s recursed @ %s:%d\n", sx->lock_object.lo_name, file, line); } else if (what & SA_RECURSED) panic("Lock %s not recursed @ %s:%d\n", sx->lock_object.lo_name, file, line); } #endif break; case SA_XLOCKED: case SA_XLOCKED | SA_NOTRECURSED: case SA_XLOCKED | SA_RECURSED: if (sx_xholder(sx) != curthread) panic("Lock %s not exclusively locked @ %s:%d\n", sx->lock_object.lo_name, file, line); if (sx_recursed(sx)) { if (what & SA_NOTRECURSED) panic("Lock %s recursed @ %s:%d\n", sx->lock_object.lo_name, file, line); } else if (what & SA_RECURSED) panic("Lock %s not recursed @ %s:%d\n", sx->lock_object.lo_name, file, line); break; case SA_UNLOCKED: #ifdef WITNESS witness_assert(&sx->lock_object, what, file, line); #else /* * If we hold an exclusve lock fail. We can't * reliably check to see if we hold a shared lock or * not. */ if (sx_xholder(sx) == curthread) panic("Lock %s exclusively locked @ %s:%d\n", sx->lock_object.lo_name, file, line); #endif break; default: panic("Unknown sx lock assertion: %d @ %s:%d", what, file, line); } } #endif /* INVARIANT_SUPPORT */ #ifdef DDB static void db_show_sx(const struct lock_object *lock) { struct thread *td; const struct sx *sx; sx = (const struct sx *)lock; db_printf(" state: "); if (sx->sx_lock == SX_LOCK_UNLOCKED) db_printf("UNLOCKED\n"); else if (sx->sx_lock == SX_LOCK_DESTROYED) { db_printf("DESTROYED\n"); return; } else if (sx->sx_lock & SX_LOCK_SHARED) db_printf("SLOCK: %ju\n", (uintmax_t)SX_SHARERS(sx->sx_lock)); else { td = sx_xholder(sx); db_printf("XLOCK: %p (tid %d, pid %d, \"%s\")\n", td, td->td_tid, td->td_proc->p_pid, td->td_name); if (sx_recursed(sx)) db_printf(" recursed: %d\n", sx->sx_recurse); } db_printf(" waiters: "); switch(sx->sx_lock & (SX_LOCK_SHARED_WAITERS | SX_LOCK_EXCLUSIVE_WAITERS)) { case SX_LOCK_SHARED_WAITERS: db_printf("shared\n"); break; case SX_LOCK_EXCLUSIVE_WAITERS: db_printf("exclusive\n"); break; case SX_LOCK_SHARED_WAITERS | SX_LOCK_EXCLUSIVE_WAITERS: db_printf("exclusive and shared\n"); break; default: db_printf("none\n"); } } /* * Check to see if a thread that is blocked on a sleep queue is actually * blocked on an sx lock. If so, output some details and return true. * If the lock has an exclusive owner, return that in *ownerp. */ int sx_chain(struct thread *td, struct thread **ownerp) { struct sx *sx; /* * Check to see if this thread is blocked on an sx lock. * First, we check the lock class. If that is ok, then we * compare the lock name against the wait message. */ sx = td->td_wchan; if (LOCK_CLASS(&sx->lock_object) != &lock_class_sx || sx->lock_object.lo_name != td->td_wmesg) return (0); /* We think we have an sx lock, so output some details. */ db_printf("blocked on sx \"%s\" ", td->td_wmesg); *ownerp = sx_xholder(sx); if (sx->sx_lock & SX_LOCK_SHARED) db_printf("SLOCK (count %ju)\n", (uintmax_t)SX_SHARERS(sx->sx_lock)); else db_printf("XLOCK\n"); return (1); } #endif Index: user/alc/PQ_LAUNDRY/sys/netinet/tcp_lro.c =================================================================== --- user/alc/PQ_LAUNDRY/sys/netinet/tcp_lro.c (revision 303666) +++ user/alc/PQ_LAUNDRY/sys/netinet/tcp_lro.c (revision 303667) @@ -1,876 +1,941 @@ /*- * Copyright (c) 2007, Myricom Inc. * Copyright (c) 2008, Intel Corporation. * Copyright (c) 2012 The FreeBSD Foundation * Copyright (c) 2016 Mellanox Technologies. * All rights reserved. * * Portions of this software were developed by Bjoern Zeeb * under sponsorship from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static MALLOC_DEFINE(M_LRO, "LRO", "LRO control structures"); #define TCP_LRO_UPDATE_CSUM 1 #ifndef TCP_LRO_UPDATE_CSUM #define TCP_LRO_INVALID_CSUM 0x0000 #endif static void tcp_lro_rx_done(struct lro_ctrl *lc); +static int tcp_lro_rx2(struct lro_ctrl *lc, struct mbuf *m, + uint32_t csum, int use_hash); static __inline void -tcp_lro_active_insert(struct lro_ctrl *lc, struct lro_entry *le) +tcp_lro_active_insert(struct lro_ctrl *lc, struct lro_head *bucket, + struct lro_entry *le) { LIST_INSERT_HEAD(&lc->lro_active, le, next); + LIST_INSERT_HEAD(bucket, le, hash_next); } static __inline void tcp_lro_active_remove(struct lro_entry *le) { - LIST_REMOVE(le, next); + LIST_REMOVE(le, next); /* active list */ + LIST_REMOVE(le, hash_next); /* hash bucket */ } int tcp_lro_init(struct lro_ctrl *lc) { return (tcp_lro_init_args(lc, NULL, TCP_LRO_ENTRIES, 0)); } int tcp_lro_init_args(struct lro_ctrl *lc, struct ifnet *ifp, unsigned lro_entries, unsigned lro_mbufs) { struct lro_entry *le; size_t size; - unsigned i; + unsigned i, elements; lc->lro_bad_csum = 0; lc->lro_queued = 0; lc->lro_flushed = 0; lc->lro_cnt = 0; lc->lro_mbuf_count = 0; lc->lro_mbuf_max = lro_mbufs; lc->lro_cnt = lro_entries; lc->lro_ackcnt_lim = TCP_LRO_ACKCNT_MAX; lc->lro_length_lim = TCP_LRO_LENGTH_MAX; lc->ifp = ifp; LIST_INIT(&lc->lro_free); LIST_INIT(&lc->lro_active); + /* create hash table to accelerate entry lookup */ + if (lro_entries > lro_mbufs) + elements = lro_entries; + else + elements = lro_mbufs; + lc->lro_hash = phashinit_flags(elements, M_LRO, &lc->lro_hashsz, + HASH_NOWAIT); + if (lc->lro_hash == NULL) { + memset(lc, 0, sizeof(*lc)); + return (ENOMEM); + } + /* compute size to allocate */ size = (lro_mbufs * sizeof(struct lro_mbuf_sort)) + (lro_entries * sizeof(*le)); lc->lro_mbuf_data = (struct lro_mbuf_sort *) malloc(size, M_LRO, M_NOWAIT | M_ZERO); /* check for out of memory */ if (lc->lro_mbuf_data == NULL) { memset(lc, 0, sizeof(*lc)); return (ENOMEM); } /* compute offset for LRO entries */ le = (struct lro_entry *) (lc->lro_mbuf_data + lro_mbufs); /* setup linked list */ for (i = 0; i != lro_entries; i++) LIST_INSERT_HEAD(&lc->lro_free, le + i, next); return (0); } void tcp_lro_free(struct lro_ctrl *lc) { struct lro_entry *le; unsigned x; /* reset LRO free list */ LIST_INIT(&lc->lro_free); /* free active mbufs, if any */ while ((le = LIST_FIRST(&lc->lro_active)) != NULL) { tcp_lro_active_remove(le); m_freem(le->m_head); } + /* free hash table */ + if (lc->lro_hash != NULL) { + free(lc->lro_hash, M_LRO); + lc->lro_hash = NULL; + } + lc->lro_hashsz = 0; + /* free mbuf array, if any */ for (x = 0; x != lc->lro_mbuf_count; x++) m_freem(lc->lro_mbuf_data[x].mb); lc->lro_mbuf_count = 0; /* free allocated memory, if any */ free(lc->lro_mbuf_data, M_LRO); lc->lro_mbuf_data = NULL; } #ifdef TCP_LRO_UPDATE_CSUM static uint16_t tcp_lro_csum_th(struct tcphdr *th) { uint32_t ch; uint16_t *p, l; ch = th->th_sum = 0x0000; l = th->th_off; p = (uint16_t *)th; while (l > 0) { ch += *p; p++; ch += *p; p++; l--; } while (ch > 0xffff) ch = (ch >> 16) + (ch & 0xffff); return (ch & 0xffff); } static uint16_t tcp_lro_rx_csum_fixup(struct lro_entry *le, void *l3hdr, struct tcphdr *th, uint16_t tcp_data_len, uint16_t csum) { uint32_t c; uint16_t cs; c = csum; /* Remove length from checksum. */ switch (le->eh_type) { #ifdef INET6 case ETHERTYPE_IPV6: { struct ip6_hdr *ip6; ip6 = (struct ip6_hdr *)l3hdr; if (le->append_cnt == 0) cs = ip6->ip6_plen; else { uint32_t cx; cx = ntohs(ip6->ip6_plen); cs = in6_cksum_pseudo(ip6, cx, ip6->ip6_nxt, 0); } break; } #endif #ifdef INET case ETHERTYPE_IP: { struct ip *ip4; ip4 = (struct ip *)l3hdr; if (le->append_cnt == 0) cs = ip4->ip_len; else { cs = in_addword(ntohs(ip4->ip_len) - sizeof(*ip4), IPPROTO_TCP); cs = in_pseudo(ip4->ip_src.s_addr, ip4->ip_dst.s_addr, htons(cs)); } break; } #endif default: cs = 0; /* Keep compiler happy. */ } cs = ~cs; c += cs; /* Remove TCP header csum. */ cs = ~tcp_lro_csum_th(th); c += cs; while (c > 0xffff) c = (c >> 16) + (c & 0xffff); return (c & 0xffff); } #endif static void tcp_lro_rx_done(struct lro_ctrl *lc) { struct lro_entry *le; while ((le = LIST_FIRST(&lc->lro_active)) != NULL) { tcp_lro_active_remove(le); tcp_lro_flush(lc, le); } } void tcp_lro_flush_inactive(struct lro_ctrl *lc, const struct timeval *timeout) { struct lro_entry *le, *le_tmp; struct timeval tv; if (LIST_EMPTY(&lc->lro_active)) return; getmicrotime(&tv); timevalsub(&tv, timeout); LIST_FOREACH_SAFE(le, &lc->lro_active, next, le_tmp) { if (timevalcmp(&tv, &le->mtime, >=)) { tcp_lro_active_remove(le); tcp_lro_flush(lc, le); } } } void tcp_lro_flush(struct lro_ctrl *lc, struct lro_entry *le) { if (le->append_cnt > 0) { struct tcphdr *th; uint16_t p_len; p_len = htons(le->p_len); switch (le->eh_type) { #ifdef INET6 case ETHERTYPE_IPV6: { struct ip6_hdr *ip6; ip6 = le->le_ip6; ip6->ip6_plen = p_len; th = (struct tcphdr *)(ip6 + 1); le->m_head->m_pkthdr.csum_flags = CSUM_DATA_VALID | CSUM_PSEUDO_HDR; le->p_len += ETHER_HDR_LEN + sizeof(*ip6); break; } #endif #ifdef INET case ETHERTYPE_IP: { struct ip *ip4; #ifdef TCP_LRO_UPDATE_CSUM uint32_t cl; uint16_t c; #endif ip4 = le->le_ip4; #ifdef TCP_LRO_UPDATE_CSUM /* Fix IP header checksum for new length. */ c = ~ip4->ip_sum; cl = c; c = ~ip4->ip_len; cl += c + p_len; while (cl > 0xffff) cl = (cl >> 16) + (cl & 0xffff); c = cl; ip4->ip_sum = ~c; #else ip4->ip_sum = TCP_LRO_INVALID_CSUM; #endif ip4->ip_len = p_len; th = (struct tcphdr *)(ip4 + 1); le->m_head->m_pkthdr.csum_flags = CSUM_DATA_VALID | CSUM_PSEUDO_HDR | CSUM_IP_CHECKED | CSUM_IP_VALID; le->p_len += ETHER_HDR_LEN; break; } #endif default: th = NULL; /* Keep compiler happy. */ } le->m_head->m_pkthdr.csum_data = 0xffff; le->m_head->m_pkthdr.len = le->p_len; /* Incorporate the latest ACK into the TCP header. */ th->th_ack = le->ack_seq; th->th_win = le->window; /* Incorporate latest timestamp into the TCP header. */ if (le->timestamp != 0) { uint32_t *ts_ptr; ts_ptr = (uint32_t *)(th + 1); ts_ptr[1] = htonl(le->tsval); ts_ptr[2] = le->tsecr; } #ifdef TCP_LRO_UPDATE_CSUM /* Update the TCP header checksum. */ le->ulp_csum += p_len; le->ulp_csum += tcp_lro_csum_th(th); while (le->ulp_csum > 0xffff) le->ulp_csum = (le->ulp_csum >> 16) + (le->ulp_csum & 0xffff); th->th_sum = (le->ulp_csum & 0xffff); th->th_sum = ~th->th_sum; #else th->th_sum = TCP_LRO_INVALID_CSUM; #endif } (*lc->ifp->if_input)(lc->ifp, le->m_head); lc->lro_queued += le->append_cnt + 1; lc->lro_flushed++; bzero(le, sizeof(*le)); LIST_INSERT_HEAD(&lc->lro_free, le, next); } #ifdef HAVE_INLINE_FLSLL #define tcp_lro_msb_64(x) (1ULL << (flsll(x) - 1)) #else static inline uint64_t tcp_lro_msb_64(uint64_t x) { x |= (x >> 1); x |= (x >> 2); x |= (x >> 4); x |= (x >> 8); x |= (x >> 16); x |= (x >> 32); return (x & ~(x >> 1)); } #endif /* * The tcp_lro_sort() routine is comparable to qsort(), except it has * a worst case complexity limit of O(MIN(N,64)*N), where N is the * number of elements to sort and 64 is the number of sequence bits * available. The algorithm is bit-slicing the 64-bit sequence number, * sorting one bit at a time from the most significant bit until the * least significant one, skipping the constant bits. This is * typically called a radix sort. */ static void tcp_lro_sort(struct lro_mbuf_sort *parray, uint32_t size) { struct lro_mbuf_sort temp; uint64_t ones; uint64_t zeros; uint32_t x; uint32_t y; repeat: /* for small arrays insertion sort is faster */ if (size <= 12) { for (x = 1; x < size; x++) { temp = parray[x]; for (y = x; y > 0 && temp.seq < parray[y - 1].seq; y--) parray[y] = parray[y - 1]; parray[y] = temp; } return; } /* compute sequence bits which are constant */ ones = 0; zeros = 0; for (x = 0; x != size; x++) { ones |= parray[x].seq; zeros |= ~parray[x].seq; } /* compute bits which are not constant into "ones" */ ones &= zeros; if (ones == 0) return; /* pick the most significant bit which is not constant */ ones = tcp_lro_msb_64(ones); /* * Move entries having cleared sequence bits to the beginning * of the array: */ for (x = y = 0; y != size; y++) { /* skip set bits */ if (parray[y].seq & ones) continue; /* swap entries */ temp = parray[x]; parray[x] = parray[y]; parray[y] = temp; x++; } KASSERT(x != 0 && x != size, ("Memory is corrupted\n")); /* sort zeros */ tcp_lro_sort(parray, x); /* sort ones */ parray += x; size -= x; goto repeat; } void tcp_lro_flush_all(struct lro_ctrl *lc) { uint64_t seq; uint64_t nseq; unsigned x; /* check if no mbufs to flush */ if (lc->lro_mbuf_count == 0) goto done; /* sort all mbufs according to stream */ tcp_lro_sort(lc->lro_mbuf_data, lc->lro_mbuf_count); /* input data into LRO engine, stream by stream */ seq = 0; for (x = 0; x != lc->lro_mbuf_count; x++) { struct mbuf *mb; /* get mbuf */ mb = lc->lro_mbuf_data[x].mb; /* get sequence number, masking away the packet index */ nseq = lc->lro_mbuf_data[x].seq & (-1ULL << 24); /* check for new stream */ if (seq != nseq) { seq = nseq; /* flush active streams */ tcp_lro_rx_done(lc); } /* add packet to LRO engine */ - if (tcp_lro_rx(lc, mb, 0) != 0) { + if (tcp_lro_rx2(lc, mb, 0, 0) != 0) { /* input packet to network layer */ (*lc->ifp->if_input)(lc->ifp, mb); lc->lro_queued++; lc->lro_flushed++; } } done: /* flush active streams */ tcp_lro_rx_done(lc); lc->lro_mbuf_count = 0; } #ifdef INET6 static int tcp_lro_rx_ipv6(struct lro_ctrl *lc, struct mbuf *m, struct ip6_hdr *ip6, struct tcphdr **th) { /* XXX-BZ we should check the flow-label. */ /* XXX-BZ We do not yet support ext. hdrs. */ if (ip6->ip6_nxt != IPPROTO_TCP) return (TCP_LRO_NOT_SUPPORTED); /* Find the TCP header. */ *th = (struct tcphdr *)(ip6 + 1); return (0); } #endif #ifdef INET static int tcp_lro_rx_ipv4(struct lro_ctrl *lc, struct mbuf *m, struct ip *ip4, struct tcphdr **th) { int csum_flags; uint16_t csum; if (ip4->ip_p != IPPROTO_TCP) return (TCP_LRO_NOT_SUPPORTED); /* Ensure there are no options. */ if ((ip4->ip_hl << 2) != sizeof (*ip4)) return (TCP_LRO_CANNOT); /* .. and the packet is not fragmented. */ if (ip4->ip_off & htons(IP_MF|IP_OFFMASK)) return (TCP_LRO_CANNOT); /* Legacy IP has a header checksum that needs to be correct. */ csum_flags = m->m_pkthdr.csum_flags; if (csum_flags & CSUM_IP_CHECKED) { if (__predict_false((csum_flags & CSUM_IP_VALID) == 0)) { lc->lro_bad_csum++; return (TCP_LRO_CANNOT); } } else { csum = in_cksum_hdr(ip4); if (__predict_false((csum) != 0)) { lc->lro_bad_csum++; return (TCP_LRO_CANNOT); } } /* Find the TCP header (we assured there are no IP options). */ *th = (struct tcphdr *)(ip4 + 1); return (0); } #endif -int -tcp_lro_rx(struct lro_ctrl *lc, struct mbuf *m, uint32_t csum) +static int +tcp_lro_rx2(struct lro_ctrl *lc, struct mbuf *m, uint32_t csum, int use_hash) { struct lro_entry *le; struct ether_header *eh; #ifdef INET6 struct ip6_hdr *ip6 = NULL; /* Keep compiler happy. */ #endif #ifdef INET struct ip *ip4 = NULL; /* Keep compiler happy. */ #endif struct tcphdr *th; void *l3hdr = NULL; /* Keep compiler happy. */ uint32_t *ts_ptr; tcp_seq seq; int error, ip_len, l; uint16_t eh_type, tcp_data_len; + struct lro_head *bucket; /* We expect a contiguous header [eh, ip, tcp]. */ eh = mtod(m, struct ether_header *); eh_type = ntohs(eh->ether_type); switch (eh_type) { #ifdef INET6 case ETHERTYPE_IPV6: { CURVNET_SET(lc->ifp->if_vnet); if (V_ip6_forwarding != 0) { /* XXX-BZ stats but changing lro_ctrl is a problem. */ CURVNET_RESTORE(); return (TCP_LRO_CANNOT); } CURVNET_RESTORE(); l3hdr = ip6 = (struct ip6_hdr *)(eh + 1); error = tcp_lro_rx_ipv6(lc, m, ip6, &th); if (error != 0) return (error); tcp_data_len = ntohs(ip6->ip6_plen); ip_len = sizeof(*ip6) + tcp_data_len; break; } #endif #ifdef INET case ETHERTYPE_IP: { CURVNET_SET(lc->ifp->if_vnet); if (V_ipforwarding != 0) { /* XXX-BZ stats but changing lro_ctrl is a problem. */ CURVNET_RESTORE(); return (TCP_LRO_CANNOT); } CURVNET_RESTORE(); l3hdr = ip4 = (struct ip *)(eh + 1); error = tcp_lro_rx_ipv4(lc, m, ip4, &th); if (error != 0) return (error); ip_len = ntohs(ip4->ip_len); tcp_data_len = ip_len - sizeof(*ip4); break; } #endif /* XXX-BZ what happens in case of VLAN(s)? */ default: return (TCP_LRO_NOT_SUPPORTED); } /* * If the frame is padded beyond the end of the IP packet, then we must * trim the extra bytes off. */ l = m->m_pkthdr.len - (ETHER_HDR_LEN + ip_len); if (l != 0) { if (l < 0) /* Truncated packet. */ return (TCP_LRO_CANNOT); m_adj(m, -l); } /* * Check TCP header constraints. */ /* Ensure no bits set besides ACK or PSH. */ if ((th->th_flags & ~(TH_ACK | TH_PUSH)) != 0) return (TCP_LRO_CANNOT); /* XXX-BZ We lose a ACK|PUSH flag concatenating multiple segments. */ /* XXX-BZ Ideally we'd flush on PUSH? */ /* * Check for timestamps. * Since the only option we handle are timestamps, we only have to * handle the simple case of aligned timestamps. */ l = (th->th_off << 2); tcp_data_len -= l; l -= sizeof(*th); ts_ptr = (uint32_t *)(th + 1); if (l != 0 && (__predict_false(l != TCPOLEN_TSTAMP_APPA) || (*ts_ptr != ntohl(TCPOPT_NOP<<24|TCPOPT_NOP<<16| TCPOPT_TIMESTAMP<<8|TCPOLEN_TIMESTAMP)))) return (TCP_LRO_CANNOT); /* If the driver did not pass in the checksum, set it now. */ if (csum == 0x0000) csum = th->th_sum; seq = ntohl(th->th_seq); + if (!use_hash) { + bucket = &lc->lro_hash[0]; + } else if (M_HASHTYPE_ISHASH(m)) { + bucket = &lc->lro_hash[m->m_pkthdr.flowid % lc->lro_hashsz]; + } else { + uint32_t hash; + + switch (eh_type) { +#ifdef INET + case ETHERTYPE_IP: + hash = ip4->ip_src.s_addr + ip4->ip_dst.s_addr; + break; +#endif +#ifdef INET6 + case ETHERTYPE_IPV6: + hash = ip6->ip6_src.s6_addr32[0] + + ip6->ip6_dst.s6_addr32[0]; + hash += ip6->ip6_src.s6_addr32[1] + + ip6->ip6_dst.s6_addr32[1]; + hash += ip6->ip6_src.s6_addr32[2] + + ip6->ip6_dst.s6_addr32[2]; + hash += ip6->ip6_src.s6_addr32[3] + + ip6->ip6_dst.s6_addr32[3]; + break; +#endif + default: + hash = 0; + break; + } + hash += th->th_sport + th->th_dport; + bucket = &lc->lro_hash[hash % lc->lro_hashsz]; + } + /* Try to find a matching previous segment. */ - LIST_FOREACH(le, &lc->lro_active, next) { + LIST_FOREACH(le, bucket, hash_next) { if (le->eh_type != eh_type) continue; if (le->source_port != th->th_sport || le->dest_port != th->th_dport) continue; switch (eh_type) { #ifdef INET6 case ETHERTYPE_IPV6: if (bcmp(&le->source_ip6, &ip6->ip6_src, sizeof(struct in6_addr)) != 0 || bcmp(&le->dest_ip6, &ip6->ip6_dst, sizeof(struct in6_addr)) != 0) continue; break; #endif #ifdef INET case ETHERTYPE_IP: if (le->source_ip4 != ip4->ip_src.s_addr || le->dest_ip4 != ip4->ip_dst.s_addr) continue; break; #endif } /* Flush now if appending will result in overflow. */ if (le->p_len > (lc->lro_length_lim - tcp_data_len)) { tcp_lro_active_remove(le); tcp_lro_flush(lc, le); break; } /* Try to append the new segment. */ if (__predict_false(seq != le->next_seq || (tcp_data_len == 0 && le->ack_seq == th->th_ack))) { /* Out of order packet or duplicate ACK. */ tcp_lro_active_remove(le); tcp_lro_flush(lc, le); return (TCP_LRO_CANNOT); } if (l != 0) { uint32_t tsval = ntohl(*(ts_ptr + 1)); /* Make sure timestamp values are increasing. */ /* XXX-BZ flip and use TSTMP_GEQ macro for this? */ if (__predict_false(le->tsval > tsval || *(ts_ptr + 2) == 0)) return (TCP_LRO_CANNOT); le->tsval = tsval; le->tsecr = *(ts_ptr + 2); } le->next_seq += tcp_data_len; le->ack_seq = th->th_ack; le->window = th->th_win; le->append_cnt++; #ifdef TCP_LRO_UPDATE_CSUM le->ulp_csum += tcp_lro_rx_csum_fixup(le, l3hdr, th, tcp_data_len, ~csum); #endif if (tcp_data_len == 0) { m_freem(m); /* * Flush this LRO entry, if this ACK should not * be further delayed. */ if (le->append_cnt >= lc->lro_ackcnt_lim) { tcp_lro_active_remove(le); tcp_lro_flush(lc, le); } return (0); } le->p_len += tcp_data_len; /* * Adjust the mbuf so that m_data points to the first byte of * the ULP payload. Adjust the mbuf to avoid complications and * append new segment to existing mbuf chain. */ m_adj(m, m->m_pkthdr.len - tcp_data_len); m_demote_pkthdr(m); le->m_tail->m_next = m; le->m_tail = m_last(m); /* * If a possible next full length packet would cause an * overflow, pro-actively flush now. */ if (le->p_len > (lc->lro_length_lim - lc->ifp->if_mtu)) { tcp_lro_active_remove(le); tcp_lro_flush(lc, le); } else getmicrotime(&le->mtime); return (0); } /* Try to find an empty slot. */ if (LIST_EMPTY(&lc->lro_free)) return (TCP_LRO_NO_ENTRIES); /* Start a new segment chain. */ le = LIST_FIRST(&lc->lro_free); LIST_REMOVE(le, next); - tcp_lro_active_insert(lc, le); + tcp_lro_active_insert(lc, bucket, le); getmicrotime(&le->mtime); /* Start filling in details. */ switch (eh_type) { #ifdef INET6 case ETHERTYPE_IPV6: le->le_ip6 = ip6; le->source_ip6 = ip6->ip6_src; le->dest_ip6 = ip6->ip6_dst; le->eh_type = eh_type; le->p_len = m->m_pkthdr.len - ETHER_HDR_LEN - sizeof(*ip6); break; #endif #ifdef INET case ETHERTYPE_IP: le->le_ip4 = ip4; le->source_ip4 = ip4->ip_src.s_addr; le->dest_ip4 = ip4->ip_dst.s_addr; le->eh_type = eh_type; le->p_len = m->m_pkthdr.len - ETHER_HDR_LEN; break; #endif } le->source_port = th->th_sport; le->dest_port = th->th_dport; le->next_seq = seq + tcp_data_len; le->ack_seq = th->th_ack; le->window = th->th_win; if (l != 0) { le->timestamp = 1; le->tsval = ntohl(*(ts_ptr + 1)); le->tsecr = *(ts_ptr + 2); } #ifdef TCP_LRO_UPDATE_CSUM /* * Do not touch the csum of the first packet. However save the * "adjusted" checksum of just the source and destination addresses, * the next header and the TCP payload. The length and TCP header * parts may change, so we remove those from the saved checksum and * re-add with final values on tcp_lro_flush() if needed. */ KASSERT(le->ulp_csum == 0, ("%s: le=%p le->ulp_csum=0x%04x\n", __func__, le, le->ulp_csum)); le->ulp_csum = tcp_lro_rx_csum_fixup(le, l3hdr, th, tcp_data_len, ~csum); th->th_sum = csum; /* Restore checksum on first packet. */ #endif le->m_head = m; le->m_tail = m_last(m); return (0); +} + +int +tcp_lro_rx(struct lro_ctrl *lc, struct mbuf *m, uint32_t csum) +{ + + return tcp_lro_rx2(lc, m, csum, 1); } void tcp_lro_queue_mbuf(struct lro_ctrl *lc, struct mbuf *mb) { /* sanity checks */ if (__predict_false(lc->ifp == NULL || lc->lro_mbuf_data == NULL || lc->lro_mbuf_max == 0)) { /* packet drop */ m_freem(mb); return; } /* check if packet is not LRO capable */ if (__predict_false(mb->m_pkthdr.csum_flags == 0 || (lc->ifp->if_capenable & IFCAP_LRO) == 0)) { lc->lro_flushed++; lc->lro_queued++; /* input packet to network layer */ (*lc->ifp->if_input) (lc->ifp, mb); return; } /* check if array is full */ if (__predict_false(lc->lro_mbuf_count == lc->lro_mbuf_max)) tcp_lro_flush_all(lc); /* create sequence number */ lc->lro_mbuf_data[lc->lro_mbuf_count].seq = (((uint64_t)M_HASHTYPE_GET(mb)) << 56) | (((uint64_t)mb->m_pkthdr.flowid) << 24) | ((uint64_t)lc->lro_mbuf_count); /* enter mbuf */ lc->lro_mbuf_data[lc->lro_mbuf_count++].mb = mb; } /* end */ Index: user/alc/PQ_LAUNDRY/sys/netinet/tcp_lro.h =================================================================== --- user/alc/PQ_LAUNDRY/sys/netinet/tcp_lro.h (revision 303666) +++ user/alc/PQ_LAUNDRY/sys/netinet/tcp_lro.h (revision 303667) @@ -1,118 +1,121 @@ /*- * Copyright (c) 2006, Myricom Inc. * Copyright (c) 2008, Intel Corporation. * Copyright (c) 2016 Mellanox Technologies. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _TCP_LRO_H_ #define _TCP_LRO_H_ #include #ifndef TCP_LRO_ENTRIES /* Define default number of LRO entries per RX queue */ #define TCP_LRO_ENTRIES 8 #endif struct lro_entry { LIST_ENTRY(lro_entry) next; + LIST_ENTRY(lro_entry) hash_next; struct mbuf *m_head; struct mbuf *m_tail; union { struct ip *ip4; struct ip6_hdr *ip6; } leip; union { in_addr_t s_ip4; struct in6_addr s_ip6; } lesource; union { in_addr_t d_ip4; struct in6_addr d_ip6; } ledest; uint16_t source_port; uint16_t dest_port; uint16_t eh_type; /* EthernetHeader type. */ uint16_t append_cnt; uint32_t p_len; /* IP header payload length. */ uint32_t ulp_csum; /* TCP, etc. checksum. */ uint32_t next_seq; /* tcp_seq */ uint32_t ack_seq; /* tcp_seq */ uint32_t tsval; uint32_t tsecr; uint16_t window; uint16_t timestamp; /* flag, not a TCP hdr field. */ struct timeval mtime; }; LIST_HEAD(lro_head, lro_entry); #define le_ip4 leip.ip4 #define le_ip6 leip.ip6 #define source_ip4 lesource.s_ip4 #define dest_ip4 ledest.d_ip4 #define source_ip6 lesource.s_ip6 #define dest_ip6 ledest.d_ip6 struct lro_mbuf_sort { uint64_t seq; struct mbuf *mb; }; /* NB: This is part of driver structs. */ struct lro_ctrl { struct ifnet *ifp; struct lro_mbuf_sort *lro_mbuf_data; uint64_t lro_queued; uint64_t lro_flushed; uint64_t lro_bad_csum; unsigned lro_cnt; unsigned lro_mbuf_count; unsigned lro_mbuf_max; unsigned short lro_ackcnt_lim; /* max # of aggregated ACKs */ unsigned lro_length_lim; /* max len of aggregated data */ + u_long lro_hashsz; + struct lro_head *lro_hash; struct lro_head lro_active; struct lro_head lro_free; }; #define TCP_LRO_LENGTH_MAX 65535 #define TCP_LRO_ACKCNT_MAX 65535 /* unlimited */ int tcp_lro_init(struct lro_ctrl *); int tcp_lro_init_args(struct lro_ctrl *, struct ifnet *, unsigned, unsigned); void tcp_lro_free(struct lro_ctrl *); void tcp_lro_flush_inactive(struct lro_ctrl *, const struct timeval *); void tcp_lro_flush(struct lro_ctrl *, struct lro_entry *); void tcp_lro_flush_all(struct lro_ctrl *); int tcp_lro_rx(struct lro_ctrl *, struct mbuf *, uint32_t); void tcp_lro_queue_mbuf(struct lro_ctrl *, struct mbuf *); #define TCP_LRO_NO_ENTRIES -2 #define TCP_LRO_CANNOT -1 #define TCP_LRO_NOT_SUPPORTED 1 #endif /* _TCP_LRO_H_ */ Index: user/alc/PQ_LAUNDRY/sys/netinet6/ip6_output.c =================================================================== --- user/alc/PQ_LAUNDRY/sys/netinet6/ip6_output.c (revision 303666) +++ user/alc/PQ_LAUNDRY/sys/netinet6/ip6_output.c (revision 303667) @@ -1,3077 +1,3078 @@ /*- * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $KAME: ip6_output.c,v 1.279 2002/01/26 06:12:30 jinmei Exp $ */ /*- * Copyright (c) 1982, 1986, 1988, 1990, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ip_output.c 8.3 (Berkeley) 1/21/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ipsec.h" #include "opt_sctp.h" #include "opt_route.h" #include "opt_rss.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef IPSEC #include #include #include #include #endif /* IPSEC */ #ifdef SCTP #include #include #endif #include #include #ifdef FLOWTABLE #include #endif extern int in6_mcast_loop; struct ip6_exthdrs { struct mbuf *ip6e_ip6; struct mbuf *ip6e_hbh; struct mbuf *ip6e_dest1; struct mbuf *ip6e_rthdr; struct mbuf *ip6e_dest2; }; static MALLOC_DEFINE(M_IP6OPT, "ip6opt", "IPv6 options"); static int ip6_pcbopt(int, u_char *, int, struct ip6_pktopts **, struct ucred *, int); static int ip6_pcbopts(struct ip6_pktopts **, struct mbuf *, struct socket *, struct sockopt *); static int ip6_getpcbopt(struct ip6_pktopts *, int, struct sockopt *); static int ip6_setpktopt(int, u_char *, int, struct ip6_pktopts *, struct ucred *, int, int, int); static int ip6_copyexthdr(struct mbuf **, caddr_t, int); static int ip6_insertfraghdr(struct mbuf *, struct mbuf *, int, struct ip6_frag **); static int ip6_insert_jumboopt(struct ip6_exthdrs *, u_int32_t); static int ip6_splithdr(struct mbuf *, struct ip6_exthdrs *); static int ip6_getpmtu(struct route_in6 *, int, struct ifnet *, const struct in6_addr *, u_long *, int *, u_int, u_int); static int ip6_calcmtu(struct ifnet *, const struct in6_addr *, u_long, u_long *, int *, u_int); static int ip6_getpmtu_ctl(u_int, const struct in6_addr *, u_long *); static int copypktopts(struct ip6_pktopts *, struct ip6_pktopts *, int); /* * Make an extension header from option data. hp is the source, and * mp is the destination. */ #define MAKE_EXTHDR(hp, mp) \ do { \ if (hp) { \ struct ip6_ext *eh = (struct ip6_ext *)(hp); \ error = ip6_copyexthdr((mp), (caddr_t)(hp), \ ((eh)->ip6e_len + 1) << 3); \ if (error) \ goto freehdrs; \ } \ } while (/*CONSTCOND*/ 0) /* * Form a chain of extension headers. * m is the extension header mbuf * mp is the previous mbuf in the chain * p is the next header * i is the type of option. */ #define MAKE_CHAIN(m, mp, p, i)\ do {\ if (m) {\ if (!hdrsplit) \ panic("assumption failed: hdr not split"); \ *mtod((m), u_char *) = *(p);\ *(p) = (i);\ p = mtod((m), u_char *);\ (m)->m_next = (mp)->m_next;\ (mp)->m_next = (m);\ (mp) = (m);\ }\ } while (/*CONSTCOND*/ 0) void in6_delayed_cksum(struct mbuf *m, uint32_t plen, u_short offset) { u_short csum; csum = in_cksum_skip(m, offset + plen, offset); if (m->m_pkthdr.csum_flags & CSUM_UDP_IPV6 && csum == 0) csum = 0xffff; offset += m->m_pkthdr.csum_data; /* checksum offset */ if (offset + sizeof(u_short) > m->m_len) { printf("%s: delayed m_pullup, m->len: %d plen %u off %u " "csum_flags=%b\n", __func__, m->m_len, plen, offset, (int)m->m_pkthdr.csum_flags, CSUM_BITS); /* * XXX this should not happen, but if it does, the correct * behavior may be to insert the checksum in the appropriate * next mbuf in the chain. */ return; } *(u_short *)(m->m_data + offset) = csum; } int ip6_fragment(struct ifnet *ifp, struct mbuf *m0, int hlen, u_char nextproto, int mtu, uint32_t id) { struct mbuf *m, **mnext, *m_frgpart; struct ip6_hdr *ip6, *mhip6; struct ip6_frag *ip6f; int off; int error; int tlen = m0->m_pkthdr.len; m = m0; ip6 = mtod(m, struct ip6_hdr *); mnext = &m->m_nextpkt; for (off = hlen; off < tlen; off += mtu) { m = m_gethdr(M_NOWAIT, MT_DATA); if (!m) { IP6STAT_INC(ip6s_odropped); return (ENOBUFS); } m->m_flags = m0->m_flags & M_COPYFLAGS; *mnext = m; mnext = &m->m_nextpkt; m->m_data += max_linkhdr; mhip6 = mtod(m, struct ip6_hdr *); *mhip6 = *ip6; m->m_len = sizeof(*mhip6); error = ip6_insertfraghdr(m0, m, hlen, &ip6f); if (error) { IP6STAT_INC(ip6s_odropped); return (error); } ip6f->ip6f_offlg = htons((u_short)((off - hlen) & ~7)); if (off + mtu >= tlen) mtu = tlen - off; else ip6f->ip6f_offlg |= IP6F_MORE_FRAG; mhip6->ip6_plen = htons((u_short)(mtu + hlen + sizeof(*ip6f) - sizeof(struct ip6_hdr))); if ((m_frgpart = m_copy(m0, off, mtu)) == NULL) { IP6STAT_INC(ip6s_odropped); return (ENOBUFS); } m_cat(m, m_frgpart); m->m_pkthdr.len = mtu + hlen + sizeof(*ip6f); m->m_pkthdr.fibnum = m0->m_pkthdr.fibnum; m->m_pkthdr.rcvif = NULL; ip6f->ip6f_reserved = 0; ip6f->ip6f_ident = id; ip6f->ip6f_nxt = nextproto; IP6STAT_INC(ip6s_ofragments); in6_ifstat_inc(ifp, ifs6_out_fragcreat); } return (0); } /* * IP6 output. The packet in mbuf chain m contains a skeletal IP6 * header (with pri, len, nxt, hlim, src, dst). * This function may modify ver and hlim only. * The mbuf chain containing the packet will be freed. * The mbuf opt, if present, will not be freed. * If route_in6 ro is present and has ro_rt initialized, route lookup would be * skipped and ro->ro_rt would be used. If ro is present but ro->ro_rt is NULL, * then result of route lookup is stored in ro->ro_rt. * * type of "mtu": rt_mtu is u_long, ifnet.ifr_mtu is int, and * nd_ifinfo.linkmtu is u_int32_t. so we use u_long to hold largest one, * which is rt_mtu. * * ifpp - XXX: just for statistics */ /* * XXX TODO: no flowid is assigned for outbound flows? */ int ip6_output(struct mbuf *m0, struct ip6_pktopts *opt, struct route_in6 *ro, int flags, struct ip6_moptions *im6o, struct ifnet **ifpp, struct inpcb *inp) { struct ip6_hdr *ip6; struct ifnet *ifp, *origifp; struct mbuf *m = m0; struct mbuf *mprev = NULL; int hlen, tlen, len; struct route_in6 ip6route; struct rtentry *rt = NULL; struct sockaddr_in6 *dst, src_sa, dst_sa; struct in6_addr odst; int error = 0; struct in6_ifaddr *ia = NULL; u_long mtu; int alwaysfrag, dontfrag; u_int32_t optlen = 0, plen = 0, unfragpartlen = 0; struct ip6_exthdrs exthdrs; struct in6_addr src0, dst0; u_int32_t zone; struct route_in6 *ro_pmtu = NULL; int hdrsplit = 0; int sw_csum, tso; int needfiblookup; uint32_t fibnum; struct m_tag *fwd_tag = NULL; uint32_t id; if (inp != NULL) { M_SETFIB(m, inp->inp_inc.inc_fibnum); if ((flags & IP_NODEFAULTFLOWID) == 0) { /* unconditionally set flowid */ m->m_pkthdr.flowid = inp->inp_flowid; M_HASHTYPE_SET(m, inp->inp_flowtype); } } bzero(&exthdrs, sizeof(exthdrs)); if (opt) { /* Hop-by-Hop options header */ MAKE_EXTHDR(opt->ip6po_hbh, &exthdrs.ip6e_hbh); /* Destination options header(1st part) */ if (opt->ip6po_rthdr) { /* * Destination options header(1st part) * This only makes sense with a routing header. * See Section 9.2 of RFC 3542. * Disabling this part just for MIP6 convenience is * a bad idea. We need to think carefully about a * way to make the advanced API coexist with MIP6 * options, which might automatically be inserted in * the kernel. */ MAKE_EXTHDR(opt->ip6po_dest1, &exthdrs.ip6e_dest1); } /* Routing header */ MAKE_EXTHDR(opt->ip6po_rthdr, &exthdrs.ip6e_rthdr); /* Destination options header(2nd part) */ MAKE_EXTHDR(opt->ip6po_dest2, &exthdrs.ip6e_dest2); } #ifdef IPSEC /* * IPSec checking which handles several cases. * FAST IPSEC: We re-injected the packet. * XXX: need scope argument. */ switch(ip6_ipsec_output(&m, inp, &error)) { case 1: /* Bad packet */ goto freehdrs; case -1: /* IPSec done */ goto done; case 0: /* No IPSec */ default: break; } #endif /* IPSEC */ /* * Calculate the total length of the extension header chain. * Keep the length of the unfragmentable part for fragmentation. */ optlen = 0; if (exthdrs.ip6e_hbh) optlen += exthdrs.ip6e_hbh->m_len; if (exthdrs.ip6e_dest1) optlen += exthdrs.ip6e_dest1->m_len; if (exthdrs.ip6e_rthdr) optlen += exthdrs.ip6e_rthdr->m_len; unfragpartlen = optlen + sizeof(struct ip6_hdr); /* NOTE: we don't add AH/ESP length here (done in ip6_ipsec_output) */ if (exthdrs.ip6e_dest2) optlen += exthdrs.ip6e_dest2->m_len; /* * If there is at least one extension header, * separate IP6 header from the payload. */ if (optlen && !hdrsplit) { if ((error = ip6_splithdr(m, &exthdrs)) != 0) { m = NULL; goto freehdrs; } m = exthdrs.ip6e_ip6; hdrsplit++; } ip6 = mtod(m, struct ip6_hdr *); /* adjust mbuf packet header length */ m->m_pkthdr.len += optlen; plen = m->m_pkthdr.len - sizeof(*ip6); /* If this is a jumbo payload, insert a jumbo payload option. */ if (plen > IPV6_MAXPACKET) { if (!hdrsplit) { if ((error = ip6_splithdr(m, &exthdrs)) != 0) { m = NULL; goto freehdrs; } m = exthdrs.ip6e_ip6; hdrsplit++; } /* adjust pointer */ ip6 = mtod(m, struct ip6_hdr *); if ((error = ip6_insert_jumboopt(&exthdrs, plen)) != 0) goto freehdrs; ip6->ip6_plen = 0; } else ip6->ip6_plen = htons(plen); /* * Concatenate headers and fill in next header fields. * Here we have, on "m" * IPv6 payload * and we insert headers accordingly. Finally, we should be getting: * IPv6 hbh dest1 rthdr ah* [esp* dest2 payload] * * during the header composing process, "m" points to IPv6 header. * "mprev" points to an extension header prior to esp. */ u_char *nexthdrp = &ip6->ip6_nxt; mprev = m; /* * we treat dest2 specially. this makes IPsec processing * much easier. the goal here is to make mprev point the * mbuf prior to dest2. * * result: IPv6 dest2 payload * m and mprev will point to IPv6 header. */ if (exthdrs.ip6e_dest2) { if (!hdrsplit) panic("assumption failed: hdr not split"); exthdrs.ip6e_dest2->m_next = m->m_next; m->m_next = exthdrs.ip6e_dest2; *mtod(exthdrs.ip6e_dest2, u_char *) = ip6->ip6_nxt; ip6->ip6_nxt = IPPROTO_DSTOPTS; } /* * result: IPv6 hbh dest1 rthdr dest2 payload * m will point to IPv6 header. mprev will point to the * extension header prior to dest2 (rthdr in the above case). */ MAKE_CHAIN(exthdrs.ip6e_hbh, mprev, nexthdrp, IPPROTO_HOPOPTS); MAKE_CHAIN(exthdrs.ip6e_dest1, mprev, nexthdrp, IPPROTO_DSTOPTS); MAKE_CHAIN(exthdrs.ip6e_rthdr, mprev, nexthdrp, IPPROTO_ROUTING); /* * If there is a routing header, discard the packet. */ if (exthdrs.ip6e_rthdr) { error = EINVAL; goto bad; } /* Source address validation */ if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src) && (flags & IPV6_UNSPECSRC) == 0) { error = EOPNOTSUPP; IP6STAT_INC(ip6s_badscope); goto bad; } if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_src)) { error = EOPNOTSUPP; IP6STAT_INC(ip6s_badscope); goto bad; } IP6STAT_INC(ip6s_localout); /* * Route packet. */ if (ro == NULL) { ro = &ip6route; bzero((caddr_t)ro, sizeof(*ro)); } else ro->ro_flags |= RT_LLE_CACHE; ro_pmtu = ro; if (opt && opt->ip6po_rthdr) ro = &opt->ip6po_route; dst = (struct sockaddr_in6 *)&ro->ro_dst; #ifdef FLOWTABLE if (ro->ro_rt == NULL) (void )flowtable_lookup(AF_INET6, m, (struct route *)ro); #endif fibnum = (inp != NULL) ? inp->inp_inc.inc_fibnum : M_GETFIB(m); again: /* * if specified, try to fill in the traffic class field. * do not override if a non-zero value is already set. * we check the diffserv field and the ecn field separately. */ if (opt && opt->ip6po_tclass >= 0) { int mask = 0; if ((ip6->ip6_flow & htonl(0xfc << 20)) == 0) mask |= 0xfc; if ((ip6->ip6_flow & htonl(0x03 << 20)) == 0) mask |= 0x03; if (mask != 0) ip6->ip6_flow |= htonl((opt->ip6po_tclass & mask) << 20); } /* fill in or override the hop limit field, if necessary. */ if (opt && opt->ip6po_hlim != -1) ip6->ip6_hlim = opt->ip6po_hlim & 0xff; else if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) { if (im6o != NULL) ip6->ip6_hlim = im6o->im6o_multicast_hlim; else ip6->ip6_hlim = V_ip6_defmcasthlim; } /* * Validate route against routing table additions; * a better/more specific route might have been added. * Make sure address family is set in route. */ if (inp) { ro->ro_dst.sin6_family = AF_INET6; RT_VALIDATE((struct route *)ro, &inp->inp_rt_cookie, fibnum); } if (ro->ro_rt && fwd_tag == NULL && (ro->ro_rt->rt_flags & RTF_UP) && ro->ro_dst.sin6_family == AF_INET6 && IN6_ARE_ADDR_EQUAL(&ro->ro_dst.sin6_addr, &ip6->ip6_dst)) { rt = ro->ro_rt; ifp = ro->ro_rt->rt_ifp; } else { if (fwd_tag == NULL) { bzero(&dst_sa, sizeof(dst_sa)); dst_sa.sin6_family = AF_INET6; dst_sa.sin6_len = sizeof(dst_sa); dst_sa.sin6_addr = ip6->ip6_dst; } error = in6_selectroute_fib(&dst_sa, opt, im6o, ro, &ifp, &rt, fibnum); if (error != 0) { if (ifp != NULL) in6_ifstat_inc(ifp, ifs6_out_discard); goto bad; } } if (rt == NULL) { /* * If in6_selectroute() does not return a route entry, * dst may not have been updated. */ *dst = dst_sa; /* XXX */ } /* * then rt (for unicast) and ifp must be non-NULL valid values. */ if ((flags & IPV6_FORWARDING) == 0) { /* XXX: the FORWARDING flag can be set for mrouting. */ in6_ifstat_inc(ifp, ifs6_out_request); } if (rt != NULL) { ia = (struct in6_ifaddr *)(rt->rt_ifa); counter_u64_add(rt->rt_pksent, 1); } /* * The outgoing interface must be in the zone of source and * destination addresses. */ origifp = ifp; src0 = ip6->ip6_src; if (in6_setscope(&src0, origifp, &zone)) goto badscope; bzero(&src_sa, sizeof(src_sa)); src_sa.sin6_family = AF_INET6; src_sa.sin6_len = sizeof(src_sa); src_sa.sin6_addr = ip6->ip6_src; if (sa6_recoverscope(&src_sa) || zone != src_sa.sin6_scope_id) goto badscope; dst0 = ip6->ip6_dst; if (in6_setscope(&dst0, origifp, &zone)) goto badscope; /* re-initialize to be sure */ bzero(&dst_sa, sizeof(dst_sa)); dst_sa.sin6_family = AF_INET6; dst_sa.sin6_len = sizeof(dst_sa); dst_sa.sin6_addr = ip6->ip6_dst; if (sa6_recoverscope(&dst_sa) || zone != dst_sa.sin6_scope_id) { goto badscope; } /* We should use ia_ifp to support the case of * sending packets to an address of our own. */ if (ia != NULL && ia->ia_ifp) ifp = ia->ia_ifp; /* scope check is done. */ goto routefound; badscope: IP6STAT_INC(ip6s_badscope); in6_ifstat_inc(origifp, ifs6_out_discard); if (error == 0) error = EHOSTUNREACH; /* XXX */ goto bad; routefound: if (rt && !IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) { if (opt && opt->ip6po_nextroute.ro_rt) { /* * The nexthop is explicitly specified by the * application. We assume the next hop is an IPv6 * address. */ dst = (struct sockaddr_in6 *)opt->ip6po_nexthop; } else if ((rt->rt_flags & RTF_GATEWAY)) dst = (struct sockaddr_in6 *)rt->rt_gateway; } if (!IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) { m->m_flags &= ~(M_BCAST | M_MCAST); /* just in case */ } else { m->m_flags = (m->m_flags & ~M_BCAST) | M_MCAST; in6_ifstat_inc(ifp, ifs6_out_mcast); /* * Confirm that the outgoing interface supports multicast. */ if (!(ifp->if_flags & IFF_MULTICAST)) { IP6STAT_INC(ip6s_noroute); in6_ifstat_inc(ifp, ifs6_out_discard); error = ENETUNREACH; goto bad; } if ((im6o == NULL && in6_mcast_loop) || (im6o && im6o->im6o_multicast_loop)) { /* * Loop back multicast datagram if not expressly * forbidden to do so, even if we have not joined * the address; protocols will filter it later, * thus deferring a hash lookup and lock acquisition * at the expense of an m_copym(). */ ip6_mloopback(ifp, m); } else { /* * If we are acting as a multicast router, perform * multicast forwarding as if the packet had just * arrived on the interface to which we are about * to send. The multicast forwarding function * recursively calls this function, using the * IPV6_FORWARDING flag to prevent infinite recursion. * * Multicasts that are looped back by ip6_mloopback(), * above, will be forwarded by the ip6_input() routine, * if necessary. */ if (V_ip6_mrouter && (flags & IPV6_FORWARDING) == 0) { /* * XXX: ip6_mforward expects that rcvif is NULL * when it is called from the originating path. * However, it may not always be the case. */ m->m_pkthdr.rcvif = NULL; if (ip6_mforward(ip6, ifp, m) != 0) { m_freem(m); goto done; } } } /* * Multicasts with a hoplimit of zero may be looped back, * above, but must not be transmitted on a network. * Also, multicasts addressed to the loopback interface * are not sent -- the above call to ip6_mloopback() will * loop back a copy if this host actually belongs to the * destination group on the loopback interface. */ if (ip6->ip6_hlim == 0 || (ifp->if_flags & IFF_LOOPBACK) || IN6_IS_ADDR_MC_INTFACELOCAL(&ip6->ip6_dst)) { m_freem(m); goto done; } } /* * Fill the outgoing inteface to tell the upper layer * to increment per-interface statistics. */ if (ifpp) *ifpp = ifp; /* Determine path MTU. */ if ((error = ip6_getpmtu(ro_pmtu, ro != ro_pmtu, ifp, &ip6->ip6_dst, &mtu, &alwaysfrag, fibnum, *nexthdrp)) != 0) goto bad; /* * The caller of this function may specify to use the minimum MTU * in some cases. * An advanced API option (IPV6_USE_MIN_MTU) can also override MTU * setting. The logic is a bit complicated; by default, unicast * packets will follow path MTU while multicast packets will be sent at * the minimum MTU. If IP6PO_MINMTU_ALL is specified, all packets * including unicast ones will be sent at the minimum MTU. Multicast * packets will always be sent at the minimum MTU unless * IP6PO_MINMTU_DISABLE is explicitly specified. * See RFC 3542 for more details. */ if (mtu > IPV6_MMTU) { if ((flags & IPV6_MINMTU)) mtu = IPV6_MMTU; else if (opt && opt->ip6po_minmtu == IP6PO_MINMTU_ALL) mtu = IPV6_MMTU; else if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) && (opt == NULL || opt->ip6po_minmtu != IP6PO_MINMTU_DISABLE)) { mtu = IPV6_MMTU; } } /* * clear embedded scope identifiers if necessary. * in6_clearscope will touch the addresses only when necessary. */ in6_clearscope(&ip6->ip6_src); in6_clearscope(&ip6->ip6_dst); /* * If the outgoing packet contains a hop-by-hop options header, * it must be examined and processed even by the source node. * (RFC 2460, section 4.) */ if (exthdrs.ip6e_hbh) { struct ip6_hbh *hbh = mtod(exthdrs.ip6e_hbh, struct ip6_hbh *); u_int32_t dummy; /* XXX unused */ u_int32_t plen = 0; /* XXX: ip6_process will check the value */ #ifdef DIAGNOSTIC if ((hbh->ip6h_len + 1) << 3 > exthdrs.ip6e_hbh->m_len) panic("ip6e_hbh is not contiguous"); #endif /* * XXX: if we have to send an ICMPv6 error to the sender, * we need the M_LOOP flag since icmp6_error() expects * the IPv6 and the hop-by-hop options header are * contiguous unless the flag is set. */ m->m_flags |= M_LOOP; m->m_pkthdr.rcvif = ifp; if (ip6_process_hopopts(m, (u_int8_t *)(hbh + 1), ((hbh->ip6h_len + 1) << 3) - sizeof(struct ip6_hbh), &dummy, &plen) < 0) { /* m was already freed at this point */ error = EINVAL;/* better error? */ goto done; } m->m_flags &= ~M_LOOP; /* XXX */ m->m_pkthdr.rcvif = NULL; } /* Jump over all PFIL processing if hooks are not active. */ if (!PFIL_HOOKED(&V_inet6_pfil_hook)) goto passout; odst = ip6->ip6_dst; /* Run through list of hooks for output packets. */ error = pfil_run_hooks(&V_inet6_pfil_hook, &m, ifp, PFIL_OUT, inp); if (error != 0 || m == NULL) goto done; /* adjust pointer */ ip6 = mtod(m, struct ip6_hdr *); needfiblookup = 0; /* See if destination IP address was changed by packet filter. */ if (!IN6_ARE_ADDR_EQUAL(&odst, &ip6->ip6_dst)) { m->m_flags |= M_SKIP_FIREWALL; /* If destination is now ourself drop to ip6_input(). */ if (in6_localip(&ip6->ip6_dst)) { m->m_flags |= M_FASTFWD_OURS; if (m->m_pkthdr.rcvif == NULL) m->m_pkthdr.rcvif = V_loif; if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA_IPV6) { m->m_pkthdr.csum_flags |= CSUM_DATA_VALID_IPV6 | CSUM_PSEUDO_HDR; m->m_pkthdr.csum_data = 0xffff; } #ifdef SCTP if (m->m_pkthdr.csum_flags & CSUM_SCTP_IPV6) m->m_pkthdr.csum_flags |= CSUM_SCTP_VALID; #endif error = netisr_queue(NETISR_IPV6, m); goto done; } else { RO_RTFREE(ro); needfiblookup = 1; /* Redo the routing table lookup. */ } } /* See if fib was changed by packet filter. */ if (fibnum != M_GETFIB(m)) { m->m_flags |= M_SKIP_FIREWALL; fibnum = M_GETFIB(m); RO_RTFREE(ro); needfiblookup = 1; } if (needfiblookup) goto again; /* See if local, if yes, send it to netisr. */ if (m->m_flags & M_FASTFWD_OURS) { if (m->m_pkthdr.rcvif == NULL) m->m_pkthdr.rcvif = V_loif; if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA_IPV6) { m->m_pkthdr.csum_flags |= CSUM_DATA_VALID_IPV6 | CSUM_PSEUDO_HDR; m->m_pkthdr.csum_data = 0xffff; } #ifdef SCTP if (m->m_pkthdr.csum_flags & CSUM_SCTP_IPV6) m->m_pkthdr.csum_flags |= CSUM_SCTP_VALID; #endif error = netisr_queue(NETISR_IPV6, m); goto done; } /* Or forward to some other address? */ if ((m->m_flags & M_IP6_NEXTHOP) && (fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL)) != NULL) { dst = (struct sockaddr_in6 *)&ro->ro_dst; bcopy((fwd_tag+1), &dst_sa, sizeof(struct sockaddr_in6)); m->m_flags |= M_SKIP_FIREWALL; m->m_flags &= ~M_IP6_NEXTHOP; m_tag_delete(m, fwd_tag); goto again; } passout: /* * Send the packet to the outgoing interface. * If necessary, do IPv6 fragmentation before sending. * * the logic here is rather complex: * 1: normal case (dontfrag == 0, alwaysfrag == 0) * 1-a: send as is if tlen <= path mtu * 1-b: fragment if tlen > path mtu * * 2: if user asks us not to fragment (dontfrag == 1) * 2-a: send as is if tlen <= interface mtu * 2-b: error if tlen > interface mtu * * 3: if we always need to attach fragment header (alwaysfrag == 1) * always fragment * * 4: if dontfrag == 1 && alwaysfrag == 1 * error, as we cannot handle this conflicting request */ sw_csum = m->m_pkthdr.csum_flags; if (!hdrsplit) { tso = ((sw_csum & ifp->if_hwassist & CSUM_TSO) != 0) ? 1 : 0; sw_csum &= ~ifp->if_hwassist; } else tso = 0; /* * If we added extension headers, we will not do TSO and calculate the * checksums ourselves for now. * XXX-BZ Need a framework to know when the NIC can handle it, even * with ext. hdrs. */ if (sw_csum & CSUM_DELAY_DATA_IPV6) { sw_csum &= ~CSUM_DELAY_DATA_IPV6; in6_delayed_cksum(m, plen, sizeof(struct ip6_hdr)); } #ifdef SCTP if (sw_csum & CSUM_SCTP_IPV6) { sw_csum &= ~CSUM_SCTP_IPV6; sctp_delayed_cksum(m, sizeof(struct ip6_hdr)); } #endif m->m_pkthdr.csum_flags &= ifp->if_hwassist; tlen = m->m_pkthdr.len; if ((opt && (opt->ip6po_flags & IP6PO_DONTFRAG)) || tso) dontfrag = 1; else dontfrag = 0; if (dontfrag && alwaysfrag) { /* case 4 */ /* conflicting request - can't transmit */ error = EMSGSIZE; goto bad; } if (dontfrag && tlen > IN6_LINKMTU(ifp) && !tso) { /* case 2-b */ /* * Even if the DONTFRAG option is specified, we cannot send the * packet when the data length is larger than the MTU of the * outgoing interface. * Notify the error by sending IPV6_PATHMTU ancillary data if * application wanted to know the MTU value. Also return an * error code (this is not described in the API spec). */ if (inp != NULL) ip6_notify_pmtu(inp, &dst_sa, (u_int32_t)mtu); error = EMSGSIZE; goto bad; } /* * transmit packet without fragmentation */ if (dontfrag || (!alwaysfrag && tlen <= mtu)) { /* case 1-a and 2-a */ struct in6_ifaddr *ia6; ip6 = mtod(m, struct ip6_hdr *); ia6 = in6_ifawithifp(ifp, &ip6->ip6_src); if (ia6) { /* Record statistics for this interface address. */ counter_u64_add(ia6->ia_ifa.ifa_opackets, 1); counter_u64_add(ia6->ia_ifa.ifa_obytes, m->m_pkthdr.len); ifa_free(&ia6->ia_ifa); } error = nd6_output_ifp(ifp, origifp, m, dst, (struct route *)ro); goto done; } /* * try to fragment the packet. case 1-b and 3 */ if (mtu < IPV6_MMTU) { /* path MTU cannot be less than IPV6_MMTU */ error = EMSGSIZE; in6_ifstat_inc(ifp, ifs6_out_fragfail); goto bad; } else if (ip6->ip6_plen == 0) { /* jumbo payload cannot be fragmented */ error = EMSGSIZE; in6_ifstat_inc(ifp, ifs6_out_fragfail); goto bad; } else { u_char nextproto; /* * Too large for the destination or interface; * fragment if possible. * Must be able to put at least 8 bytes per fragment. */ hlen = unfragpartlen; if (mtu > IPV6_MAXPACKET) mtu = IPV6_MAXPACKET; len = (mtu - hlen - sizeof(struct ip6_frag)) & ~7; if (len < 8) { error = EMSGSIZE; in6_ifstat_inc(ifp, ifs6_out_fragfail); goto bad; } /* * If the interface will not calculate checksums on * fragmented packets, then do it here. * XXX-BZ handle the hw offloading case. Need flags. */ if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA_IPV6) { in6_delayed_cksum(m, plen, hlen); m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA_IPV6; } #ifdef SCTP if (m->m_pkthdr.csum_flags & CSUM_SCTP_IPV6) { sctp_delayed_cksum(m, hlen); m->m_pkthdr.csum_flags &= ~CSUM_SCTP_IPV6; } #endif /* * Change the next header field of the last header in the * unfragmentable part. */ if (exthdrs.ip6e_rthdr) { nextproto = *mtod(exthdrs.ip6e_rthdr, u_char *); *mtod(exthdrs.ip6e_rthdr, u_char *) = IPPROTO_FRAGMENT; } else if (exthdrs.ip6e_dest1) { nextproto = *mtod(exthdrs.ip6e_dest1, u_char *); *mtod(exthdrs.ip6e_dest1, u_char *) = IPPROTO_FRAGMENT; } else if (exthdrs.ip6e_hbh) { nextproto = *mtod(exthdrs.ip6e_hbh, u_char *); *mtod(exthdrs.ip6e_hbh, u_char *) = IPPROTO_FRAGMENT; } else { nextproto = ip6->ip6_nxt; ip6->ip6_nxt = IPPROTO_FRAGMENT; } /* * Loop through length of segment after first fragment, * make new header and copy data of each part and link onto * chain. */ m0 = m; id = htonl(ip6_randomid()); if ((error = ip6_fragment(ifp, m, hlen, nextproto, len, id))) goto sendorfree; in6_ifstat_inc(ifp, ifs6_out_fragok); } /* * Remove leading garbages. */ sendorfree: m = m0->m_nextpkt; m0->m_nextpkt = 0; m_freem(m0); for (m0 = m; m; m = m0) { m0 = m->m_nextpkt; m->m_nextpkt = 0; if (error == 0) { /* Record statistics for this interface address. */ if (ia) { counter_u64_add(ia->ia_ifa.ifa_opackets, 1); counter_u64_add(ia->ia_ifa.ifa_obytes, m->m_pkthdr.len); } error = nd6_output_ifp(ifp, origifp, m, dst, (struct route *)ro); } else m_freem(m); } if (error == 0) IP6STAT_INC(ip6s_fragmented); done: /* * Release the route if using our private route, or if * (with flowtable) we don't have our own reference. */ - if (ro == &ip6route || ro->ro_flags & RT_NORTREF) + if (ro == &ip6route || + (ro != NULL && ro->ro_flags & RT_NORTREF)) RO_RTFREE(ro); return (error); freehdrs: m_freem(exthdrs.ip6e_hbh); /* m_freem will check if mbuf is 0 */ m_freem(exthdrs.ip6e_dest1); m_freem(exthdrs.ip6e_rthdr); m_freem(exthdrs.ip6e_dest2); /* FALLTHROUGH */ bad: if (m) m_freem(m); goto done; } static int ip6_copyexthdr(struct mbuf **mp, caddr_t hdr, int hlen) { struct mbuf *m; if (hlen > MCLBYTES) return (ENOBUFS); /* XXX */ if (hlen > MLEN) m = m_getcl(M_NOWAIT, MT_DATA, 0); else m = m_get(M_NOWAIT, MT_DATA); if (m == NULL) return (ENOBUFS); m->m_len = hlen; if (hdr) bcopy(hdr, mtod(m, caddr_t), hlen); *mp = m; return (0); } /* * Insert jumbo payload option. */ static int ip6_insert_jumboopt(struct ip6_exthdrs *exthdrs, u_int32_t plen) { struct mbuf *mopt; u_char *optbuf; u_int32_t v; #define JUMBOOPTLEN 8 /* length of jumbo payload option and padding */ /* * If there is no hop-by-hop options header, allocate new one. * If there is one but it doesn't have enough space to store the * jumbo payload option, allocate a cluster to store the whole options. * Otherwise, use it to store the options. */ if (exthdrs->ip6e_hbh == NULL) { mopt = m_get(M_NOWAIT, MT_DATA); if (mopt == NULL) return (ENOBUFS); mopt->m_len = JUMBOOPTLEN; optbuf = mtod(mopt, u_char *); optbuf[1] = 0; /* = ((JUMBOOPTLEN) >> 3) - 1 */ exthdrs->ip6e_hbh = mopt; } else { struct ip6_hbh *hbh; mopt = exthdrs->ip6e_hbh; if (M_TRAILINGSPACE(mopt) < JUMBOOPTLEN) { /* * XXX assumption: * - exthdrs->ip6e_hbh is not referenced from places * other than exthdrs. * - exthdrs->ip6e_hbh is not an mbuf chain. */ int oldoptlen = mopt->m_len; struct mbuf *n; /* * XXX: give up if the whole (new) hbh header does * not fit even in an mbuf cluster. */ if (oldoptlen + JUMBOOPTLEN > MCLBYTES) return (ENOBUFS); /* * As a consequence, we must always prepare a cluster * at this point. */ n = m_getcl(M_NOWAIT, MT_DATA, 0); if (n == NULL) return (ENOBUFS); n->m_len = oldoptlen + JUMBOOPTLEN; bcopy(mtod(mopt, caddr_t), mtod(n, caddr_t), oldoptlen); optbuf = mtod(n, caddr_t) + oldoptlen; m_freem(mopt); mopt = exthdrs->ip6e_hbh = n; } else { optbuf = mtod(mopt, u_char *) + mopt->m_len; mopt->m_len += JUMBOOPTLEN; } optbuf[0] = IP6OPT_PADN; optbuf[1] = 1; /* * Adjust the header length according to the pad and * the jumbo payload option. */ hbh = mtod(mopt, struct ip6_hbh *); hbh->ip6h_len += (JUMBOOPTLEN >> 3); } /* fill in the option. */ optbuf[2] = IP6OPT_JUMBO; optbuf[3] = 4; v = (u_int32_t)htonl(plen + JUMBOOPTLEN); bcopy(&v, &optbuf[4], sizeof(u_int32_t)); /* finally, adjust the packet header length */ exthdrs->ip6e_ip6->m_pkthdr.len += JUMBOOPTLEN; return (0); #undef JUMBOOPTLEN } /* * Insert fragment header and copy unfragmentable header portions. */ static int ip6_insertfraghdr(struct mbuf *m0, struct mbuf *m, int hlen, struct ip6_frag **frghdrp) { struct mbuf *n, *mlast; if (hlen > sizeof(struct ip6_hdr)) { n = m_copym(m0, sizeof(struct ip6_hdr), hlen - sizeof(struct ip6_hdr), M_NOWAIT); if (n == NULL) return (ENOBUFS); m->m_next = n; } else n = m; /* Search for the last mbuf of unfragmentable part. */ for (mlast = n; mlast->m_next; mlast = mlast->m_next) ; if (M_WRITABLE(mlast) && M_TRAILINGSPACE(mlast) >= sizeof(struct ip6_frag)) { /* use the trailing space of the last mbuf for the fragment hdr */ *frghdrp = (struct ip6_frag *)(mtod(mlast, caddr_t) + mlast->m_len); mlast->m_len += sizeof(struct ip6_frag); m->m_pkthdr.len += sizeof(struct ip6_frag); } else { /* allocate a new mbuf for the fragment header */ struct mbuf *mfrg; mfrg = m_get(M_NOWAIT, MT_DATA); if (mfrg == NULL) return (ENOBUFS); mfrg->m_len = sizeof(struct ip6_frag); *frghdrp = mtod(mfrg, struct ip6_frag *); mlast->m_next = mfrg; } return (0); } /* * Calculates IPv6 path mtu for destination @dst. * Resulting MTU is stored in @mtup. * * Returns 0 on success. */ static int ip6_getpmtu_ctl(u_int fibnum, const struct in6_addr *dst, u_long *mtup) { struct nhop6_extended nh6; struct in6_addr kdst; uint32_t scopeid; struct ifnet *ifp; u_long mtu; int error; in6_splitscope(dst, &kdst, &scopeid); if (fib6_lookup_nh_ext(fibnum, &kdst, scopeid, NHR_REF, 0, &nh6) != 0) return (EHOSTUNREACH); ifp = nh6.nh_ifp; mtu = nh6.nh_mtu; error = ip6_calcmtu(ifp, dst, mtu, mtup, NULL, 0); fib6_free_nh_ext(fibnum, &nh6); return (error); } /* * Calculates IPv6 path MTU for @dst based on transmit @ifp, * and cached data in @ro_pmtu. * MTU from (successful) route lookup is saved (along with dst) * inside @ro_pmtu to avoid subsequent route lookups after packet * filter processing. * * Stores mtu and always-frag value into @mtup and @alwaysfragp. * Returns 0 on success. */ static int ip6_getpmtu(struct route_in6 *ro_pmtu, int do_lookup, struct ifnet *ifp, const struct in6_addr *dst, u_long *mtup, int *alwaysfragp, u_int fibnum, u_int proto) { struct nhop6_basic nh6; struct in6_addr kdst; uint32_t scopeid; struct sockaddr_in6 *sa6_dst; u_long mtu; mtu = 0; if (do_lookup) { /* * Here ro_pmtu has final destination address, while * ro might represent immediate destination. * Use ro_pmtu destination since mtu might differ. */ sa6_dst = (struct sockaddr_in6 *)&ro_pmtu->ro_dst; if (!IN6_ARE_ADDR_EQUAL(&sa6_dst->sin6_addr, dst)) ro_pmtu->ro_mtu = 0; if (ro_pmtu->ro_mtu == 0) { bzero(sa6_dst, sizeof(*sa6_dst)); sa6_dst->sin6_family = AF_INET6; sa6_dst->sin6_len = sizeof(struct sockaddr_in6); sa6_dst->sin6_addr = *dst; in6_splitscope(dst, &kdst, &scopeid); if (fib6_lookup_nh_basic(fibnum, &kdst, scopeid, 0, 0, &nh6) == 0) ro_pmtu->ro_mtu = nh6.nh_mtu; } mtu = ro_pmtu->ro_mtu; } if (ro_pmtu->ro_rt) mtu = ro_pmtu->ro_rt->rt_mtu; return (ip6_calcmtu(ifp, dst, mtu, mtup, alwaysfragp, proto)); } /* * Calculate MTU based on transmit @ifp, route mtu @rt_mtu and * hostcache data for @dst. * Stores mtu and always-frag value into @mtup and @alwaysfragp. * * Returns 0 on success. */ static int ip6_calcmtu(struct ifnet *ifp, const struct in6_addr *dst, u_long rt_mtu, u_long *mtup, int *alwaysfragp, u_int proto) { u_long mtu = 0; int alwaysfrag = 0; int error = 0; if (rt_mtu > 0) { u_int32_t ifmtu; struct in_conninfo inc; bzero(&inc, sizeof(inc)); inc.inc_flags |= INC_ISIPV6; inc.inc6_faddr = *dst; ifmtu = IN6_LINKMTU(ifp); /* TCP is known to react to pmtu changes so skip hc */ if (proto != IPPROTO_TCP) mtu = tcp_hc_getmtu(&inc); if (mtu) mtu = min(mtu, rt_mtu); else mtu = rt_mtu; if (mtu == 0) mtu = ifmtu; else if (mtu < IPV6_MMTU) { /* * RFC2460 section 5, last paragraph: * if we record ICMPv6 too big message with * mtu < IPV6_MMTU, transmit packets sized IPV6_MMTU * or smaller, with framgent header attached. * (fragment header is needed regardless from the * packet size, for translators to identify packets) */ alwaysfrag = 1; mtu = IPV6_MMTU; } } else if (ifp) { mtu = IN6_LINKMTU(ifp); } else error = EHOSTUNREACH; /* XXX */ *mtup = mtu; if (alwaysfragp) *alwaysfragp = alwaysfrag; return (error); } /* * IP6 socket option processing. */ int ip6_ctloutput(struct socket *so, struct sockopt *sopt) { int optdatalen, uproto; void *optdata; struct inpcb *in6p = sotoinpcb(so); int error, optval; int level, op, optname; int optlen; struct thread *td; #ifdef RSS uint32_t rss_bucket; int retval; #endif level = sopt->sopt_level; op = sopt->sopt_dir; optname = sopt->sopt_name; optlen = sopt->sopt_valsize; td = sopt->sopt_td; error = 0; optval = 0; uproto = (int)so->so_proto->pr_protocol; if (level != IPPROTO_IPV6) { error = EINVAL; if (sopt->sopt_level == SOL_SOCKET && sopt->sopt_dir == SOPT_SET) { switch (sopt->sopt_name) { case SO_REUSEADDR: INP_WLOCK(in6p); if ((so->so_options & SO_REUSEADDR) != 0) in6p->inp_flags2 |= INP_REUSEADDR; else in6p->inp_flags2 &= ~INP_REUSEADDR; INP_WUNLOCK(in6p); error = 0; break; case SO_REUSEPORT: INP_WLOCK(in6p); if ((so->so_options & SO_REUSEPORT) != 0) in6p->inp_flags2 |= INP_REUSEPORT; else in6p->inp_flags2 &= ~INP_REUSEPORT; INP_WUNLOCK(in6p); error = 0; break; case SO_SETFIB: INP_WLOCK(in6p); in6p->inp_inc.inc_fibnum = so->so_fibnum; INP_WUNLOCK(in6p); error = 0; break; default: break; } } } else { /* level == IPPROTO_IPV6 */ switch (op) { case SOPT_SET: switch (optname) { case IPV6_2292PKTOPTIONS: #ifdef IPV6_PKTOPTIONS case IPV6_PKTOPTIONS: #endif { struct mbuf *m; error = soopt_getm(sopt, &m); /* XXX */ if (error != 0) break; error = soopt_mcopyin(sopt, m); /* XXX */ if (error != 0) break; error = ip6_pcbopts(&in6p->in6p_outputopts, m, so, sopt); m_freem(m); /* XXX */ break; } /* * Use of some Hop-by-Hop options or some * Destination options, might require special * privilege. That is, normal applications * (without special privilege) might be forbidden * from setting certain options in outgoing packets, * and might never see certain options in received * packets. [RFC 2292 Section 6] * KAME specific note: * KAME prevents non-privileged users from sending or * receiving ANY hbh/dst options in order to avoid * overhead of parsing options in the kernel. */ case IPV6_RECVHOPOPTS: case IPV6_RECVDSTOPTS: case IPV6_RECVRTHDRDSTOPTS: if (td != NULL) { error = priv_check(td, PRIV_NETINET_SETHDROPTS); if (error) break; } /* FALLTHROUGH */ case IPV6_UNICAST_HOPS: case IPV6_HOPLIMIT: case IPV6_RECVPKTINFO: case IPV6_RECVHOPLIMIT: case IPV6_RECVRTHDR: case IPV6_RECVPATHMTU: case IPV6_RECVTCLASS: case IPV6_RECVFLOWID: #ifdef RSS case IPV6_RECVRSSBUCKETID: #endif case IPV6_V6ONLY: case IPV6_AUTOFLOWLABEL: case IPV6_BINDANY: case IPV6_BINDMULTI: #ifdef RSS case IPV6_RSS_LISTEN_BUCKET: #endif if (optname == IPV6_BINDANY && td != NULL) { error = priv_check(td, PRIV_NETINET_BINDANY); if (error) break; } if (optlen != sizeof(int)) { error = EINVAL; break; } error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) break; switch (optname) { case IPV6_UNICAST_HOPS: if (optval < -1 || optval >= 256) error = EINVAL; else { /* -1 = kernel default */ in6p->in6p_hops = optval; if ((in6p->inp_vflag & INP_IPV4) != 0) in6p->inp_ip_ttl = optval; } break; #define OPTSET(bit) \ do { \ INP_WLOCK(in6p); \ if (optval) \ in6p->inp_flags |= (bit); \ else \ in6p->inp_flags &= ~(bit); \ INP_WUNLOCK(in6p); \ } while (/*CONSTCOND*/ 0) #define OPTSET2292(bit) \ do { \ INP_WLOCK(in6p); \ in6p->inp_flags |= IN6P_RFC2292; \ if (optval) \ in6p->inp_flags |= (bit); \ else \ in6p->inp_flags &= ~(bit); \ INP_WUNLOCK(in6p); \ } while (/*CONSTCOND*/ 0) #define OPTBIT(bit) (in6p->inp_flags & (bit) ? 1 : 0) #define OPTSET2(bit, val) do { \ INP_WLOCK(in6p); \ if (val) \ in6p->inp_flags2 |= bit; \ else \ in6p->inp_flags2 &= ~bit; \ INP_WUNLOCK(in6p); \ } while (0) #define OPTBIT2(bit) (in6p->inp_flags2 & (bit) ? 1 : 0) case IPV6_RECVPKTINFO: /* cannot mix with RFC2292 */ if (OPTBIT(IN6P_RFC2292)) { error = EINVAL; break; } OPTSET(IN6P_PKTINFO); break; case IPV6_HOPLIMIT: { struct ip6_pktopts **optp; /* cannot mix with RFC2292 */ if (OPTBIT(IN6P_RFC2292)) { error = EINVAL; break; } optp = &in6p->in6p_outputopts; error = ip6_pcbopt(IPV6_HOPLIMIT, (u_char *)&optval, sizeof(optval), optp, (td != NULL) ? td->td_ucred : NULL, uproto); break; } case IPV6_RECVHOPLIMIT: /* cannot mix with RFC2292 */ if (OPTBIT(IN6P_RFC2292)) { error = EINVAL; break; } OPTSET(IN6P_HOPLIMIT); break; case IPV6_RECVHOPOPTS: /* cannot mix with RFC2292 */ if (OPTBIT(IN6P_RFC2292)) { error = EINVAL; break; } OPTSET(IN6P_HOPOPTS); break; case IPV6_RECVDSTOPTS: /* cannot mix with RFC2292 */ if (OPTBIT(IN6P_RFC2292)) { error = EINVAL; break; } OPTSET(IN6P_DSTOPTS); break; case IPV6_RECVRTHDRDSTOPTS: /* cannot mix with RFC2292 */ if (OPTBIT(IN6P_RFC2292)) { error = EINVAL; break; } OPTSET(IN6P_RTHDRDSTOPTS); break; case IPV6_RECVRTHDR: /* cannot mix with RFC2292 */ if (OPTBIT(IN6P_RFC2292)) { error = EINVAL; break; } OPTSET(IN6P_RTHDR); break; case IPV6_RECVPATHMTU: /* * We ignore this option for TCP * sockets. * (RFC3542 leaves this case * unspecified.) */ if (uproto != IPPROTO_TCP) OPTSET(IN6P_MTU); break; case IPV6_RECVFLOWID: OPTSET2(INP_RECVFLOWID, optval); break; #ifdef RSS case IPV6_RECVRSSBUCKETID: OPTSET2(INP_RECVRSSBUCKETID, optval); break; #endif case IPV6_V6ONLY: /* * make setsockopt(IPV6_V6ONLY) * available only prior to bind(2). * see ipng mailing list, Jun 22 2001. */ if (in6p->inp_lport || !IN6_IS_ADDR_UNSPECIFIED(&in6p->in6p_laddr)) { error = EINVAL; break; } OPTSET(IN6P_IPV6_V6ONLY); if (optval) in6p->inp_vflag &= ~INP_IPV4; else in6p->inp_vflag |= INP_IPV4; break; case IPV6_RECVTCLASS: /* cannot mix with RFC2292 XXX */ if (OPTBIT(IN6P_RFC2292)) { error = EINVAL; break; } OPTSET(IN6P_TCLASS); break; case IPV6_AUTOFLOWLABEL: OPTSET(IN6P_AUTOFLOWLABEL); break; case IPV6_BINDANY: OPTSET(INP_BINDANY); break; case IPV6_BINDMULTI: OPTSET2(INP_BINDMULTI, optval); break; #ifdef RSS case IPV6_RSS_LISTEN_BUCKET: if ((optval >= 0) && (optval < rss_getnumbuckets())) { in6p->inp_rss_listen_bucket = optval; OPTSET2(INP_RSS_BUCKET_SET, 1); } else { error = EINVAL; } break; #endif } break; case IPV6_TCLASS: case IPV6_DONTFRAG: case IPV6_USE_MIN_MTU: case IPV6_PREFER_TEMPADDR: if (optlen != sizeof(optval)) { error = EINVAL; break; } error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) break; { struct ip6_pktopts **optp; optp = &in6p->in6p_outputopts; error = ip6_pcbopt(optname, (u_char *)&optval, sizeof(optval), optp, (td != NULL) ? td->td_ucred : NULL, uproto); break; } case IPV6_2292PKTINFO: case IPV6_2292HOPLIMIT: case IPV6_2292HOPOPTS: case IPV6_2292DSTOPTS: case IPV6_2292RTHDR: /* RFC 2292 */ if (optlen != sizeof(int)) { error = EINVAL; break; } error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) break; switch (optname) { case IPV6_2292PKTINFO: OPTSET2292(IN6P_PKTINFO); break; case IPV6_2292HOPLIMIT: OPTSET2292(IN6P_HOPLIMIT); break; case IPV6_2292HOPOPTS: /* * Check super-user privilege. * See comments for IPV6_RECVHOPOPTS. */ if (td != NULL) { error = priv_check(td, PRIV_NETINET_SETHDROPTS); if (error) return (error); } OPTSET2292(IN6P_HOPOPTS); break; case IPV6_2292DSTOPTS: if (td != NULL) { error = priv_check(td, PRIV_NETINET_SETHDROPTS); if (error) return (error); } OPTSET2292(IN6P_DSTOPTS|IN6P_RTHDRDSTOPTS); /* XXX */ break; case IPV6_2292RTHDR: OPTSET2292(IN6P_RTHDR); break; } break; case IPV6_PKTINFO: case IPV6_HOPOPTS: case IPV6_RTHDR: case IPV6_DSTOPTS: case IPV6_RTHDRDSTOPTS: case IPV6_NEXTHOP: { /* new advanced API (RFC3542) */ u_char *optbuf; u_char optbuf_storage[MCLBYTES]; int optlen; struct ip6_pktopts **optp; /* cannot mix with RFC2292 */ if (OPTBIT(IN6P_RFC2292)) { error = EINVAL; break; } /* * We only ensure valsize is not too large * here. Further validation will be done * later. */ error = sooptcopyin(sopt, optbuf_storage, sizeof(optbuf_storage), 0); if (error) break; optlen = sopt->sopt_valsize; optbuf = optbuf_storage; optp = &in6p->in6p_outputopts; error = ip6_pcbopt(optname, optbuf, optlen, optp, (td != NULL) ? td->td_ucred : NULL, uproto); break; } #undef OPTSET case IPV6_MULTICAST_IF: case IPV6_MULTICAST_HOPS: case IPV6_MULTICAST_LOOP: case IPV6_JOIN_GROUP: case IPV6_LEAVE_GROUP: case IPV6_MSFILTER: case MCAST_BLOCK_SOURCE: case MCAST_UNBLOCK_SOURCE: case MCAST_JOIN_GROUP: case MCAST_LEAVE_GROUP: case MCAST_JOIN_SOURCE_GROUP: case MCAST_LEAVE_SOURCE_GROUP: error = ip6_setmoptions(in6p, sopt); break; case IPV6_PORTRANGE: error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) break; INP_WLOCK(in6p); switch (optval) { case IPV6_PORTRANGE_DEFAULT: in6p->inp_flags &= ~(INP_LOWPORT); in6p->inp_flags &= ~(INP_HIGHPORT); break; case IPV6_PORTRANGE_HIGH: in6p->inp_flags &= ~(INP_LOWPORT); in6p->inp_flags |= INP_HIGHPORT; break; case IPV6_PORTRANGE_LOW: in6p->inp_flags &= ~(INP_HIGHPORT); in6p->inp_flags |= INP_LOWPORT; break; default: error = EINVAL; break; } INP_WUNLOCK(in6p); break; #ifdef IPSEC case IPV6_IPSEC_POLICY: { caddr_t req; struct mbuf *m; if ((error = soopt_getm(sopt, &m)) != 0) /* XXX */ break; if ((error = soopt_mcopyin(sopt, m)) != 0) /* XXX */ break; req = mtod(m, caddr_t); error = ipsec_set_policy(in6p, optname, req, m->m_len, (sopt->sopt_td != NULL) ? sopt->sopt_td->td_ucred : NULL); m_freem(m); break; } #endif /* IPSEC */ default: error = ENOPROTOOPT; break; } break; case SOPT_GET: switch (optname) { case IPV6_2292PKTOPTIONS: #ifdef IPV6_PKTOPTIONS case IPV6_PKTOPTIONS: #endif /* * RFC3542 (effectively) deprecated the * semantics of the 2292-style pktoptions. * Since it was not reliable in nature (i.e., * applications had to expect the lack of some * information after all), it would make sense * to simplify this part by always returning * empty data. */ sopt->sopt_valsize = 0; break; case IPV6_RECVHOPOPTS: case IPV6_RECVDSTOPTS: case IPV6_RECVRTHDRDSTOPTS: case IPV6_UNICAST_HOPS: case IPV6_RECVPKTINFO: case IPV6_RECVHOPLIMIT: case IPV6_RECVRTHDR: case IPV6_RECVPATHMTU: case IPV6_V6ONLY: case IPV6_PORTRANGE: case IPV6_RECVTCLASS: case IPV6_AUTOFLOWLABEL: case IPV6_BINDANY: case IPV6_FLOWID: case IPV6_FLOWTYPE: case IPV6_RECVFLOWID: #ifdef RSS case IPV6_RSSBUCKETID: case IPV6_RECVRSSBUCKETID: #endif case IPV6_BINDMULTI: switch (optname) { case IPV6_RECVHOPOPTS: optval = OPTBIT(IN6P_HOPOPTS); break; case IPV6_RECVDSTOPTS: optval = OPTBIT(IN6P_DSTOPTS); break; case IPV6_RECVRTHDRDSTOPTS: optval = OPTBIT(IN6P_RTHDRDSTOPTS); break; case IPV6_UNICAST_HOPS: optval = in6p->in6p_hops; break; case IPV6_RECVPKTINFO: optval = OPTBIT(IN6P_PKTINFO); break; case IPV6_RECVHOPLIMIT: optval = OPTBIT(IN6P_HOPLIMIT); break; case IPV6_RECVRTHDR: optval = OPTBIT(IN6P_RTHDR); break; case IPV6_RECVPATHMTU: optval = OPTBIT(IN6P_MTU); break; case IPV6_V6ONLY: optval = OPTBIT(IN6P_IPV6_V6ONLY); break; case IPV6_PORTRANGE: { int flags; flags = in6p->inp_flags; if (flags & INP_HIGHPORT) optval = IPV6_PORTRANGE_HIGH; else if (flags & INP_LOWPORT) optval = IPV6_PORTRANGE_LOW; else optval = 0; break; } case IPV6_RECVTCLASS: optval = OPTBIT(IN6P_TCLASS); break; case IPV6_AUTOFLOWLABEL: optval = OPTBIT(IN6P_AUTOFLOWLABEL); break; case IPV6_BINDANY: optval = OPTBIT(INP_BINDANY); break; case IPV6_FLOWID: optval = in6p->inp_flowid; break; case IPV6_FLOWTYPE: optval = in6p->inp_flowtype; break; case IPV6_RECVFLOWID: optval = OPTBIT2(INP_RECVFLOWID); break; #ifdef RSS case IPV6_RSSBUCKETID: retval = rss_hash2bucket(in6p->inp_flowid, in6p->inp_flowtype, &rss_bucket); if (retval == 0) optval = rss_bucket; else error = EINVAL; break; case IPV6_RECVRSSBUCKETID: optval = OPTBIT2(INP_RECVRSSBUCKETID); break; #endif case IPV6_BINDMULTI: optval = OPTBIT2(INP_BINDMULTI); break; } if (error) break; error = sooptcopyout(sopt, &optval, sizeof optval); break; case IPV6_PATHMTU: { u_long pmtu = 0; struct ip6_mtuinfo mtuinfo; if (!(so->so_state & SS_ISCONNECTED)) return (ENOTCONN); /* * XXX: we dot not consider the case of source * routing, or optional information to specify * the outgoing interface. */ error = ip6_getpmtu_ctl(so->so_fibnum, &in6p->in6p_faddr, &pmtu); if (error) break; if (pmtu > IPV6_MAXPACKET) pmtu = IPV6_MAXPACKET; bzero(&mtuinfo, sizeof(mtuinfo)); mtuinfo.ip6m_mtu = (u_int32_t)pmtu; optdata = (void *)&mtuinfo; optdatalen = sizeof(mtuinfo); error = sooptcopyout(sopt, optdata, optdatalen); break; } case IPV6_2292PKTINFO: case IPV6_2292HOPLIMIT: case IPV6_2292HOPOPTS: case IPV6_2292RTHDR: case IPV6_2292DSTOPTS: switch (optname) { case IPV6_2292PKTINFO: optval = OPTBIT(IN6P_PKTINFO); break; case IPV6_2292HOPLIMIT: optval = OPTBIT(IN6P_HOPLIMIT); break; case IPV6_2292HOPOPTS: optval = OPTBIT(IN6P_HOPOPTS); break; case IPV6_2292RTHDR: optval = OPTBIT(IN6P_RTHDR); break; case IPV6_2292DSTOPTS: optval = OPTBIT(IN6P_DSTOPTS|IN6P_RTHDRDSTOPTS); break; } error = sooptcopyout(sopt, &optval, sizeof optval); break; case IPV6_PKTINFO: case IPV6_HOPOPTS: case IPV6_RTHDR: case IPV6_DSTOPTS: case IPV6_RTHDRDSTOPTS: case IPV6_NEXTHOP: case IPV6_TCLASS: case IPV6_DONTFRAG: case IPV6_USE_MIN_MTU: case IPV6_PREFER_TEMPADDR: error = ip6_getpcbopt(in6p->in6p_outputopts, optname, sopt); break; case IPV6_MULTICAST_IF: case IPV6_MULTICAST_HOPS: case IPV6_MULTICAST_LOOP: case IPV6_MSFILTER: error = ip6_getmoptions(in6p, sopt); break; #ifdef IPSEC case IPV6_IPSEC_POLICY: { caddr_t req = NULL; size_t len = 0; struct mbuf *m = NULL; struct mbuf **mp = &m; size_t ovalsize = sopt->sopt_valsize; caddr_t oval = (caddr_t)sopt->sopt_val; error = soopt_getm(sopt, &m); /* XXX */ if (error != 0) break; error = soopt_mcopyin(sopt, m); /* XXX */ if (error != 0) break; sopt->sopt_valsize = ovalsize; sopt->sopt_val = oval; if (m) { req = mtod(m, caddr_t); len = m->m_len; } error = ipsec_get_policy(in6p, req, len, mp); if (error == 0) error = soopt_mcopyout(sopt, m); /* XXX */ if (error == 0 && m) m_freem(m); break; } #endif /* IPSEC */ default: error = ENOPROTOOPT; break; } break; } } return (error); } int ip6_raw_ctloutput(struct socket *so, struct sockopt *sopt) { int error = 0, optval, optlen; const int icmp6off = offsetof(struct icmp6_hdr, icmp6_cksum); struct inpcb *in6p = sotoinpcb(so); int level, op, optname; level = sopt->sopt_level; op = sopt->sopt_dir; optname = sopt->sopt_name; optlen = sopt->sopt_valsize; if (level != IPPROTO_IPV6) { return (EINVAL); } switch (optname) { case IPV6_CHECKSUM: /* * For ICMPv6 sockets, no modification allowed for checksum * offset, permit "no change" values to help existing apps. * * RFC3542 says: "An attempt to set IPV6_CHECKSUM * for an ICMPv6 socket will fail." * The current behavior does not meet RFC3542. */ switch (op) { case SOPT_SET: if (optlen != sizeof(int)) { error = EINVAL; break; } error = sooptcopyin(sopt, &optval, sizeof(optval), sizeof(optval)); if (error) break; if ((optval % 2) != 0) { /* the API assumes even offset values */ error = EINVAL; } else if (so->so_proto->pr_protocol == IPPROTO_ICMPV6) { if (optval != icmp6off) error = EINVAL; } else in6p->in6p_cksum = optval; break; case SOPT_GET: if (so->so_proto->pr_protocol == IPPROTO_ICMPV6) optval = icmp6off; else optval = in6p->in6p_cksum; error = sooptcopyout(sopt, &optval, sizeof(optval)); break; default: error = EINVAL; break; } break; default: error = ENOPROTOOPT; break; } return (error); } /* * Set up IP6 options in pcb for insertion in output packets or * specifying behavior of outgoing packets. */ static int ip6_pcbopts(struct ip6_pktopts **pktopt, struct mbuf *m, struct socket *so, struct sockopt *sopt) { struct ip6_pktopts *opt = *pktopt; int error = 0; struct thread *td = sopt->sopt_td; /* turn off any old options. */ if (opt) { #ifdef DIAGNOSTIC if (opt->ip6po_pktinfo || opt->ip6po_nexthop || opt->ip6po_hbh || opt->ip6po_dest1 || opt->ip6po_dest2 || opt->ip6po_rhinfo.ip6po_rhi_rthdr) printf("ip6_pcbopts: all specified options are cleared.\n"); #endif ip6_clearpktopts(opt, -1); } else opt = malloc(sizeof(*opt), M_IP6OPT, M_WAITOK); *pktopt = NULL; if (!m || m->m_len == 0) { /* * Only turning off any previous options, regardless of * whether the opt is just created or given. */ free(opt, M_IP6OPT); return (0); } /* set options specified by user. */ if ((error = ip6_setpktopts(m, opt, NULL, (td != NULL) ? td->td_ucred : NULL, so->so_proto->pr_protocol)) != 0) { ip6_clearpktopts(opt, -1); /* XXX: discard all options */ free(opt, M_IP6OPT); return (error); } *pktopt = opt; return (0); } /* * initialize ip6_pktopts. beware that there are non-zero default values in * the struct. */ void ip6_initpktopts(struct ip6_pktopts *opt) { bzero(opt, sizeof(*opt)); opt->ip6po_hlim = -1; /* -1 means default hop limit */ opt->ip6po_tclass = -1; /* -1 means default traffic class */ opt->ip6po_minmtu = IP6PO_MINMTU_MCASTONLY; opt->ip6po_prefer_tempaddr = IP6PO_TEMPADDR_SYSTEM; } static int ip6_pcbopt(int optname, u_char *buf, int len, struct ip6_pktopts **pktopt, struct ucred *cred, int uproto) { struct ip6_pktopts *opt; if (*pktopt == NULL) { *pktopt = malloc(sizeof(struct ip6_pktopts), M_IP6OPT, M_WAITOK); ip6_initpktopts(*pktopt); } opt = *pktopt; return (ip6_setpktopt(optname, buf, len, opt, cred, 1, 0, uproto)); } static int ip6_getpcbopt(struct ip6_pktopts *pktopt, int optname, struct sockopt *sopt) { void *optdata = NULL; int optdatalen = 0; struct ip6_ext *ip6e; int error = 0; struct in6_pktinfo null_pktinfo; int deftclass = 0, on; int defminmtu = IP6PO_MINMTU_MCASTONLY; int defpreftemp = IP6PO_TEMPADDR_SYSTEM; switch (optname) { case IPV6_PKTINFO: optdata = (void *)&null_pktinfo; if (pktopt && pktopt->ip6po_pktinfo) { bcopy(pktopt->ip6po_pktinfo, &null_pktinfo, sizeof(null_pktinfo)); in6_clearscope(&null_pktinfo.ipi6_addr); } else { /* XXX: we don't have to do this every time... */ bzero(&null_pktinfo, sizeof(null_pktinfo)); } optdatalen = sizeof(struct in6_pktinfo); break; case IPV6_TCLASS: if (pktopt && pktopt->ip6po_tclass >= 0) optdata = (void *)&pktopt->ip6po_tclass; else optdata = (void *)&deftclass; optdatalen = sizeof(int); break; case IPV6_HOPOPTS: if (pktopt && pktopt->ip6po_hbh) { optdata = (void *)pktopt->ip6po_hbh; ip6e = (struct ip6_ext *)pktopt->ip6po_hbh; optdatalen = (ip6e->ip6e_len + 1) << 3; } break; case IPV6_RTHDR: if (pktopt && pktopt->ip6po_rthdr) { optdata = (void *)pktopt->ip6po_rthdr; ip6e = (struct ip6_ext *)pktopt->ip6po_rthdr; optdatalen = (ip6e->ip6e_len + 1) << 3; } break; case IPV6_RTHDRDSTOPTS: if (pktopt && pktopt->ip6po_dest1) { optdata = (void *)pktopt->ip6po_dest1; ip6e = (struct ip6_ext *)pktopt->ip6po_dest1; optdatalen = (ip6e->ip6e_len + 1) << 3; } break; case IPV6_DSTOPTS: if (pktopt && pktopt->ip6po_dest2) { optdata = (void *)pktopt->ip6po_dest2; ip6e = (struct ip6_ext *)pktopt->ip6po_dest2; optdatalen = (ip6e->ip6e_len + 1) << 3; } break; case IPV6_NEXTHOP: if (pktopt && pktopt->ip6po_nexthop) { optdata = (void *)pktopt->ip6po_nexthop; optdatalen = pktopt->ip6po_nexthop->sa_len; } break; case IPV6_USE_MIN_MTU: if (pktopt) optdata = (void *)&pktopt->ip6po_minmtu; else optdata = (void *)&defminmtu; optdatalen = sizeof(int); break; case IPV6_DONTFRAG: if (pktopt && ((pktopt->ip6po_flags) & IP6PO_DONTFRAG)) on = 1; else on = 0; optdata = (void *)&on; optdatalen = sizeof(on); break; case IPV6_PREFER_TEMPADDR: if (pktopt) optdata = (void *)&pktopt->ip6po_prefer_tempaddr; else optdata = (void *)&defpreftemp; optdatalen = sizeof(int); break; default: /* should not happen */ #ifdef DIAGNOSTIC panic("ip6_getpcbopt: unexpected option\n"); #endif return (ENOPROTOOPT); } error = sooptcopyout(sopt, optdata, optdatalen); return (error); } void ip6_clearpktopts(struct ip6_pktopts *pktopt, int optname) { if (pktopt == NULL) return; if (optname == -1 || optname == IPV6_PKTINFO) { if (pktopt->ip6po_pktinfo) free(pktopt->ip6po_pktinfo, M_IP6OPT); pktopt->ip6po_pktinfo = NULL; } if (optname == -1 || optname == IPV6_HOPLIMIT) pktopt->ip6po_hlim = -1; if (optname == -1 || optname == IPV6_TCLASS) pktopt->ip6po_tclass = -1; if (optname == -1 || optname == IPV6_NEXTHOP) { if (pktopt->ip6po_nextroute.ro_rt) { RTFREE(pktopt->ip6po_nextroute.ro_rt); pktopt->ip6po_nextroute.ro_rt = NULL; } if (pktopt->ip6po_nexthop) free(pktopt->ip6po_nexthop, M_IP6OPT); pktopt->ip6po_nexthop = NULL; } if (optname == -1 || optname == IPV6_HOPOPTS) { if (pktopt->ip6po_hbh) free(pktopt->ip6po_hbh, M_IP6OPT); pktopt->ip6po_hbh = NULL; } if (optname == -1 || optname == IPV6_RTHDRDSTOPTS) { if (pktopt->ip6po_dest1) free(pktopt->ip6po_dest1, M_IP6OPT); pktopt->ip6po_dest1 = NULL; } if (optname == -1 || optname == IPV6_RTHDR) { if (pktopt->ip6po_rhinfo.ip6po_rhi_rthdr) free(pktopt->ip6po_rhinfo.ip6po_rhi_rthdr, M_IP6OPT); pktopt->ip6po_rhinfo.ip6po_rhi_rthdr = NULL; if (pktopt->ip6po_route.ro_rt) { RTFREE(pktopt->ip6po_route.ro_rt); pktopt->ip6po_route.ro_rt = NULL; } } if (optname == -1 || optname == IPV6_DSTOPTS) { if (pktopt->ip6po_dest2) free(pktopt->ip6po_dest2, M_IP6OPT); pktopt->ip6po_dest2 = NULL; } } #define PKTOPT_EXTHDRCPY(type) \ do {\ if (src->type) {\ int hlen = (((struct ip6_ext *)src->type)->ip6e_len + 1) << 3;\ dst->type = malloc(hlen, M_IP6OPT, canwait);\ if (dst->type == NULL && canwait == M_NOWAIT)\ goto bad;\ bcopy(src->type, dst->type, hlen);\ }\ } while (/*CONSTCOND*/ 0) static int copypktopts(struct ip6_pktopts *dst, struct ip6_pktopts *src, int canwait) { if (dst == NULL || src == NULL) { printf("ip6_clearpktopts: invalid argument\n"); return (EINVAL); } dst->ip6po_hlim = src->ip6po_hlim; dst->ip6po_tclass = src->ip6po_tclass; dst->ip6po_flags = src->ip6po_flags; dst->ip6po_minmtu = src->ip6po_minmtu; dst->ip6po_prefer_tempaddr = src->ip6po_prefer_tempaddr; if (src->ip6po_pktinfo) { dst->ip6po_pktinfo = malloc(sizeof(*dst->ip6po_pktinfo), M_IP6OPT, canwait); if (dst->ip6po_pktinfo == NULL) goto bad; *dst->ip6po_pktinfo = *src->ip6po_pktinfo; } if (src->ip6po_nexthop) { dst->ip6po_nexthop = malloc(src->ip6po_nexthop->sa_len, M_IP6OPT, canwait); if (dst->ip6po_nexthop == NULL) goto bad; bcopy(src->ip6po_nexthop, dst->ip6po_nexthop, src->ip6po_nexthop->sa_len); } PKTOPT_EXTHDRCPY(ip6po_hbh); PKTOPT_EXTHDRCPY(ip6po_dest1); PKTOPT_EXTHDRCPY(ip6po_dest2); PKTOPT_EXTHDRCPY(ip6po_rthdr); /* not copy the cached route */ return (0); bad: ip6_clearpktopts(dst, -1); return (ENOBUFS); } #undef PKTOPT_EXTHDRCPY struct ip6_pktopts * ip6_copypktopts(struct ip6_pktopts *src, int canwait) { int error; struct ip6_pktopts *dst; dst = malloc(sizeof(*dst), M_IP6OPT, canwait); if (dst == NULL) return (NULL); ip6_initpktopts(dst); if ((error = copypktopts(dst, src, canwait)) != 0) { free(dst, M_IP6OPT); return (NULL); } return (dst); } void ip6_freepcbopts(struct ip6_pktopts *pktopt) { if (pktopt == NULL) return; ip6_clearpktopts(pktopt, -1); free(pktopt, M_IP6OPT); } /* * Set IPv6 outgoing packet options based on advanced API. */ int ip6_setpktopts(struct mbuf *control, struct ip6_pktopts *opt, struct ip6_pktopts *stickyopt, struct ucred *cred, int uproto) { struct cmsghdr *cm = NULL; if (control == NULL || opt == NULL) return (EINVAL); ip6_initpktopts(opt); if (stickyopt) { int error; /* * If stickyopt is provided, make a local copy of the options * for this particular packet, then override them by ancillary * objects. * XXX: copypktopts() does not copy the cached route to a next * hop (if any). This is not very good in terms of efficiency, * but we can allow this since this option should be rarely * used. */ if ((error = copypktopts(opt, stickyopt, M_NOWAIT)) != 0) return (error); } /* * XXX: Currently, we assume all the optional information is stored * in a single mbuf. */ if (control->m_next) return (EINVAL); for (; control->m_len > 0; control->m_data += CMSG_ALIGN(cm->cmsg_len), control->m_len -= CMSG_ALIGN(cm->cmsg_len)) { int error; if (control->m_len < CMSG_LEN(0)) return (EINVAL); cm = mtod(control, struct cmsghdr *); if (cm->cmsg_len == 0 || cm->cmsg_len > control->m_len) return (EINVAL); if (cm->cmsg_level != IPPROTO_IPV6) continue; error = ip6_setpktopt(cm->cmsg_type, CMSG_DATA(cm), cm->cmsg_len - CMSG_LEN(0), opt, cred, 0, 1, uproto); if (error) return (error); } return (0); } /* * Set a particular packet option, as a sticky option or an ancillary data * item. "len" can be 0 only when it's a sticky option. * We have 4 cases of combination of "sticky" and "cmsg": * "sticky=0, cmsg=0": impossible * "sticky=0, cmsg=1": RFC2292 or RFC3542 ancillary data * "sticky=1, cmsg=0": RFC3542 socket option * "sticky=1, cmsg=1": RFC2292 socket option */ static int ip6_setpktopt(int optname, u_char *buf, int len, struct ip6_pktopts *opt, struct ucred *cred, int sticky, int cmsg, int uproto) { int minmtupolicy, preftemp; int error; if (!sticky && !cmsg) { #ifdef DIAGNOSTIC printf("ip6_setpktopt: impossible case\n"); #endif return (EINVAL); } /* * IPV6_2292xxx is for backward compatibility to RFC2292, and should * not be specified in the context of RFC3542. Conversely, * RFC3542 types should not be specified in the context of RFC2292. */ if (!cmsg) { switch (optname) { case IPV6_2292PKTINFO: case IPV6_2292HOPLIMIT: case IPV6_2292NEXTHOP: case IPV6_2292HOPOPTS: case IPV6_2292DSTOPTS: case IPV6_2292RTHDR: case IPV6_2292PKTOPTIONS: return (ENOPROTOOPT); } } if (sticky && cmsg) { switch (optname) { case IPV6_PKTINFO: case IPV6_HOPLIMIT: case IPV6_NEXTHOP: case IPV6_HOPOPTS: case IPV6_DSTOPTS: case IPV6_RTHDRDSTOPTS: case IPV6_RTHDR: case IPV6_USE_MIN_MTU: case IPV6_DONTFRAG: case IPV6_TCLASS: case IPV6_PREFER_TEMPADDR: /* XXX: not an RFC3542 option */ return (ENOPROTOOPT); } } switch (optname) { case IPV6_2292PKTINFO: case IPV6_PKTINFO: { struct ifnet *ifp = NULL; struct in6_pktinfo *pktinfo; if (len != sizeof(struct in6_pktinfo)) return (EINVAL); pktinfo = (struct in6_pktinfo *)buf; /* * An application can clear any sticky IPV6_PKTINFO option by * doing a "regular" setsockopt with ipi6_addr being * in6addr_any and ipi6_ifindex being zero. * [RFC 3542, Section 6] */ if (optname == IPV6_PKTINFO && opt->ip6po_pktinfo && pktinfo->ipi6_ifindex == 0 && IN6_IS_ADDR_UNSPECIFIED(&pktinfo->ipi6_addr)) { ip6_clearpktopts(opt, optname); break; } if (uproto == IPPROTO_TCP && optname == IPV6_PKTINFO && sticky && !IN6_IS_ADDR_UNSPECIFIED(&pktinfo->ipi6_addr)) { return (EINVAL); } if (IN6_IS_ADDR_MULTICAST(&pktinfo->ipi6_addr)) return (EINVAL); /* validate the interface index if specified. */ if (pktinfo->ipi6_ifindex > V_if_index) return (ENXIO); if (pktinfo->ipi6_ifindex) { ifp = ifnet_byindex(pktinfo->ipi6_ifindex); if (ifp == NULL) return (ENXIO); } if (ifp != NULL && (ifp->if_afdata[AF_INET6] == NULL || (ND_IFINFO(ifp)->flags & ND6_IFF_IFDISABLED) != 0)) return (ENETDOWN); if (ifp != NULL && !IN6_IS_ADDR_UNSPECIFIED(&pktinfo->ipi6_addr)) { struct in6_ifaddr *ia; in6_setscope(&pktinfo->ipi6_addr, ifp, NULL); ia = in6ifa_ifpwithaddr(ifp, &pktinfo->ipi6_addr); if (ia == NULL) return (EADDRNOTAVAIL); ifa_free(&ia->ia_ifa); } /* * We store the address anyway, and let in6_selectsrc() * validate the specified address. This is because ipi6_addr * may not have enough information about its scope zone, and * we may need additional information (such as outgoing * interface or the scope zone of a destination address) to * disambiguate the scope. * XXX: the delay of the validation may confuse the * application when it is used as a sticky option. */ if (opt->ip6po_pktinfo == NULL) { opt->ip6po_pktinfo = malloc(sizeof(*pktinfo), M_IP6OPT, M_NOWAIT); if (opt->ip6po_pktinfo == NULL) return (ENOBUFS); } bcopy(pktinfo, opt->ip6po_pktinfo, sizeof(*pktinfo)); break; } case IPV6_2292HOPLIMIT: case IPV6_HOPLIMIT: { int *hlimp; /* * RFC 3542 deprecated the usage of sticky IPV6_HOPLIMIT * to simplify the ordering among hoplimit options. */ if (optname == IPV6_HOPLIMIT && sticky) return (ENOPROTOOPT); if (len != sizeof(int)) return (EINVAL); hlimp = (int *)buf; if (*hlimp < -1 || *hlimp > 255) return (EINVAL); opt->ip6po_hlim = *hlimp; break; } case IPV6_TCLASS: { int tclass; if (len != sizeof(int)) return (EINVAL); tclass = *(int *)buf; if (tclass < -1 || tclass > 255) return (EINVAL); opt->ip6po_tclass = tclass; break; } case IPV6_2292NEXTHOP: case IPV6_NEXTHOP: if (cred != NULL) { error = priv_check_cred(cred, PRIV_NETINET_SETHDROPTS, 0); if (error) return (error); } if (len == 0) { /* just remove the option */ ip6_clearpktopts(opt, IPV6_NEXTHOP); break; } /* check if cmsg_len is large enough for sa_len */ if (len < sizeof(struct sockaddr) || len < *buf) return (EINVAL); switch (((struct sockaddr *)buf)->sa_family) { case AF_INET6: { struct sockaddr_in6 *sa6 = (struct sockaddr_in6 *)buf; int error; if (sa6->sin6_len != sizeof(struct sockaddr_in6)) return (EINVAL); if (IN6_IS_ADDR_UNSPECIFIED(&sa6->sin6_addr) || IN6_IS_ADDR_MULTICAST(&sa6->sin6_addr)) { return (EINVAL); } if ((error = sa6_embedscope(sa6, V_ip6_use_defzone)) != 0) { return (error); } break; } case AF_LINK: /* should eventually be supported */ default: return (EAFNOSUPPORT); } /* turn off the previous option, then set the new option. */ ip6_clearpktopts(opt, IPV6_NEXTHOP); opt->ip6po_nexthop = malloc(*buf, M_IP6OPT, M_NOWAIT); if (opt->ip6po_nexthop == NULL) return (ENOBUFS); bcopy(buf, opt->ip6po_nexthop, *buf); break; case IPV6_2292HOPOPTS: case IPV6_HOPOPTS: { struct ip6_hbh *hbh; int hbhlen; /* * XXX: We don't allow a non-privileged user to set ANY HbH * options, since per-option restriction has too much * overhead. */ if (cred != NULL) { error = priv_check_cred(cred, PRIV_NETINET_SETHDROPTS, 0); if (error) return (error); } if (len == 0) { ip6_clearpktopts(opt, IPV6_HOPOPTS); break; /* just remove the option */ } /* message length validation */ if (len < sizeof(struct ip6_hbh)) return (EINVAL); hbh = (struct ip6_hbh *)buf; hbhlen = (hbh->ip6h_len + 1) << 3; if (len != hbhlen) return (EINVAL); /* turn off the previous option, then set the new option. */ ip6_clearpktopts(opt, IPV6_HOPOPTS); opt->ip6po_hbh = malloc(hbhlen, M_IP6OPT, M_NOWAIT); if (opt->ip6po_hbh == NULL) return (ENOBUFS); bcopy(hbh, opt->ip6po_hbh, hbhlen); break; } case IPV6_2292DSTOPTS: case IPV6_DSTOPTS: case IPV6_RTHDRDSTOPTS: { struct ip6_dest *dest, **newdest = NULL; int destlen; if (cred != NULL) { /* XXX: see the comment for IPV6_HOPOPTS */ error = priv_check_cred(cred, PRIV_NETINET_SETHDROPTS, 0); if (error) return (error); } if (len == 0) { ip6_clearpktopts(opt, optname); break; /* just remove the option */ } /* message length validation */ if (len < sizeof(struct ip6_dest)) return (EINVAL); dest = (struct ip6_dest *)buf; destlen = (dest->ip6d_len + 1) << 3; if (len != destlen) return (EINVAL); /* * Determine the position that the destination options header * should be inserted; before or after the routing header. */ switch (optname) { case IPV6_2292DSTOPTS: /* * The old advacned API is ambiguous on this point. * Our approach is to determine the position based * according to the existence of a routing header. * Note, however, that this depends on the order of the * extension headers in the ancillary data; the 1st * part of the destination options header must appear * before the routing header in the ancillary data, * too. * RFC3542 solved the ambiguity by introducing * separate ancillary data or option types. */ if (opt->ip6po_rthdr == NULL) newdest = &opt->ip6po_dest1; else newdest = &opt->ip6po_dest2; break; case IPV6_RTHDRDSTOPTS: newdest = &opt->ip6po_dest1; break; case IPV6_DSTOPTS: newdest = &opt->ip6po_dest2; break; } /* turn off the previous option, then set the new option. */ ip6_clearpktopts(opt, optname); *newdest = malloc(destlen, M_IP6OPT, M_NOWAIT); if (*newdest == NULL) return (ENOBUFS); bcopy(dest, *newdest, destlen); break; } case IPV6_2292RTHDR: case IPV6_RTHDR: { struct ip6_rthdr *rth; int rthlen; if (len == 0) { ip6_clearpktopts(opt, IPV6_RTHDR); break; /* just remove the option */ } /* message length validation */ if (len < sizeof(struct ip6_rthdr)) return (EINVAL); rth = (struct ip6_rthdr *)buf; rthlen = (rth->ip6r_len + 1) << 3; if (len != rthlen) return (EINVAL); switch (rth->ip6r_type) { case IPV6_RTHDR_TYPE_0: if (rth->ip6r_len == 0) /* must contain one addr */ return (EINVAL); if (rth->ip6r_len % 2) /* length must be even */ return (EINVAL); if (rth->ip6r_len / 2 != rth->ip6r_segleft) return (EINVAL); break; default: return (EINVAL); /* not supported */ } /* turn off the previous option */ ip6_clearpktopts(opt, IPV6_RTHDR); opt->ip6po_rthdr = malloc(rthlen, M_IP6OPT, M_NOWAIT); if (opt->ip6po_rthdr == NULL) return (ENOBUFS); bcopy(rth, opt->ip6po_rthdr, rthlen); break; } case IPV6_USE_MIN_MTU: if (len != sizeof(int)) return (EINVAL); minmtupolicy = *(int *)buf; if (minmtupolicy != IP6PO_MINMTU_MCASTONLY && minmtupolicy != IP6PO_MINMTU_DISABLE && minmtupolicy != IP6PO_MINMTU_ALL) { return (EINVAL); } opt->ip6po_minmtu = minmtupolicy; break; case IPV6_DONTFRAG: if (len != sizeof(int)) return (EINVAL); if (uproto == IPPROTO_TCP || *(int *)buf == 0) { /* * we ignore this option for TCP sockets. * (RFC3542 leaves this case unspecified.) */ opt->ip6po_flags &= ~IP6PO_DONTFRAG; } else opt->ip6po_flags |= IP6PO_DONTFRAG; break; case IPV6_PREFER_TEMPADDR: if (len != sizeof(int)) return (EINVAL); preftemp = *(int *)buf; if (preftemp != IP6PO_TEMPADDR_SYSTEM && preftemp != IP6PO_TEMPADDR_NOTPREFER && preftemp != IP6PO_TEMPADDR_PREFER) { return (EINVAL); } opt->ip6po_prefer_tempaddr = preftemp; break; default: return (ENOPROTOOPT); } /* end of switch */ return (0); } /* * Routine called from ip6_output() to loop back a copy of an IP6 multicast * packet to the input queue of a specified interface. Note that this * calls the output routine of the loopback "driver", but with an interface * pointer that might NOT be &loif -- easier than replicating that code here. */ void ip6_mloopback(struct ifnet *ifp, struct mbuf *m) { struct mbuf *copym; struct ip6_hdr *ip6; copym = m_copy(m, 0, M_COPYALL); if (copym == NULL) return; /* * Make sure to deep-copy IPv6 header portion in case the data * is in an mbuf cluster, so that we can safely override the IPv6 * header portion later. */ if (!M_WRITABLE(copym) || copym->m_len < sizeof(struct ip6_hdr)) { copym = m_pullup(copym, sizeof(struct ip6_hdr)); if (copym == NULL) return; } ip6 = mtod(copym, struct ip6_hdr *); /* * clear embedded scope identifiers if necessary. * in6_clearscope will touch the addresses only when necessary. */ in6_clearscope(&ip6->ip6_src); in6_clearscope(&ip6->ip6_dst); if (copym->m_pkthdr.csum_flags & CSUM_DELAY_DATA_IPV6) { copym->m_pkthdr.csum_flags |= CSUM_DATA_VALID_IPV6 | CSUM_PSEUDO_HDR; copym->m_pkthdr.csum_data = 0xffff; } if_simloop(ifp, copym, AF_INET6, 0); } /* * Chop IPv6 header off from the payload. */ static int ip6_splithdr(struct mbuf *m, struct ip6_exthdrs *exthdrs) { struct mbuf *mh; struct ip6_hdr *ip6; ip6 = mtod(m, struct ip6_hdr *); if (m->m_len > sizeof(*ip6)) { mh = m_gethdr(M_NOWAIT, MT_DATA); if (mh == NULL) { m_freem(m); return ENOBUFS; } m_move_pkthdr(mh, m); M_ALIGN(mh, sizeof(*ip6)); m->m_len -= sizeof(*ip6); m->m_data += sizeof(*ip6); mh->m_next = m; m = mh; m->m_len = sizeof(*ip6); bcopy((caddr_t)ip6, mtod(m, caddr_t), sizeof(*ip6)); } exthdrs->ip6e_ip6 = m; return 0; } /* * Compute IPv6 extension header length. */ int ip6_optlen(struct inpcb *in6p) { int len; if (!in6p->in6p_outputopts) return 0; len = 0; #define elen(x) \ (((struct ip6_ext *)(x)) ? (((struct ip6_ext *)(x))->ip6e_len + 1) << 3 : 0) len += elen(in6p->in6p_outputopts->ip6po_hbh); if (in6p->in6p_outputopts->ip6po_rthdr) /* dest1 is valid with rthdr only */ len += elen(in6p->in6p_outputopts->ip6po_dest1); len += elen(in6p->in6p_outputopts->ip6po_rthdr); len += elen(in6p->in6p_outputopts->ip6po_dest2); return len; #undef elen } Index: user/alc/PQ_LAUNDRY/sys/riscv/conf/GENERIC =================================================================== --- user/alc/PQ_LAUNDRY/sys/riscv/conf/GENERIC (revision 303666) +++ user/alc/PQ_LAUNDRY/sys/riscv/conf/GENERIC (revision 303667) @@ -1,112 +1,113 @@ # # GENERIC -- Generic kernel configuration file for FreeBSD/RISC-V # # For more information on this file, please read the config(5) manual page, # and/or the handbook section on Kernel Configuration Files: # # http://www.FreeBSD.org/doc/en_US.ISO8859-1/books/handbook/kernelconfig-config.html # # The handbook is also available locally in /usr/share/doc/handbook # if you've installed the doc distribution, otherwise always see the # FreeBSD World Wide Web server (http://www.FreeBSD.org/) for the # latest information. # # An exhaustive list of options and more detailed explanations of the # device lines is also present in the ../../conf/NOTES and NOTES files. # If you are in doubt as to the purpose or necessity of a line, check first # in NOTES. # # $FreeBSD$ cpu RISCV ident GENERIC makeoptions DEBUG=-g # Build kernel with gdb(1) debug symbols # makeoptions WITH_CTF=1 # Run ctfconvert(1) for DTrace support # FIXME: linker error. "--relax and -r may not be used together" makeoptions WITHOUT_MODULES="usb otusfw mwlfw ispfw mwlfw ralfw rtwnfw urtwnfw" +# makeoptions NO_MODULES options SCHED_ULE # ULE scheduler options PREEMPTION # Enable kernel thread preemption options INET # InterNETworking options INET6 # IPv6 communications protocols options IPSEC # IP (v4/v6) security options TCP_OFFLOAD # TCP offload options SCTP # Stream Control Transmission Protocol options FFS # Berkeley Fast Filesystem options SOFTUPDATES # Enable FFS soft updates support options UFS_ACL # Support for access control lists options UFS_DIRHASH # Improve performance on big directories options UFS_GJOURNAL # Enable gjournal-based UFS journaling options QUOTA # Enable disk quotas for UFS -options MD_ROOT # MD is a potential root device options NFSCL # Network Filesystem Client options NFSD # Network Filesystem Server options NFSLOCKD # Network Lock Manager options NFS_ROOT # NFS usable as /, requires NFSCL options MSDOSFS # MSDOS Filesystem options CD9660 # ISO 9660 Filesystem options PROCFS # Process filesystem (requires PSEUDOFS) options PSEUDOFS # Pseudo-filesystem framework options GEOM_PART_GPT # GUID Partition Tables. # options GEOM_RAID # Soft RAID functionality. options GEOM_LABEL # Provides labelization options SCSI_DELAY=5000 # Delay (in ms) before probing SCSI options KTRACE # ktrace(1) support # options STACK # stack(9) support options SYSVSHM # SYSV-style shared memory options SYSVMSG # SYSV-style message queues options SYSVSEM # SYSV-style semaphores options _KPOSIX_PRIORITY_SCHEDULING # POSIX P1003_1B real-time extensions options PRINTF_BUFR_SIZE=128 # Prevent printf output being interspersed. options KBD_INSTALL_CDEV # install a CDEV entry in /dev # options HWPMC_HOOKS # Necessary kernel hooks for hwpmc(4) options AUDIT # Security event auditing options CAPABILITY_MODE # Capsicum capability mode options CAPABILITIES # Capsicum capabilities options MAC # TrustedBSD MAC Framework options KDTRACE_FRAME # Ensure frames are compiled in options KDTRACE_HOOKS # Kernel DTrace hooks # options VFP # Floating-point support options RACCT # Resource accounting framework options RACCT_DEFAULT_TO_DISABLED # Set kern.racct.enable=0 by default options RCTL # Resource limits options SMP # Uncomment for memory disk # options MD_ROOT # options MD_ROOT_SIZE=8192 # 8MB ram disk # makeoptions MFS_IMAGE=/path/to/img # options ROOTDEVNAME=\"ufs:/dev/md0\" # Debugging support. Always need this: options KDB # Enable kernel debugger support. options KDB_TRACE # Print a stack trace for a panic. # For full debugger support use (turn off in stable branch): options DDB # Support DDB. # options GDB # Support remote GDB. options DEADLKRES # Enable the deadlock resolver options INVARIANTS # Enable calls of extra sanity checking options INVARIANT_SUPPORT # Extra sanity checks of internal structures, required by INVARIANTS # options WITNESS # Enable checks to detect deadlocks and cycles # options WITNESS_SKIPSPIN # Don't run witness on spinlocks for speed options MALLOC_DEBUG_MAXZONES=8 # Separate malloc(9) zones # options EARLY_PRINTF +# options VERBOSE_SYSINIT # Pseudo devices. device loop # Network loopback device random # Entropy device device ether # Ethernet support device vlan # 802.1Q VLAN support device tun # Packet tunnel. device md # Memory "disks" device gif # IPv6 and IPv4 tunneling device firmware # firmware assist module # The `bpf' device enables the Berkeley Packet Filter. # Be aware of the administrative consequences of enabling this! # Note that 'bpf' is required for DHCP. device bpf # Berkeley packet filter options FDT Index: user/alc/PQ_LAUNDRY/sys/riscv/htif/htif.c =================================================================== --- user/alc/PQ_LAUNDRY/sys/riscv/htif/htif.c (revision 303666) +++ user/alc/PQ_LAUNDRY/sys/riscv/htif/htif.c (revision 303667) @@ -1,283 +1,278 @@ /*- * Copyright (c) 2015-2016 Ruslan Bukin * All rights reserved. * * Portions of this software were developed by SRI International and the * University of Cambridge Computer Laboratory under DARPA/AFRL contract * FA8750-10-C-0237 ("CTSRD"), as part of the DARPA CRASH research programme. * * Portions of this software were developed by the University of Cambridge * Computer Laboratory as part of the CTSRD Project, with support from the * UK Higher Education Innovation Fund (HEIF). * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "htif.h" static struct resource_spec htif_spec[] = { { SYS_RES_IRQ, 0, RF_ACTIVE | RF_SHAREABLE}, { -1, 0 } }; struct intr_entry { void (*func) (void *, uint64_t); void *arg; }; struct intr_entry intrs[HTIF_NDEV]; uint64_t htif_command(uint64_t arg) { return (machine_command(ECALL_HTIF_CMD, arg)); } int htif_setup_intr(int id, void *func, void *arg) { if (id >= HTIF_NDEV) return (-1); intrs[id].func = func; intrs[id].arg = arg; return (0); } static void htif_handle_entry(struct htif_softc *sc) { uint64_t entry; uint8_t devcmd; uint8_t devid; entry = machine_command(ECALL_HTIF_GET_ENTRY, 0); while (entry) { devid = HTIF_DEV_ID(entry); devcmd = HTIF_DEV_CMD(entry); if (devcmd == HTIF_CMD_IDENTIFY) { /* Enumeration interrupt */ if (devid == sc->identify_id) sc->identify_done = 1; } else { /* Device interrupt */ if (intrs[devid].func != NULL) intrs[devid].func(intrs[devid].arg, entry); } entry = machine_command(ECALL_HTIF_GET_ENTRY, 0); } } static int htif_intr(void *arg) { struct htif_softc *sc; sc = arg; csr_clear(sip, SIP_SSIP); htif_handle_entry(sc); return (FILTER_HANDLED); } static int htif_add_device(struct htif_softc *sc, int i, char *id, char *name) { struct htif_dev_ivars *di; di = malloc(sizeof(struct htif_dev_ivars), M_DEVBUF, M_WAITOK | M_ZERO); di->sc = sc; di->index = i; di->id = malloc(HTIF_ID_LEN, M_DEVBUF, M_WAITOK | M_ZERO); memcpy(di->id, id, HTIF_ID_LEN); di->dev = device_add_child(sc->dev, name, -1); device_set_ivars(di->dev, di); return (0); } static int htif_enumerate(struct htif_softc *sc) { char id[HTIF_ID_LEN] __aligned(HTIF_ALIGN); uint64_t paddr; uint64_t data; uint64_t cmd; int len; int i; device_printf(sc->dev, "Enumerating devices\n"); for (i = 0; i < HTIF_NDEV; i++) { paddr = pmap_kextract((vm_offset_t)&id); data = (paddr << IDENTIFY_PADDR_SHIFT); data |= IDENTIFY_IDENT; sc->identify_id = i; sc->identify_done = 0; cmd = i; cmd <<= HTIF_DEV_ID_SHIFT; cmd |= (HTIF_CMD_IDENTIFY << HTIF_CMD_SHIFT); cmd |= data; htif_command(cmd); - /* Do poll as interrupts are disabled yet */ - while (sc->identify_done == 0) { - htif_handle_entry(sc); - } - len = strnlen(id, sizeof(id)); if (len <= 0) break; if (bootverbose) printf(" %d %s\n", i, id); if (strncmp(id, "disk", 4) == 0) htif_add_device(sc, i, id, "htif_blk"); else if (strncmp(id, "bcd", 3) == 0) htif_add_device(sc, i, id, "htif_console"); else if (strncmp(id, "syscall_proxy", 13) == 0) htif_add_device(sc, i, id, "htif_syscall_proxy"); } return (bus_generic_attach(sc->dev)); } int htif_read_ivar(device_t dev, device_t child, int which, uintptr_t *result) { struct htif_dev_ivars *ivars; ivars = device_get_ivars(child); switch (which) { case HTIF_IVAR_INDEX: *result = ivars->index; break; case HTIF_IVAR_ID: *result = (uintptr_t)ivars->id; default: return (EINVAL); } return (0); } static int htif_probe(device_t dev) { if (!ofw_bus_status_okay(dev)) return (ENXIO); if (!ofw_bus_is_compatible(dev, "riscv,htif")) return (ENXIO); device_set_desc(dev, "HTIF bus device"); return (BUS_PROBE_DEFAULT); } static int htif_attach(device_t dev) { struct htif_softc *sc; int error; sc = device_get_softc(dev); sc->dev = dev; if (bus_alloc_resources(dev, htif_spec, sc->res)) { device_printf(dev, "could not allocate resources\n"); return (ENXIO); } /* Setup IRQs handler */ error = bus_setup_intr(dev, sc->res[0], INTR_TYPE_CLK, htif_intr, NULL, sc, &sc->ihl[0]); if (error) { device_printf(dev, "Unable to alloc int resource.\n"); return (ENXIO); } csr_set(sie, SIE_SSIE); return (htif_enumerate(sc)); } static device_method_t htif_methods[] = { DEVMETHOD(device_probe, htif_probe), DEVMETHOD(device_attach, htif_attach), /* Bus interface */ DEVMETHOD(bus_read_ivar, htif_read_ivar), DEVMETHOD_END }; static driver_t htif_driver = { "htif", htif_methods, sizeof(struct htif_softc) }; static devclass_t htif_devclass; DRIVER_MODULE(htif, simplebus, htif_driver, htif_devclass, 0, 0); Index: user/alc/PQ_LAUNDRY/sys/riscv/htif/htif_block.c =================================================================== --- user/alc/PQ_LAUNDRY/sys/riscv/htif/htif_block.c (revision 303666) +++ user/alc/PQ_LAUNDRY/sys/riscv/htif/htif_block.c (revision 303667) @@ -1,297 +1,299 @@ /*- * Copyright (c) 2015-2016 Ruslan Bukin * All rights reserved. * * Portions of this software were developed by SRI International and the * University of Cambridge Computer Laboratory under DARPA/AFRL contract * FA8750-10-C-0237 ("CTSRD"), as part of the DARPA CRASH research programme. * * Portions of this software were developed by the University of Cambridge * Computer Laboratory as part of the CTSRD Project, with support from the * UK Higher Education Innovation Fund (HEIF). * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "htif.h" #define SECTOR_SIZE_SHIFT (9) #define SECTOR_SIZE (1 << SECTOR_SIZE_SHIFT) #define HTIF_BLK_LOCK(_sc) mtx_lock(&(_sc)->sc_mtx) #define HTIF_BLK_UNLOCK(_sc) mtx_unlock(&(_sc)->sc_mtx) #define HTIF_BLK_LOCK_INIT(_sc) \ mtx_init(&_sc->sc_mtx, device_get_nameunit(_sc->dev), \ "htif_blk", MTX_DEF) #define HTIF_BLK_LOCK_DESTROY(_sc) mtx_destroy(&_sc->sc_mtx); #define HTIF_BLK_ASSERT_LOCKED(_sc) mtx_assert(&_sc->sc_mtx, MA_OWNED); #define HTIF_BLK_ASSERT_UNLOCKED(_sc) mtx_assert(&_sc->sc_mtx, MA_NOTOWNED); static void htif_blk_task(void *arg); static disk_open_t htif_blk_open; static disk_close_t htif_blk_close; static disk_strategy_t htif_blk_strategy; struct htif_blk_softc { device_t dev; struct disk *disk; struct mtx htif_io_mtx; struct mtx sc_mtx; struct proc *p; struct bio_queue_head bio_queue; int running; int intr_chan; int cmd_done; int index; uint16_t curtag; }; struct htif_blk_request { uint64_t addr; uint64_t offset; /* offset in bytes */ uint64_t size; /* length in bytes */ uint64_t tag; }; static void htif_blk_intr(void *arg, uint64_t entry) { struct htif_blk_softc *sc; uint64_t devcmd; uint64_t data; sc = arg; devcmd = HTIF_DEV_CMD(entry); data = HTIF_DEV_DATA(entry); if (sc->curtag == data) { wmb(); sc->cmd_done = 1; wakeup(&sc->intr_chan); } else { device_printf(sc->dev, "Unexpected tag %d (should be %d)\n", data, sc->curtag); } } static int htif_blk_probe(device_t dev) { return (0); } static int htif_blk_attach(device_t dev) { struct htif_blk_softc *sc; char prefix[] = " size="; char *str; long size; sc = device_get_softc(dev); sc->dev = dev; mtx_init(&sc->htif_io_mtx, device_get_nameunit(dev), "htif_blk", MTX_DEF); HTIF_BLK_LOCK_INIT(sc); str = strstr(htif_get_id(dev), prefix); size = strtol((str + 6), NULL, 10); if (size == 0) { return (ENXIO); } sc->index = htif_get_index(dev); if (sc->index < 0) return (EINVAL); htif_setup_intr(sc->index, htif_blk_intr, sc); sc->disk = disk_alloc(); sc->disk->d_drv1 = sc; sc->disk->d_maxsize = 4096; /* Max transfer */ sc->disk->d_name = "htif_blk"; sc->disk->d_open = htif_blk_open; sc->disk->d_close = htif_blk_close; sc->disk->d_strategy = htif_blk_strategy; sc->disk->d_unit = 0; sc->disk->d_sectorsize = SECTOR_SIZE; sc->disk->d_mediasize = size; disk_create(sc->disk, DISK_VERSION); bioq_init(&sc->bio_queue); sc->running = 1; kproc_create(&htif_blk_task, sc, &sc->p, 0, 0, "%s: transfer", device_get_nameunit(dev)); return (0); } static int htif_blk_open(struct disk *dp) { return (0); } static int htif_blk_close(struct disk *dp) { return (0); } static void htif_blk_task(void *arg) { struct htif_blk_request req __aligned(HTIF_ALIGN); struct htif_blk_softc *sc; uint64_t req_paddr; struct bio *bp; uint64_t paddr; + uint64_t resp; uint64_t cmd; int i; sc = (struct htif_blk_softc *)arg; while (1) { HTIF_BLK_LOCK(sc); do { bp = bioq_takefirst(&sc->bio_queue); if (bp == NULL) msleep(sc, &sc->sc_mtx, PRIBIO, "jobqueue", 0); } while (bp == NULL); HTIF_BLK_UNLOCK(sc); if (bp->bio_cmd == BIO_READ || bp->bio_cmd == BIO_WRITE) { HTIF_BLK_LOCK(sc); rmb(); req.offset = (bp->bio_pblkno * sc->disk->d_sectorsize); req.size = bp->bio_bcount; paddr = vtophys(bp->bio_data); KASSERT(paddr != 0, ("paddr is 0")); req.addr = paddr; sc->curtag++; req.tag = sc->curtag; cmd = sc->index; cmd <<= HTIF_DEV_ID_SHIFT; if (bp->bio_cmd == BIO_READ) cmd |= (HTIF_CMD_READ << HTIF_CMD_SHIFT); else cmd |= (HTIF_CMD_WRITE << HTIF_CMD_SHIFT); req_paddr = vtophys(&req); KASSERT(req_paddr != 0, ("req_paddr is 0")); cmd |= req_paddr; sc->cmd_done = 0; - htif_command(cmd); + resp = htif_command(cmd); + htif_blk_intr(sc, resp); /* Wait for interrupt */ i = 0; while (sc->cmd_done == 0) { msleep(&sc->intr_chan, &sc->sc_mtx, PRIBIO, "intr", hz/2); if (i++ > 2) { /* TODO: try to re-issue operation on timeout ? */ bp->bio_error = EIO; bp->bio_flags |= BIO_ERROR; disk_err(bp, "hard error", -1, 1); break; } } HTIF_BLK_UNLOCK(sc); biodone(bp); } else { printf("unknown op %d\n", bp->bio_cmd); } } } static void htif_blk_strategy(struct bio *bp) { struct htif_blk_softc *sc; sc = bp->bio_disk->d_drv1; HTIF_BLK_LOCK(sc); if (sc->running > 0) { bioq_disksort(&sc->bio_queue, bp); HTIF_BLK_UNLOCK(sc); wakeup(sc); } else { HTIF_BLK_UNLOCK(sc); biofinish(bp, NULL, ENXIO); } } static device_method_t htif_blk_methods[] = { DEVMETHOD(device_probe, htif_blk_probe), DEVMETHOD(device_attach, htif_blk_attach), }; static driver_t htif_blk_driver = { "htif_blk", htif_blk_methods, sizeof(struct htif_blk_softc) }; static devclass_t htif_blk_devclass; DRIVER_MODULE(htif_blk, htif, htif_blk_driver, htif_blk_devclass, 0, 0); Index: user/alc/PQ_LAUNDRY/sys/riscv/htif/htif_console.c =================================================================== --- user/alc/PQ_LAUNDRY/sys/riscv/htif/htif_console.c (revision 303666) +++ user/alc/PQ_LAUNDRY/sys/riscv/htif/htif_console.c (revision 303667) @@ -1,396 +1,349 @@ /*- * Copyright (c) 2015-2016 Ruslan Bukin * All rights reserved. * * Portions of this software were developed by SRI International and the * University of Cambridge Computer Laboratory under DARPA/AFRL contract * FA8750-10-C-0237 ("CTSRD"), as part of the DARPA CRASH research programme. * * Portions of this software were developed by the University of Cambridge * Computer Laboratory as part of the CTSRD Project, with support from the * UK Higher Education Innovation Fund (HEIF). * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "htif.h" #include #include -extern uint64_t console_intr; - static tsw_outwakeup_t riscvtty_outwakeup; static struct ttydevsw riscv_ttydevsw = { .tsw_flags = TF_NOPREFIX, .tsw_outwakeup = riscvtty_outwakeup, }; static int polltime; static struct callout riscv_callout; static struct tty *tp = NULL; #if defined(KDB) static int alt_break_state; #endif static void riscv_timeout(void *); static cn_probe_t riscv_cnprobe; static cn_init_t riscv_cninit; static cn_term_t riscv_cnterm; static cn_getc_t riscv_cngetc; static cn_putc_t riscv_cnputc; static cn_grab_t riscv_cngrab; static cn_ungrab_t riscv_cnungrab; CONSOLE_DRIVER(riscv); #define MAX_BURST_LEN 1 #define QUEUE_SIZE 256 #define CONSOLE_DEFAULT_ID 1ul #define SPIN_IN_MACHINE_MODE 1 struct queue_entry { uint64_t data; uint64_t used; struct queue_entry *next; }; struct queue_entry cnqueue[QUEUE_SIZE]; struct queue_entry *entry_last; struct queue_entry *entry_served; static void -htif_putc(int c) +riscv_putc(int c) { uint64_t cmd; cmd = (HTIF_CMD_WRITE << HTIF_CMD_SHIFT); cmd |= (CONSOLE_DEFAULT_ID << HTIF_DEV_ID_SHIFT); cmd |= c; -#ifdef SPIN_IN_MACHINE_MODE - machine_command(ECALL_HTIF_LOWPUTC, cmd); -#else - htif_command(cmd); -#endif - + machine_command(ECALL_HTIF_CMD, cmd); } -static uint8_t -htif_getc(void) -{ - uint64_t cmd; - uint8_t res; - - cmd = (HTIF_CMD_READ << HTIF_CMD_SHIFT); - cmd |= (CONSOLE_DEFAULT_ID << HTIF_DEV_ID_SHIFT); - - res = htif_command(cmd); - - return (res); -} - -static void -riscv_putc(int c) -{ - uint64_t counter; - uint64_t *cc; - uint64_t val; - - val = 0; - counter = 0; - - cc = (uint64_t*)&console_intr; - *cc = 0; - - htif_putc(c); - -#ifndef SPIN_IN_MACHINE_MODE - /* Wait for an interrupt */ - __asm __volatile( - "li %0, 1\n" /* counter = 1 */ - "slli %0, %0, 12\n" /* counter <<= 12 */ - "1:" - "addi %0, %0, -1\n" /* counter -= 1 */ - "beqz %0, 2f\n" /* counter == 0 ? finish */ - "ld %1, 0(%2)\n" /* val = *cc */ - "beqz %1, 1b\n" /* val == 0 ? repeat */ - "2:" - : "=&r"(counter), "=&r"(val) : "r"(cc) - ); -#endif -} - #ifdef EARLY_PRINTF early_putc_t *early_putc = riscv_putc; #endif static void cn_drvinit(void *unused) { if (riscv_consdev.cn_pri != CN_DEAD && riscv_consdev.cn_name[0] != '\0') { tp = tty_alloc(&riscv_ttydevsw, NULL); tty_init_console(tp, 0); tty_makedev(tp, NULL, "%s", "rcons"); polltime = 1; callout_init(&riscv_callout, 1); callout_reset(&riscv_callout, polltime, riscv_timeout, NULL); } } SYSINIT(cndev, SI_SUB_CONFIGURE, SI_ORDER_MIDDLE, cn_drvinit, NULL); static void riscvtty_outwakeup(struct tty *tp) { u_char buf[MAX_BURST_LEN]; int len; int i; for (;;) { len = ttydisc_getc(tp, buf, sizeof(buf)); if (len == 0) break; KASSERT(len == 1, ("tty error")); for (i = 0; i < len; i++) riscv_putc(buf[i]); } } static void riscv_timeout(void *v) { int c; tty_lock(tp); while ((c = riscv_cngetc(NULL)) != -1) ttydisc_rint(tp, c, 0); ttydisc_rint_done(tp); tty_unlock(tp); callout_reset(&riscv_callout, polltime, riscv_timeout, NULL); } static void riscv_cnprobe(struct consdev *cp) { cp->cn_pri = CN_NORMAL; } static void riscv_cninit(struct consdev *cp) { int i; strcpy(cp->cn_name, "rcons"); for (i = 0; i < QUEUE_SIZE; i++) { if (i == (QUEUE_SIZE - 1)) cnqueue[i].next = &cnqueue[0]; else cnqueue[i].next = &cnqueue[i+1]; cnqueue[i].data = 0; cnqueue[i].used = 0; } entry_last = &cnqueue[0]; entry_served = &cnqueue[0]; } static void riscv_cnterm(struct consdev *cp) { } static void riscv_cngrab(struct consdev *cp) { } static void riscv_cnungrab(struct consdev *cp) { } static int riscv_cngetc(struct consdev *cp) { #if defined(KDB) uint64_t devcmd; uint64_t entry; uint64_t devid; #endif + uint64_t cmd; uint8_t data; int ch; - htif_getc(); + cmd = (HTIF_CMD_READ << HTIF_CMD_SHIFT); + cmd |= (CONSOLE_DEFAULT_ID << HTIF_DEV_ID_SHIFT); + machine_command(ECALL_HTIF_CMD_REQ, cmd); + #if defined(KDB) if (kdb_active) { - entry = machine_command(ECALL_HTIF_GET_ENTRY, 0); + + entry = machine_command(ECALL_HTIF_CMD_RESP, 0); while (entry) { devid = HTIF_DEV_ID(entry); devcmd = HTIF_DEV_CMD(entry); data = HTIF_DEV_DATA(entry); if (devid == CONSOLE_DEFAULT_ID && devcmd == 0) { entry_last->data = data; entry_last->used = 1; entry_last = entry_last->next; } else { printf("Lost interrupt: devid %d\n", devid); } - entry = machine_command(ECALL_HTIF_GET_ENTRY, 0); + entry = machine_command(ECALL_HTIF_CMD_RESP, 0); } } #endif if (entry_served->used == 1) { data = entry_served->data; entry_served->used = 0; entry_served = entry_served->next; ch = (data & 0xff); if (ch > 0 && ch < 0xff) { #if defined(KDB) kdb_alt_break(ch, &alt_break_state); #endif return (ch); } } return (-1); } static void riscv_cnputc(struct consdev *cp, int c) { riscv_putc(c); } /* * Bus interface. */ struct htif_console_softc { device_t dev; int running; int intr_chan; int cmd_done; int curtag; int index; }; static void htif_console_intr(void *arg, uint64_t entry) { struct htif_console_softc *sc; uint8_t devcmd; uint64_t data; sc = arg; devcmd = HTIF_DEV_CMD(entry); data = HTIF_DEV_DATA(entry); if (devcmd == 0) { entry_last->data = data; entry_last->used = 1; entry_last = entry_last->next; } } static int htif_console_probe(device_t dev) { return (0); } static int htif_console_attach(device_t dev) { struct htif_console_softc *sc; sc = device_get_softc(dev); sc->dev = dev; sc->index = htif_get_index(dev); if (sc->index < 0) return (EINVAL); htif_setup_intr(sc->index, htif_console_intr, sc); return (0); } static device_method_t htif_console_methods[] = { DEVMETHOD(device_probe, htif_console_probe), DEVMETHOD(device_attach, htif_console_attach), DEVMETHOD_END }; static driver_t htif_console_driver = { "htif_console", htif_console_methods, sizeof(struct htif_console_softc) }; static devclass_t htif_console_devclass; DRIVER_MODULE(htif_console, htif, htif_console_driver, htif_console_devclass, 0, 0); Index: user/alc/PQ_LAUNDRY/sys/riscv/include/cpu.h =================================================================== --- user/alc/PQ_LAUNDRY/sys/riscv/include/cpu.h (revision 303666) +++ user/alc/PQ_LAUNDRY/sys/riscv/include/cpu.h (revision 303667) @@ -1,95 +1,94 @@ /*- - * Copyright (c) 2015 Ruslan Bukin + * Copyright (c) 2015-2016 Ruslan Bukin * All rights reserved. * * Portions of this software were developed by SRI International and the * University of Cambridge Computer Laboratory under DARPA/AFRL contract * FA8750-10-C-0237 ("CTSRD"), as part of the DARPA CRASH research programme. * * Portions of this software were developed by the University of Cambridge * Computer Laboratory as part of the CTSRD Project, with support from the * UK Higher Education Innovation Fund (HEIF). * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _MACHINE_CPU_H_ #define _MACHINE_CPU_H_ #include #include #define TRAPF_PC(tfp) ((tfp)->tf_ra) #define TRAPF_USERMODE(tfp) (((tfp)->tf_sepc & (1ul << 63)) == 0) #define cpu_getstack(td) ((td)->td_frame->tf_sp) #define cpu_setstack(td, sp) ((td)->td_frame->tf_sp = (sp)) #define cpu_spinwait() /* nothing */ #ifdef _KERNEL /* * 0x0000 CPU ID unimplemented * 0x0001 UC Berkeley Rocket repo * 0x0002­0x7FFE Reserved for open-source repos * 0x7FFF Reserved for extension * 0x8000 Reserved for anonymous source * 0x8001­0xFFFE Reserved for proprietary implementations * 0xFFFF Reserved for extension */ #define CPU_IMPL_SHIFT 0 #define CPU_IMPL_MASK (0xffff << CPU_IMPL_SHIFT) #define CPU_IMPL(mimpid) ((mimpid & CPU_IMPL_MASK) >> CPU_IMPL_SHIFT) #define CPU_IMPL_UNIMPLEMEN 0x0 #define CPU_IMPL_UCB_ROCKET 0x1 #define CPU_PART_SHIFT 62 #define CPU_PART_MASK (0x3ul << CPU_PART_SHIFT) -#define CPU_PART(mcpuid) ((mcpuid & CPU_PART_MASK) >> CPU_PART_SHIFT) -#define CPU_PART_RV32I 0x0 -#define CPU_PART_RV32E 0x1 -#define CPU_PART_RV64I 0x2 -#define CPU_PART_RV128I 0x3 +#define CPU_PART(misa) ((misa & CPU_PART_MASK) >> CPU_PART_SHIFT) +#define CPU_PART_RV32 0x1 +#define CPU_PART_RV64 0x2 +#define CPU_PART_RV128 0x3 extern char btext[]; extern char etext[]; void cpu_halt(void) __dead2; void cpu_reset(void) __dead2; void fork_trampoline(void); void identify_cpu(void); void swi_vm(void *v); static __inline uint64_t get_cyclecount(void) { /* TODO: This is bogus */ return (1); } #endif #endif /* !_MACHINE_CPU_H_ */ Index: user/alc/PQ_LAUNDRY/sys/riscv/include/cpufunc.h =================================================================== --- user/alc/PQ_LAUNDRY/sys/riscv/include/cpufunc.h (revision 303666) +++ user/alc/PQ_LAUNDRY/sys/riscv/include/cpufunc.h (revision 303667) @@ -1,123 +1,124 @@ /*- - * Copyright (c) 2015 Ruslan Bukin + * Copyright (c) 2015-2016 Ruslan Bukin * All rights reserved. * * Portions of this software were developed by SRI International and the * University of Cambridge Computer Laboratory under DARPA/AFRL contract * FA8750-10-C-0237 ("CTSRD"), as part of the DARPA CRASH research programme. * * Portions of this software were developed by the University of Cambridge * Computer Laboratory as part of the CTSRD Project, with support from the * UK Higher Education Innovation Fund (HEIF). * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _MACHINE_CPUFUNC_H_ #define _MACHINE_CPUFUNC_H_ #ifdef _KERNEL #include static __inline void breakpoint(void) { __asm("ebreak"); } static __inline register_t intr_disable(void) { uint64_t ret; __asm __volatile( - "csrrci %0, sstatus, 1" - : "=&r" (ret) + "csrrci %0, sstatus, %1" + : "=&r" (ret) : "i" (SSTATUS_SIE) ); - return (ret & SSTATUS_IE); + return (ret & (SSTATUS_SIE)); } static __inline void intr_restore(register_t s) { __asm __volatile( "csrs sstatus, %0" :: "r" (s) ); } static __inline void intr_enable(void) { __asm __volatile( - "csrsi sstatus, 1" + "csrsi sstatus, %0" + :: "i" (SSTATUS_SIE) ); } static __inline register_t machine_command(uint64_t cmd, uint64_t arg) { uint64_t res; __asm __volatile( "mv t5, %2\n" "mv t6, %1\n" "ecall\n" "mv %0, t6" : "=&r"(res) : "r"(arg), "r"(cmd) ); return (res); } #define cpu_nullop() riscv_nullop() #define cpufunc_nullop() riscv_nullop() #define cpu_setttb(a) riscv_setttb(a) #define cpu_tlb_flushID() riscv_tlb_flushID() #define cpu_tlb_flushID_SE(e) riscv_tlb_flushID_SE(e) #define cpu_dcache_wbinv_range(a, s) riscv_dcache_wbinv_range((a), (s)) #define cpu_dcache_inv_range(a, s) riscv_dcache_inv_range((a), (s)) #define cpu_dcache_wb_range(a, s) riscv_dcache_wb_range((a), (s)) #define cpu_idcache_wbinv_range(a, s) riscv_idcache_wbinv_range((a), (s)) #define cpu_icache_sync_range(a, s) riscv_icache_sync_range((a), (s)) void riscv_nullop(void); void riscv_setttb(vm_offset_t); void riscv_tlb_flushID(void); void riscv_tlb_flushID_SE(vm_offset_t); void riscv_icache_sync_range(vm_offset_t, vm_size_t); void riscv_idcache_wbinv_range(vm_offset_t, vm_size_t); void riscv_dcache_wbinv_range(vm_offset_t, vm_size_t); void riscv_dcache_inv_range(vm_offset_t, vm_size_t); void riscv_dcache_wb_range(vm_offset_t, vm_size_t); #endif /* _KERNEL */ #endif /* _MACHINE_CPUFUNC_H_ */ Index: user/alc/PQ_LAUNDRY/sys/riscv/include/db_machdep.h =================================================================== --- user/alc/PQ_LAUNDRY/sys/riscv/include/db_machdep.h (revision 303666) +++ user/alc/PQ_LAUNDRY/sys/riscv/include/db_machdep.h (revision 303667) @@ -1,91 +1,91 @@ /*- * Copyright (c) 2015-2016 Ruslan Bukin * All rights reserved. * * Portions of this software were developed by SRI International and the * University of Cambridge Computer Laboratory under DARPA/AFRL contract * FA8750-10-C-0237 ("CTSRD"), as part of the DARPA CRASH research programme. * * Portions of this software were developed by the University of Cambridge * Computer Laboratory as part of the CTSRD Project, with support from the * UK Higher Education Innovation Fund (HEIF). * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _MACHINE_DB_MACHDEP_H_ #define _MACHINE_DB_MACHDEP_H_ #include #include #include -#define T_BREAKPOINT (EXCP_INSTR_BREAKPOINT) +#define T_BREAKPOINT (EXCP_BREAKPOINT) #define T_WATCHPOINT (0) typedef vm_offset_t db_addr_t; typedef long db_expr_t; #define PC_REGS() ((db_addr_t)kdb_thrctx->pcb_sepc) #define BKPT_INST (0x00100073) #define BKPT_SIZE (INSN_SIZE) #define BKPT_SET(inst) (BKPT_INST) #define BKPT_SKIP do { \ kdb_frame->tf_sepc += BKPT_SIZE; \ } while (0) #define db_clear_single_step kdb_cpu_clear_singlestep #define db_set_single_step kdb_cpu_set_singlestep #define IS_BREAKPOINT_TRAP(type, code) (type == T_BREAKPOINT) #define IS_WATCHPOINT_TRAP(type, code) (type == T_WATCHPOINT) #define inst_trap_return(ins) (ins == 0x10000073) /* eret */ #define inst_return(ins) (ins == 0x00008067) /* ret */ #define inst_call(ins) (((ins) & 0x7f) == 111 || \ ((ins) & 0x7f) == 103) /* jal, jalr */ #define inst_load(ins) ({ \ uint32_t tmp_instr = db_get_value(PC_REGS(), sizeof(uint32_t), FALSE); \ is_load_instr(tmp_instr); \ }) #define inst_store(ins) ({ \ uint32_t tmp_instr = db_get_value(PC_REGS(), sizeof(uint32_t), FALSE); \ is_store_instr(tmp_instr); \ }) #define is_load_instr(ins) (((ins) & 0x7f) == 3) #define is_store_instr(ins) (((ins) & 0x7f) == 35) #define next_instr_address(pc, bd) ((bd) ? (pc) : ((pc) + 4)) #define DB_SMALL_VALUE_MAX (0x7fffffff) #define DB_SMALL_VALUE_MIN (-0x40001) #define DB_ELFSIZE 64 #endif /* !_MACHINE_DB_MACHDEP_H_ */ Index: user/alc/PQ_LAUNDRY/sys/riscv/include/intr.h =================================================================== --- user/alc/PQ_LAUNDRY/sys/riscv/include/intr.h (revision 303666) +++ user/alc/PQ_LAUNDRY/sys/riscv/include/intr.h (revision 303667) @@ -1,68 +1,80 @@ /*- * Copyright (c) 2015-2016 Ruslan Bukin * All rights reserved. * * Portions of this software were developed by SRI International and the * University of Cambridge Computer Laboratory under DARPA/AFRL contract * FA8750-10-C-0237 ("CTSRD"), as part of the DARPA CRASH research programme. * * Portions of this software were developed by the University of Cambridge * Computer Laboratory as part of the CTSRD Project, with support from the * UK Higher Education Innovation Fund (HEIF). * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _MACHINE_INTR_MACHDEP_H_ #define _MACHINE_INTR_MACHDEP_H_ struct trapframe; void riscv_init_interrupts(void); int riscv_teardown_intr(void *); int riscv_config_intr(u_int, enum intr_trigger, enum intr_polarity); int riscv_setup_intr(const char *, driver_filter_t *, driver_intr_t *, void *, int, int, void **); void riscv_cpu_intr(struct trapframe *); typedef unsigned long * riscv_intrcnt_t; riscv_intrcnt_t riscv_intrcnt_create(const char *); void riscv_intrcnt_setname(riscv_intrcnt_t, const char *); #ifdef SMP void riscv_setup_ipihandler(driver_filter_t *); void riscv_unmask_ipi(void); #endif enum { - IRQ_SOFTWARE, - IRQ_TIMER, - IRQ_HTIF, + IRQ_SOFTWARE_USER, + IRQ_SOFTWARE_SUPERVISOR, + IRQ_SOFTWARE_HYPERVISOR, + IRQ_SOFTWARE_MACHINE, + IRQ_TIMER_USER, + IRQ_TIMER_SUPERVISOR, + IRQ_TIMER_HYPERVISOR, + IRQ_TIMER_MACHINE, + IRQ_EXTERNAL_USER, + IRQ_EXTERNAL_SUPERVISOR, + IRQ_EXTERNAL_HYPERVISOR, + IRQ_EXTERNAL_MACHINE, +#if 0 + /* lowRISC TODO */ IRQ_COP, /* lowRISC only */ IRQ_UART, /* lowRISC only */ +#endif NIRQS }; #endif /* !_MACHINE_INTR_MACHDEP_H_ */ Index: user/alc/PQ_LAUNDRY/sys/riscv/include/pte.h =================================================================== --- user/alc/PQ_LAUNDRY/sys/riscv/include/pte.h (revision 303666) +++ user/alc/PQ_LAUNDRY/sys/riscv/include/pte.h (revision 303667) @@ -1,101 +1,90 @@ /*- * Copyright (c) 2014 Andrew Turner - * Copyright (c) 2015 Ruslan Bukin + * Copyright (c) 2015-2016 Ruslan Bukin * All rights reserved. * * Portions of this software were developed by SRI International and the * University of Cambridge Computer Laboratory under DARPA/AFRL contract * FA8750-10-C-0237 ("CTSRD"), as part of the DARPA CRASH research programme. * * Portions of this software were developed by the University of Cambridge * Computer Laboratory as part of the CTSRD Project, with support from the * UK Higher Education Innovation Fund (HEIF). * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _MACHINE_PTE_H_ #define _MACHINE_PTE_H_ #ifndef LOCORE typedef uint64_t pd_entry_t; /* page directory entry */ typedef uint64_t pt_entry_t; /* page table entry */ typedef uint64_t pn_t; /* page number */ #endif /* Level 0 table, 512GiB per entry */ #define L0_SHIFT 39 /* Level 1 table, 1GiB per entry */ #define L1_SHIFT 30 #define L1_SIZE (1 << L1_SHIFT) #define L1_OFFSET (L1_SIZE - 1) /* Level 2 table, 2MiB per entry */ #define L2_SHIFT 21 #define L2_SIZE (1 << L2_SHIFT) #define L2_OFFSET (L2_SIZE - 1) /* Level 3 table, 4KiB per entry */ #define L3_SHIFT 12 #define L3_SIZE (1 << L3_SHIFT) #define L3_OFFSET (L3_SIZE - 1) #define Ln_ENTRIES (1 << 9) #define Ln_ADDR_MASK (Ln_ENTRIES - 1) /* Bits 9:7 are reserved for software */ -#define PTE_SW_MANAGED (1 << 8) -#define PTE_SW_WIRED (1 << 7) -#define PTE_DIRTY (1 << 6) /* Virtual page is written */ -#define PTE_REF (1 << 5) /* Virtual page is referenced */ -#define PTE_VALID (1 << 0) /* Virtual page is valid */ -#define PTE_TYPE_S 1 -#define PTE_TYPE_M (0xf << PTE_TYPE_S) -#define PTE_TYPE_PTR 0 -#define PTE_TYPE_PTR_G 1 -#define PTE_TYPE_SROURX 2 /* Supervisor read-only, user read-execute page. */ -#define PTE_TYPE_SRWURWX 3 /* Supervisor read-write, user read-write-execute page. */ -#define PTE_TYPE_SURO 4 /* Supervisor and user read-only page. */ -#define PTE_TYPE_SURW 5 /* Supervisor and user read-write page. */ -#define PTE_TYPE_SURX 6 /* Supervisor and user read-execute page. */ -#define PTE_TYPE_SURWX 7 /* Supervisor and User Read Write Execute */ -#define PTE_TYPE_SRO 8 /* Supervisor read-only page. */ -#define PTE_TYPE_SRW 9 /* Supervisor read-write page. */ -#define PTE_TYPE_SRX 10 /* Supervisor read-execute page. */ -#define PTE_TYPE_SRWX 11 /* Supervisor read-write-execute page. */ -#define PTE_TYPE_SRO_G 12 /* Supervisor read-only page--global mapping. */ -#define PTE_TYPE_SRW_G 13 /* Supervisor read-write page--global mapping. */ -#define PTE_TYPE_SRX_G 14 /* Supervisor read-execute page--global mapping. */ -#define PTE_TYPE_SRWX_G 15 /* Supervisor Read Write Execute Global */ +#define PTE_SW_MANAGED (1 << 9) +#define PTE_SW_WIRED (1 << 8) +#define PTE_D (1 << 7) /* Dirty */ +#define PTE_A (1 << 6) /* Accessed */ +#define PTE_G (1 << 5) /* Global */ +#define PTE_U (1 << 4) /* User */ +#define PTE_X (1 << 3) /* Execute */ +#define PTE_W (1 << 2) /* Write */ +#define PTE_R (1 << 1) /* Read */ +#define PTE_V (1 << 0) /* Valid */ +#define PTE_RWX (PTE_R | PTE_W | PTE_X) +#define PTE_RX (PTE_R | PTE_X) #define PTE_PPN0_S 10 #define PTE_PPN1_S 19 #define PTE_PPN2_S 28 #define PTE_PPN3_S 37 #define PTE_SIZE 8 #endif /* !_MACHINE_PTE_H_ */ /* End of pte.h */ Index: user/alc/PQ_LAUNDRY/sys/riscv/include/riscvreg.h =================================================================== --- user/alc/PQ_LAUNDRY/sys/riscv/include/riscvreg.h (revision 303666) +++ user/alc/PQ_LAUNDRY/sys/riscv/include/riscvreg.h (revision 303667) @@ -1,171 +1,210 @@ /*- * Copyright (c) 2015-2016 Ruslan Bukin * All rights reserved. * * Portions of this software were developed by SRI International and the * University of Cambridge Computer Laboratory under DARPA/AFRL contract * FA8750-10-C-0237 ("CTSRD"), as part of the DARPA CRASH research programme. * * Portions of this software were developed by the University of Cambridge * Computer Laboratory as part of the CTSRD Project, with support from the * UK Higher Education Innovation Fund (HEIF). * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _MACHINE_RISCVREG_H_ #define _MACHINE_RISCVREG_H_ /* Machine mode requests */ #define ECALL_MTIMECMP 0x01 -#define ECALL_CLEAR_PENDING 0x02 -#define ECALL_HTIF_CMD 0x03 -#define ECALL_HTIF_GET_ENTRY 0x04 -#define ECALL_MCPUID_GET 0x05 -#define ECALL_MIMPID_GET 0x06 -#define ECALL_SEND_IPI 0x07 -#define ECALL_CLEAR_IPI 0x08 -#define ECALL_HTIF_LOWPUTC 0x09 -#define ECALL_MIE_SET 0x0a -#define ECALL_IO_IRQ_MASK 0x0b +#define ECALL_HTIF_GET_ENTRY 0x02 +#define ECALL_MCPUID_GET 0x03 +#define ECALL_MIMPID_GET 0x04 +#define ECALL_SEND_IPI 0x05 +#define ECALL_CLEAR_IPI 0x06 +#define ECALL_MIE_SET 0x07 +#define ECALL_IO_IRQ_MASK 0x08 +#define ECALL_HTIF_CMD 0x09 +#define ECALL_HTIF_CMD_REQ 0x0a +#define ECALL_HTIF_CMD_RESP 0x0b #define EXCP_SHIFT 0 #define EXCP_MASK (0xf << EXCP_SHIFT) -#define EXCP_INSTR_ADDR_MISALIGNED 0 -#define EXCP_INSTR_ACCESS_FAULT 1 -#define EXCP_INSTR_ILLEGAL 2 -#define EXCP_INSTR_BREAKPOINT 3 -#define EXCP_LOAD_ADDR_MISALIGNED 4 -#define EXCP_LOAD_ACCESS_FAULT 5 -#define EXCP_STORE_ADDR_MISALIGNED 6 -#define EXCP_STORE_ACCESS_FAULT 7 -#define EXCP_UMODE_ENV_CALL 8 -#define EXCP_SMODE_ENV_CALL 9 -#define EXCP_HMODE_ENV_CALL 10 -#define EXCP_MMODE_ENV_CALL 11 -#define EXCP_INTR (1 << 31) +#define EXCP_MISALIGNED_FETCH 0 +#define EXCP_FAULT_FETCH 1 +#define EXCP_ILLEGAL_INSTRUCTION 2 +#define EXCP_BREAKPOINT 3 +#define EXCP_MISALIGNED_LOAD 4 +#define EXCP_FAULT_LOAD 5 +#define EXCP_MISALIGNED_STORE 6 +#define EXCP_FAULT_STORE 7 +#define EXCP_USER_ECALL 8 +#define EXCP_SUPERVISOR_ECALL 9 +#define EXCP_HYPERVISOR_ECALL 10 +#define EXCP_MACHINE_ECALL 11 +#define EXCP_INTR (1ul << 63) #define EXCP_INTR_SOFTWARE 0 #define EXCP_INTR_TIMER 1 #define EXCP_INTR_HTIF 2 -#define SSTATUS_IE (1 << 0) -#define SSTATUS_PIE (1 << 3) -#define SSTATUS_PS (1 << 4) +#define SSTATUS_UIE (1 << 0) +#define SSTATUS_SIE (1 << 1) +#define SSTATUS_UPIE (1 << 4) +#define SSTATUS_SPIE (1 << 5) +#define SSTATUS_SPIE_SHIFT 5 +#define SSTATUS_SPP (1 << 8) +#define SSTATUS_SPP_SHIFT 8 +#define SSTATUS_FS_MASK 0x3 +#define SSTATUS_FS_SHIFT 13 +#define SSTATUS_XS_MASK 0x3 +#define SSTATUS_XS_SHIFT 15 +#define SSTATUS_PUM (1 << 18) +#define SSTATUS32_SD (1 << 63) +#define SSTATUS64_SD (1 << 31) -#define MSTATUS_MPRV (1 << 16) -#define MSTATUS_PRV_SHIFT 1 -#define MSTATUS_PRV1_SHIFT 4 -#define MSTATUS_PRV2_SHIFT 7 -#define MSTATUS_PRV_MASK (0x3 << MSTATUS_PRV_SHIFT) -#define MSTATUS_PRV_U 0 /* user */ -#define MSTATUS_PRV_S 1 /* supervisor */ -#define MSTATUS_PRV_H 2 /* hypervisor */ -#define MSTATUS_PRV_M 3 /* machine */ +#define MSTATUS_UIE (1 << 0) +#define MSTATUS_SIE (1 << 1) +#define MSTATUS_HIE (1 << 2) +#define MSTATUS_MIE (1 << 3) +#define MSTATUS_UPIE (1 << 4) +#define MSTATUS_SPIE (1 << 5) +#define MSTATUS_SPIE_SHIFT 5 +#define MSTATUS_HPIE (1 << 6) +#define MSTATUS_MPIE (1 << 7) +#define MSTATUS_MPIE_SHIFT 7 +#define MSTATUS_SPP (1 << 8) +#define MSTATUS_SPP_SHIFT 8 +#define MSTATUS_HPP_MASK 0x3 +#define MSTATUS_HPP_SHIFT 9 +#define MSTATUS_MPP_MASK 0x3 +#define MSTATUS_MPP_SHIFT 11 +#define MSTATUS_FS_MASK 0x3 +#define MSTATUS_FS_SHIFT 13 +#define MSTATUS_XS_MASK 0x3 +#define MSTATUS_XS_SHIFT 15 +#define MSTATUS_MPRV (1 << 17) +#define MSTATUS_PUM (1 << 18) +#define MSTATUS_VM_MASK 0x1f +#define MSTATUS_VM_SHIFT 24 +#define MSTATUS_VM_MBARE 0 +#define MSTATUS_VM_MBB 1 +#define MSTATUS_VM_MBBID 2 +#define MSTATUS_VM_SV32 8 +#define MSTATUS_VM_SV39 9 +#define MSTATUS_VM_SV48 10 +#define MSTATUS_VM_SV57 11 +#define MSTATUS_VM_SV64 12 +#define MSTATUS32_SD (1 << 63) +#define MSTATUS64_SD (1 << 31) -#define MSTATUS_VM_SHIFT 17 -#define MSTATUS_VM_MASK 0x1f -#define MSTATUS_VM_MBARE 0 -#define MSTATUS_VM_MBB 1 -#define MSTATUS_VM_MBBID 2 -#define MSTATUS_VM_SV32 8 -#define MSTATUS_VM_SV39 9 -#define MSTATUS_VM_SV48 10 +#define MSTATUS_PRV_U 0 /* user */ +#define MSTATUS_PRV_S 1 /* supervisor */ +#define MSTATUS_PRV_H 2 /* hypervisor */ +#define MSTATUS_PRV_M 3 /* machine */ +#define MIE_USIE (1 << 0) #define MIE_SSIE (1 << 1) #define MIE_HSIE (1 << 2) #define MIE_MSIE (1 << 3) +#define MIE_UTIE (1 << 4) #define MIE_STIE (1 << 5) #define MIE_HTIE (1 << 6) #define MIE_MTIE (1 << 7) +#define MIP_USIP (1 << 0) #define MIP_SSIP (1 << 1) #define MIP_HSIP (1 << 2) #define MIP_MSIP (1 << 3) +#define MIP_UTIP (1 << 4) #define MIP_STIP (1 << 5) #define MIP_HTIP (1 << 6) #define MIP_MTIP (1 << 7) -#define SR_IE (1 << 0) -#define SR_IE1 (1 << 3) -#define SR_IE2 (1 << 6) -#define SR_IE3 (1 << 9) - +#define SIE_USIE (1 << 0) #define SIE_SSIE (1 << 1) +#define SIE_UTIE (1 << 4) #define SIE_STIE (1 << 5) +#define MIP_SEIP (1 << 9) + /* Note: sip register has no SIP_STIP bit in Spike simulator */ #define SIP_SSIP (1 << 1) #define SIP_STIP (1 << 5) +#if 0 +/* lowRISC TODO */ #define NCSRS 4096 #define CSR_IPI 0x783 #define CSR_IO_IRQ 0x7c0 /* lowRISC only? */ +#endif + #define XLEN 8 #define INSN_SIZE 4 #define RISCV_INSN_NOP 0x00000013 #define RISCV_INSN_BREAK 0x00100073 #define RISCV_INSN_RET 0x00008067 #define CSR_ZIMM(val) \ (__builtin_constant_p(val) && ((u_long)(val) < 32)) #define csr_swap(csr, val) \ ({ if (CSR_ZIMM(val)) \ __asm __volatile("csrrwi %0, " #csr ", %1" \ : "=r" (val) : "i" (val)); \ else \ __asm __volatile("csrrw %0, " #csr ", %1" \ : "=r" (val) : "r" (val)); \ val; \ }) #define csr_write(csr, val) \ ({ if (CSR_ZIMM(val)) \ __asm __volatile("csrwi " #csr ", %0" :: "i" (val)); \ else \ __asm __volatile("csrw " #csr ", %0" :: "r" (val)); \ }) #define csr_set(csr, val) \ ({ if (CSR_ZIMM(val)) \ __asm __volatile("csrsi " #csr ", %0" :: "i" (val)); \ else \ __asm __volatile("csrs " #csr ", %0" :: "r" (val)); \ }) #define csr_clear(csr, val) \ ({ if (CSR_ZIMM(val)) \ __asm __volatile("csrci " #csr ", %0" :: "i" (val)); \ else \ __asm __volatile("csrc " #csr ", %0" :: "r" (val)); \ }) #define csr_read(csr) \ ({ u_long val; \ __asm __volatile("csrr %0, " #csr : "=r" (val)); \ val; \ }) #endif /* !_MACHINE_RISCVREG_H_ */ Index: user/alc/PQ_LAUNDRY/sys/riscv/include/vmparam.h =================================================================== --- user/alc/PQ_LAUNDRY/sys/riscv/include/vmparam.h (revision 303666) +++ user/alc/PQ_LAUNDRY/sys/riscv/include/vmparam.h (revision 303667) @@ -1,244 +1,244 @@ /*- * Copyright (c) 1990 The Regents of the University of California. * All rights reserved. * Copyright (c) 1994 John S. Dyson * All rights reserved. * * This code is derived from software contributed to Berkeley by * William Jolitz. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)vmparam.h 5.9 (Berkeley) 5/12/91 * from: FreeBSD: src/sys/i386/include/vmparam.h,v 1.33 2000/03/30 * $FreeBSD$ */ #ifndef _MACHINE_VMPARAM_H_ #define _MACHINE_VMPARAM_H_ /* * Virtual memory related constants, all in bytes */ #ifndef MAXTSIZ #define MAXTSIZ (1*1024*1024*1024) /* max text size */ #endif #ifndef DFLDSIZ #define DFLDSIZ (128*1024*1024) /* initial data size limit */ #endif #ifndef MAXDSIZ #define MAXDSIZ (1*1024*1024*1024) /* max data size */ #endif #ifndef DFLSSIZ #define DFLSSIZ (128*1024*1024) /* initial stack size limit */ #endif #ifndef MAXSSIZ #define MAXSSIZ (1*1024*1024*1024) /* max stack size */ #endif #ifndef SGROWSIZ #define SGROWSIZ (128*1024) /* amount to grow stack */ #endif /* * The physical address space is sparsely populated. */ #define VM_PHYSSEG_SPARSE /* * The number of PHYSSEG entries must be one greater than the number * of phys_avail entries because the phys_avail entry that spans the * largest physical address that is accessible by ISA DMA is split * into two PHYSSEG entries. */ #define VM_PHYSSEG_MAX 64 /* * Create two free page pools: VM_FREEPOOL_DEFAULT is the default pool * from which physical pages are allocated and VM_FREEPOOL_DIRECT is * the pool from which physical pages for small UMA objects are * allocated. */ #define VM_NFREEPOOL 2 #define VM_FREEPOOL_DEFAULT 0 #define VM_FREEPOOL_DIRECT 1 /* * Create two free page lists: VM_FREELIST_DEFAULT is for physical * pages that are above the largest physical address that is * accessible by ISA DMA and VM_FREELIST_ISADMA is for physical pages * that are below that address. */ #define VM_NFREELIST 2 #define VM_FREELIST_DEFAULT 0 #define VM_FREELIST_ISADMA 1 /* * An allocation size of 16MB is supported in order to optimize the * use of the direct map by UMA. Specifically, a cache line contains * at most four TTEs, collectively mapping 16MB of physical memory. * By reducing the number of distinct 16MB "pages" that are used by UMA, * the physical memory allocator reduces the likelihood of both 4MB * page TLB misses and cache misses caused by 4MB page TLB misses. */ #define VM_NFREEORDER 12 /* * Enable superpage reservations: 1 level. */ #ifndef VM_NRESERVLEVEL #define VM_NRESERVLEVEL 1 #endif /* * Level 0 reservations consist of 512 pages. */ #ifndef VM_LEVEL_0_ORDER #define VM_LEVEL_0_ORDER 9 #endif /** * Address space layout. * * RISC-V implements up to a 48 bit virtual address space. The address space is * split into 2 regions at each end of the 64 bit address space, with an * out of range "hole" in the middle. * * We limit the size of the two spaces to 39 bits each. * * Upper region: 0xffffffffffffffff * 0xffffff8000000000 * * Hole: 0xffffff7fffffffff * 0x0000008000000000 * * Lower region: 0x0000007fffffffff * 0x0000000000000000 * * We use the upper region for the kernel, and the lower region for userland. * * We define some interesting address constants: * * VM_MIN_ADDRESS and VM_MAX_ADDRESS define the start and end of the entire * 64 bit address space, mostly just for convenience. * * VM_MIN_KERNEL_ADDRESS and VM_MAX_KERNEL_ADDRESS define the start and end of * mappable kernel virtual address space. * * VM_MIN_USER_ADDRESS and VM_MAX_USER_ADDRESS define the start and end of the * user address space. */ #define VM_MIN_ADDRESS (0x0000000000000000UL) #define VM_MAX_ADDRESS (0xffffffffffffffffUL) /* 32 GiB of kernel addresses */ #define VM_MIN_KERNEL_ADDRESS (0xffffffc000000000UL) #define VM_MAX_KERNEL_ADDRESS (0xffffffc800000000UL) /* Direct Map for 128 GiB of PA: 0x0 - 0x1fffffffff */ #define DMAP_MIN_ADDRESS (0xffffffd000000000UL) #define DMAP_MAX_ADDRESS (0xffffffefffffffffUL) #define DMAP_MIN_PHYSADDR (0x0000000000000000UL) #define DMAP_MAX_PHYSADDR (DMAP_MAX_ADDRESS - DMAP_MIN_ADDRESS) /* True if pa is in the dmap range */ #define PHYS_IN_DMAP(pa) ((pa) >= DMAP_MIN_PHYSADDR && \ (pa) <= DMAP_MAX_PHYSADDR) /* True if va is in the dmap range */ #define VIRT_IN_DMAP(va) ((va) >= DMAP_MIN_ADDRESS && \ (va) <= DMAP_MAX_ADDRESS) #define PHYS_TO_DMAP(pa) \ ({ \ KASSERT(PHYS_IN_DMAP(pa), \ ("%s: PA out of range, PA: 0x%lx", __func__, \ (vm_paddr_t)(pa))); \ (pa) | DMAP_MIN_ADDRESS; \ }) #define DMAP_TO_PHYS(va) \ ({ \ KASSERT(VIRT_IN_DMAP(va), \ ("%s: VA out of range, VA: 0x%lx", __func__, \ (vm_offset_t)(va))); \ (va) & ~DMAP_MIN_ADDRESS; \ }) #define VM_MIN_USER_ADDRESS (0x0000000000000000UL) #define VM_MAX_USER_ADDRESS (0x0000004000000000UL) #define VM_MINUSER_ADDRESS (VM_MIN_USER_ADDRESS) #define VM_MAXUSER_ADDRESS (VM_MAX_USER_ADDRESS) #define KERNBASE (VM_MIN_KERNEL_ADDRESS) #define SHAREDPAGE (VM_MAXUSER_ADDRESS - PAGE_SIZE) #define USRSTACK SHAREDPAGE -#define KERNENTRY (0x200) +#define KERNENTRY (0x80000000) /* * How many physical pages per kmem arena virtual page. */ #ifndef VM_KMEM_SIZE_SCALE #define VM_KMEM_SIZE_SCALE (3) #endif /* * Optional floor (in bytes) on the size of the kmem arena. */ #ifndef VM_KMEM_SIZE_MIN #define VM_KMEM_SIZE_MIN (16 * 1024 * 1024) #endif /* * Optional ceiling (in bytes) on the size of the kmem arena: 60% of the * kernel map. */ #ifndef VM_KMEM_SIZE_MAX #define VM_KMEM_SIZE_MAX ((VM_MAX_KERNEL_ADDRESS - \ VM_MIN_KERNEL_ADDRESS + 1) * 3 / 5) #endif /* * Initial pagein size of beginning of executable file. */ #ifndef VM_INITIAL_PAGEIN #define VM_INITIAL_PAGEIN 16 #endif /* * RISCVTODO * #define UMA_MD_SMALL_ALLOC */ extern u_int tsb_kernel_ldd_phys; extern vm_offset_t vm_max_kernel_address; extern vm_offset_t init_pt_va; #define ZERO_REGION_SIZE (64 * 1024) /* 64KB */ #define DEVMAP_MAX_VADDR VM_MAX_KERNEL_ADDRESS #endif /* !_MACHINE_VMPARAM_H_ */ Index: user/alc/PQ_LAUNDRY/sys/riscv/riscv/exception.S =================================================================== --- user/alc/PQ_LAUNDRY/sys/riscv/riscv/exception.S (revision 303666) +++ user/alc/PQ_LAUNDRY/sys/riscv/riscv/exception.S (revision 303667) @@ -1,611 +1,622 @@ /*- * Copyright (c) 2015-2016 Ruslan Bukin * All rights reserved. * * Portions of this software were developed by SRI International and the * University of Cambridge Computer Laboratory under DARPA/AFRL contract * FA8750-10-C-0237 ("CTSRD"), as part of the DARPA CRASH research programme. * * Portions of this software were developed by the University of Cambridge * Computer Laboratory as part of the CTSRD Project, with support from the * UK Higher Education Innovation Fund (HEIF). * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "assym.s" #include #include .macro save_registers el addi sp, sp, -(TF_SIZE) sd ra, (TF_RA)(sp) sd tp, (TF_TP)(sp) .if \el == 0 /* We came from userspace. Load our pcpu */ sd gp, (TF_GP)(sp) ld gp, (TF_SIZE)(sp) .endif sd t0, (TF_T + 0 * 8)(sp) sd t1, (TF_T + 1 * 8)(sp) sd t2, (TF_T + 2 * 8)(sp) sd t3, (TF_T + 3 * 8)(sp) sd t4, (TF_T + 4 * 8)(sp) sd t5, (TF_T + 5 * 8)(sp) sd t6, (TF_T + 6 * 8)(sp) sd s0, (TF_S + 0 * 8)(sp) sd s1, (TF_S + 1 * 8)(sp) sd s2, (TF_S + 2 * 8)(sp) sd s3, (TF_S + 3 * 8)(sp) sd s4, (TF_S + 4 * 8)(sp) sd s5, (TF_S + 5 * 8)(sp) sd s6, (TF_S + 6 * 8)(sp) sd s7, (TF_S + 7 * 8)(sp) sd s8, (TF_S + 8 * 8)(sp) sd s9, (TF_S + 9 * 8)(sp) sd s10, (TF_S + 10 * 8)(sp) sd s11, (TF_S + 11 * 8)(sp) sd a0, (TF_A + 0 * 8)(sp) sd a1, (TF_A + 1 * 8)(sp) sd a2, (TF_A + 2 * 8)(sp) sd a3, (TF_A + 3 * 8)(sp) sd a4, (TF_A + 4 * 8)(sp) sd a5, (TF_A + 5 * 8)(sp) sd a6, (TF_A + 6 * 8)(sp) sd a7, (TF_A + 7 * 8)(sp) #if 0 /* XXX: temporary test: spin if stack is not kernel one */ .if \el == 1 /* kernel */ mv t0, sp srli t0, t0, 63 1: beqz t0, 1b .endif #endif .if \el == 1 /* Store kernel sp */ li t1, TF_SIZE add t0, sp, t1 sd t0, (TF_SP)(sp) .else /* Store user sp */ csrr t0, sscratch sd t0, (TF_SP)(sp) .endif li t0, 0 csrw sscratch, t0 csrr t0, sepc sd t0, (TF_SEPC)(sp) csrr t0, sstatus sd t0, (TF_SSTATUS)(sp) csrr t0, sbadaddr sd t0, (TF_SBADADDR)(sp) csrr t0, scause sd t0, (TF_SCAUSE)(sp) .endm .macro load_registers el ld t0, (TF_SSTATUS)(sp) .if \el == 0 /* Ensure user interrupts will be enabled on eret. */ - ori t0, t0, SSTATUS_PIE + li t1, SSTATUS_SPIE + or t0, t0, t1 .else /* * Disable interrupts for supervisor mode exceptions. * For user mode exceptions we have already done this * in do_ast. */ - li t1, ~SSTATUS_IE + li t1, ~SSTATUS_SIE and t0, t0, t1 .endif csrw sstatus, t0 ld t0, (TF_SEPC)(sp) csrw sepc, t0 .if \el == 0 /* We go to userspace. Load user sp */ ld t0, (TF_SP)(sp) csrw sscratch, t0 /* And store our pcpu */ sd gp, (TF_SIZE)(sp) ld gp, (TF_GP)(sp) .endif ld ra, (TF_RA)(sp) ld tp, (TF_TP)(sp) ld t0, (TF_T + 0 * 8)(sp) ld t1, (TF_T + 1 * 8)(sp) ld t2, (TF_T + 2 * 8)(sp) ld t3, (TF_T + 3 * 8)(sp) ld t4, (TF_T + 4 * 8)(sp) ld t5, (TF_T + 5 * 8)(sp) ld t6, (TF_T + 6 * 8)(sp) ld s0, (TF_S + 0 * 8)(sp) ld s1, (TF_S + 1 * 8)(sp) ld s2, (TF_S + 2 * 8)(sp) ld s3, (TF_S + 3 * 8)(sp) ld s4, (TF_S + 4 * 8)(sp) ld s5, (TF_S + 5 * 8)(sp) ld s6, (TF_S + 6 * 8)(sp) ld s7, (TF_S + 7 * 8)(sp) ld s8, (TF_S + 8 * 8)(sp) ld s9, (TF_S + 9 * 8)(sp) ld s10, (TF_S + 10 * 8)(sp) ld s11, (TF_S + 11 * 8)(sp) ld a0, (TF_A + 0 * 8)(sp) ld a1, (TF_A + 1 * 8)(sp) ld a2, (TF_A + 2 * 8)(sp) ld a3, (TF_A + 3 * 8)(sp) ld a4, (TF_A + 4 * 8)(sp) ld a5, (TF_A + 5 * 8)(sp) ld a6, (TF_A + 6 * 8)(sp) ld a7, (TF_A + 7 * 8)(sp) addi sp, sp, (TF_SIZE) .endm .macro do_ast /* Disable interrupts */ csrr a4, sstatus 1: - csrci sstatus, SSTATUS_IE + csrci sstatus, (SSTATUS_SIE) ld a1, PC_CURTHREAD(gp) lw a2, TD_FLAGS(a1) li a3, (TDF_ASTPENDING|TDF_NEEDRESCHED) and a2, a2, a3 beqz a2, 2f /* Restore interrupts */ - andi a4, a4, SSTATUS_IE + andi a4, a4, (SSTATUS_SIE) csrs sstatus, a4 /* Handle the ast */ mv a0, sp call _C_LABEL(ast) /* Re-check for new ast scheduled */ j 1b 2: .endm +ENTRY(cpu_exception_handler) + csrrw sp, sscratch, sp + beqz sp, 1f + /* User mode detected */ + csrrw sp, sscratch, sp + j cpu_exception_handler_user +1: + /* Supervisor mode detected */ + csrrw sp, sscratch, sp + j cpu_exception_handler_supervisor +END(cpu_exception_handler) + ENTRY(cpu_exception_handler_supervisor) save_registers 1 mv a0, sp call _C_LABEL(do_trap_supervisor) load_registers 1 - eret + sret END(cpu_exception_handler_supervisor) ENTRY(cpu_exception_handler_user) csrrw sp, sscratch, sp save_registers 0 mv a0, sp call _C_LABEL(do_trap_user) do_ast load_registers 0 csrrw sp, sscratch, sp - eret + sret END(cpu_exception_handler_user) /* * Trap handlers */ .text bad_trap: j bad_trap -user_trap: +machine_trap: /* Save state */ csrrw sp, mscratch, sp addi sp, sp, -64 sd t0, (8 * 0)(sp) sd t1, (8 * 1)(sp) sd t2, (8 * 2)(sp) sd t3, (8 * 3)(sp) sd t4, (8 * 4)(sp) sd t5, (8 * 5)(sp) sd a0, (8 * 7)(sp) - la t2, _C_LABEL(cpu_exception_handler_user) - - csrr t0, mcause - bltz t0, machine_interrupt - j exit_mrts - -supervisor_trap: - /* Save state */ - csrrw sp, mscratch, sp - addi sp, sp, -64 - sd t0, (8 * 0)(sp) - sd t1, (8 * 1)(sp) - sd t2, (8 * 2)(sp) - sd t3, (8 * 3)(sp) - sd t4, (8 * 4)(sp) - sd t5, (8 * 5)(sp) - sd a0, (8 * 7)(sp) - - la t2, _C_LABEL(cpu_exception_handler_supervisor) - + csrr t3, mstatus /* Required for debug */ csrr t0, mcause bltz t0, machine_interrupt - li t1, EXCP_SMODE_ENV_CALL + li t1, EXCP_SUPERVISOR_ECALL beq t0, t1, supervisor_call - j exit_mrts +4: + /* NOT REACHED */ + j 4b machine_interrupt: /* Type of interrupt ? */ csrr t0, mcause andi t0, t0, EXCP_MASK - li t1, 0 - beq t1, t0, software_interrupt - li t1, 1 - beq t1, t0, timer_interrupt - li t1, 2 - beq t1, t0, htif_interrupt +#if 0 + /* lowRISC TODO */ li t1, 4 beq t1, t0, io_interrupt /* lowRISC only */ +#endif + li t1, 1 + beq t1, t0, supervisor_software_interrupt + li t1, 3 + beq t1, t0, machine_software_interrupt + li t1, 5 + beq t1, t0, supervisor_timer_interrupt + li t1, 7 + beq t1, t0, machine_timer_interrupt - /* not reached */ + /* NOT REACHED */ 1: j 1b +#if 0 + /* lowRISC TODO */ io_interrupt: /* Disable IO interrupts so we can go to supervisor mode */ csrwi CSR_IO_IRQ, 0 /* Handle the trap in supervisor mode */ j exit_mrts +#endif -software_interrupt: +supervisor_software_interrupt: +1: + /* Nothing here as we are using mideleg feature */ + j 1b + +machine_software_interrupt: + /* Clear IPI */ + li t0, 0x40001000 + csrr t2, mhartid + li t3, 0x1000 + mul t2, t2, t3 + add t0, t0, t2 + li t2, 0 + sd t2, 0(t0) + + /* Clear machine software pending bit */ li t0, MIP_MSIP csrc mip, t0 + + /* Post supervisor software interrupt */ li t0, MIP_SSIP csrs mip, t0 - /* If PRV1 is PRV_U (user) then serve the trap */ - csrr t0, mstatus - li t1, (MSTATUS_PRV_M << MSTATUS_PRV1_SHIFT) - and t0, t0, t1 - beqz t0, 1f - - /* - * If PRV1 is supervisor and interrupts were enabled, - * then serve the trap. - */ - csrr t0, mstatus - li t1, (SR_IE1 | (MSTATUS_PRV_M << MSTATUS_PRV1_SHIFT)) - and t0, t0, t1 - li t1, (SR_IE1 | (MSTATUS_PRV_S << MSTATUS_PRV1_SHIFT)) - beq t0, t1, 1f - j exit +supervisor_timer_interrupt: 1: - /* Handle the trap in supervisor mode */ - j exit_mrts + /* Nothing here as we are using mideleg feature */ + j 1b -timer_interrupt: +machine_timer_interrupt: /* Disable machine timer interrupts */ li t0, MIE_MTIE csrc mie, t0 - /* Clear machine pending */ + /* Clear machine timer interrupt pending */ li t0, MIP_MTIP csrc mip, t0 /* Post supervisor timer interrupt */ li t0, MIP_STIP csrs mip, t0 - /* If PRV1 is PRV_U (user) then serve the trap */ - csrr t0, mstatus - li t1, (MSTATUS_PRV_M << MSTATUS_PRV1_SHIFT) - and t0, t0, t1 - beqz t0, 1f - /* - * If PRV1 is supervisor and interrupts were enabled, - * then serve the trap. + * Check for HTIF interrupts. + * The only interrupt expected here is key press. */ - csrr t0, mstatus - li t1, (SR_IE1 | (MSTATUS_PRV_M << MSTATUS_PRV1_SHIFT)) - and t0, t0, t1 - li t1, (SR_IE1 | (MSTATUS_PRV_S << MSTATUS_PRV1_SHIFT)) - beq t0, t1, 1f + la t0, htif_lock + li t2, 1 + amoswap.d t3, t2, 0(t0) + bnez t3, 5f /* Another operation in progress, give up */ - j exit + /* We have lock */ + la t1, fromhost + ld t5, 0(t1) + beqz t5, 4f -1: - /* Serve a trap in supervisor mode */ - j exit_mrts - -htif_interrupt: -1: - li t5, 0 - csrrw t5, mfromhost, t5 - beqz t5, 3f - - /* Console PUT intr ? */ + /* Console GET intr ? */ mv t1, t5 - li t0, 0x101 + li t0, 0x100 srli t1, t1, 48 - bne t1, t0, 2f - /* Yes */ - la t0, console_intr - li t1, 1 - sd t1, 0(t0) - - /* Check if there is any other pending event */ + beq t1, t0, 2f +1: + /* There is no interrupts except keypress */ j 1b 2: /* Save entry */ la t0, htif_ring - csrr t1, mhartid - li t4, (HTIF_RING_SIZE + 16) - mulw t4, t4, t1 - add t0, t0, t4 li t4, (HTIF_RING_SIZE) add t0, t0, t4 /* t0 == htif_ring_cursor */ ld t1, 0(t0) /* load ptr to cursor */ sd t5, 0(t1) /* put entry */ li t4, 1 sd t4, 8(t1) /* mark used */ ld t4, 16(t1) /* take next */ /* Update cursor */ sd t4, 0(t0) /* Post supervisor software interrupt */ li t0, MIP_SSIP csrs mip, t0 - /* Check if there is any other pending event */ - j 1b - 3: + la t1, fromhost + li t5, 0 + sd t5, 0(t1) + +4: + /* Release lock */ + la t0, htif_lock + li t2, 0 + amoswap.d t3, t2, 0(t0) + +5: j exit supervisor_call: csrr t1, mepc addi t1, t1, 4 /* Next instruction in t1 */ li t4, ECALL_HTIF_CMD beq t5, t4, htif_cmd + li t4, ECALL_HTIF_CMD_REQ + beq t5, t4, htif_cmd_req + li t4, ECALL_HTIF_CMD_RESP + beq t5, t4, htif_cmd_resp li t4, ECALL_HTIF_GET_ENTRY beq t5, t4, htif_get_entry li t4, ECALL_MTIMECMP beq t5, t4, set_mtimecmp - li t4, ECALL_CLEAR_PENDING - beq t5, t4, clear_pending li t4, ECALL_MCPUID_GET beq t5, t4, mcpuid_get li t4, ECALL_MIMPID_GET beq t5, t4, mimpid_get li t4, ECALL_SEND_IPI beq t5, t4, send_ipi li t4, ECALL_CLEAR_IPI beq t5, t4, clear_ipi - li t4, ECALL_HTIF_LOWPUTC - beq t5, t4, htif_lowputc li t4, ECALL_MIE_SET beq t5, t4, mie_set +#if 0 + /* lowRISC TODO */ li t4, ECALL_IO_IRQ_MASK beq t5, t4, io_irq_mask +#endif j exit_next_instr +#if 0 + /* lowRISC TODO */ io_irq_mask: csrw CSR_IO_IRQ, t6 j exit_next_instr +#endif mie_set: csrs mie, t6 j exit_next_instr mcpuid_get: - csrr t6, mcpuid + csrr t6, misa j exit_next_instr mimpid_get: csrr t6, mimpid j exit_next_instr send_ipi: - /* CPU mmio base in t6 */ + /* CPU ipi MMIO register in t6 */ mv t0, t6 - li t2, (CSR_IPI * XLEN) - add t0, t0, t2 /* t0 = CSR_IPI */ li t2, 1 sd t2, 0(t0) j exit_next_instr clear_ipi: /* Do only clear if there are no new entries in HTIF ring */ la t0, htif_ring - csrr t2, mhartid - li t4, (HTIF_RING_SIZE + 16) - mulw t4, t4, t2 - add t0, t0, t4 li t4, (HTIF_RING_SIZE) add t0, t0, t4 /* t0 == ptr to htif_ring_cursor */ ld t2, 8(t0) /* load htif_ring_last */ ld t2, 8(t2) /* load used */ bnez t2, 1f /* Clear supervisor software interrupt pending bit */ li t0, MIP_SSIP csrc mip, t0 1: j exit_next_instr htif_get_entry: /* Get a htif_ring for current core */ la t0, htif_ring - csrr t2, mhartid - li t4, (HTIF_RING_SIZE + 16) - mulw t4, t4, t2 - add t0, t0, t4 li t4, (HTIF_RING_SIZE + 8) add t0, t0, t4 /* t0 == htif_ring_last */ /* Check for new entries */ li t6, 0 /* preset return value */ ld t2, 0(t0) /* load ptr to last */ ld t4, 8(t2) /* get used */ beqz t4, 1f /* No new entries. Exit */ /* Get one */ ld t6, 0(t2) /* get entry */ li t4, 0 sd t4, 8(t2) /* mark free */ sd t4, 0(t2) /* free entry, just in case */ ld t4, 16(t2) /* take next */ sd t4, 0(t0) /* update ptr to last */ 1: /* Exit. Result is stored in t6 */ j exit_next_instr -htif_cmd: +htif_cmd_resp: + la t0, htif_lock + li t2, 1 1: - mv t0, t6 - csrrw t0, mtohost, t0 - bnez t0, 1b + amoswap.d t3, t2, 0(t0) + bnez t3, 1b + + /* We have lock. Read for data */ + la t4, fromhost + ld t6, 0(t4) + beqz t6, 2f + + /* Clear event */ + li t5, 0 + sd t5, 0(t4) + +2: + /* Release lock */ + la t0, htif_lock + li t2, 0 + amoswap.d t3, t2, 0(t0) + j exit_next_instr -htif_lowputc: +htif_cmd_req: + la t0, htif_lock + li t2, 1 1: - mv t0, t6 - csrrw t0, mtohost, t0 - bnez t0, 1b + amoswap.d t3, t2, 0(t0) + bnez t3, 1b + /* We have lock. Store new request */ + la t4, tohost + sd t6, 0(t4) + + /* Release lock */ + la t0, htif_lock + li t2, 0 + amoswap.d t3, t2, 0(t0) + + j exit_next_instr + +htif_cmd: + la t0, htif_lock + li t2, 1 +1: + amoswap.d t3, t2, 0(t0) + bnez t3, 1b + + mv t3, t6 + + /* We have lock. Store new request */ + la t4, tohost + sd t6, 0(t4) 2: - li t4, 0 - csrrw t5, mfromhost, t4 - beqz t5, 2b + /* Poll for result */ + la t4, fromhost + ld t6, 0(t4) + beqz t6, 2b - /* Console PUT intr ? */ - mv t2, t5 - srli t2, t2, 48 - li t3, 0x0101 - beq t2, t3, 3f + /* Check for unexpected event */ + srli t0, t6, 48 + srli t2, t3, 48 + beq t2, t0, 3f - /* Not a console PUT, so save entry */ + /* + * We have something unexpected (e.g. keyboard keypress) + * Save entry. + */ la t0, htif_ring - csrr t2, mhartid - li t4, (HTIF_RING_SIZE + 16) - mulw t4, t4, t2 - add t0, t0, t4 li t4, (HTIF_RING_SIZE) add t0, t0, t4 /* t0 == htif_ring_cursor */ ld t2, 0(t0) /* load ptr to cursor */ - sd t5, 0(t2) /* put entry */ + sd t6, 0(t2) /* put entry */ li t4, 1 sd t4, 8(t2) /* mark used */ ld t4, 16(t2) /* take next */ /* Update cursor */ sd t4, 0(t0) /* Post supervisor software interrupt */ li t0, MIP_SSIP csrs mip, t0 - /* Wait for console intr again */ + /* Clear and look for response again */ + la t2, fromhost + li t5, 0 + sd t5, 0(t2) j 2b 3: + la t2, fromhost + li t5, 0 + sd t5, 0(t2) + + /* Release lock */ + la t0, htif_lock + li t2, 0 + amoswap.d t3, t2, 0(t0) + j exit_next_instr set_mtimecmp: - csrr t2, stime - add t6, t6, t2 - csrw mtimecmp, t6 - /* Enable interrupts */ li t0, (MIE_MTIE | MIE_STIE) csrs mie, t0 j exit_next_instr -clear_pending: - li t0, MIP_STIP - csrc mip, t0 - j exit_next_instr - /* * Trap exit functions */ exit_next_instr: /* Next instruction is in t1 */ csrw mepc, t1 exit: /* Restore state */ ld t0, (8 * 0)(sp) ld t1, (8 * 1)(sp) ld t2, (8 * 2)(sp) ld t3, (8 * 3)(sp) ld t4, (8 * 4)(sp) ld t5, (8 * 5)(sp) ld a0, (8 * 7)(sp) addi sp, sp, 64 csrrw sp, mscratch, sp - eret + mret -/* - * Redirect to supervisor - */ exit_mrts: - /* Setup exception handler */ - li t1, KERNBASE - add t2, t2, t1 - csrw stvec, t2 - - /* Restore state */ - ld t0, (8 * 0)(sp) - ld t1, (8 * 1)(sp) - ld t2, (8 * 2)(sp) - ld t3, (8 * 3)(sp) - ld t4, (8 * 4)(sp) - ld t5, (8 * 5)(sp) - ld a0, (8 * 7)(sp) - addi sp, sp, 64 - csrrw sp, mscratch, sp - - /* Redirect to supervisor */ - mrts + j exit_mrts Index: user/alc/PQ_LAUNDRY/sys/riscv/riscv/genassym.c =================================================================== --- user/alc/PQ_LAUNDRY/sys/riscv/riscv/genassym.c (revision 303666) +++ user/alc/PQ_LAUNDRY/sys/riscv/riscv/genassym.c (revision 303667) @@ -1,99 +1,100 @@ /*- * Copyright (c) 2015-2016 Ruslan Bukin * All rights reserved. * * Portions of this software were developed by SRI International and the * University of Cambridge Computer Laboratory under DARPA/AFRL contract * FA8750-10-C-0237 ("CTSRD"), as part of the DARPA CRASH research programme. * * Portions of this software were developed by the University of Cambridge * Computer Laboratory as part of the CTSRD Project, with support from the * UK Higher Education Innovation Fund (HEIF). * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include ASSYM(KERNBASE, KERNBASE); +ASSYM(KERNENTRY, KERNENTRY); ASSYM(VM_MAXUSER_ADDRESS, VM_MAXUSER_ADDRESS); ASSYM(VM_MAX_KERNEL_ADDRESS, VM_MAX_KERNEL_ADDRESS); ASSYM(TDF_ASTPENDING, TDF_ASTPENDING); ASSYM(TDF_NEEDRESCHED, TDF_NEEDRESCHED); ASSYM(PCB_ONFAULT, offsetof(struct pcb, pcb_onfault)); ASSYM(PCB_L1ADDR, offsetof(struct pcb, pcb_l1addr)); ASSYM(PCB_SIZE, sizeof(struct pcb)); ASSYM(PCB_RA, offsetof(struct pcb, pcb_ra)); ASSYM(PCB_SP, offsetof(struct pcb, pcb_sp)); ASSYM(PCB_GP, offsetof(struct pcb, pcb_gp)); ASSYM(PCB_TP, offsetof(struct pcb, pcb_tp)); ASSYM(PCB_T, offsetof(struct pcb, pcb_t)); ASSYM(PCB_S, offsetof(struct pcb, pcb_s)); ASSYM(PCB_A, offsetof(struct pcb, pcb_a)); ASSYM(SF_UC, offsetof(struct sigframe, sf_uc)); ASSYM(PC_CURPCB, offsetof(struct pcpu, pc_curpcb)); ASSYM(PC_CURTHREAD, offsetof(struct pcpu, pc_curthread)); ASSYM(TD_PCB, offsetof(struct thread, td_pcb)); ASSYM(TD_FLAGS, offsetof(struct thread, td_flags)); ASSYM(TD_PROC, offsetof(struct thread, td_proc)); ASSYM(TD_FRAME, offsetof(struct thread, td_frame)); ASSYM(TD_MD, offsetof(struct thread, td_md)); ASSYM(TD_LOCK, offsetof(struct thread, td_lock)); ASSYM(TF_SIZE, sizeof(struct trapframe)); ASSYM(TF_RA, offsetof(struct trapframe, tf_ra)); ASSYM(TF_SP, offsetof(struct trapframe, tf_sp)); ASSYM(TF_GP, offsetof(struct trapframe, tf_gp)); ASSYM(TF_TP, offsetof(struct trapframe, tf_tp)); ASSYM(TF_T, offsetof(struct trapframe, tf_t)); ASSYM(TF_S, offsetof(struct trapframe, tf_s)); ASSYM(TF_A, offsetof(struct trapframe, tf_a)); ASSYM(TF_SEPC, offsetof(struct trapframe, tf_sepc)); ASSYM(TF_SBADADDR, offsetof(struct trapframe, tf_sbadaddr)); ASSYM(TF_SCAUSE, offsetof(struct trapframe, tf_scause)); ASSYM(TF_SSTATUS, offsetof(struct trapframe, tf_sstatus)); Index: user/alc/PQ_LAUNDRY/sys/riscv/riscv/identcpu.c =================================================================== --- user/alc/PQ_LAUNDRY/sys/riscv/riscv/identcpu.c (revision 303666) +++ user/alc/PQ_LAUNDRY/sys/riscv/riscv/identcpu.c (revision 303667) @@ -1,149 +1,136 @@ /*- - * Copyright (c) 2015 Ruslan Bukin + * Copyright (c) 2015-2016 Ruslan Bukin * All rights reserved. * * Portions of this software were developed by SRI International and the * University of Cambridge Computer Laboratory under DARPA/AFRL contract * FA8750-10-C-0237 ("CTSRD"), as part of the DARPA CRASH research programme. * * Portions of this software were developed by the University of Cambridge * Computer Laboratory as part of the CTSRD Project, with support from the * UK Higher Education Innovation Fund (HEIF). * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include char machine[] = "riscv"; SYSCTL_STRING(_hw, HW_MACHINE, machine, CTLFLAG_RD, machine, 0, "Machine class"); struct cpu_desc { u_int cpu_impl; u_int cpu_part_num; const char *cpu_impl_name; const char *cpu_part_name; }; struct cpu_desc cpu_desc[MAXCPU]; struct cpu_parts { u_int part_id; const char *part_name; }; #define CPU_PART_NONE { -1, "Unknown Processor" } struct cpu_implementers { u_int impl_id; const char *impl_name; - /* - * Part number is implementation defined - * so each vendor will have its own set of values and names. - */ - const struct cpu_parts *cpu_parts; }; -#define CPU_IMPLEMENTER_NONE { 0, "Unknown Implementer", cpu_parts_none } +#define CPU_IMPLEMENTER_NONE { 0, "Unknown Implementer" } /* - * Per-implementer table of (PartNum, CPU Name) pairs. + * CPU base */ -/* UC Berkeley */ -static const struct cpu_parts cpu_parts_ucb[] = { - { CPU_PART_RV32I, "RV32I" }, - { CPU_PART_RV32E, "RV32E" }, - { CPU_PART_RV64I, "RV64I" }, - { CPU_PART_RV128I, "RV128I" }, +static const struct cpu_parts cpu_parts_std[] = { + { CPU_PART_RV32, "RV32" }, + { CPU_PART_RV64, "RV64" }, + { CPU_PART_RV128, "RV128" }, CPU_PART_NONE, }; -/* Unknown */ -static const struct cpu_parts cpu_parts_none[] = { - CPU_PART_NONE, -}; - /* * Implementers table. */ const struct cpu_implementers cpu_implementers[] = { - { CPU_IMPL_UCB_ROCKET, "UC Berkeley Rocket", cpu_parts_ucb }, + { CPU_IMPL_UCB_ROCKET, "UC Berkeley Rocket" }, CPU_IMPLEMENTER_NONE, }; void identify_cpu(void) { const struct cpu_parts *cpu_partsp; uint32_t part_id; uint32_t impl_id; uint64_t mimpid; - uint64_t mcpuid; + uint64_t misa; u_int cpu; size_t i; cpu_partsp = NULL; mimpid = machine_command(ECALL_MIMPID_GET, 0); - mcpuid = machine_command(ECALL_MCPUID_GET, 0); + misa = machine_command(ECALL_MCPUID_GET, 0); - /* SMPTODO: use mhartid ? */ cpu = PCPU_GET(cpuid); impl_id = CPU_IMPL(mimpid); for (i = 0; i < nitems(cpu_implementers); i++) { if (impl_id == cpu_implementers[i].impl_id || cpu_implementers[i].impl_id == 0) { cpu_desc[cpu].cpu_impl = impl_id; cpu_desc[cpu].cpu_impl_name = cpu_implementers[i].impl_name; - cpu_partsp = cpu_implementers[i].cpu_parts; + cpu_partsp = cpu_parts_std; break; } } - part_id = CPU_PART(mcpuid); + part_id = CPU_PART(misa); for (i = 0; &cpu_partsp[i] != NULL; i++) { if (part_id == cpu_partsp[i].part_id || cpu_partsp[i].part_id == -1) { cpu_desc[cpu].cpu_part_num = part_id; cpu_desc[cpu].cpu_part_name = cpu_partsp[i].part_name; break; } } /* Print details for boot CPU or if we want verbose output */ if (cpu == 0 || bootverbose) { printf("CPU(%d): %s %s\n", cpu, cpu_desc[cpu].cpu_impl_name, cpu_desc[cpu].cpu_part_name); } } Index: user/alc/PQ_LAUNDRY/sys/riscv/riscv/intr_machdep.c =================================================================== --- user/alc/PQ_LAUNDRY/sys/riscv/riscv/intr_machdep.c (revision 303666) +++ user/alc/PQ_LAUNDRY/sys/riscv/riscv/intr_machdep.c (revision 303667) @@ -1,304 +1,315 @@ /*- * Copyright (c) 2015-2016 Ruslan Bukin * All rights reserved. * * Portions of this software were developed by SRI International and the * University of Cambridge Computer Laboratory under DARPA/AFRL contract * FA8750-10-C-0237 ("CTSRD"), as part of the DARPA CRASH research programme. * * Portions of this software were developed by the University of Cambridge * Computer Laboratory as part of the CTSRD Project, with support from the * UK Higher Education Innovation Fund (HEIF). * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #ifdef SMP #include #endif u_long intrcnt[NIRQS]; size_t sintrcnt = sizeof(intrcnt); char intrnames[NIRQS * (MAXCOMLEN + 1) * 2]; size_t sintrnames = sizeof(intrnames); static struct intr_event *intr_events[NIRQS]; static riscv_intrcnt_t riscv_intr_counters[NIRQS]; static int intrcnt_index; riscv_intrcnt_t riscv_intrcnt_create(const char* name) { riscv_intrcnt_t counter; counter = &intrcnt[intrcnt_index++]; riscv_intrcnt_setname(counter, name); return (counter); } void riscv_intrcnt_setname(riscv_intrcnt_t counter, const char *name) { int i; i = (counter - intrcnt); KASSERT(counter != NULL, ("riscv_intrcnt_setname: NULL counter")); snprintf(intrnames + (MAXCOMLEN + 1) * i, MAXCOMLEN + 1, "%-*s", MAXCOMLEN, name); } static void riscv_mask_irq(void *source) { uintptr_t irq; irq = (uintptr_t)source; switch (irq) { - case IRQ_TIMER: + case IRQ_TIMER_SUPERVISOR: csr_clear(sie, SIE_STIE); break; - case IRQ_SOFTWARE: + case IRQ_SOFTWARE_USER: + csr_clear(sie, SIE_USIE); + case IRQ_SOFTWARE_SUPERVISOR: csr_clear(sie, SIE_SSIE); break; +#if 0 + /* lowRISC TODO */ case IRQ_UART: machine_command(ECALL_IO_IRQ_MASK, 0); break; +#endif default: panic("Unknown irq %d\n", irq); } } static void riscv_unmask_irq(void *source) { uintptr_t irq; irq = (uintptr_t)source; switch (irq) { - case IRQ_TIMER: + case IRQ_TIMER_SUPERVISOR: csr_set(sie, SIE_STIE); break; - case IRQ_SOFTWARE: + case IRQ_SOFTWARE_USER: + csr_set(sie, SIE_USIE); + break; + case IRQ_SOFTWARE_SUPERVISOR: csr_set(sie, SIE_SSIE); break; +#if 0 + /* lowRISC TODO */ case IRQ_UART: machine_command(ECALL_IO_IRQ_MASK, 1); break; +#endif default: panic("Unknown irq %d\n", irq); } } void riscv_init_interrupts(void) { char name[MAXCOMLEN + 1]; int i; for (i = 0; i < NIRQS; i++) { snprintf(name, MAXCOMLEN + 1, "int%d:", i); riscv_intr_counters[i] = riscv_intrcnt_create(name); } } int riscv_setup_intr(const char *name, driver_filter_t *filt, void (*handler)(void*), void *arg, int irq, int flags, void **cookiep) { struct intr_event *event; int error; if (irq < 0 || irq >= NIRQS) panic("%s: unknown intr %d", __func__, irq); event = intr_events[irq]; if (event == NULL) { error = intr_event_create(&event, (void *)(uintptr_t)irq, 0, irq, riscv_mask_irq, riscv_unmask_irq, NULL, NULL, "int%d", irq); if (error) return (error); intr_events[irq] = event; riscv_unmask_irq((void*)(uintptr_t)irq); } error = intr_event_add_handler(event, name, filt, handler, arg, intr_priority(flags), flags, cookiep); if (error) { printf("Failed to setup intr: %d\n", irq); return (error); } riscv_intrcnt_setname(riscv_intr_counters[irq], event->ie_fullname); return (0); } int riscv_teardown_intr(void *ih) { /* TODO */ return (0); } int riscv_config_intr(u_int irq, enum intr_trigger trig, enum intr_polarity pol) { /* There is no configuration for interrupts */ return (0); } void riscv_cpu_intr(struct trapframe *frame) { struct intr_event *event; int active_irq; critical_enter(); KASSERT(frame->tf_scause & EXCP_INTR, ("riscv_cpu_intr: wrong frame passed")); active_irq = (frame->tf_scause & EXCP_MASK); switch (active_irq) { +#if 0 + /* lowRISC TODO */ case IRQ_UART: - case IRQ_SOFTWARE: - case IRQ_TIMER: +#endif + case IRQ_SOFTWARE_USER: + case IRQ_SOFTWARE_SUPERVISOR: + case IRQ_TIMER_SUPERVISOR: event = intr_events[active_irq]; /* Update counters */ atomic_add_long(riscv_intr_counters[active_irq], 1); PCPU_INC(cnt.v_intr); break; - case IRQ_HTIF: - /* HTIF interrupts are only handled in machine mode */ - panic("%s: HTIF interrupt", __func__); - break; default: event = NULL; } if (!event || TAILQ_EMPTY(&event->ie_handlers) || (intr_event_handle(event, frame) != 0)) printf("stray interrupt %d\n", active_irq); critical_exit(); } #ifdef SMP void riscv_setup_ipihandler(driver_filter_t *filt) { - riscv_setup_intr("ipi", filt, NULL, NULL, IRQ_SOFTWARE, + riscv_setup_intr("ipi", filt, NULL, NULL, IRQ_SOFTWARE_SUPERVISOR, INTR_TYPE_MISC, NULL); } void riscv_unmask_ipi(void) { csr_set(sie, SIE_SSIE); } /* Sending IPI */ static void ipi_send(struct pcpu *pc, int ipi) { CTR3(KTR_SMP, "%s: cpu=%d, ipi=%x", __func__, pc->pc_cpuid, ipi); atomic_set_32(&pc->pc_pending_ipis, ipi); machine_command(ECALL_SEND_IPI, pc->pc_reg); CTR1(KTR_SMP, "%s: sent", __func__); } void ipi_all_but_self(u_int ipi) { cpuset_t other_cpus; other_cpus = all_cpus; CPU_CLR(PCPU_GET(cpuid), &other_cpus); CTR2(KTR_SMP, "%s: ipi: %x", __func__, ipi); ipi_selected(other_cpus, ipi); } void ipi_cpu(int cpu, u_int ipi) { cpuset_t cpus; CPU_ZERO(&cpus); CPU_SET(cpu, &cpus); CTR3(KTR_SMP, "%s: cpu: %d, ipi: %x\n", __func__, cpu, ipi); ipi_send(cpuid_to_pcpu[cpu], ipi); } void ipi_selected(cpuset_t cpus, u_int ipi) { struct pcpu *pc; CTR1(KTR_SMP, "ipi_selected: ipi: %x", ipi); STAILQ_FOREACH(pc, &cpuhead, pc_allcpu) { if (CPU_ISSET(pc->pc_cpuid, &cpus)) { CTR3(KTR_SMP, "%s: pc: %p, ipi: %x\n", __func__, pc, ipi); ipi_send(pc, ipi); } } } #endif Index: user/alc/PQ_LAUNDRY/sys/riscv/riscv/locore.S =================================================================== --- user/alc/PQ_LAUNDRY/sys/riscv/riscv/locore.S (revision 303666) +++ user/alc/PQ_LAUNDRY/sys/riscv/riscv/locore.S (revision 303667) @@ -1,382 +1,392 @@ /*- * Copyright (c) 2015-2016 Ruslan Bukin * All rights reserved. * * Portions of this software were developed by SRI International and the * University of Cambridge Computer Laboratory under DARPA/AFRL contract * FA8750-10-C-0237 ("CTSRD"), as part of the DARPA CRASH research programme. * * Portions of this software were developed by the University of Cambridge * Computer Laboratory as part of the CTSRD Project, with support from the * UK Higher Education Innovation Fund (HEIF). * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include "assym.s" #include #include #include #include #include #include #define HTIF_RING_NENTRIES (512) #define HTIF_RING_ENTRY_SZ (24) #define HTIF_RING_SIZE (HTIF_RING_ENTRY_SZ * HTIF_RING_NENTRIES) #define HW_STACK_SIZE (96) /* - * Event queue for each CPU core: + * Event queue: * * struct htif_ring { * uint64_t data; * uint64_t used; * uint64_t next; * } htif_ring[HTIF_RING_NENTRIES]; * uint64_t htif_ring_cursor; * uint64_t htif_ring_last; */ .macro build_ring la t0, htif_ring -#ifdef SMP - csrr a0, mhartid - li s0, (HTIF_RING_SIZE + 16) - mulw s0, a0, s0 - add t0, t0, s0 -#endif li t1, 0 sd t1, 0(t0) /* zero data */ sd t1, 8(t0) /* zero used */ mv t2, t0 mv t3, t0 li t5, (HTIF_RING_SIZE) li t6, 0 add t4, t0, t5 1: addi t3, t3, HTIF_RING_ENTRY_SZ /* pointer to next */ beq t3, t4, 2f /* finish */ sd t3, 16(t2) /* store pointer */ addi t2, t2, HTIF_RING_ENTRY_SZ /* next entry */ addi t6, t6, 1 /* counter */ j 1b 2: addi t3, t3, -HTIF_RING_ENTRY_SZ sd t0, 16(t3) /* last -> first */ li t2, (HTIF_RING_SIZE) add s0, t0, t2 sd t0, 0(s0) /* cursor */ sd t0, 8(s0) /* last */ /* finish building ring */ .endm .globl kernbase .set kernbase, KERNBASE /* Trap entries */ .text mentry: - /* User mode entry point (mtvec + 0x000) */ - .align 6 - j user_trap + /* Vectors */ + j _start /* reset */ + j bad_trap /* NMI (non-maskable interrupt) */ + j machine_trap - /* Supervisor mode entry point (mtvec + 0x040) */ - .align 6 - j supervisor_trap - - /* Hypervisor mode entry point (mtvec + 0x080) */ - .align 6 - j bad_trap - - /* Machine mode entry point (mtvec + 0x0C0) */ - .align 6 - j bad_trap - /* Reset vector */ .text - .align 8 .globl _start _start: + /* Setup machine trap vector */ + la t0, machine_trap + csrw mtvec, t0 + + /* Delegate interrupts to supervisor mode */ + li t0, (MIP_SSIP | MIP_STIP | MIP_SEIP) + csrw mideleg, t0 + + /* Delegate exceptions to supervisor mode */ + li t0, (1 << EXCP_MISALIGNED_FETCH) | \ + (1 << EXCP_FAULT_FETCH) | \ + (1 << EXCP_ILLEGAL_INSTRUCTION) | \ + (1 << EXCP_FAULT_LOAD) | \ + (1 << EXCP_FAULT_STORE) | \ + (1 << EXCP_BREAKPOINT) | \ + (1 << EXCP_USER_ECALL) + csrw medeleg, t0 + + la t0, cpu_exception_handler + li t1, KERNBASE + add t0, t0, t1 + csrw stvec, t0 + /* Direct secondary cores to mpentry */ csrr a0, mhartid bnez a0, mpentry + li t1, 0 + la t0, tohost + sd t1, 0(t0) + la t0, fromhost + sd t1, 0(t0) + /* Build event queue for current core */ build_ring /* Setup machine-mode stack for CPU 0 */ la t0, hardstack_end csrw mscratch, t0 li t0, 0 csrw sscratch, t0 li s10, PAGE_SIZE li s9, (PAGE_SIZE * KSTACK_PAGES) /* Page tables */ /* Create an L1 page for early devmap */ la s1, pagetable_l1 la s2, pagetable_l2_devmap /* Link to next level PN */ srli s2, s2, PAGE_SHIFT li a5, (VM_MAX_KERNEL_ADDRESS - L2_SIZE) srli a5, a5, L1_SHIFT /* >> L1_SHIFT */ andi a5, a5, 0x1ff /* & 0x1ff */ - li t4, (PTE_VALID | (PTE_TYPE_PTR << PTE_TYPE_S)) + li t4, PTE_V slli t5, s2, PTE_PPN0_S /* (s2 << PTE_PPN0_S) */ or t6, t4, t5 /* Store single level1 PTE entry to position */ li a6, PTE_SIZE mulw a5, a5, a6 add t0, s1, a5 sd t6, (t0) /* Add single Level 1 entry for kernel */ la s1, pagetable_l1 la s2, pagetable_l2 /* Link to next level PN */ srli s2, s2, PAGE_SHIFT - li a5, KERNBASE + li a5, (KERNBASE + KERNENTRY) srli a5, a5, L1_SHIFT /* >> L1_SHIFT */ andi a5, a5, 0x1ff /* & 0x1ff */ - li t4, (PTE_VALID | (PTE_TYPE_PTR << PTE_TYPE_S)) + li t4, PTE_V slli t5, s2, PTE_PPN0_S /* (s2 << PTE_PPN0_S) */ or t6, t4, t5 /* Store single level1 PTE entry to position */ li a6, PTE_SIZE mulw a5, a5, a6 add t0, s1, a5 sd t6, (t0) /* Level 2 superpages (512 x 2MiB) */ la s1, pagetable_l2 - li t3, 512 /* Build 512 entries */ - li t4, 0 /* Counter */ + li t4, KERNENTRY + srli t4, t4, 21 /* Div by 2 MiB */ + li t2, 512 /* Build 512 entries */ + add t3, t4, t2 li t5, 0 2: - li t0, (PTE_VALID | (PTE_TYPE_SRWX << PTE_TYPE_S)) + li t0, (PTE_V | PTE_RWX) slli t2, t4, PTE_PPN1_S /* << PTE_PPN1_S */ or t5, t0, t2 sd t5, (s1) /* Store PTE entry to position */ addi s1, s1, PTE_SIZE addi t4, t4, 1 bltu t4, t3, 2b /* Set page tables base register */ la s1, pagetable_l1 + srli s1, s1, PAGE_SHIFT csrw sptbr, s1 /* Page tables END */ /* Enter supervisor mode */ li s0, ((MSTATUS_VM_SV39 << MSTATUS_VM_SHIFT) | \ - (MSTATUS_PRV_M << MSTATUS_PRV_SHIFT) | \ - (MSTATUS_PRV_S << MSTATUS_PRV1_SHIFT) | \ - (MSTATUS_PRV_U << MSTATUS_PRV2_SHIFT)); + (MSTATUS_PRV_S << MSTATUS_MPP_SHIFT)); csrw mstatus, s0 /* * Enable machine-mode software interrupts * so we can deliver IPI to this core. */ li t0, MIE_MSIE csrs mie, t0 /* Exit from machine mode */ la t0, .Lmmu_on li s11, KERNBASE add t0, t0, s11 csrw mepc, t0 - eret + mret .Lmmu_on: /* Initialize stack pointer */ la s3, initstack_end mv sp, s3 addi sp, sp, -PCB_SIZE /* Clear BSS */ la a0, _C_LABEL(__bss_start) la s1, _C_LABEL(_end) 1: sd zero, 0(a0) addi a0, a0, 8 bltu a0, s1, 1b /* Fill riscv_bootparams */ addi sp, sp, -16 la t0, pagetable_l1 sd t0, 0(sp) /* kern_l1pt */ la t0, initstack_end sd t0, 8(sp) /* kern_stack */ mv a0, sp call _C_LABEL(initriscv) /* Off we go */ call _C_LABEL(mi_startup) .align 4 initstack: .space (PAGE_SIZE * KSTACK_PAGES) initstack_end: hardstack: .space (HW_STACK_SIZE * MAXCPU) hardstack_end: .globl htif_ring htif_ring: - .space ((HTIF_RING_SIZE + 16) * MAXCPU) - - .globl console_intr -console_intr: + .space (HTIF_RING_SIZE + 16) +htif_lock: .space (8) +tohost: + .space (8) +fromhost: + .space (8) ENTRY(sigcode) mv a0, sp addi a0, a0, SF_UC 1: li t0, SYS_sigreturn ecall /* sigreturn failed, exit */ li t0, SYS_exit ecall j 1b END(sigcode) /* This may be copied to the stack, keep it 16-byte aligned */ .align 3 esigcode: .data .align 3 .global szsigcode szsigcode: .quad esigcode - sigcode .align 12 pagetable_l1: .space PAGE_SIZE pagetable_l2: .space PAGE_SIZE pagetable_l2_devmap: .space PAGE_SIZE .globl init_pt_va init_pt_va: .quad pagetable_l2 /* XXX: Keep page tables VA */ #ifndef SMP ENTRY(mpentry) 1: wfi j 1b END(mpentry) #else /* * mpentry(unsigned long) * * Called by a core when it is being brought online. * The data in x0 is passed straight to init_secondary. */ ENTRY(mpentry) /* * Calculate the offset to __riscv_boot_ap * for current core, cpuid in a0. */ li t1, 4 mulw t1, t1, a0 /* Get pointer */ la t0, __riscv_boot_ap add t0, t0, t1 1: /* Wait the kernel to be ready */ lw t1, 0(t0) beqz t1, 1b - /* Build event queue ring for this core */ - build_ring - /* Set page tables base register */ la t0, pagetable_l1 + srli t0, t0, PAGE_SHIFT csrw sptbr, t0 /* Configure mstatus */ li s0, ((MSTATUS_VM_SV39 << MSTATUS_VM_SHIFT) | \ - (MSTATUS_PRV_M << MSTATUS_PRV_SHIFT) | \ - (MSTATUS_PRV_S << MSTATUS_PRV1_SHIFT) | \ - (MSTATUS_PRV_U << MSTATUS_PRV2_SHIFT)); + (MSTATUS_PRV_S << MSTATUS_MPP_SHIFT)); csrw mstatus, s0 /* Setup stack for machine mode exceptions */ la t0, hardstack_end li t1, HW_STACK_SIZE mulw t1, t1, a0 sub t0, t0, t1 csrw mscratch, t0 li t0, 0 csrw sscratch, t0 /* * Enable machine-mode software interrupts * so we can deliver IPI to this core. */ li t0, MIE_MSIE csrs mie, t0 /* * Exit from machine mode and go to * the virtual address space. */ la t0, mp_virtdone li s11, KERNBASE add t0, t0, s11 csrw mepc, t0 - eret + mret mp_virtdone: /* We are now in virtual address space */ /* Setup stack pointer */ la t0, secondary_stacks li t1, (PAGE_SIZE * KSTACK_PAGES) mulw t1, t1, a0 add sp, t0, t1 call init_secondary END(mpentry) #endif #include "exception.S" Index: user/alc/PQ_LAUNDRY/sys/riscv/riscv/machdep.c =================================================================== --- user/alc/PQ_LAUNDRY/sys/riscv/riscv/machdep.c (revision 303666) +++ user/alc/PQ_LAUNDRY/sys/riscv/riscv/machdep.c (revision 303667) @@ -1,797 +1,798 @@ /*- * Copyright (c) 2014 Andrew Turner * Copyright (c) 2015-2016 Ruslan Bukin * All rights reserved. * * Portions of this software were developed by SRI International and the * University of Cambridge Computer Laboratory under DARPA/AFRL contract * FA8750-10-C-0237 ("CTSRD"), as part of the DARPA CRASH research programme. * * Portions of this software were developed by the University of Cambridge * Computer Laboratory as part of the CTSRD Project, with support from the * UK Higher Education Innovation Fund (HEIF). * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include "opt_platform.h" #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef VFP #include #endif #ifdef FDT #include #include #endif struct pcpu __pcpu[MAXCPU]; static struct trapframe proc0_tf; vm_paddr_t phys_avail[PHYS_AVAIL_SIZE + 2]; vm_paddr_t dump_avail[PHYS_AVAIL_SIZE + 2]; int early_boot = 1; int cold = 1; long realmem = 0; long Maxmem = 0; #define PHYSMAP_SIZE (2 * (VM_PHYSSEG_MAX - 1)) vm_paddr_t physmap[PHYSMAP_SIZE]; u_int physmap_idx; struct kva_md_info kmi; int64_t dcache_line_size; /* The minimum D cache line size */ int64_t icache_line_size; /* The minimum I cache line size */ int64_t idcache_line_size; /* The minimum cache line size */ extern int *end; extern int *initstack_end; struct pcpu *pcpup; uintptr_t mcall_trap(uintptr_t mcause, uintptr_t* regs); uintptr_t mcall_trap(uintptr_t mcause, uintptr_t* regs) { return (0); } static void cpu_startup(void *dummy) { identify_cpu(); vm_ksubmap_init(&kmi); bufinit(); vm_pager_bufferinit(); } SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL); int cpu_idle_wakeup(int cpu) { return (0); } void bzero(void *buf, size_t len) { uint8_t *p; p = buf; while(len-- > 0) *p++ = 0; } int fill_regs(struct thread *td, struct reg *regs) { struct trapframe *frame; frame = td->td_frame; regs->sepc = frame->tf_sepc; regs->sstatus = frame->tf_sstatus; regs->ra = frame->tf_ra; regs->sp = frame->tf_sp; regs->gp = frame->tf_gp; regs->tp = frame->tf_tp; memcpy(regs->t, frame->tf_t, sizeof(regs->t)); memcpy(regs->s, frame->tf_s, sizeof(regs->s)); memcpy(regs->a, frame->tf_a, sizeof(regs->a)); return (0); } int set_regs(struct thread *td, struct reg *regs) { struct trapframe *frame; frame = td->td_frame; frame->tf_sepc = regs->sepc; frame->tf_sstatus = regs->sstatus; frame->tf_ra = regs->ra; frame->tf_sp = regs->sp; frame->tf_gp = regs->gp; frame->tf_tp = regs->tp; memcpy(frame->tf_t, regs->t, sizeof(frame->tf_t)); memcpy(frame->tf_s, regs->s, sizeof(frame->tf_s)); memcpy(frame->tf_a, regs->a, sizeof(frame->tf_a)); return (0); } int fill_fpregs(struct thread *td, struct fpreg *regs) { /* TODO */ bzero(regs, sizeof(*regs)); return (0); } int set_fpregs(struct thread *td, struct fpreg *regs) { /* TODO */ return (0); } int fill_dbregs(struct thread *td, struct dbreg *regs) { panic("fill_dbregs"); } int set_dbregs(struct thread *td, struct dbreg *regs) { panic("set_dbregs"); } int ptrace_set_pc(struct thread *td, u_long addr) { panic("ptrace_set_pc"); return (0); } int ptrace_single_step(struct thread *td) { /* TODO; */ return (0); } int ptrace_clear_single_step(struct thread *td) { /* TODO; */ return (0); } void exec_setregs(struct thread *td, struct image_params *imgp, u_long stack) { struct trapframe *tf; tf = td->td_frame; memset(tf, 0, sizeof(struct trapframe)); /* * We need to set a0 for init as it doesn't call * cpu_set_syscall_retval to copy the value. We also * need to set td_retval for the cases where we do. */ tf->tf_a[0] = td->td_retval[0] = stack; tf->tf_sp = STACKALIGN(stack); tf->tf_ra = imgp->entry_addr; tf->tf_sepc = imgp->entry_addr; } /* Sanity check these are the same size, they will be memcpy'd to and fro */ CTASSERT(sizeof(((struct trapframe *)0)->tf_a) == sizeof((struct gpregs *)0)->gp_a); CTASSERT(sizeof(((struct trapframe *)0)->tf_s) == sizeof((struct gpregs *)0)->gp_s); CTASSERT(sizeof(((struct trapframe *)0)->tf_t) == sizeof((struct gpregs *)0)->gp_t); CTASSERT(sizeof(((struct trapframe *)0)->tf_a) == sizeof((struct reg *)0)->a); CTASSERT(sizeof(((struct trapframe *)0)->tf_s) == sizeof((struct reg *)0)->s); CTASSERT(sizeof(((struct trapframe *)0)->tf_t) == sizeof((struct reg *)0)->t); int get_mcontext(struct thread *td, mcontext_t *mcp, int clear_ret) { struct trapframe *tf = td->td_frame; memcpy(mcp->mc_gpregs.gp_t, tf->tf_t, sizeof(mcp->mc_gpregs.gp_t)); memcpy(mcp->mc_gpregs.gp_s, tf->tf_s, sizeof(mcp->mc_gpregs.gp_s)); memcpy(mcp->mc_gpregs.gp_a, tf->tf_a, sizeof(mcp->mc_gpregs.gp_a)); if (clear_ret & GET_MC_CLEAR_RET) { mcp->mc_gpregs.gp_a[0] = 0; mcp->mc_gpregs.gp_t[0] = 0; /* clear syscall error */ } mcp->mc_gpregs.gp_ra = tf->tf_ra; mcp->mc_gpregs.gp_sp = tf->tf_sp; mcp->mc_gpregs.gp_gp = tf->tf_gp; mcp->mc_gpregs.gp_tp = tf->tf_tp; mcp->mc_gpregs.gp_sepc = tf->tf_sepc; mcp->mc_gpregs.gp_sstatus = tf->tf_sstatus; return (0); } int set_mcontext(struct thread *td, mcontext_t *mcp) { struct trapframe *tf; tf = td->td_frame; memcpy(tf->tf_t, mcp->mc_gpregs.gp_t, sizeof(tf->tf_t)); memcpy(tf->tf_s, mcp->mc_gpregs.gp_s, sizeof(tf->tf_s)); memcpy(tf->tf_a, mcp->mc_gpregs.gp_a, sizeof(tf->tf_a)); tf->tf_ra = mcp->mc_gpregs.gp_ra; tf->tf_sp = mcp->mc_gpregs.gp_sp; tf->tf_gp = mcp->mc_gpregs.gp_gp; tf->tf_tp = mcp->mc_gpregs.gp_tp; tf->tf_sepc = mcp->mc_gpregs.gp_sepc; tf->tf_sstatus = mcp->mc_gpregs.gp_sstatus; return (0); } static void get_fpcontext(struct thread *td, mcontext_t *mcp) { /* TODO */ } static void set_fpcontext(struct thread *td, mcontext_t *mcp) { /* TODO */ } void cpu_idle(int busy) { spinlock_enter(); if (!busy) cpu_idleclock(); if (!sched_runnable()) __asm __volatile( "fence \n" "wfi \n"); if (!busy) cpu_activeclock(); spinlock_exit(); } void cpu_halt(void) { panic("cpu_halt"); } /* * Flush the D-cache for non-DMA I/O so that the I-cache can * be made coherent later. */ void cpu_flush_dcache(void *ptr, size_t len) { /* TBD */ } /* Get current clock frequency for the given CPU ID. */ int cpu_est_clockrate(int cpu_id, uint64_t *rate) { panic("cpu_est_clockrate"); } void cpu_pcpu_init(struct pcpu *pcpu, int cpuid, size_t size) { } void spinlock_enter(void) { struct thread *td; td = curthread; if (td->td_md.md_spinlock_count == 0) { td->td_md.md_spinlock_count = 1; td->td_md.md_saved_sstatus_ie = intr_disable(); } else td->td_md.md_spinlock_count++; critical_enter(); } void spinlock_exit(void) { struct thread *td; register_t sstatus_ie; td = curthread; critical_exit(); sstatus_ie = td->td_md.md_saved_sstatus_ie; td->td_md.md_spinlock_count--; if (td->td_md.md_spinlock_count == 0) intr_restore(sstatus_ie); } #ifndef _SYS_SYSPROTO_H_ struct sigreturn_args { ucontext_t *ucp; }; #endif int sys_sigreturn(struct thread *td, struct sigreturn_args *uap) { uint64_t sstatus; ucontext_t uc; int error; if (uap == NULL) return (EFAULT); if (copyin(uap->sigcntxp, &uc, sizeof(uc))) return (EFAULT); /* * Make sure the processor mode has not been tampered with and * interrupts have not been disabled. + * Supervisor interrupts in user mode are always enabled. */ sstatus = uc.uc_mcontext.mc_gpregs.gp_sstatus; - if ((sstatus & SSTATUS_PS) != 0 || - (sstatus & SSTATUS_PIE) == 0) + if ((sstatus & SSTATUS_SPP) != 0) return (EINVAL); error = set_mcontext(td, &uc.uc_mcontext); if (error != 0) return (error); set_fpcontext(td, &uc.uc_mcontext); /* Restore signal mask. */ kern_sigprocmask(td, SIG_SETMASK, &uc.uc_sigmask, NULL, 0); return (EJUSTRETURN); } /* * Construct a PCB from a trapframe. This is called from kdb_trap() where * we want to start a backtrace from the function that caused us to enter * the debugger. We have the context in the trapframe, but base the trace * on the PCB. The PCB doesn't have to be perfect, as long as it contains * enough for a backtrace. */ void makectx(struct trapframe *tf, struct pcb *pcb) { memcpy(pcb->pcb_t, tf->tf_t, sizeof(tf->tf_t)); memcpy(pcb->pcb_s, tf->tf_s, sizeof(tf->tf_s)); memcpy(pcb->pcb_a, tf->tf_a, sizeof(tf->tf_a)); pcb->pcb_ra = tf->tf_ra; pcb->pcb_sp = tf->tf_sp; pcb->pcb_gp = tf->tf_gp; pcb->pcb_tp = tf->tf_tp; pcb->pcb_sepc = tf->tf_sepc; } void sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask) { struct sigframe *fp, frame; struct sysentvec *sysent; struct trapframe *tf; struct sigacts *psp; struct thread *td; struct proc *p; int onstack; int code; int sig; td = curthread; p = td->td_proc; PROC_LOCK_ASSERT(p, MA_OWNED); sig = ksi->ksi_signo; code = ksi->ksi_code; psp = p->p_sigacts; mtx_assert(&psp->ps_mtx, MA_OWNED); tf = td->td_frame; onstack = sigonstack(tf->tf_sp); CTR4(KTR_SIG, "sendsig: td=%p (%s) catcher=%p sig=%d", td, p->p_comm, catcher, sig); /* Allocate and validate space for the signal handler context. */ if ((td->td_pflags & TDP_ALTSTACK) != 0 && !onstack && SIGISMEMBER(psp->ps_sigonstack, sig)) { fp = (struct sigframe *)((uintptr_t)td->td_sigstk.ss_sp + td->td_sigstk.ss_size); } else { fp = (struct sigframe *)td->td_frame->tf_sp; } /* Make room, keeping the stack aligned */ fp--; fp = (struct sigframe *)STACKALIGN(fp); /* Fill in the frame to copy out */ get_mcontext(td, &frame.sf_uc.uc_mcontext, 0); get_fpcontext(td, &frame.sf_uc.uc_mcontext); frame.sf_si = ksi->ksi_info; frame.sf_uc.uc_sigmask = *mask; frame.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK) ? ((onstack) ? SS_ONSTACK : 0) : SS_DISABLE; frame.sf_uc.uc_stack = td->td_sigstk; mtx_unlock(&psp->ps_mtx); PROC_UNLOCK(td->td_proc); /* Copy the sigframe out to the user's stack. */ if (copyout(&frame, fp, sizeof(*fp)) != 0) { /* Process has trashed its stack. Kill it. */ CTR2(KTR_SIG, "sendsig: sigexit td=%p fp=%p", td, fp); PROC_LOCK(p); sigexit(td, SIGILL); } tf->tf_a[0] = sig; tf->tf_a[1] = (register_t)&fp->sf_si; tf->tf_a[2] = (register_t)&fp->sf_uc; tf->tf_sepc = (register_t)catcher; tf->tf_sp = (register_t)fp; sysent = p->p_sysent; if (sysent->sv_sigcode_base != 0) tf->tf_ra = (register_t)sysent->sv_sigcode_base; else tf->tf_ra = (register_t)(sysent->sv_psstrings - *(sysent->sv_szsigcode)); CTR3(KTR_SIG, "sendsig: return td=%p pc=%#x sp=%#x", td, tf->tf_sepc, tf->tf_sp); PROC_LOCK(p); mtx_lock(&psp->ps_mtx); } static void init_proc0(vm_offset_t kstack) { pcpup = &__pcpu[0]; proc_linkup0(&proc0, &thread0); thread0.td_kstack = kstack; thread0.td_pcb = (struct pcb *)(thread0.td_kstack) - 1; thread0.td_frame = &proc0_tf; pcpup->pc_curpcb = thread0.td_pcb; } static int add_physmap_entry(uint64_t base, uint64_t length, vm_paddr_t *physmap, u_int *physmap_idxp) { u_int i, insert_idx, _physmap_idx; _physmap_idx = *physmap_idxp; if (length == 0) return (1); /* * Find insertion point while checking for overlap. Start off by * assuming the new entry will be added to the end. */ insert_idx = _physmap_idx; for (i = 0; i <= _physmap_idx; i += 2) { if (base < physmap[i + 1]) { if (base + length <= physmap[i]) { insert_idx = i; break; } if (boothowto & RB_VERBOSE) printf( "Overlapping memory regions, ignoring second region\n"); return (1); } } /* See if we can prepend to the next entry. */ if (insert_idx <= _physmap_idx && base + length == physmap[insert_idx]) { physmap[insert_idx] = base; return (1); } /* See if we can append to the previous entry. */ if (insert_idx > 0 && base == physmap[insert_idx - 1]) { physmap[insert_idx - 1] += length; return (1); } _physmap_idx += 2; *physmap_idxp = _physmap_idx; if (_physmap_idx == PHYSMAP_SIZE) { printf( "Too many segments in the physical address map, giving up\n"); return (0); } /* * Move the last 'N' entries down to make room for the new * entry if needed. */ for (i = _physmap_idx; i > insert_idx; i -= 2) { physmap[i] = physmap[i - 2]; physmap[i + 1] = physmap[i - 1]; } /* Insert the new entry. */ physmap[insert_idx] = base; physmap[insert_idx + 1] = base + length; printf("physmap[%d] = 0x%016lx\n", insert_idx, base); printf("physmap[%d] = 0x%016lx\n", insert_idx + 1, base + length); return (1); } #ifdef FDT static void try_load_dtb(caddr_t kmdp) { vm_offset_t dtbp; dtbp = (vm_offset_t)&fdt_static_dtb; if (dtbp == (vm_offset_t)NULL) { printf("ERROR loading DTB\n"); return; } if (OF_install(OFW_FDT, 0) == FALSE) panic("Cannot install FDT"); if (OF_init((void *)dtbp) != 0) panic("OF_init failed with the found device tree"); } #endif static void cache_setup(void) { /* TODO */ } /* * Fake up a boot descriptor table. * RISCVTODO: This needs to be done via loader (when it's available). */ vm_offset_t fake_preload_metadata(struct riscv_bootparams *rvbp __unused) { #ifdef DDB vm_offset_t zstart = 0, zend = 0; #endif vm_offset_t lastaddr; int i = 0; static uint32_t fake_preload[35]; fake_preload[i++] = MODINFO_NAME; fake_preload[i++] = strlen("kernel") + 1; strcpy((char*)&fake_preload[i++], "kernel"); i += 1; fake_preload[i++] = MODINFO_TYPE; fake_preload[i++] = strlen("elf64 kernel") + 1; strcpy((char*)&fake_preload[i++], "elf64 kernel"); i += 3; fake_preload[i++] = MODINFO_ADDR; fake_preload[i++] = sizeof(vm_offset_t); fake_preload[i++] = (uint64_t)(KERNBASE + KERNENTRY); i += 1; fake_preload[i++] = MODINFO_SIZE; fake_preload[i++] = sizeof(uint64_t); printf("end is 0x%016lx\n", (uint64_t)&end); fake_preload[i++] = (uint64_t)&end - (uint64_t)(KERNBASE + KERNENTRY); i += 1; #ifdef DDB #if 0 /* RISCVTODO */ if (*(uint32_t *)KERNVIRTADDR == MAGIC_TRAMP_NUMBER) { fake_preload[i++] = MODINFO_METADATA|MODINFOMD_SSYM; fake_preload[i++] = sizeof(vm_offset_t); fake_preload[i++] = *(uint32_t *)(KERNVIRTADDR + 4); fake_preload[i++] = MODINFO_METADATA|MODINFOMD_ESYM; fake_preload[i++] = sizeof(vm_offset_t); fake_preload[i++] = *(uint32_t *)(KERNVIRTADDR + 8); lastaddr = *(uint32_t *)(KERNVIRTADDR + 8); zend = lastaddr; zstart = *(uint32_t *)(KERNVIRTADDR + 4); db_fetch_ksymtab(zstart, zend); } else #endif #endif lastaddr = (vm_offset_t)&end; fake_preload[i++] = 0; fake_preload[i] = 0; preload_metadata = (void *)fake_preload; return (lastaddr); } void initriscv(struct riscv_bootparams *rvbp) { struct mem_region mem_regions[FDT_MEM_REGIONS]; vm_offset_t lastaddr; int mem_regions_sz; vm_size_t kernlen; caddr_t kmdp; int i; /* Set the module data location */ lastaddr = fake_preload_metadata(rvbp); /* Find the kernel address */ kmdp = preload_search_by_type("elf kernel"); if (kmdp == NULL) kmdp = preload_search_by_type("elf64 kernel"); - boothowto = 0; + boothowto = RB_VERBOSE | RB_SINGLE; + boothowto = RB_VERBOSE; kern_envp = NULL; #ifdef FDT try_load_dtb(kmdp); #endif /* Load the physical memory ranges */ physmap_idx = 0; /* Grab physical memory regions information from device tree. */ if (fdt_get_mem_regions(mem_regions, &mem_regions_sz, NULL) != 0) panic("Cannot get physical memory regions"); for (i = 0; i < mem_regions_sz; i++) add_physmap_entry(mem_regions[i].mr_start, mem_regions[i].mr_size, physmap, &physmap_idx); /* Set the pcpu data, this is needed by pmap_bootstrap */ pcpup = &__pcpu[0]; pcpu_init(pcpup, 0, sizeof(struct pcpu)); /* Set the pcpu pointer */ __asm __volatile("mv gp, %0" :: "r"(pcpup)); PCPU_SET(curthread, &thread0); /* Do basic tuning, hz etc */ init_param1(); cache_setup(); - /* Bootstrap enough of pmap to enter the kernel proper */ + /* Bootstrap enough of pmap to enter the kernel proper */ kernlen = (lastaddr - KERNBASE); pmap_bootstrap(rvbp->kern_l1pt, KERNENTRY, kernlen); cninit(); init_proc0(rvbp->kern_stack); /* set page table base register for thread0 */ thread0.td_pcb->pcb_l1addr = (rvbp->kern_l1pt - KERNBASE); msgbufinit(msgbufp, msgbufsize); mutex_init(); init_param2(physmem); kdb_init(); riscv_init_interrupts(); early_boot = 0; } Index: user/alc/PQ_LAUNDRY/sys/riscv/riscv/pmap.c =================================================================== --- user/alc/PQ_LAUNDRY/sys/riscv/riscv/pmap.c (revision 303666) +++ user/alc/PQ_LAUNDRY/sys/riscv/riscv/pmap.c (revision 303667) @@ -1,3278 +1,3282 @@ /*- * Copyright (c) 1991 Regents of the University of California. * All rights reserved. * Copyright (c) 1994 John S. Dyson * All rights reserved. * Copyright (c) 1994 David Greenman * All rights reserved. * Copyright (c) 2003 Peter Wemm * All rights reserved. * Copyright (c) 2005-2010 Alan L. Cox * All rights reserved. * Copyright (c) 2014 Andrew Turner * All rights reserved. * Copyright (c) 2014 The FreeBSD Foundation * All rights reserved. * Copyright (c) 2015-2016 Ruslan Bukin * All rights reserved. * * This code is derived from software contributed to Berkeley by * the Systems Programming Group of the University of Utah Computer * Science Department and William Jolitz of UUNET Technologies Inc. * * Portions of this software were developed by Andrew Turner under * sponsorship from The FreeBSD Foundation. * * Portions of this software were developed by SRI International and the * University of Cambridge Computer Laboratory under DARPA/AFRL contract * FA8750-10-C-0237 ("CTSRD"), as part of the DARPA CRASH research programme. * * Portions of this software were developed by the University of Cambridge * Computer Laboratory as part of the CTSRD Project, with support from the * UK Higher Education Innovation Fund (HEIF). * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)pmap.c 7.7 (Berkeley) 5/12/91 */ /*- * Copyright (c) 2003 Networks Associates Technology, Inc. * All rights reserved. * * This software was developed for the FreeBSD Project by Jake Burkholder, * Safeport Network Services, and Network Associates Laboratories, the * Security Research Division of Network Associates, Inc. under * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA * CHATS research program. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); /* * Manages physical address maps. * * Since the information managed by this module is * also stored by the logical address mapping module, * this module may throw away valid virtual-to-physical * mappings at almost any time. However, invalidations * of virtual-to-physical mappings must be done as * requested. * * In order to cope with hardware architectures which * make virtual-to-physical map invalidates expensive, * this module may delay invalidate or reduced protection * operations until such time as they are actually * necessary. This module is given full information as * to which processors are currently using which maps, * and to when physical maps must be made correct. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define NPDEPG (PAGE_SIZE/(sizeof (pd_entry_t))) #define NUPDE (NPDEPG * NPDEPG) #define NUSERPGTBLS (NUPDE + NPDEPG) #if !defined(DIAGNOSTIC) #ifdef __GNUC_GNU_INLINE__ #define PMAP_INLINE __attribute__((__gnu_inline__)) inline #else #define PMAP_INLINE extern inline #endif #else #define PMAP_INLINE #endif #ifdef PV_STATS #define PV_STAT(x) do { x ; } while (0) #else #define PV_STAT(x) do { } while (0) #endif #define pmap_l2_pindex(v) ((v) >> L2_SHIFT) #define NPV_LIST_LOCKS MAXCPU #define PHYS_TO_PV_LIST_LOCK(pa) \ (&pv_list_locks[pa_index(pa) % NPV_LIST_LOCKS]) #define CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa) do { \ struct rwlock **_lockp = (lockp); \ struct rwlock *_new_lock; \ \ _new_lock = PHYS_TO_PV_LIST_LOCK(pa); \ if (_new_lock != *_lockp) { \ if (*_lockp != NULL) \ rw_wunlock(*_lockp); \ *_lockp = _new_lock; \ rw_wlock(*_lockp); \ } \ } while (0) #define CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m) \ CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, VM_PAGE_TO_PHYS(m)) #define RELEASE_PV_LIST_LOCK(lockp) do { \ struct rwlock **_lockp = (lockp); \ \ if (*_lockp != NULL) { \ rw_wunlock(*_lockp); \ *_lockp = NULL; \ } \ } while (0) #define VM_PAGE_TO_PV_LIST_LOCK(m) \ PHYS_TO_PV_LIST_LOCK(VM_PAGE_TO_PHYS(m)) /* The list of all the user pmaps */ LIST_HEAD(pmaplist, pmap); static struct pmaplist allpmaps; static MALLOC_DEFINE(M_VMPMAP, "pmap", "PMAP L1"); struct pmap kernel_pmap_store; vm_offset_t virtual_avail; /* VA of first avail page (after kernel bss) */ vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */ vm_offset_t kernel_vm_end = 0; struct msgbuf *msgbufp = NULL; static struct rwlock_padalign pvh_global_lock; /* * Data for the pv entry allocation mechanism */ static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks); static struct mtx pv_chunks_mutex; static struct rwlock pv_list_locks[NPV_LIST_LOCKS]; static void free_pv_chunk(struct pv_chunk *pc); static void free_pv_entry(pmap_t pmap, pv_entry_t pv); static pv_entry_t get_pv_entry(pmap_t pmap, struct rwlock **lockp); static vm_page_t reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp); static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va); static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va); static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp); static int pmap_remove_l3(pmap_t pmap, pt_entry_t *l3, vm_offset_t sva, pd_entry_t ptepde, struct spglist *free, struct rwlock **lockp); static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m, struct rwlock **lockp); static vm_page_t _pmap_alloc_l3(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp); static void _pmap_unwire_l3(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free); static int pmap_unuse_l3(pmap_t, vm_offset_t, pd_entry_t, struct spglist *); /* * These load the old table data and store the new value. * They need to be atomic as the System MMU may write to the table at * the same time as the CPU. */ #define pmap_load_store(table, entry) atomic_swap_64(table, entry) #define pmap_set(table, mask) atomic_set_64(table, mask) #define pmap_load_clear(table) atomic_swap_64(table, 0) #define pmap_load(table) (*table) /********************/ /* Inline functions */ /********************/ static __inline void pagecopy(void *s, void *d) { memcpy(d, s, PAGE_SIZE); } static __inline void pagezero(void *p) { bzero(p, PAGE_SIZE); } #define pmap_l1_index(va) (((va) >> L1_SHIFT) & Ln_ADDR_MASK) #define pmap_l2_index(va) (((va) >> L2_SHIFT) & Ln_ADDR_MASK) #define pmap_l3_index(va) (((va) >> L3_SHIFT) & Ln_ADDR_MASK) #define PTE_TO_PHYS(pte) ((pte >> PTE_PPN0_S) * PAGE_SIZE) static __inline pd_entry_t * pmap_l1(pmap_t pmap, vm_offset_t va) { return (&pmap->pm_l1[pmap_l1_index(va)]); } static __inline pd_entry_t * pmap_l1_to_l2(pd_entry_t *l1, vm_offset_t va) { vm_paddr_t phys; pd_entry_t *l2; phys = PTE_TO_PHYS(pmap_load(l1)); l2 = (pd_entry_t *)PHYS_TO_DMAP(phys); return (&l2[pmap_l2_index(va)]); } static __inline pd_entry_t * pmap_l2(pmap_t pmap, vm_offset_t va) { pd_entry_t *l1; l1 = pmap_l1(pmap, va); if (l1 == NULL) return (NULL); - if ((pmap_load(l1) & PTE_VALID) == 0) + if ((pmap_load(l1) & PTE_V) == 0) return (NULL); - if ((pmap_load(l1) & PTE_TYPE_M) != (PTE_TYPE_PTR << PTE_TYPE_S)) + if ((pmap_load(l1) & PTE_RX) != 0) return (NULL); return (pmap_l1_to_l2(l1, va)); } static __inline pt_entry_t * pmap_l2_to_l3(pd_entry_t *l2, vm_offset_t va) { vm_paddr_t phys; pt_entry_t *l3; phys = PTE_TO_PHYS(pmap_load(l2)); l3 = (pd_entry_t *)PHYS_TO_DMAP(phys); return (&l3[pmap_l3_index(va)]); } static __inline pt_entry_t * pmap_l3(pmap_t pmap, vm_offset_t va) { pd_entry_t *l2; l2 = pmap_l2(pmap, va); if (l2 == NULL) return (NULL); - if ((pmap_load(l2) & PTE_VALID) == 0) + if ((pmap_load(l2) & PTE_V) == 0) return (NULL); - if ((pmap_load(l2) & PTE_TYPE_M) != (PTE_TYPE_PTR << PTE_TYPE_S)) + if ((pmap_load(l2) & PTE_RX) != 0) return (NULL); return (pmap_l2_to_l3(l2, va)); } static __inline int pmap_is_write(pt_entry_t entry) { - if (entry & (1 << PTE_TYPE_S)) - return (1); - - return (0); + return (entry & PTE_W); } static __inline int pmap_is_current(pmap_t pmap) { return ((pmap == pmap_kernel()) || (pmap == curthread->td_proc->p_vmspace->vm_map.pmap)); } static __inline int pmap_l3_valid(pt_entry_t l3) { - return (l3 & PTE_VALID); + return (l3 & PTE_V); } static __inline int pmap_l3_valid_cacheable(pt_entry_t l3) { /* TODO */ return (0); } #define PTE_SYNC(pte) cpu_dcache_wb_range((vm_offset_t)pte, sizeof(*pte)) /* Checks if the page is dirty. */ static inline int pmap_page_dirty(pt_entry_t pte) { - return (pte & PTE_DIRTY); + return (pte & PTE_D); } static __inline void pmap_resident_count_inc(pmap_t pmap, int count) { PMAP_LOCK_ASSERT(pmap, MA_OWNED); pmap->pm_stats.resident_count += count; } static __inline void pmap_resident_count_dec(pmap_t pmap, int count) { PMAP_LOCK_ASSERT(pmap, MA_OWNED); KASSERT(pmap->pm_stats.resident_count >= count, ("pmap %p resident count underflow %ld %d", pmap, pmap->pm_stats.resident_count, count)); pmap->pm_stats.resident_count -= count; } static void pmap_distribute_l1(struct pmap *pmap, vm_pindex_t l1index, pt_entry_t entry) { struct pmap *user_pmap; pd_entry_t *l1; /* Distribute new kernel L1 entry to all the user pmaps */ if (pmap != kernel_pmap) return; LIST_FOREACH(user_pmap, &allpmaps, pm_list) { l1 = &user_pmap->pm_l1[l1index]; if (entry) pmap_load_store(l1, entry); else pmap_load_clear(l1); } } static pt_entry_t * pmap_early_page_idx(vm_offset_t l1pt, vm_offset_t va, u_int *l1_slot, u_int *l2_slot) { pt_entry_t *l2; pd_entry_t *l1; l1 = (pd_entry_t *)l1pt; *l1_slot = (va >> L1_SHIFT) & Ln_ADDR_MASK; /* Check locore has used a table L1 map */ - KASSERT((l1[*l1_slot] & PTE_TYPE_M) == (PTE_TYPE_PTR << PTE_TYPE_S), + KASSERT((l1[*l1_slot] & PTE_RX) == 0, ("Invalid bootstrap L1 table")); /* Find the address of the L2 table */ l2 = (pt_entry_t *)init_pt_va; *l2_slot = pmap_l2_index(va); return (l2); } static vm_paddr_t pmap_early_vtophys(vm_offset_t l1pt, vm_offset_t va) { u_int l1_slot, l2_slot; pt_entry_t *l2; u_int ret; l2 = pmap_early_page_idx(l1pt, va, &l1_slot, &l2_slot); /* L2 is superpages */ ret = (l2[l2_slot] >> PTE_PPN1_S) << L2_SHIFT; ret += (va & L2_OFFSET); return (ret); } static void pmap_bootstrap_dmap(vm_offset_t l1pt, vm_paddr_t kernstart) { vm_offset_t va; vm_paddr_t pa; pd_entry_t *l1; u_int l1_slot; pt_entry_t entry; pn_t pn; + /* + * Initialize DMAP starting from zero physical address. + * TODO: remove this once machine-mode code splitted out. + */ + kernstart = 0; + printf("%s: l1pt 0x%016lx kernstart 0x%016lx\n", __func__, l1pt, kernstart); + pa = kernstart & ~L1_OFFSET; va = DMAP_MIN_ADDRESS; l1 = (pd_entry_t *)l1pt; l1_slot = pmap_l1_index(DMAP_MIN_ADDRESS); for (; va < DMAP_MAX_ADDRESS; pa += L1_SIZE, va += L1_SIZE, l1_slot++) { KASSERT(l1_slot < Ln_ENTRIES, ("Invalid L1 index")); /* superpages */ pn = (pa / PAGE_SIZE); - entry = (PTE_VALID | (PTE_TYPE_SRWX << PTE_TYPE_S)); + entry = (PTE_V | PTE_RWX); entry |= (pn << PTE_PPN0_S); pmap_load_store(&l1[l1_slot], entry); } cpu_dcache_wb_range((vm_offset_t)l1, PAGE_SIZE); cpu_tlb_flushID(); } static vm_offset_t pmap_bootstrap_l3(vm_offset_t l1pt, vm_offset_t va, vm_offset_t l3_start) { vm_offset_t l2pt, l3pt; pt_entry_t entry; pd_entry_t *l2; vm_paddr_t pa; u_int l2_slot; pn_t pn; KASSERT((va & L2_OFFSET) == 0, ("Invalid virtual address")); l2 = pmap_l2(kernel_pmap, va); l2 = (pd_entry_t *)((uintptr_t)l2 & ~(PAGE_SIZE - 1)); l2pt = (vm_offset_t)l2; l2_slot = pmap_l2_index(va); l3pt = l3_start; for (; va < VM_MAX_KERNEL_ADDRESS; l2_slot++, va += L2_SIZE) { KASSERT(l2_slot < Ln_ENTRIES, ("Invalid L2 index")); pa = pmap_early_vtophys(l1pt, l3pt); pn = (pa / PAGE_SIZE); - entry = (PTE_VALID | (PTE_TYPE_PTR << PTE_TYPE_S)); + entry = (PTE_V); entry |= (pn << PTE_PPN0_S); pmap_load_store(&l2[l2_slot], entry); l3pt += PAGE_SIZE; } + /* Clean the L2 page table */ memset((void *)l3_start, 0, l3pt - l3_start); cpu_dcache_wb_range(l3_start, l3pt - l3_start); cpu_dcache_wb_range((vm_offset_t)l2, PAGE_SIZE); - return l3pt; + return (l3pt); } /* * Bootstrap the system enough to run with virtual memory. */ void pmap_bootstrap(vm_offset_t l1pt, vm_paddr_t kernstart, vm_size_t kernlen) { u_int l1_slot, l2_slot, avail_slot, map_slot, used_map_slot; uint64_t kern_delta; pt_entry_t *l2; vm_offset_t va, freemempos; vm_offset_t dpcpu, msgbufpv; vm_paddr_t pa, min_pa; int i; kern_delta = KERNBASE - kernstart; physmem = 0; printf("pmap_bootstrap %lx %lx %lx\n", l1pt, kernstart, kernlen); printf("%lx\n", l1pt); printf("%lx\n", (KERNBASE >> L1_SHIFT) & Ln_ADDR_MASK); /* Set this early so we can use the pagetable walking functions */ kernel_pmap_store.pm_l1 = (pd_entry_t *)l1pt; PMAP_LOCK_INIT(kernel_pmap); /* * Initialize the global pv list lock. */ rw_init(&pvh_global_lock, "pmap pv global"); LIST_INIT(&allpmaps); /* Assume the address we were loaded to is a valid physical address */ min_pa = KERNBASE - kern_delta; /* * Find the minimum physical address. physmap is sorted, * but may contain empty ranges. */ for (i = 0; i < (physmap_idx * 2); i += 2) { if (physmap[i] == physmap[i + 1]) continue; if (physmap[i] <= min_pa) min_pa = physmap[i]; break; } /* Create a direct map region early so we can use it for pa -> va */ pmap_bootstrap_dmap(l1pt, min_pa); va = KERNBASE; pa = KERNBASE - kern_delta; /* * Start to initialize phys_avail by copying from physmap * up to the physical address KERNBASE points at. */ map_slot = avail_slot = 0; for (; map_slot < (physmap_idx * 2); map_slot += 2) { if (physmap[map_slot] == physmap[map_slot + 1]) continue; + if (physmap[map_slot] <= pa && + physmap[map_slot + 1] > pa) + break; + phys_avail[avail_slot] = physmap[map_slot]; phys_avail[avail_slot + 1] = physmap[map_slot + 1]; physmem += (phys_avail[avail_slot + 1] - phys_avail[avail_slot]) >> PAGE_SHIFT; avail_slot += 2; } /* Add the memory before the kernel */ if (physmap[avail_slot] < pa) { phys_avail[avail_slot] = physmap[map_slot]; phys_avail[avail_slot + 1] = pa; physmem += (phys_avail[avail_slot + 1] - phys_avail[avail_slot]) >> PAGE_SHIFT; avail_slot += 2; } used_map_slot = map_slot; /* * Read the page table to find out what is already mapped. * This assumes we have mapped a block of memory from KERNBASE * using a single L1 entry. */ l2 = pmap_early_page_idx(l1pt, KERNBASE, &l1_slot, &l2_slot); /* Sanity check the index, KERNBASE should be the first VA */ KASSERT(l2_slot == 0, ("The L2 index is non-zero")); /* Find how many pages we have mapped */ for (; l2_slot < Ln_ENTRIES; l2_slot++) { - if ((l2[l2_slot] & PTE_VALID) == 0) + if ((l2[l2_slot] & PTE_V) == 0) break; /* Check locore used L2 superpages */ - KASSERT((l2[l2_slot] & PTE_TYPE_M) != (PTE_TYPE_PTR << PTE_TYPE_S), + KASSERT((l2[l2_slot] & PTE_RX) != 0, ("Invalid bootstrap L2 table")); va += L2_SIZE; pa += L2_SIZE; } va = roundup2(va, L2_SIZE); freemempos = KERNBASE + kernlen; freemempos = roundup2(freemempos, PAGE_SIZE); /* Create the l3 tables for the early devmap */ freemempos = pmap_bootstrap_l3(l1pt, VM_MAX_KERNEL_ADDRESS - L2_SIZE, freemempos); cpu_tlb_flushID(); #define alloc_pages(var, np) \ (var) = freemempos; \ freemempos += (np * PAGE_SIZE); \ memset((char *)(var), 0, ((np) * PAGE_SIZE)); /* Allocate dynamic per-cpu area. */ alloc_pages(dpcpu, DPCPU_SIZE / PAGE_SIZE); dpcpu_init((void *)dpcpu, 0); /* Allocate memory for the msgbuf, e.g. for /sbin/dmesg */ alloc_pages(msgbufpv, round_page(msgbufsize) / PAGE_SIZE); msgbufp = (void *)msgbufpv; virtual_avail = roundup2(freemempos, L2_SIZE); virtual_end = VM_MAX_KERNEL_ADDRESS - L2_SIZE; kernel_vm_end = virtual_avail; pa = pmap_early_vtophys(l1pt, freemempos); /* Finish initialising physmap */ map_slot = used_map_slot; for (; avail_slot < (PHYS_AVAIL_SIZE - 2) && map_slot < (physmap_idx * 2); map_slot += 2) { - if (physmap[map_slot] == physmap[map_slot + 1]) + if (physmap[map_slot] == physmap[map_slot + 1]) { continue; + } /* Have we used the current range? */ - if (physmap[map_slot + 1] <= pa) + if (physmap[map_slot + 1] <= pa) { continue; + } /* Do we need to split the entry? */ if (physmap[map_slot] < pa) { phys_avail[avail_slot] = pa; phys_avail[avail_slot + 1] = physmap[map_slot + 1]; } else { phys_avail[avail_slot] = physmap[map_slot]; phys_avail[avail_slot + 1] = physmap[map_slot + 1]; } physmem += (phys_avail[avail_slot + 1] - phys_avail[avail_slot]) >> PAGE_SHIFT; avail_slot += 2; } phys_avail[avail_slot] = 0; phys_avail[avail_slot + 1] = 0; /* * Maxmem isn't the "maximum memory", it's one larger than the * highest page of the physical address space. It should be * called something like "Maxphyspage". */ Maxmem = atop(phys_avail[avail_slot - 1]); cpu_tlb_flushID(); } /* * Initialize a vm_page's machine-dependent fields. */ void pmap_page_init(vm_page_t m) { TAILQ_INIT(&m->md.pv_list); m->md.pv_memattr = VM_MEMATTR_WRITE_BACK; } /* * Initialize the pmap module. * Called by vm_init, to initialize any structures that the pmap * system needs to map virtual memory. */ void pmap_init(void) { int i; /* * Initialize the pv chunk list mutex. */ mtx_init(&pv_chunks_mutex, "pmap pv chunk list", NULL, MTX_DEF); /* * Initialize the pool of pv list locks. */ for (i = 0; i < NPV_LIST_LOCKS; i++) rw_init(&pv_list_locks[i], "pmap pv list"); } /* * Normal, non-SMP, invalidation functions. * We inline these within pmap.c for speed. */ PMAP_INLINE void pmap_invalidate_page(pmap_t pmap, vm_offset_t va) { /* TODO */ sched_pin(); __asm __volatile("sfence.vm"); sched_unpin(); } PMAP_INLINE void pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { /* TODO */ sched_pin(); __asm __volatile("sfence.vm"); sched_unpin(); } PMAP_INLINE void pmap_invalidate_all(pmap_t pmap) { /* TODO */ sched_pin(); __asm __volatile("sfence.vm"); sched_unpin(); } /* * Routine: pmap_extract * Function: * Extract the physical page address associated * with the given map/virtual_address pair. */ vm_paddr_t pmap_extract(pmap_t pmap, vm_offset_t va) { pd_entry_t *l2p, l2; pt_entry_t *l3p, l3; vm_paddr_t pa; pa = 0; PMAP_LOCK(pmap); /* * Start with the l2 tabel. We are unable to allocate * pages in the l1 table. */ l2p = pmap_l2(pmap, va); if (l2p != NULL) { l2 = pmap_load(l2p); - if ((l2 & PTE_TYPE_M) == (PTE_TYPE_PTR << PTE_TYPE_S)) { + if ((l2 & PTE_RX) == 0) { l3p = pmap_l2_to_l3(l2p, va); if (l3p != NULL) { l3 = pmap_load(l3p); pa = PTE_TO_PHYS(l3); pa |= (va & L3_OFFSET); } } else { /* L2 is superpages */ pa = (l2 >> PTE_PPN1_S) << L2_SHIFT; pa |= (va & L2_OFFSET); } } PMAP_UNLOCK(pmap); return (pa); } /* * Routine: pmap_extract_and_hold * Function: * Atomically extract and hold the physical page * with the given pmap and virtual address pair * if that mapping permits the given protection. */ vm_page_t pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot) { pt_entry_t *l3p, l3; vm_paddr_t phys; vm_paddr_t pa; vm_page_t m; pa = 0; m = NULL; PMAP_LOCK(pmap); retry: l3p = pmap_l3(pmap, va); if (l3p != NULL && (l3 = pmap_load(l3p)) != 0) { if ((pmap_is_write(l3)) || ((prot & VM_PROT_WRITE) == 0)) { phys = PTE_TO_PHYS(l3); if (vm_page_pa_tryrelock(pmap, phys, &pa)) goto retry; m = PHYS_TO_VM_PAGE(phys); vm_page_hold(m); } } PA_UNLOCK_COND(pa); PMAP_UNLOCK(pmap); return (m); } vm_paddr_t pmap_kextract(vm_offset_t va) { pd_entry_t *l2; pt_entry_t *l3; vm_paddr_t pa; if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) { pa = DMAP_TO_PHYS(va); } else { l2 = pmap_l2(kernel_pmap, va); if (l2 == NULL) panic("pmap_kextract: No l2"); - if ((pmap_load(l2) & PTE_TYPE_M) != (PTE_TYPE_PTR << PTE_TYPE_S)) { + if ((pmap_load(l2) & PTE_RX) != 0) { /* superpages */ pa = (pmap_load(l2) >> PTE_PPN1_S) << L2_SHIFT; pa |= (va & L2_OFFSET); return (pa); } l3 = pmap_l2_to_l3(l2, va); if (l3 == NULL) panic("pmap_kextract: No l3..."); pa = PTE_TO_PHYS(pmap_load(l3)); pa |= (va & PAGE_MASK); } return (pa); } /*************************************************** * Low level mapping routines..... ***************************************************/ void pmap_kenter_device(vm_offset_t sva, vm_size_t size, vm_paddr_t pa) { pt_entry_t entry; pt_entry_t *l3; vm_offset_t va; pn_t pn; KASSERT((pa & L3_OFFSET) == 0, ("pmap_kenter_device: Invalid physical address")); KASSERT((sva & L3_OFFSET) == 0, ("pmap_kenter_device: Invalid virtual address")); KASSERT((size & PAGE_MASK) == 0, ("pmap_kenter_device: Mapping is not page-sized")); va = sva; while (size != 0) { l3 = pmap_l3(kernel_pmap, va); KASSERT(l3 != NULL, ("Invalid page table, va: 0x%lx", va)); pn = (pa / PAGE_SIZE); - entry = (PTE_VALID | (PTE_TYPE_SRWX << PTE_TYPE_S)); + entry = (PTE_V | PTE_RWX); entry |= (pn << PTE_PPN0_S); pmap_load_store(l3, entry); PTE_SYNC(l3); va += PAGE_SIZE; pa += PAGE_SIZE; size -= PAGE_SIZE; } pmap_invalidate_range(kernel_pmap, sva, va); } /* * Remove a page from the kernel pagetables. * Note: not SMP coherent. */ PMAP_INLINE void pmap_kremove(vm_offset_t va) { pt_entry_t *l3; l3 = pmap_l3(kernel_pmap, va); KASSERT(l3 != NULL, ("pmap_kremove: Invalid address")); if (pmap_l3_valid_cacheable(pmap_load(l3))) cpu_dcache_wb_range(va, L3_SIZE); pmap_load_clear(l3); PTE_SYNC(l3); pmap_invalidate_page(kernel_pmap, va); } void pmap_kremove_device(vm_offset_t sva, vm_size_t size) { pt_entry_t *l3; vm_offset_t va; KASSERT((sva & L3_OFFSET) == 0, ("pmap_kremove_device: Invalid virtual address")); KASSERT((size & PAGE_MASK) == 0, ("pmap_kremove_device: Mapping is not page-sized")); va = sva; while (size != 0) { l3 = pmap_l3(kernel_pmap, va); KASSERT(l3 != NULL, ("Invalid page table, va: 0x%lx", va)); pmap_load_clear(l3); PTE_SYNC(l3); va += PAGE_SIZE; size -= PAGE_SIZE; } pmap_invalidate_range(kernel_pmap, sva, va); } /* * Used to map a range of physical addresses into kernel * virtual address space. * * The value passed in '*virt' is a suggested virtual address for * the mapping. Architectures which can support a direct-mapped * physical to virtual region can return the appropriate address * within that region, leaving '*virt' unchanged. Other * architectures should map the pages starting at '*virt' and * update '*virt' with the first usable address after the mapped * region. */ vm_offset_t pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot) { return PHYS_TO_DMAP(start); } /* * Add a list of wired pages to the kva * this routine is only used for temporary * kernel mappings that do not need to have * page modification or references recorded. * Note that old mappings are simply written * over. The page *must* be wired. * Note: SMP coherent. Uses a ranged shootdown IPI. */ void pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count) { pt_entry_t *l3, pa; vm_offset_t va; vm_page_t m; pt_entry_t entry; pn_t pn; int i; va = sva; for (i = 0; i < count; i++) { m = ma[i]; pa = VM_PAGE_TO_PHYS(m); pn = (pa / PAGE_SIZE); l3 = pmap_l3(kernel_pmap, va); - entry = (PTE_VALID | (PTE_TYPE_SRWX << PTE_TYPE_S)); + entry = (PTE_V | PTE_RWX); entry |= (pn << PTE_PPN0_S); pmap_load_store(l3, entry); PTE_SYNC(l3); va += L3_SIZE; } pmap_invalidate_range(kernel_pmap, sva, va); } /* * This routine tears out page mappings from the * kernel -- it is meant only for temporary mappings. * Note: SMP coherent. Uses a ranged shootdown IPI. */ void pmap_qremove(vm_offset_t sva, int count) { pt_entry_t *l3; vm_offset_t va; KASSERT(sva >= VM_MIN_KERNEL_ADDRESS, ("usermode va %lx", sva)); va = sva; while (count-- > 0) { l3 = pmap_l3(kernel_pmap, va); KASSERT(l3 != NULL, ("pmap_kremove: Invalid address")); if (pmap_l3_valid_cacheable(pmap_load(l3))) cpu_dcache_wb_range(va, L3_SIZE); pmap_load_clear(l3); PTE_SYNC(l3); va += PAGE_SIZE; } pmap_invalidate_range(kernel_pmap, sva, va); } /*************************************************** * Page table page management routines..... ***************************************************/ static __inline void pmap_free_zero_pages(struct spglist *free) { vm_page_t m; while ((m = SLIST_FIRST(free)) != NULL) { SLIST_REMOVE_HEAD(free, plinks.s.ss); /* Preserve the page's PG_ZERO setting. */ vm_page_free_toq(m); } } /* * Schedule the specified unused page table page to be freed. Specifically, * add the page to the specified list of pages that will be released to the * physical memory manager after the TLB has been updated. */ static __inline void pmap_add_delayed_free_list(vm_page_t m, struct spglist *free, boolean_t set_PG_ZERO) { if (set_PG_ZERO) m->flags |= PG_ZERO; else m->flags &= ~PG_ZERO; SLIST_INSERT_HEAD(free, m, plinks.s.ss); } /* * Decrements a page table page's wire count, which is used to record the * number of valid page table entries within the page. If the wire count * drops to zero, then the page table page is unmapped. Returns TRUE if the * page table page was unmapped and FALSE otherwise. */ static inline boolean_t pmap_unwire_l3(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) { --m->wire_count; if (m->wire_count == 0) { _pmap_unwire_l3(pmap, va, m, free); return (TRUE); } else { return (FALSE); } } static void _pmap_unwire_l3(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) { vm_paddr_t phys; PMAP_LOCK_ASSERT(pmap, MA_OWNED); /* * unmap the page table page */ if (m->pindex >= NUPDE) { /* PD page */ pd_entry_t *l1; l1 = pmap_l1(pmap, va); pmap_load_clear(l1); pmap_distribute_l1(pmap, pmap_l1_index(va), 0); PTE_SYNC(l1); } else { /* PTE page */ pd_entry_t *l2; l2 = pmap_l2(pmap, va); pmap_load_clear(l2); PTE_SYNC(l2); } pmap_resident_count_dec(pmap, 1); if (m->pindex < NUPDE) { pd_entry_t *l1; /* We just released a PT, unhold the matching PD */ vm_page_t pdpg; l1 = pmap_l1(pmap, va); phys = PTE_TO_PHYS(pmap_load(l1)); pdpg = PHYS_TO_VM_PAGE(phys); pmap_unwire_l3(pmap, va, pdpg, free); } pmap_invalidate_page(pmap, va); /* * This is a release store so that the ordinary store unmapping * the page table page is globally performed before TLB shoot- * down is begun. */ atomic_subtract_rel_int(&vm_cnt.v_wire_count, 1); /* * Put page on a list so that it is released after * *ALL* TLB shootdown is done */ pmap_add_delayed_free_list(m, free, TRUE); } /* * After removing an l3 entry, this routine is used to * conditionally free the page, and manage the hold/wire counts. */ static int pmap_unuse_l3(pmap_t pmap, vm_offset_t va, pd_entry_t ptepde, struct spglist *free) { vm_paddr_t phys; vm_page_t mpte; if (va >= VM_MAXUSER_ADDRESS) return (0); KASSERT(ptepde != 0, ("pmap_unuse_pt: ptepde != 0")); phys = PTE_TO_PHYS(ptepde); mpte = PHYS_TO_VM_PAGE(phys); return (pmap_unwire_l3(pmap, va, mpte, free)); } void pmap_pinit0(pmap_t pmap) { PMAP_LOCK_INIT(pmap); bzero(&pmap->pm_stats, sizeof(pmap->pm_stats)); pmap->pm_l1 = kernel_pmap->pm_l1; } int pmap_pinit(pmap_t pmap) { vm_paddr_t l1phys; vm_page_t l1pt; /* * allocate the l1 page */ while ((l1pt = vm_page_alloc(NULL, 0xdeadbeef, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) VM_WAIT; l1phys = VM_PAGE_TO_PHYS(l1pt); pmap->pm_l1 = (pd_entry_t *)PHYS_TO_DMAP(l1phys); if ((l1pt->flags & PG_ZERO) == 0) pagezero(pmap->pm_l1); bzero(&pmap->pm_stats, sizeof(pmap->pm_stats)); /* Install kernel pagetables */ memcpy(pmap->pm_l1, kernel_pmap->pm_l1, PAGE_SIZE); /* Add to the list of all user pmaps */ LIST_INSERT_HEAD(&allpmaps, pmap, pm_list); return (1); } /* * This routine is called if the desired page table page does not exist. * * If page table page allocation fails, this routine may sleep before * returning NULL. It sleeps only if a lock pointer was given. * * Note: If a page allocation fails at page table level two or three, * one or two pages may be held during the wait, only to be released * afterwards. This conservative approach is easily argued to avoid * race conditions. */ static vm_page_t _pmap_alloc_l3(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp) { vm_page_t m, /*pdppg, */pdpg; pt_entry_t entry; vm_paddr_t phys; pn_t pn; PMAP_LOCK_ASSERT(pmap, MA_OWNED); /* * Allocate a page table page. */ if ((m = vm_page_alloc(NULL, ptepindex, VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) { if (lockp != NULL) { RELEASE_PV_LIST_LOCK(lockp); PMAP_UNLOCK(pmap); rw_runlock(&pvh_global_lock); VM_WAIT; rw_rlock(&pvh_global_lock); PMAP_LOCK(pmap); } /* * Indicate the need to retry. While waiting, the page table * page may have been allocated. */ return (NULL); } if ((m->flags & PG_ZERO) == 0) pmap_zero_page(m); /* * Map the pagetable page into the process address space, if * it isn't already there. */ if (ptepindex >= NUPDE) { pd_entry_t *l1; vm_pindex_t l1index; l1index = ptepindex - NUPDE; l1 = &pmap->pm_l1[l1index]; pn = (VM_PAGE_TO_PHYS(m) / PAGE_SIZE); - entry = (PTE_VALID | (PTE_TYPE_PTR << PTE_TYPE_S)); + entry = (PTE_V); entry |= (pn << PTE_PPN0_S); pmap_load_store(l1, entry); pmap_distribute_l1(pmap, l1index, entry); PTE_SYNC(l1); } else { vm_pindex_t l1index; pd_entry_t *l1, *l2; l1index = ptepindex >> (L1_SHIFT - L2_SHIFT); l1 = &pmap->pm_l1[l1index]; if (pmap_load(l1) == 0) { /* recurse for allocating page dir */ if (_pmap_alloc_l3(pmap, NUPDE + l1index, lockp) == NULL) { --m->wire_count; atomic_subtract_int(&vm_cnt.v_wire_count, 1); vm_page_free_zero(m); return (NULL); } } else { phys = PTE_TO_PHYS(pmap_load(l1)); pdpg = PHYS_TO_VM_PAGE(phys); pdpg->wire_count++; } phys = PTE_TO_PHYS(pmap_load(l1)); l2 = (pd_entry_t *)PHYS_TO_DMAP(phys); l2 = &l2[ptepindex & Ln_ADDR_MASK]; pn = (VM_PAGE_TO_PHYS(m) / PAGE_SIZE); - entry = (PTE_VALID | (PTE_TYPE_PTR << PTE_TYPE_S)); + entry = (PTE_V); entry |= (pn << PTE_PPN0_S); pmap_load_store(l2, entry); PTE_SYNC(l2); } pmap_resident_count_inc(pmap, 1); return (m); } static vm_page_t pmap_alloc_l3(pmap_t pmap, vm_offset_t va, struct rwlock **lockp) { vm_pindex_t ptepindex; pd_entry_t *l2; vm_paddr_t phys; vm_page_t m; /* * Calculate pagetable page index */ ptepindex = pmap_l2_pindex(va); retry: /* * Get the page directory entry */ l2 = pmap_l2(pmap, va); /* * If the page table page is mapped, we just increment the * hold count, and activate it. */ if (l2 != NULL && pmap_load(l2) != 0) { phys = PTE_TO_PHYS(pmap_load(l2)); m = PHYS_TO_VM_PAGE(phys); m->wire_count++; } else { /* * Here if the pte page isn't mapped, or if it has been * deallocated. */ m = _pmap_alloc_l3(pmap, ptepindex, lockp); if (m == NULL && lockp != NULL) goto retry; } return (m); } /*************************************************** * Pmap allocation/deallocation routines. ***************************************************/ /* * Release any resources held by the given physical map. * Called when a pmap initialized by pmap_pinit is being released. * Should only be called if the map contains no valid mappings. */ void pmap_release(pmap_t pmap) { vm_page_t m; KASSERT(pmap->pm_stats.resident_count == 0, ("pmap_release: pmap resident count %ld != 0", pmap->pm_stats.resident_count)); m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pmap->pm_l1)); m->wire_count--; atomic_subtract_int(&vm_cnt.v_wire_count, 1); vm_page_free_zero(m); /* Remove pmap from the allpmaps list */ LIST_REMOVE(pmap, pm_list); /* Remove kernel pagetables */ bzero(pmap->pm_l1, PAGE_SIZE); } #if 0 static int kvm_size(SYSCTL_HANDLER_ARGS) { unsigned long ksize = VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS; return sysctl_handle_long(oidp, &ksize, 0, req); } SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG|CTLFLAG_RD, 0, 0, kvm_size, "LU", "Size of KVM"); static int kvm_free(SYSCTL_HANDLER_ARGS) { unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end; return sysctl_handle_long(oidp, &kfree, 0, req); } SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG|CTLFLAG_RD, 0, 0, kvm_free, "LU", "Amount of KVM free"); #endif /* 0 */ /* * grow the number of kernel page table entries, if needed */ void pmap_growkernel(vm_offset_t addr) { vm_paddr_t paddr; vm_page_t nkpg; pd_entry_t *l1, *l2; pt_entry_t entry; pn_t pn; mtx_assert(&kernel_map->system_mtx, MA_OWNED); addr = roundup2(addr, L2_SIZE); if (addr - 1 >= kernel_map->max_offset) addr = kernel_map->max_offset; while (kernel_vm_end < addr) { l1 = pmap_l1(kernel_pmap, kernel_vm_end); if (pmap_load(l1) == 0) { /* We need a new PDP entry */ nkpg = vm_page_alloc(NULL, kernel_vm_end >> L1_SHIFT, VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO); if (nkpg == NULL) panic("pmap_growkernel: no memory to grow kernel"); if ((nkpg->flags & PG_ZERO) == 0) pmap_zero_page(nkpg); paddr = VM_PAGE_TO_PHYS(nkpg); pn = (paddr / PAGE_SIZE); - entry = (PTE_VALID | (PTE_TYPE_PTR << PTE_TYPE_S)); + entry = (PTE_V); entry |= (pn << PTE_PPN0_S); pmap_load_store(l1, entry); pmap_distribute_l1(kernel_pmap, pmap_l1_index(kernel_vm_end), entry); PTE_SYNC(l1); continue; /* try again */ } l2 = pmap_l1_to_l2(l1, kernel_vm_end); - if ((pmap_load(l2) & PTE_REF) != 0) { + if ((pmap_load(l2) & PTE_A) != 0) { kernel_vm_end = (kernel_vm_end + L2_SIZE) & ~L2_OFFSET; if (kernel_vm_end - 1 >= kernel_map->max_offset) { kernel_vm_end = kernel_map->max_offset; break; } continue; } nkpg = vm_page_alloc(NULL, kernel_vm_end >> L2_SHIFT, VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO); if (nkpg == NULL) panic("pmap_growkernel: no memory to grow kernel"); - if ((nkpg->flags & PG_ZERO) == 0) + if ((nkpg->flags & PG_ZERO) == 0) { pmap_zero_page(nkpg); + } paddr = VM_PAGE_TO_PHYS(nkpg); pn = (paddr / PAGE_SIZE); - entry = (PTE_VALID | (PTE_TYPE_PTR << PTE_TYPE_S)); + entry = (PTE_V); entry |= (pn << PTE_PPN0_S); pmap_load_store(l2, entry); PTE_SYNC(l2); pmap_invalidate_page(kernel_pmap, kernel_vm_end); kernel_vm_end = (kernel_vm_end + L2_SIZE) & ~L2_OFFSET; if (kernel_vm_end - 1 >= kernel_map->max_offset) { kernel_vm_end = kernel_map->max_offset; break; } } } /*************************************************** * page management routines. ***************************************************/ CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE); CTASSERT(_NPCM == 3); CTASSERT(_NPCPV == 168); static __inline struct pv_chunk * pv_to_chunk(pv_entry_t pv) { return ((struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK)); } #define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap) #define PC_FREE0 0xfffffffffffffffful #define PC_FREE1 0xfffffffffffffffful #define PC_FREE2 0x000000fffffffffful static const uint64_t pc_freemask[_NPCM] = { PC_FREE0, PC_FREE1, PC_FREE2 }; #if 0 #ifdef PV_STATS static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail; SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0, "Current number of pv entry chunks"); SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0, "Current number of pv entry chunks allocated"); SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0, "Current number of pv entry chunks frees"); SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0, "Number of times tried to get a chunk page but failed."); static long pv_entry_frees, pv_entry_allocs, pv_entry_count; static int pv_entry_spare; SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0, "Current number of pv entry frees"); SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0, "Current number of pv entry allocs"); SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0, "Current number of pv entries"); SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0, "Current number of spare pv entries"); #endif #endif /* 0 */ /* * We are in a serious low memory condition. Resort to * drastic measures to free some pages so we can allocate * another pv entry chunk. * * Returns NULL if PV entries were reclaimed from the specified pmap. * * We do not, however, unmap 2mpages because subsequent accesses will * allocate per-page pv entries until repromotion occurs, thereby * exacerbating the shortage of free pv entries. */ static vm_page_t reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp) { panic("RISCVTODO: reclaim_pv_chunk"); } /* * free the pv_entry back to the free list */ static void free_pv_entry(pmap_t pmap, pv_entry_t pv) { struct pv_chunk *pc; int idx, field, bit; rw_assert(&pvh_global_lock, RA_LOCKED); PMAP_LOCK_ASSERT(pmap, MA_OWNED); PV_STAT(atomic_add_long(&pv_entry_frees, 1)); PV_STAT(atomic_add_int(&pv_entry_spare, 1)); PV_STAT(atomic_subtract_long(&pv_entry_count, 1)); pc = pv_to_chunk(pv); idx = pv - &pc->pc_pventry[0]; field = idx / 64; bit = idx % 64; pc->pc_map[field] |= 1ul << bit; if (pc->pc_map[0] != PC_FREE0 || pc->pc_map[1] != PC_FREE1 || pc->pc_map[2] != PC_FREE2) { /* 98% of the time, pc is already at the head of the list. */ if (__predict_false(pc != TAILQ_FIRST(&pmap->pm_pvchunk))) { TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); } return; } TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); free_pv_chunk(pc); } static void free_pv_chunk(struct pv_chunk *pc) { vm_page_t m; mtx_lock(&pv_chunks_mutex); TAILQ_REMOVE(&pv_chunks, pc, pc_lru); mtx_unlock(&pv_chunks_mutex); PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV)); PV_STAT(atomic_subtract_int(&pc_chunk_count, 1)); PV_STAT(atomic_add_int(&pc_chunk_frees, 1)); /* entire chunk is free, return it */ m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc)); #if 0 /* TODO: For minidump */ dump_drop_page(m->phys_addr); #endif vm_page_unwire(m, PQ_INACTIVE); vm_page_free(m); } /* * Returns a new PV entry, allocating a new PV chunk from the system when * needed. If this PV chunk allocation fails and a PV list lock pointer was * given, a PV chunk is reclaimed from an arbitrary pmap. Otherwise, NULL is * returned. * * The given PV list lock may be released. */ static pv_entry_t get_pv_entry(pmap_t pmap, struct rwlock **lockp) { int bit, field; pv_entry_t pv; struct pv_chunk *pc; vm_page_t m; rw_assert(&pvh_global_lock, RA_LOCKED); PMAP_LOCK_ASSERT(pmap, MA_OWNED); PV_STAT(atomic_add_long(&pv_entry_allocs, 1)); retry: pc = TAILQ_FIRST(&pmap->pm_pvchunk); if (pc != NULL) { for (field = 0; field < _NPCM; field++) { if (pc->pc_map[field]) { bit = ffsl(pc->pc_map[field]) - 1; break; } } if (field < _NPCM) { pv = &pc->pc_pventry[field * 64 + bit]; pc->pc_map[field] &= ~(1ul << bit); /* If this was the last item, move it to tail */ if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 && pc->pc_map[2] == 0) { TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); } PV_STAT(atomic_add_long(&pv_entry_count, 1)); PV_STAT(atomic_subtract_int(&pv_entry_spare, 1)); return (pv); } } /* No free items, allocate another chunk */ m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED); if (m == NULL) { if (lockp == NULL) { PV_STAT(pc_chunk_tryfail++); return (NULL); } m = reclaim_pv_chunk(pmap, lockp); if (m == NULL) goto retry; } PV_STAT(atomic_add_int(&pc_chunk_count, 1)); PV_STAT(atomic_add_int(&pc_chunk_allocs, 1)); #if 0 /* TODO: This is for minidump */ dump_add_page(m->phys_addr); #endif pc = (void *)PHYS_TO_DMAP(m->phys_addr); pc->pc_pmap = pmap; pc->pc_map[0] = PC_FREE0 & ~1ul; /* preallocated bit 0 */ pc->pc_map[1] = PC_FREE1; pc->pc_map[2] = PC_FREE2; mtx_lock(&pv_chunks_mutex); TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru); mtx_unlock(&pv_chunks_mutex); pv = &pc->pc_pventry[0]; TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); PV_STAT(atomic_add_long(&pv_entry_count, 1)); PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV - 1)); return (pv); } /* * First find and then remove the pv entry for the specified pmap and virtual * address from the specified pv list. Returns the pv entry if found and NULL * otherwise. This operation can be performed on pv lists for either 4KB or * 2MB page mappings. */ static __inline pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va) { pv_entry_t pv; rw_assert(&pvh_global_lock, RA_LOCKED); TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { if (pmap == PV_PMAP(pv) && va == pv->pv_va) { TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); pvh->pv_gen++; break; } } return (pv); } /* * First find and then destroy the pv entry for the specified pmap and virtual * address. This operation can be performed on pv lists for either 4KB or 2MB * page mappings. */ static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va) { pv_entry_t pv; pv = pmap_pvh_remove(pvh, pmap, va); KASSERT(pv != NULL, ("pmap_pvh_free: pv not found")); free_pv_entry(pmap, pv); } /* * Conditionally create the PV entry for a 4KB page mapping if the required * memory can be allocated without resorting to reclamation. */ static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m, struct rwlock **lockp) { pv_entry_t pv; rw_assert(&pvh_global_lock, RA_LOCKED); PMAP_LOCK_ASSERT(pmap, MA_OWNED); /* Pass NULL instead of the lock pointer to disable reclamation. */ if ((pv = get_pv_entry(pmap, NULL)) != NULL) { pv->pv_va = va; CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); m->md.pv_gen++; return (TRUE); } else return (FALSE); } /* * pmap_remove_l3: do the things to unmap a page in a process */ static int pmap_remove_l3(pmap_t pmap, pt_entry_t *l3, vm_offset_t va, pd_entry_t l2e, struct spglist *free, struct rwlock **lockp) { pt_entry_t old_l3; vm_paddr_t phys; vm_page_t m; PMAP_LOCK_ASSERT(pmap, MA_OWNED); if (pmap_is_current(pmap) && pmap_l3_valid_cacheable(pmap_load(l3))) cpu_dcache_wb_range(va, L3_SIZE); old_l3 = pmap_load_clear(l3); PTE_SYNC(l3); pmap_invalidate_page(pmap, va); if (old_l3 & PTE_SW_WIRED) pmap->pm_stats.wired_count -= 1; pmap_resident_count_dec(pmap, 1); if (old_l3 & PTE_SW_MANAGED) { phys = PTE_TO_PHYS(old_l3); m = PHYS_TO_VM_PAGE(phys); if (pmap_page_dirty(old_l3)) vm_page_dirty(m); - if (old_l3 & PTE_REF) + if (old_l3 & PTE_A) vm_page_aflag_set(m, PGA_REFERENCED); CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); pmap_pvh_free(&m->md, pmap, va); } return (pmap_unuse_l3(pmap, va, l2e, free)); } /* * Remove the given range of addresses from the specified map. * * It is assumed that the start and end are properly * rounded to the page size. */ void pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { struct rwlock *lock; vm_offset_t va, va_next; pd_entry_t *l1, *l2; pt_entry_t l3_pte, *l3; struct spglist free; int anyvalid; /* * Perform an unsynchronized read. This is, however, safe. */ if (pmap->pm_stats.resident_count == 0) return; anyvalid = 0; SLIST_INIT(&free); rw_rlock(&pvh_global_lock); PMAP_LOCK(pmap); lock = NULL; for (; sva < eva; sva = va_next) { if (pmap->pm_stats.resident_count == 0) break; l1 = pmap_l1(pmap, sva); if (pmap_load(l1) == 0) { va_next = (sva + L1_SIZE) & ~L1_OFFSET; if (va_next < sva) va_next = eva; continue; } /* * Calculate index for next page table. */ va_next = (sva + L2_SIZE) & ~L2_OFFSET; if (va_next < sva) va_next = eva; l2 = pmap_l1_to_l2(l1, sva); if (l2 == NULL) continue; l3_pte = pmap_load(l2); /* * Weed out invalid mappings. */ if (l3_pte == 0) continue; - if ((pmap_load(l2) & PTE_TYPE_M) != (PTE_TYPE_PTR << PTE_TYPE_S)) + if ((pmap_load(l2) & PTE_RX) != 0) continue; /* * Limit our scan to either the end of the va represented * by the current page table page, or to the end of the * range being removed. */ if (va_next > eva) va_next = eva; va = va_next; for (l3 = pmap_l2_to_l3(l2, sva); sva != va_next; l3++, sva += L3_SIZE) { if (l3 == NULL) panic("l3 == NULL"); if (pmap_load(l3) == 0) { if (va != va_next) { pmap_invalidate_range(pmap, va, sva); va = va_next; } continue; } if (va == va_next) va = sva; if (pmap_remove_l3(pmap, l3, sva, l3_pte, &free, &lock)) { sva += L3_SIZE; break; } } if (va != va_next) pmap_invalidate_range(pmap, va, sva); } if (lock != NULL) rw_wunlock(lock); if (anyvalid) pmap_invalidate_all(pmap); rw_runlock(&pvh_global_lock); PMAP_UNLOCK(pmap); pmap_free_zero_pages(&free); } /* * Routine: pmap_remove_all * Function: * Removes this physical page from * all physical maps in which it resides. * Reflects back modify bits to the pager. * * Notes: * Original versions of this routine were very * inefficient because they iteratively called * pmap_remove (slow...) */ void pmap_remove_all(vm_page_t m) { pv_entry_t pv; pmap_t pmap; pt_entry_t *l3, tl3; pd_entry_t *l2, tl2; struct spglist free; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_remove_all: page %p is not managed", m)); SLIST_INIT(&free); rw_wlock(&pvh_global_lock); while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pmap_resident_count_dec(pmap, 1); l2 = pmap_l2(pmap, pv->pv_va); KASSERT(l2 != NULL, ("pmap_remove_all: no l2 table found")); tl2 = pmap_load(l2); - KASSERT((tl2 & PTE_TYPE_M) == (PTE_TYPE_PTR << PTE_TYPE_S), + KASSERT((tl2 & PTE_RX) == 0, ("pmap_remove_all: found a table when expecting " "a block in %p's pv list", m)); l3 = pmap_l2_to_l3(l2, pv->pv_va); if (pmap_is_current(pmap) && pmap_l3_valid_cacheable(pmap_load(l3))) cpu_dcache_wb_range(pv->pv_va, L3_SIZE); tl3 = pmap_load_clear(l3); PTE_SYNC(l3); pmap_invalidate_page(pmap, pv->pv_va); if (tl3 & PTE_SW_WIRED) pmap->pm_stats.wired_count--; - if ((tl3 & PTE_REF) != 0) + if ((tl3 & PTE_A) != 0) vm_page_aflag_set(m, PGA_REFERENCED); /* * Update the vm_page_t clean and reference bits. */ if (pmap_page_dirty(tl3)) vm_page_dirty(m); pmap_unuse_l3(pmap, pv->pv_va, pmap_load(l2), &free); TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); m->md.pv_gen++; free_pv_entry(pmap, pv); PMAP_UNLOCK(pmap); } vm_page_aflag_clear(m, PGA_WRITEABLE); rw_wunlock(&pvh_global_lock); pmap_free_zero_pages(&free); } /* * Set the physical protection on the * specified range of this map as requested. */ void pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) { vm_offset_t va, va_next; pd_entry_t *l1, *l2; pt_entry_t *l3p, l3; pt_entry_t entry; if ((prot & VM_PROT_READ) == VM_PROT_NONE) { pmap_remove(pmap, sva, eva); return; } if ((prot & VM_PROT_WRITE) == VM_PROT_WRITE) return; PMAP_LOCK(pmap); for (; sva < eva; sva = va_next) { l1 = pmap_l1(pmap, sva); if (pmap_load(l1) == 0) { va_next = (sva + L1_SIZE) & ~L1_OFFSET; if (va_next < sva) va_next = eva; continue; } va_next = (sva + L2_SIZE) & ~L2_OFFSET; if (va_next < sva) va_next = eva; l2 = pmap_l1_to_l2(l1, sva); if (l2 == NULL) continue; - if ((pmap_load(l2) & PTE_TYPE_M) != (PTE_TYPE_PTR << PTE_TYPE_S)) + if ((pmap_load(l2) & PTE_RX) != 0) continue; if (va_next > eva) va_next = eva; va = va_next; for (l3p = pmap_l2_to_l3(l2, sva); sva != va_next; l3p++, sva += L3_SIZE) { l3 = pmap_load(l3p); if (pmap_l3_valid(l3)) { entry = pmap_load(l3p); - entry &= ~(1 << PTE_TYPE_S); + entry &= ~(PTE_W); pmap_load_store(l3p, entry); PTE_SYNC(l3p); /* XXX: Use pmap_invalidate_range */ pmap_invalidate_page(pmap, va); } } } PMAP_UNLOCK(pmap); /* TODO: Only invalidate entries we are touching */ pmap_invalidate_all(pmap); } /* * Insert the given physical page (p) at * the specified virtual address (v) in the * target physical map with the protection requested. * * If specified, the page will be wired down, meaning * that the related pte can not be reclaimed. * * NB: This is the only routine which MAY NOT lazy-evaluate * or lose information. That is, this routine must actually * insert this page into the given map NOW. */ int pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, u_int flags, int8_t psind __unused) { struct rwlock *lock; pd_entry_t *l1, *l2; pt_entry_t new_l3, orig_l3; pt_entry_t *l3; pv_entry_t pv; vm_paddr_t opa, pa, l2_pa, l3_pa; vm_page_t mpte, om, l2_m, l3_m; boolean_t nosleep; pt_entry_t entry; pn_t l2_pn; pn_t l3_pn; pn_t pn; va = trunc_page(va); if ((m->oflags & VPO_UNMANAGED) == 0 && !vm_page_xbusied(m)) VM_OBJECT_ASSERT_LOCKED(m->object); pa = VM_PAGE_TO_PHYS(m); pn = (pa / PAGE_SIZE); - new_l3 = PTE_VALID; + new_l3 = PTE_V | PTE_R | PTE_X; + if (prot & VM_PROT_WRITE) + new_l3 |= PTE_W; + if ((va >> 63) == 0) + new_l3 |= PTE_U; - if ((prot & VM_PROT_WRITE) == 0) { /* Read-only */ - if ((va >> 63) == 0) /* USER */ - new_l3 |= (PTE_TYPE_SURX << PTE_TYPE_S); - else /* KERNEL */ - new_l3 |= (PTE_TYPE_SRX << PTE_TYPE_S); - } else { - if ((va >> 63) == 0) /* USER */ - new_l3 |= (PTE_TYPE_SURWX << PTE_TYPE_S); - else /* KERNEL */ - new_l3 |= (PTE_TYPE_SRWX << PTE_TYPE_S); - } - new_l3 |= (pn << PTE_PPN0_S); if ((flags & PMAP_ENTER_WIRED) != 0) new_l3 |= PTE_SW_WIRED; CTR2(KTR_PMAP, "pmap_enter: %.16lx -> %.16lx", va, pa); mpte = NULL; lock = NULL; rw_rlock(&pvh_global_lock); PMAP_LOCK(pmap); if (va < VM_MAXUSER_ADDRESS) { nosleep = (flags & PMAP_ENTER_NOSLEEP) != 0; mpte = pmap_alloc_l3(pmap, va, nosleep ? NULL : &lock); if (mpte == NULL && nosleep) { CTR0(KTR_PMAP, "pmap_enter: mpte == NULL"); if (lock != NULL) rw_wunlock(lock); rw_runlock(&pvh_global_lock); PMAP_UNLOCK(pmap); return (KERN_RESOURCE_SHORTAGE); } l3 = pmap_l3(pmap, va); } else { l3 = pmap_l3(pmap, va); /* TODO: This is not optimal, but should mostly work */ if (l3 == NULL) { l2 = pmap_l2(pmap, va); if (l2 == NULL) { l2_m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO); if (l2_m == NULL) panic("pmap_enter: l2 pte_m == NULL"); if ((l2_m->flags & PG_ZERO) == 0) pmap_zero_page(l2_m); l2_pa = VM_PAGE_TO_PHYS(l2_m); l2_pn = (l2_pa / PAGE_SIZE); l1 = pmap_l1(pmap, va); - entry = (PTE_VALID | (PTE_TYPE_PTR << PTE_TYPE_S)); + entry = (PTE_V); entry |= (l2_pn << PTE_PPN0_S); pmap_load_store(l1, entry); pmap_distribute_l1(pmap, pmap_l1_index(va), entry); PTE_SYNC(l1); l2 = pmap_l1_to_l2(l1, va); } KASSERT(l2 != NULL, ("No l2 table after allocating one")); l3_m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO); if (l3_m == NULL) panic("pmap_enter: l3 pte_m == NULL"); if ((l3_m->flags & PG_ZERO) == 0) pmap_zero_page(l3_m); l3_pa = VM_PAGE_TO_PHYS(l3_m); l3_pn = (l3_pa / PAGE_SIZE); - entry = (PTE_VALID | (PTE_TYPE_PTR << PTE_TYPE_S)); + entry = (PTE_V); entry |= (l3_pn << PTE_PPN0_S); pmap_load_store(l2, entry); PTE_SYNC(l2); l3 = pmap_l2_to_l3(l2, va); } pmap_invalidate_page(pmap, va); } om = NULL; orig_l3 = pmap_load(l3); opa = PTE_TO_PHYS(orig_l3); /* * Is the specified virtual address already mapped? */ if (pmap_l3_valid(orig_l3)) { /* * Wiring change, just update stats. We don't worry about * wiring PT pages as they remain resident as long as there * are valid mappings in them. Hence, if a user page is wired, * the PT page will be also. */ if ((flags & PMAP_ENTER_WIRED) != 0 && (orig_l3 & PTE_SW_WIRED) == 0) pmap->pm_stats.wired_count++; else if ((flags & PMAP_ENTER_WIRED) == 0 && (orig_l3 & PTE_SW_WIRED) != 0) pmap->pm_stats.wired_count--; /* * Remove the extra PT page reference. */ if (mpte != NULL) { mpte->wire_count--; KASSERT(mpte->wire_count > 0, ("pmap_enter: missing reference to page table page," " va: 0x%lx", va)); } /* * Has the physical page changed? */ if (opa == pa) { /* * No, might be a protection or wiring change. */ if ((orig_l3 & PTE_SW_MANAGED) != 0) { new_l3 |= PTE_SW_MANAGED; if (pmap_is_write(new_l3)) vm_page_aflag_set(m, PGA_WRITEABLE); } goto validate; } /* Flush the cache, there might be uncommitted data in it */ if (pmap_is_current(pmap) && pmap_l3_valid_cacheable(orig_l3)) cpu_dcache_wb_range(va, L3_SIZE); } else { /* * Increment the counters. */ if ((new_l3 & PTE_SW_WIRED) != 0) pmap->pm_stats.wired_count++; pmap_resident_count_inc(pmap, 1); } /* * Enter on the PV list if part of our managed memory. */ if ((m->oflags & VPO_UNMANAGED) == 0) { new_l3 |= PTE_SW_MANAGED; pv = get_pv_entry(pmap, &lock); pv->pv_va = va; CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, pa); TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); m->md.pv_gen++; if (pmap_is_write(new_l3)) vm_page_aflag_set(m, PGA_WRITEABLE); } /* * Update the L3 entry. */ if (orig_l3 != 0) { validate: orig_l3 = pmap_load_store(l3, new_l3); PTE_SYNC(l3); opa = PTE_TO_PHYS(orig_l3); if (opa != pa) { if ((orig_l3 & PTE_SW_MANAGED) != 0) { om = PHYS_TO_VM_PAGE(opa); if (pmap_page_dirty(orig_l3)) vm_page_dirty(om); - if ((orig_l3 & PTE_REF) != 0) + if ((orig_l3 & PTE_A) != 0) vm_page_aflag_set(om, PGA_REFERENCED); CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, opa); pmap_pvh_free(&om->md, pmap, va); } } else if (pmap_page_dirty(orig_l3)) { if ((orig_l3 & PTE_SW_MANAGED) != 0) vm_page_dirty(m); } } else { pmap_load_store(l3, new_l3); PTE_SYNC(l3); } pmap_invalidate_page(pmap, va); if ((pmap != pmap_kernel()) && (pmap == &curproc->p_vmspace->vm_pmap)) cpu_icache_sync_range(va, PAGE_SIZE); if (lock != NULL) rw_wunlock(lock); rw_runlock(&pvh_global_lock); PMAP_UNLOCK(pmap); return (KERN_SUCCESS); } /* * Maps a sequence of resident pages belonging to the same object. * The sequence begins with the given page m_start. This page is * mapped at the given virtual address start. Each subsequent page is * mapped at a virtual address that is offset from start by the same * amount as the page is offset from m_start within the object. The * last page in the sequence is the page with the largest offset from * m_start that can be mapped at a virtual address less than the given * virtual address end. Not every virtual page between start and end * is mapped; only those for which a resident page exists with the * corresponding offset from m_start are mapped. */ void pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end, vm_page_t m_start, vm_prot_t prot) { struct rwlock *lock; vm_offset_t va; vm_page_t m, mpte; vm_pindex_t diff, psize; VM_OBJECT_ASSERT_LOCKED(m_start->object); psize = atop(end - start); mpte = NULL; m = m_start; lock = NULL; rw_rlock(&pvh_global_lock); PMAP_LOCK(pmap); while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) { va = start + ptoa(diff); mpte = pmap_enter_quick_locked(pmap, va, m, prot, mpte, &lock); m = TAILQ_NEXT(m, listq); } if (lock != NULL) rw_wunlock(lock); rw_runlock(&pvh_global_lock); PMAP_UNLOCK(pmap); } /* * this code makes some *MAJOR* assumptions: * 1. Current pmap & pmap exists. * 2. Not wired. * 3. Read access. * 4. No page table pages. * but is *MUCH* faster than pmap_enter... */ void pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot) { struct rwlock *lock; lock = NULL; rw_rlock(&pvh_global_lock); PMAP_LOCK(pmap); (void)pmap_enter_quick_locked(pmap, va, m, prot, NULL, &lock); if (lock != NULL) rw_wunlock(lock); rw_runlock(&pvh_global_lock); PMAP_UNLOCK(pmap); } static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp) { struct spglist free; vm_paddr_t phys; pd_entry_t *l2; pt_entry_t *l3; vm_paddr_t pa; pt_entry_t entry; pn_t pn; KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva || (m->oflags & VPO_UNMANAGED) != 0, ("pmap_enter_quick_locked: managed mapping within the clean submap")); rw_assert(&pvh_global_lock, RA_LOCKED); PMAP_LOCK_ASSERT(pmap, MA_OWNED); CTR2(KTR_PMAP, "pmap_enter_quick_locked: %p %lx", pmap, va); /* * In the case that a page table page is not * resident, we are creating it here. */ if (va < VM_MAXUSER_ADDRESS) { vm_pindex_t l2pindex; /* * Calculate pagetable page index */ l2pindex = pmap_l2_pindex(va); if (mpte && (mpte->pindex == l2pindex)) { mpte->wire_count++; } else { /* * Get the l2 entry */ l2 = pmap_l2(pmap, va); /* * If the page table page is mapped, we just increment * the hold count, and activate it. Otherwise, we * attempt to allocate a page table page. If this * attempt fails, we don't retry. Instead, we give up. */ if (l2 != NULL && pmap_load(l2) != 0) { phys = PTE_TO_PHYS(pmap_load(l2)); mpte = PHYS_TO_VM_PAGE(phys); mpte->wire_count++; } else { /* * Pass NULL instead of the PV list lock * pointer, because we don't intend to sleep. */ mpte = _pmap_alloc_l3(pmap, l2pindex, NULL); if (mpte == NULL) return (mpte); } } l3 = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mpte)); l3 = &l3[pmap_l3_index(va)]; } else { mpte = NULL; l3 = pmap_l3(kernel_pmap, va); } if (l3 == NULL) panic("pmap_enter_quick_locked: No l3"); if (pmap_load(l3) != 0) { if (mpte != NULL) { mpte->wire_count--; mpte = NULL; } return (mpte); } /* * Enter on the PV list if part of our managed memory. */ if ((m->oflags & VPO_UNMANAGED) == 0 && !pmap_try_insert_pv_entry(pmap, va, m, lockp)) { if (mpte != NULL) { SLIST_INIT(&free); if (pmap_unwire_l3(pmap, va, mpte, &free)) { pmap_invalidate_page(pmap, va); pmap_free_zero_pages(&free); } mpte = NULL; } return (mpte); } /* * Increment counters */ pmap_resident_count_inc(pmap, 1); pa = VM_PAGE_TO_PHYS(m); pn = (pa / PAGE_SIZE); /* RISCVTODO: check permissions */ - entry = (PTE_VALID | (PTE_TYPE_SRWX << PTE_TYPE_S)); + entry = (PTE_V | PTE_RWX); entry |= (pn << PTE_PPN0_S); /* * Now validate mapping with RO protection */ if ((m->oflags & VPO_UNMANAGED) == 0) entry |= PTE_SW_MANAGED; pmap_load_store(l3, entry); PTE_SYNC(l3); pmap_invalidate_page(pmap, va); return (mpte); } /* * This code maps large physical mmap regions into the * processor address space. Note that some shortcuts * are taken, but the code works. */ void pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object, vm_pindex_t pindex, vm_size_t size) { VM_OBJECT_ASSERT_WLOCKED(object); KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG, ("pmap_object_init_pt: non-device object")); } /* * Clear the wired attribute from the mappings for the specified range of * addresses in the given pmap. Every valid mapping within that range * must have the wired attribute set. In contrast, invalid mappings * cannot have the wired attribute set, so they are ignored. * * The wired attribute of the page table entry is not a hardware feature, * so there is no need to invalidate any TLB entries. */ void pmap_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { vm_offset_t va_next; pd_entry_t *l1, *l2; pt_entry_t *l3; boolean_t pv_lists_locked; pv_lists_locked = FALSE; PMAP_LOCK(pmap); for (; sva < eva; sva = va_next) { l1 = pmap_l1(pmap, sva); if (pmap_load(l1) == 0) { va_next = (sva + L1_SIZE) & ~L1_OFFSET; if (va_next < sva) va_next = eva; continue; } va_next = (sva + L2_SIZE) & ~L2_OFFSET; if (va_next < sva) va_next = eva; l2 = pmap_l1_to_l2(l1, sva); if (pmap_load(l2) == 0) continue; if (va_next > eva) va_next = eva; for (l3 = pmap_l2_to_l3(l2, sva); sva != va_next; l3++, sva += L3_SIZE) { if (pmap_load(l3) == 0) continue; if ((pmap_load(l3) & PTE_SW_WIRED) == 0) panic("pmap_unwire: l3 %#jx is missing " "PTE_SW_WIRED", (uintmax_t)*l3); /* * PG_W must be cleared atomically. Although the pmap * lock synchronizes access to PG_W, another processor * could be setting PG_M and/or PG_A concurrently. */ atomic_clear_long(l3, PTE_SW_WIRED); pmap->pm_stats.wired_count--; } } if (pv_lists_locked) rw_runlock(&pvh_global_lock); PMAP_UNLOCK(pmap); } /* * Copy the range specified by src_addr/len * from the source map to the range dst_addr/len * in the destination map. * * This routine is only advisory and need not do anything. */ void pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len, vm_offset_t src_addr) { } /* * pmap_zero_page zeros the specified hardware page by mapping * the page into KVM and using bzero to clear its contents. */ void pmap_zero_page(vm_page_t m) { vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); pagezero((void *)va); } /* * pmap_zero_page_area zeros the specified hardware page by mapping * the page into KVM and using bzero to clear its contents. * * off and size may not cover an area beyond a single hardware page. */ void pmap_zero_page_area(vm_page_t m, int off, int size) { vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); if (off == 0 && size == PAGE_SIZE) pagezero((void *)va); else bzero((char *)va + off, size); } /* * pmap_zero_page_idle zeros the specified hardware page by mapping * the page into KVM and using bzero to clear its contents. This * is intended to be called from the vm_pagezero process only and * outside of Giant. */ void pmap_zero_page_idle(vm_page_t m) { vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); pagezero((void *)va); } /* * pmap_copy_page copies the specified (machine independent) * page by mapping the page into virtual memory and using * bcopy to copy the page, one machine dependent page at a * time. */ void pmap_copy_page(vm_page_t msrc, vm_page_t mdst) { vm_offset_t src = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(msrc)); vm_offset_t dst = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mdst)); pagecopy((void *)src, (void *)dst); } int unmapped_buf_allowed = 1; void pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[], vm_offset_t b_offset, int xfersize) { void *a_cp, *b_cp; vm_page_t m_a, m_b; vm_paddr_t p_a, p_b; vm_offset_t a_pg_offset, b_pg_offset; int cnt; while (xfersize > 0) { a_pg_offset = a_offset & PAGE_MASK; m_a = ma[a_offset >> PAGE_SHIFT]; p_a = m_a->phys_addr; b_pg_offset = b_offset & PAGE_MASK; m_b = mb[b_offset >> PAGE_SHIFT]; p_b = m_b->phys_addr; cnt = min(xfersize, PAGE_SIZE - a_pg_offset); cnt = min(cnt, PAGE_SIZE - b_pg_offset); if (__predict_false(!PHYS_IN_DMAP(p_a))) { panic("!DMAP a %lx", p_a); } else { a_cp = (char *)PHYS_TO_DMAP(p_a) + a_pg_offset; } if (__predict_false(!PHYS_IN_DMAP(p_b))) { panic("!DMAP b %lx", p_b); } else { b_cp = (char *)PHYS_TO_DMAP(p_b) + b_pg_offset; } bcopy(a_cp, b_cp, cnt); a_offset += cnt; b_offset += cnt; xfersize -= cnt; } } vm_offset_t pmap_quick_enter_page(vm_page_t m) { return (PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m))); } void pmap_quick_remove_page(vm_offset_t addr) { } /* * Returns true if the pmap's pv is one of the first * 16 pvs linked to from this page. This count may * be changed upwards or downwards in the future; it * is only necessary that true be returned for a small * subset of pmaps for proper page aging. */ boolean_t pmap_page_exists_quick(pmap_t pmap, vm_page_t m) { struct rwlock *lock; pv_entry_t pv; int loops = 0; boolean_t rv; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_page_exists_quick: page %p is not managed", m)); rv = FALSE; rw_rlock(&pvh_global_lock); lock = VM_PAGE_TO_PV_LIST_LOCK(m); rw_rlock(lock); TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { if (PV_PMAP(pv) == pmap) { rv = TRUE; break; } loops++; if (loops >= 16) break; } rw_runlock(lock); rw_runlock(&pvh_global_lock); return (rv); } /* * pmap_page_wired_mappings: * * Return the number of managed mappings to the given physical page * that are wired. */ int pmap_page_wired_mappings(vm_page_t m) { struct rwlock *lock; pmap_t pmap; pt_entry_t *l3; pv_entry_t pv; int count, md_gen; if ((m->oflags & VPO_UNMANAGED) != 0) return (0); rw_rlock(&pvh_global_lock); lock = VM_PAGE_TO_PV_LIST_LOCK(m); rw_rlock(lock); restart: count = 0; TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { md_gen = m->md.pv_gen; rw_runlock(lock); PMAP_LOCK(pmap); rw_rlock(lock); if (md_gen != m->md.pv_gen) { PMAP_UNLOCK(pmap); goto restart; } } l3 = pmap_l3(pmap, pv->pv_va); if (l3 != NULL && (pmap_load(l3) & PTE_SW_WIRED) != 0) count++; PMAP_UNLOCK(pmap); } rw_runlock(lock); rw_runlock(&pvh_global_lock); return (count); } /* * Destroy all managed, non-wired mappings in the given user-space * pmap. This pmap cannot be active on any processor besides the * caller. * * This function cannot be applied to the kernel pmap. Moreover, it * is not intended for general use. It is only to be used during * process termination. Consequently, it can be implemented in ways * that make it faster than pmap_remove(). First, it can more quickly * destroy mappings by iterating over the pmap's collection of PV * entries, rather than searching the page table. Second, it doesn't * have to test and clear the page table entries atomically, because * no processor is currently accessing the user address space. In * particular, a page table entry's dirty bit won't change state once * this function starts. */ void pmap_remove_pages(pmap_t pmap) { pd_entry_t ptepde, *l2; pt_entry_t *l3, tl3; struct spglist free; vm_page_t m; pv_entry_t pv; struct pv_chunk *pc, *npc; struct rwlock *lock; int64_t bit; uint64_t inuse, bitmask; int allfree, field, freed, idx; vm_paddr_t pa; lock = NULL; SLIST_INIT(&free); rw_rlock(&pvh_global_lock); PMAP_LOCK(pmap); TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) { allfree = 1; freed = 0; for (field = 0; field < _NPCM; field++) { inuse = ~pc->pc_map[field] & pc_freemask[field]; while (inuse != 0) { bit = ffsl(inuse) - 1; bitmask = 1UL << bit; idx = field * 64 + bit; pv = &pc->pc_pventry[idx]; inuse &= ~bitmask; l2 = pmap_l2(pmap, pv->pv_va); ptepde = pmap_load(l2); l3 = pmap_l2_to_l3(l2, pv->pv_va); tl3 = pmap_load(l3); /* * We cannot remove wired pages from a process' mapping at this time */ if (tl3 & PTE_SW_WIRED) { allfree = 0; continue; } pa = PTE_TO_PHYS(tl3); m = PHYS_TO_VM_PAGE(pa); KASSERT(m->phys_addr == pa, ("vm_page_t %p phys_addr mismatch %016jx %016jx", m, (uintmax_t)m->phys_addr, (uintmax_t)tl3)); KASSERT((m->flags & PG_FICTITIOUS) != 0 || m < &vm_page_array[vm_page_array_size], ("pmap_remove_pages: bad l3 %#jx", (uintmax_t)tl3)); if (pmap_is_current(pmap) && pmap_l3_valid_cacheable(pmap_load(l3))) cpu_dcache_wb_range(pv->pv_va, L3_SIZE); pmap_load_clear(l3); PTE_SYNC(l3); pmap_invalidate_page(pmap, pv->pv_va); /* * Update the vm_page_t clean/reference bits. */ if (pmap_page_dirty(tl3)) vm_page_dirty(m); CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, m); /* Mark free */ pc->pc_map[field] |= bitmask; pmap_resident_count_dec(pmap, 1); TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); m->md.pv_gen++; pmap_unuse_l3(pmap, pv->pv_va, ptepde, &free); freed++; } } PV_STAT(atomic_add_long(&pv_entry_frees, freed)); PV_STAT(atomic_add_int(&pv_entry_spare, freed)); PV_STAT(atomic_subtract_long(&pv_entry_count, freed)); if (allfree) { TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); free_pv_chunk(pc); } } pmap_invalidate_all(pmap); if (lock != NULL) rw_wunlock(lock); rw_runlock(&pvh_global_lock); PMAP_UNLOCK(pmap); pmap_free_zero_pages(&free); } /* * This is used to check if a page has been accessed or modified. As we * don't have a bit to see if it has been modified we have to assume it * has been if the page is read/write. */ static boolean_t pmap_page_test_mappings(vm_page_t m, boolean_t accessed, boolean_t modified) { struct rwlock *lock; pv_entry_t pv; pt_entry_t *l3, mask, value; pmap_t pmap; int md_gen; boolean_t rv; rv = FALSE; rw_rlock(&pvh_global_lock); lock = VM_PAGE_TO_PV_LIST_LOCK(m); rw_rlock(lock); restart: TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { md_gen = m->md.pv_gen; rw_runlock(lock); PMAP_LOCK(pmap); rw_rlock(lock); if (md_gen != m->md.pv_gen) { PMAP_UNLOCK(pmap); goto restart; } } l3 = pmap_l3(pmap, pv->pv_va); mask = 0; value = 0; if (modified) { - mask |= PTE_DIRTY; - value |= PTE_DIRTY; + mask |= PTE_D; + value |= PTE_D; } if (accessed) { - mask |= PTE_REF; - value |= PTE_REF; + mask |= PTE_A; + value |= PTE_A; } #if 0 if (modified) { mask |= ATTR_AP_RW_BIT; value |= ATTR_AP(ATTR_AP_RW); } if (accessed) { mask |= ATTR_AF | ATTR_DESCR_MASK; value |= ATTR_AF | L3_PAGE; } #endif rv = (pmap_load(l3) & mask) == value; PMAP_UNLOCK(pmap); if (rv) goto out; } out: rw_runlock(lock); rw_runlock(&pvh_global_lock); return (rv); } /* * pmap_is_modified: * * Return whether or not the specified physical page was modified * in any physical maps. */ boolean_t pmap_is_modified(vm_page_t m) { KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_is_modified: page %p is not managed", m)); /* * If the page is not exclusive busied, then PGA_WRITEABLE cannot be * concurrently set while the object is locked. Thus, if PGA_WRITEABLE * is clear, no PTEs can have PG_M set. */ VM_OBJECT_ASSERT_WLOCKED(m->object); if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0) return (FALSE); return (pmap_page_test_mappings(m, FALSE, TRUE)); } /* * pmap_is_prefaultable: * * Return whether or not the specified virtual address is eligible * for prefault. */ boolean_t pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr) { pt_entry_t *l3; boolean_t rv; rv = FALSE; PMAP_LOCK(pmap); l3 = pmap_l3(pmap, addr); if (l3 != NULL && pmap_load(l3) != 0) { rv = TRUE; } PMAP_UNLOCK(pmap); return (rv); } /* * pmap_is_referenced: * * Return whether or not the specified physical page was referenced * in any physical maps. */ boolean_t pmap_is_referenced(vm_page_t m) { KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_is_referenced: page %p is not managed", m)); return (pmap_page_test_mappings(m, TRUE, FALSE)); } /* * Clear the write and modified bits in each of the given page's mappings. */ void pmap_remove_write(vm_page_t m) { pmap_t pmap; struct rwlock *lock; pv_entry_t pv; pt_entry_t *l3, oldl3; pt_entry_t newl3; int md_gen; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_remove_write: page %p is not managed", m)); /* * If the page is not exclusive busied, then PGA_WRITEABLE cannot be * set by another thread while the object is locked. Thus, * if PGA_WRITEABLE is clear, no page table entries need updating. */ VM_OBJECT_ASSERT_WLOCKED(m->object); if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0) return; rw_rlock(&pvh_global_lock); lock = VM_PAGE_TO_PV_LIST_LOCK(m); retry_pv_loop: rw_wlock(lock); TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { md_gen = m->md.pv_gen; rw_wunlock(lock); PMAP_LOCK(pmap); rw_wlock(lock); if (md_gen != m->md.pv_gen) { PMAP_UNLOCK(pmap); rw_wunlock(lock); goto retry_pv_loop; } } l3 = pmap_l3(pmap, pv->pv_va); retry: oldl3 = pmap_load(l3); if (pmap_is_write(oldl3)) { - newl3 = oldl3 & ~(1 << PTE_TYPE_S); + newl3 = oldl3 & ~(PTE_W); if (!atomic_cmpset_long(l3, oldl3, newl3)) goto retry; /* TODO: use pmap_page_dirty(oldl3) ? */ - if ((oldl3 & PTE_REF) != 0) + if ((oldl3 & PTE_A) != 0) vm_page_dirty(m); pmap_invalidate_page(pmap, pv->pv_va); } PMAP_UNLOCK(pmap); } rw_wunlock(lock); vm_page_aflag_clear(m, PGA_WRITEABLE); rw_runlock(&pvh_global_lock); } static __inline boolean_t safe_to_clear_referenced(pmap_t pmap, pt_entry_t pte) { return (FALSE); } #define PMAP_TS_REFERENCED_MAX 5 /* * pmap_ts_referenced: * * Return a count of reference bits for a page, clearing those bits. * It is not necessary for every reference bit to be cleared, but it * is necessary that 0 only be returned when there are truly no * reference bits set. * * XXX: The exact number of bits to check and clear is a matter that * should be tested and standardized at some point in the future for * optimal aging of shared pages. */ int pmap_ts_referenced(vm_page_t m) { pv_entry_t pv, pvf; pmap_t pmap; struct rwlock *lock; pd_entry_t *l2; pt_entry_t *l3; vm_paddr_t pa; int cleared, md_gen, not_cleared; struct spglist free; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_ts_referenced: page %p is not managed", m)); SLIST_INIT(&free); cleared = 0; pa = VM_PAGE_TO_PHYS(m); lock = PHYS_TO_PV_LIST_LOCK(pa); rw_rlock(&pvh_global_lock); rw_wlock(lock); retry: not_cleared = 0; if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL) goto out; pv = pvf; do { if (pvf == NULL) pvf = pv; pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { md_gen = m->md.pv_gen; rw_wunlock(lock); PMAP_LOCK(pmap); rw_wlock(lock); if (md_gen != m->md.pv_gen) { PMAP_UNLOCK(pmap); goto retry; } } l2 = pmap_l2(pmap, pv->pv_va); - KASSERT((pmap_load(l2) & PTE_TYPE_M) == (PTE_TYPE_PTR << PTE_TYPE_S), + KASSERT((pmap_load(l2) & PTE_RX) == 0, ("pmap_ts_referenced: found an invalid l2 table")); l3 = pmap_l2_to_l3(l2, pv->pv_va); - if ((pmap_load(l3) & PTE_REF) != 0) { + if ((pmap_load(l3) & PTE_A) != 0) { if (safe_to_clear_referenced(pmap, pmap_load(l3))) { /* * TODO: We don't handle the access flag * at all. We need to be able to set it in * the exception handler. */ panic("RISCVTODO: safe_to_clear_referenced\n"); } else if ((pmap_load(l3) & PTE_SW_WIRED) == 0) { /* * Wired pages cannot be paged out so * doing accessed bit emulation for * them is wasted effort. We do the * hard work for unwired pages only. */ pmap_remove_l3(pmap, l3, pv->pv_va, pmap_load(l2), &free, &lock); pmap_invalidate_page(pmap, pv->pv_va); cleared++; if (pvf == pv) pvf = NULL; pv = NULL; KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m), ("inconsistent pv lock %p %p for page %p", lock, VM_PAGE_TO_PV_LIST_LOCK(m), m)); } else not_cleared++; } PMAP_UNLOCK(pmap); /* Rotate the PV list if it has more than one entry. */ if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) { TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); m->md.pv_gen++; } } while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && cleared + not_cleared < PMAP_TS_REFERENCED_MAX); out: rw_wunlock(lock); rw_runlock(&pvh_global_lock); pmap_free_zero_pages(&free); return (cleared + not_cleared); } /* * Apply the given advice to the specified range of addresses within the * given pmap. Depending on the advice, clear the referenced and/or * modified flags in each mapping and set the mapped page's dirty field. */ void pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice) { } /* * Clear the modify bits on the specified physical page. */ void pmap_clear_modify(vm_page_t m) { KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_clear_modify: page %p is not managed", m)); VM_OBJECT_ASSERT_WLOCKED(m->object); KASSERT(!vm_page_xbusied(m), ("pmap_clear_modify: page %p is exclusive busied", m)); /* * If the page is not PGA_WRITEABLE, then no PTEs can have PG_M set. * If the object containing the page is locked and the page is not * exclusive busied, then PGA_WRITEABLE cannot be concurrently set. */ if ((m->aflags & PGA_WRITEABLE) == 0) return; /* RISCVTODO: We lack support for tracking if a page is modified */ } void * pmap_mapbios(vm_paddr_t pa, vm_size_t size) { return ((void *)PHYS_TO_DMAP(pa)); } void pmap_unmapbios(vm_paddr_t pa, vm_size_t size) { } /* * Sets the memory attribute for the specified page. */ void pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma) { m->md.pv_memattr = ma; /* * RISCVTODO: Implement the below (from the amd64 pmap) * If "m" is a normal page, update its direct mapping. This update * can be relied upon to perform any cache operations that are * required for data coherence. */ if ((m->flags & PG_FICTITIOUS) == 0 && PHYS_IN_DMAP(VM_PAGE_TO_PHYS(m))) panic("RISCVTODO: pmap_page_set_memattr"); } /* * perform the pmap work for mincore */ int pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *locked_pa) { panic("RISCVTODO: pmap_mincore"); } void pmap_activate(struct thread *td) { pmap_t pmap; critical_enter(); pmap = vmspace_pmap(td->td_proc->p_vmspace); td->td_pcb->pcb_l1addr = vtophys(pmap->pm_l1); - __asm __volatile("csrw sptbr, %0" :: "r"(td->td_pcb->pcb_l1addr)); + __asm __volatile("csrw sptbr, %0" :: "r"(td->td_pcb->pcb_l1addr >> PAGE_SHIFT)); pmap_invalidate_all(pmap); critical_exit(); } void pmap_sync_icache(pmap_t pm, vm_offset_t va, vm_size_t sz) { panic("RISCVTODO: pmap_sync_icache"); } /* * Increase the starting virtual address of the given mapping if a * different alignment might result in more superpage mappings. */ void pmap_align_superpage(vm_object_t object, vm_ooffset_t offset, vm_offset_t *addr, vm_size_t size) { } /** * Get the kernel virtual address of a set of physical pages. If there are * physical addresses not covered by the DMAP perform a transient mapping * that will be removed when calling pmap_unmap_io_transient. * * \param page The pages the caller wishes to obtain the virtual * address on the kernel memory map. * \param vaddr On return contains the kernel virtual memory address * of the pages passed in the page parameter. * \param count Number of pages passed in. * \param can_fault TRUE if the thread using the mapped pages can take * page faults, FALSE otherwise. * * \returns TRUE if the caller must call pmap_unmap_io_transient when * finished or FALSE otherwise. * */ boolean_t pmap_map_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count, boolean_t can_fault) { vm_paddr_t paddr; boolean_t needs_mapping; int error, i; /* * Allocate any KVA space that we need, this is done in a separate * loop to prevent calling vmem_alloc while pinned. */ needs_mapping = FALSE; for (i = 0; i < count; i++) { paddr = VM_PAGE_TO_PHYS(page[i]); if (__predict_false(paddr >= DMAP_MAX_PHYSADDR)) { error = vmem_alloc(kernel_arena, PAGE_SIZE, M_BESTFIT | M_WAITOK, &vaddr[i]); KASSERT(error == 0, ("vmem_alloc failed: %d", error)); needs_mapping = TRUE; } else { vaddr[i] = PHYS_TO_DMAP(paddr); } } /* Exit early if everything is covered by the DMAP */ if (!needs_mapping) return (FALSE); if (!can_fault) sched_pin(); for (i = 0; i < count; i++) { paddr = VM_PAGE_TO_PHYS(page[i]); if (paddr >= DMAP_MAX_PHYSADDR) { panic( "pmap_map_io_transient: TODO: Map out of DMAP data"); } } return (needs_mapping); } void pmap_unmap_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count, boolean_t can_fault) { vm_paddr_t paddr; int i; if (!can_fault) sched_unpin(); for (i = 0; i < count; i++) { paddr = VM_PAGE_TO_PHYS(page[i]); if (paddr >= DMAP_MAX_PHYSADDR) { panic("RISCVTODO: pmap_unmap_io_transient: Unmap data"); } } } Index: user/alc/PQ_LAUNDRY/sys/riscv/riscv/swtch.S =================================================================== --- user/alc/PQ_LAUNDRY/sys/riscv/riscv/swtch.S (revision 303666) +++ user/alc/PQ_LAUNDRY/sys/riscv/riscv/swtch.S (revision 303667) @@ -1,274 +1,277 @@ /*- * Copyright (c) 2015-2016 Ruslan Bukin * All rights reserved. * * Portions of this software were developed by SRI International and the * University of Cambridge Computer Laboratory under DARPA/AFRL contract * FA8750-10-C-0237 ("CTSRD"), as part of the DARPA CRASH research programme. * * Portions of this software were developed by the University of Cambridge * Computer Laboratory as part of the CTSRD Project, with support from the * UK Higher Education Innovation Fund (HEIF). * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include "assym.s" #include "opt_sched.h" #include #include #include #include __FBSDID("$FreeBSD$"); /* * void cpu_throw(struct thread *old, struct thread *new) */ ENTRY(cpu_throw) /* Store the new curthread */ sd a1, PC_CURTHREAD(gp) /* And the new pcb */ ld x13, TD_PCB(a1) sd x13, PC_CURPCB(gp) sfence.vm /* Switch to the new pmap */ ld t0, PCB_L1ADDR(x13) + srli t0, t0, PAGE_SHIFT csrw sptbr, t0 /* TODO: Invalidate the TLB */ sfence.vm /* Load registers */ ld ra, (PCB_RA)(x13) ld sp, (PCB_SP)(x13) /* s[0-11] */ ld s0, (PCB_S + 0 * 8)(x13) ld s1, (PCB_S + 1 * 8)(x13) ld s2, (PCB_S + 2 * 8)(x13) ld s3, (PCB_S + 3 * 8)(x13) ld s4, (PCB_S + 4 * 8)(x13) ld s5, (PCB_S + 5 * 8)(x13) ld s6, (PCB_S + 6 * 8)(x13) ld s7, (PCB_S + 7 * 8)(x13) ld s8, (PCB_S + 8 * 8)(x13) ld s9, (PCB_S + 9 * 8)(x13) ld s10, (PCB_S + 10 * 8)(x13) ld s11, (PCB_S + 11 * 8)(x13) + ret .Lcpu_throw_panic_str: .asciz "cpu_throw: %p\0" END(cpu_throw) /* * void cpu_switch(struct thread *old, struct thread *new, struct mtx *mtx) * * a0 = old * a1 = new * a2 = mtx * x3 to x7, x16 and x17 are caller saved */ ENTRY(cpu_switch) /* Store the new curthread */ sd a1, PC_CURTHREAD(gp) /* And the new pcb */ ld x13, TD_PCB(a1) sd x13, PC_CURPCB(gp) /* Save the old context. */ ld x13, TD_PCB(a0) /* Store ra, sp and the callee-saved registers */ sd ra, (PCB_RA)(x13) sd sp, (PCB_SP)(x13) /* s[0-11] */ sd s0, (PCB_S + 0 * 8)(x13) sd s1, (PCB_S + 1 * 8)(x13) sd s2, (PCB_S + 2 * 8)(x13) sd s3, (PCB_S + 3 * 8)(x13) sd s4, (PCB_S + 4 * 8)(x13) sd s5, (PCB_S + 5 * 8)(x13) sd s6, (PCB_S + 6 * 8)(x13) sd s7, (PCB_S + 7 * 8)(x13) sd s8, (PCB_S + 8 * 8)(x13) sd s9, (PCB_S + 9 * 8)(x13) sd s10, (PCB_S + 10 * 8)(x13) sd s11, (PCB_S + 11 * 8)(x13) /* * Restore the saved context. */ ld x13, TD_PCB(a1) /* * TODO: We may need to flush the cache here if switching * to a user process. */ sfence.vm /* Switch to the new pmap */ ld t0, PCB_L1ADDR(x13) + srli t0, t0, PAGE_SHIFT csrw sptbr, t0 /* TODO: Invalidate the TLB */ sfence.vm /* Release the old thread */ sd a2, TD_LOCK(a0) #if defined(SCHED_ULE) && defined(SMP) /* Spin if TD_LOCK points to a blocked_lock */ la a2, _C_LABEL(blocked_lock) 1: ld t0, TD_LOCK(a1) beq t0, a2, 1b #endif /* Restore the registers */ ld ra, (PCB_RA)(x13) ld sp, (PCB_SP)(x13) /* s[0-11] */ ld s0, (PCB_S + 0 * 8)(x13) ld s1, (PCB_S + 1 * 8)(x13) ld s2, (PCB_S + 2 * 8)(x13) ld s3, (PCB_S + 3 * 8)(x13) ld s4, (PCB_S + 4 * 8)(x13) ld s5, (PCB_S + 5 * 8)(x13) ld s6, (PCB_S + 6 * 8)(x13) ld s7, (PCB_S + 7 * 8)(x13) ld s8, (PCB_S + 8 * 8)(x13) ld s9, (PCB_S + 9 * 8)(x13) ld s10, (PCB_S + 10 * 8)(x13) ld s11, (PCB_S + 11 * 8)(x13) ret .Lcpu_switch_panic_str: .asciz "cpu_switch: %p\0" END(cpu_switch) /* * fork_exit(void (*callout)(void *, struct trapframe *), void *arg, * struct trapframe *frame) */ ENTRY(fork_trampoline) mv a0, s0 mv a1, s1 mv a2, sp call _C_LABEL(fork_exit) /* Restore sstatus */ ld t0, (TF_SSTATUS)(sp) /* Ensure interrupts disabled */ - li t1, ~SSTATUS_IE + li t1, ~SSTATUS_SIE and t0, t0, t1 csrw sstatus, t0 /* Restore exception program counter */ ld t0, (TF_SEPC)(sp) csrw sepc, t0 /* Restore the registers */ ld t0, (TF_T + 0 * 8)(sp) ld t1, (TF_T + 1 * 8)(sp) ld t2, (TF_T + 2 * 8)(sp) ld t3, (TF_T + 3 * 8)(sp) ld t4, (TF_T + 4 * 8)(sp) ld t5, (TF_T + 5 * 8)(sp) ld t6, (TF_T + 6 * 8)(sp) ld s0, (TF_S + 0 * 8)(sp) ld s1, (TF_S + 1 * 8)(sp) ld s2, (TF_S + 2 * 8)(sp) ld s3, (TF_S + 3 * 8)(sp) ld s4, (TF_S + 4 * 8)(sp) ld s5, (TF_S + 5 * 8)(sp) ld s6, (TF_S + 6 * 8)(sp) ld s7, (TF_S + 7 * 8)(sp) ld s8, (TF_S + 8 * 8)(sp) ld s9, (TF_S + 9 * 8)(sp) ld s10, (TF_S + 10 * 8)(sp) ld s11, (TF_S + 11 * 8)(sp) ld a0, (TF_A + 0 * 8)(sp) ld a1, (TF_A + 1 * 8)(sp) ld a2, (TF_A + 2 * 8)(sp) ld a3, (TF_A + 3 * 8)(sp) ld a4, (TF_A + 4 * 8)(sp) ld a5, (TF_A + 5 * 8)(sp) ld a6, (TF_A + 6 * 8)(sp) ld a7, (TF_A + 7 * 8)(sp) /* Load user ra and sp */ ld tp, (TF_TP)(sp) ld ra, (TF_RA)(sp) /* * Store our pcpup on stack, we will load it back * on kernel mode trap. */ sd gp, (TF_SIZE)(sp) ld gp, (TF_GP)(sp) /* Save kernel stack so we can use it doing a user trap */ addi sp, sp, TF_SIZE csrw sscratch, sp /* Load user stack */ ld sp, (TF_SP - TF_SIZE)(sp) - eret + sret END(fork_trampoline) ENTRY(savectx) /* Store ra, sp and the callee-saved registers */ sd ra, (PCB_RA)(a0) sd sp, (PCB_SP)(a0) /* s[0-11] */ sd s0, (PCB_S + 0 * 8)(a0) sd s1, (PCB_S + 1 * 8)(a0) sd s2, (PCB_S + 2 * 8)(a0) sd s3, (PCB_S + 3 * 8)(a0) sd s4, (PCB_S + 4 * 8)(a0) sd s5, (PCB_S + 5 * 8)(a0) sd s6, (PCB_S + 6 * 8)(a0) sd s7, (PCB_S + 7 * 8)(a0) sd s8, (PCB_S + 8 * 8)(a0) sd s9, (PCB_S + 9 * 8)(a0) sd s10, (PCB_S + 10 * 8)(a0) sd s11, (PCB_S + 11 * 8)(a0) /* Store the VFP registers */ #ifdef VFP /* TODO */ #endif ret END(savectx) Index: user/alc/PQ_LAUNDRY/sys/riscv/riscv/timer.c =================================================================== --- user/alc/PQ_LAUNDRY/sys/riscv/riscv/timer.c (revision 303666) +++ user/alc/PQ_LAUNDRY/sys/riscv/riscv/timer.c (revision 303667) @@ -1,299 +1,318 @@ /*- - * Copyright (c) 2015 Ruslan Bukin + * Copyright (c) 2015-2016 Ruslan Bukin * All rights reserved. * * Portions of this software were developed by SRI International and the * University of Cambridge Computer Laboratory under DARPA/AFRL contract * FA8750-10-C-0237 ("CTSRD"), as part of the DARPA CRASH research programme. * * Portions of this software were developed by the University of Cambridge * Computer Laboratory as part of the CTSRD Project, with support from the * UK Higher Education Innovation Fund (HEIF). * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * RISC-V Timer */ #include "opt_platform.h" #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define DEFAULT_FREQ 1000000 +#define TIMER_COUNTS 0x00 +#define TIMER_MTIMECMP(cpu) (0x08 + (cpu * 8)) + +#define READ8(_sc, _reg) \ + bus_space_read_8(_sc->bst, _sc->bsh, _reg) +#define WRITE8(_sc, _reg, _val) \ + bus_space_write_8(_sc->bst, _sc->bsh, _reg, _val) + struct riscv_tmr_softc { - struct resource *res[1]; - void *ihl[1]; + struct resource *res[2]; + bus_space_tag_t bst; + bus_space_handle_t bsh; + void *ih; uint32_t clkfreq; struct eventtimer et; }; static struct riscv_tmr_softc *riscv_tmr_sc = NULL; static struct resource_spec timer_spec[] = { + { SYS_RES_MEMORY, 0, RF_ACTIVE }, { SYS_RES_IRQ, 0, RF_ACTIVE }, { -1, 0 } }; static timecounter_get_t riscv_tmr_get_timecount; static struct timecounter riscv_tmr_timecount = { .tc_name = "RISC-V Timecounter", .tc_get_timecount = riscv_tmr_get_timecount, .tc_poll_pps = NULL, .tc_counter_mask = ~0u, .tc_frequency = 0, .tc_quality = 1000, }; static long -get_counts(void) +get_counts(struct riscv_tmr_softc *sc) { - return (csr_read(stime)); + return (READ8(sc, TIMER_COUNTS)); } static unsigned riscv_tmr_get_timecount(struct timecounter *tc) { + struct riscv_tmr_softc *sc; - return (get_counts()); + sc = tc->tc_priv; + + return (get_counts(sc)); } static int riscv_tmr_start(struct eventtimer *et, sbintime_t first, sbintime_t period) { struct riscv_tmr_softc *sc; - int counts; + uint64_t counts; + int cpu; sc = (struct riscv_tmr_softc *)et->et_priv; if (first != 0) { counts = ((uint32_t)et->et_frequency * first) >> 32; + counts += READ8(sc, TIMER_COUNTS); + cpu = PCPU_GET(cpuid); + WRITE8(sc, TIMER_MTIMECMP(cpu), counts); + csr_set(sie, SIE_STIE); machine_command(ECALL_MTIMECMP, counts); + return (0); } return (EINVAL); } static int riscv_tmr_stop(struct eventtimer *et) { struct riscv_tmr_softc *sc; sc = (struct riscv_tmr_softc *)et->et_priv; /* TODO */ return (0); } static int riscv_tmr_intr(void *arg) { struct riscv_tmr_softc *sc; sc = (struct riscv_tmr_softc *)arg; - /* - * Clear interrupt pending bit. - * Note: SIP_STIP bit is not implemented in sip register - * in Spike simulator, so use machine command to clear - * interrupt pending bit in mip. - */ - machine_command(ECALL_CLEAR_PENDING, 0); + csr_clear(sip, SIP_STIP); if (sc->et.et_active) sc->et.et_event_cb(&sc->et, sc->et.et_arg); return (FILTER_HANDLED); } static int riscv_tmr_fdt_probe(device_t dev) { if (!ofw_bus_status_okay(dev)) return (ENXIO); if (ofw_bus_is_compatible(dev, "riscv,timer")) { device_set_desc(dev, "RISC-V Timer"); return (BUS_PROBE_DEFAULT); } return (ENXIO); } static int riscv_tmr_attach(device_t dev) { struct riscv_tmr_softc *sc; phandle_t node; pcell_t clock; int error; sc = device_get_softc(dev); if (riscv_tmr_sc) return (ENXIO); /* Get the base clock frequency */ node = ofw_bus_get_node(dev); if (node > 0) { error = OF_getprop(node, "clock-frequency", &clock, sizeof(clock)); if (error > 0) { sc->clkfreq = fdt32_to_cpu(clock); } } if (sc->clkfreq == 0) sc->clkfreq = DEFAULT_FREQ; if (sc->clkfreq == 0) { device_printf(dev, "No clock frequency specified\n"); return (ENXIO); } if (bus_alloc_resources(dev, timer_spec, sc->res)) { device_printf(dev, "could not allocate resources\n"); return (ENXIO); } + /* Memory interface */ + sc->bst = rman_get_bustag(sc->res[0]); + sc->bsh = rman_get_bushandle(sc->res[0]); + riscv_tmr_sc = sc; /* Setup IRQs handler */ - error = bus_setup_intr(dev, sc->res[0], INTR_TYPE_CLK, - riscv_tmr_intr, NULL, sc, &sc->ihl[0]); + error = bus_setup_intr(dev, sc->res[1], INTR_TYPE_CLK, + riscv_tmr_intr, NULL, sc, &sc->ih); if (error) { device_printf(dev, "Unable to alloc int resource.\n"); return (ENXIO); } riscv_tmr_timecount.tc_frequency = sc->clkfreq; + riscv_tmr_timecount.tc_priv = sc; tc_init(&riscv_tmr_timecount); sc->et.et_name = "RISC-V Eventtimer"; sc->et.et_flags = ET_FLAGS_ONESHOT | ET_FLAGS_PERCPU; sc->et.et_quality = 1000; sc->et.et_frequency = sc->clkfreq; sc->et.et_min_period = (0x00000002LLU << 32) / sc->et.et_frequency; sc->et.et_max_period = (0xfffffffeLLU << 32) / sc->et.et_frequency; sc->et.et_start = riscv_tmr_start; sc->et.et_stop = riscv_tmr_stop; sc->et.et_priv = sc; et_register(&sc->et); return (0); } static device_method_t riscv_tmr_fdt_methods[] = { DEVMETHOD(device_probe, riscv_tmr_fdt_probe), DEVMETHOD(device_attach, riscv_tmr_attach), { 0, 0 } }; static driver_t riscv_tmr_fdt_driver = { "timer", riscv_tmr_fdt_methods, sizeof(struct riscv_tmr_softc), }; static devclass_t riscv_tmr_fdt_devclass; EARLY_DRIVER_MODULE(timer, simplebus, riscv_tmr_fdt_driver, riscv_tmr_fdt_devclass, 0, 0, BUS_PASS_TIMER + BUS_PASS_ORDER_MIDDLE); EARLY_DRIVER_MODULE(timer, ofwbus, riscv_tmr_fdt_driver, riscv_tmr_fdt_devclass, 0, 0, BUS_PASS_TIMER + BUS_PASS_ORDER_MIDDLE); void DELAY(int usec) { - int32_t counts, counts_per_usec; - uint32_t first, last; + int64_t counts, counts_per_usec; + uint64_t first, last; /* * Check the timers are setup, if not just * use a for loop for the meantime */ if (riscv_tmr_sc == NULL) { for (; usec > 0; usec--) for (counts = 200; counts > 0; counts--) /* * Prevent the compiler from optimizing * out the loop */ cpufunc_nullop(); return; } /* Get the number of times to count */ counts_per_usec = ((riscv_tmr_timecount.tc_frequency / 1000000) + 1); /* * Clamp the timeout at a maximum value (about 32 seconds with * a 66MHz clock). *Nobody* should be delay()ing for anywhere * near that length of time and if they are, they should be hung * out to dry. */ if (usec >= (0x80000000U / counts_per_usec)) counts = (0x80000000U / counts_per_usec) - 1; else counts = usec * counts_per_usec; - first = get_counts(); + first = get_counts(riscv_tmr_sc); while (counts > 0) { - last = get_counts(); - counts -= (int32_t)(last - first); + last = get_counts(riscv_tmr_sc); + counts -= (int64_t)(last - first); first = last; } } Index: user/alc/PQ_LAUNDRY/sys/riscv/riscv/trap.c =================================================================== --- user/alc/PQ_LAUNDRY/sys/riscv/riscv/trap.c (revision 303666) +++ user/alc/PQ_LAUNDRY/sys/riscv/riscv/trap.c (revision 303667) @@ -1,361 +1,373 @@ /*- - * Copyright (c) 2015 Ruslan Bukin + * Copyright (c) 2015-2016 Ruslan Bukin * All rights reserved. * * Portions of this software were developed by SRI International and the * University of Cambridge Computer Laboratory under DARPA/AFRL contract * FA8750-10-C-0237 ("CTSRD"), as part of the DARPA CRASH research programme. * * Portions of this software were developed by the University of Cambridge * Computer Laboratory as part of the CTSRD Project, with support from the * UK Higher Education Innovation Fund (HEIF). * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #ifdef KDB #include #endif #include #include #include #include #include #include #include #include #include #include #include #ifdef KDTRACE_HOOKS #include #endif int (*dtrace_invop_jump_addr)(struct trapframe *); extern register_t fsu_intr_fault; /* Called from exception.S */ void do_trap_supervisor(struct trapframe *); void do_trap_user(struct trapframe *); static __inline void call_trapsignal(struct thread *td, int sig, int code, void *addr) { ksiginfo_t ksi; ksiginfo_init_trap(&ksi); ksi.ksi_signo = sig; ksi.ksi_code = code; ksi.ksi_addr = addr; trapsignal(td, &ksi); } int cpu_fetch_syscall_args(struct thread *td, struct syscall_args *sa) { struct proc *p; register_t *ap; int nap; nap = 8; p = td->td_proc; ap = &td->td_frame->tf_a[0]; sa->code = td->td_frame->tf_t[0]; if (sa->code == SYS_syscall || sa->code == SYS___syscall) { sa->code = *ap++; nap--; } if (p->p_sysent->sv_mask) sa->code &= p->p_sysent->sv_mask; if (sa->code >= p->p_sysent->sv_size) sa->callp = &p->p_sysent->sv_table[0]; else sa->callp = &p->p_sysent->sv_table[sa->code]; sa->narg = sa->callp->sy_narg; memcpy(sa->args, ap, nap * sizeof(register_t)); if (sa->narg > nap) panic("TODO: Could we have more then 8 args?"); td->td_retval[0] = 0; td->td_retval[1] = 0; return (0); } #include "../../kern/subr_syscall.c" static void dump_regs(struct trapframe *frame) { int n; int i; n = (sizeof(frame->tf_t) / sizeof(frame->tf_t[0])); for (i = 0; i < n; i++) printf("t[%d] == 0x%016lx\n", i, frame->tf_t[i]); n = (sizeof(frame->tf_s) / sizeof(frame->tf_s[0])); for (i = 0; i < n; i++) printf("s[%d] == 0x%016lx\n", i, frame->tf_s[i]); n = (sizeof(frame->tf_a) / sizeof(frame->tf_a[0])); for (i = 0; i < n; i++) printf("a[%d] == 0x%016lx\n", i, frame->tf_a[i]); printf("sepc == 0x%016lx\n", frame->tf_sepc); printf("sstatus == 0x%016lx\n", frame->tf_sstatus); } static void svc_handler(struct trapframe *frame) { struct syscall_args sa; struct thread *td; int error; td = curthread; td->td_frame = frame; error = syscallenter(td, &sa); syscallret(td, error, &sa); } static void data_abort(struct trapframe *frame, int lower) { struct vm_map *map; uint64_t sbadaddr; struct thread *td; struct pcb *pcb; vm_prot_t ftype; vm_offset_t va; struct proc *p; int ucode; int error; int sig; #ifdef KDB if (kdb_active) { kdb_reenter(); return; } #endif td = curthread; pcb = td->td_pcb; /* * Special case for fuswintr and suswintr. These can't sleep so * handle them early on in the trap handler. */ if (__predict_false(pcb->pcb_onfault == (vm_offset_t)&fsu_intr_fault)) { frame->tf_sepc = pcb->pcb_onfault; return; } sbadaddr = frame->tf_sbadaddr; p = td->td_proc; if (lower) map = &td->td_proc->p_vmspace->vm_map; else { /* The top bit tells us which range to use */ if ((sbadaddr >> 63) == 1) map = kernel_map; else map = &td->td_proc->p_vmspace->vm_map; } va = trunc_page(sbadaddr); - if (frame->tf_scause == EXCP_STORE_ACCESS_FAULT) { + if (frame->tf_scause == EXCP_FAULT_STORE) { ftype = (VM_PROT_READ | VM_PROT_WRITE); } else { ftype = (VM_PROT_READ); } if (map != kernel_map) { /* * Keep swapout from messing with us during this * critical time. */ PROC_LOCK(p); ++p->p_lock; PROC_UNLOCK(p); /* Fault in the user page: */ error = vm_fault(map, va, ftype, VM_FAULT_NORMAL); PROC_LOCK(p); --p->p_lock; PROC_UNLOCK(p); } else { /* * Don't have to worry about process locking or stacks in the * kernel. */ error = vm_fault(map, va, ftype, VM_FAULT_NORMAL); } if (error != KERN_SUCCESS) { if (lower) { sig = SIGSEGV; if (error == KERN_PROTECTION_FAILURE) ucode = SEGV_ACCERR; else ucode = SEGV_MAPERR; call_trapsignal(td, sig, ucode, (void *)sbadaddr); } else { if (td->td_intr_nesting_level == 0 && pcb->pcb_onfault != 0) { frame->tf_a[0] = error; frame->tf_sepc = pcb->pcb_onfault; return; } dump_regs(frame); panic("vm_fault failed: %lx, va 0x%016lx", frame->tf_sepc, sbadaddr); } } if (lower) userret(td, frame); } void do_trap_supervisor(struct trapframe *frame) { uint64_t exception; + uint64_t sstatus; + /* Ensure we came from supervisor mode, interrupts disabled */ + __asm __volatile("csrr %0, sstatus" : "=&r" (sstatus)); + KASSERT((sstatus & (SSTATUS_SPP | SSTATUS_SIE)) == SSTATUS_SPP, + ("We must came from S mode with interrupts disabled")); + exception = (frame->tf_scause & EXCP_MASK); if (frame->tf_scause & EXCP_INTR) { /* Interrupt */ riscv_cpu_intr(frame); return; } #ifdef KDTRACE_HOOKS if (dtrace_trap_func != NULL && (*dtrace_trap_func)(frame, exception)) return; #endif CTR3(KTR_TRAP, "do_trap_supervisor: curthread: %p, sepc: %lx, frame: %p", curthread, frame->tf_sepc, frame); switch(exception) { - case EXCP_LOAD_ACCESS_FAULT: - case EXCP_STORE_ACCESS_FAULT: - case EXCP_INSTR_ACCESS_FAULT: + case EXCP_FAULT_LOAD: + case EXCP_FAULT_STORE: + case EXCP_FAULT_FETCH: data_abort(frame, 0); break; - case EXCP_INSTR_BREAKPOINT: + case EXCP_BREAKPOINT: #ifdef KDTRACE_HOOKS if (dtrace_invop_jump_addr != 0) { dtrace_invop_jump_addr(frame); break; } #endif #ifdef KDB kdb_trap(exception, 0, frame); #else dump_regs(frame); panic("No debugger in kernel.\n"); #endif break; - case EXCP_INSTR_ILLEGAL: + case EXCP_ILLEGAL_INSTRUCTION: dump_regs(frame); panic("Illegal instruction at 0x%016lx\n", frame->tf_sepc); break; default: dump_regs(frame); panic("Unknown kernel exception %x badaddr %lx\n", exception, frame->tf_sbadaddr); } } void do_trap_user(struct trapframe *frame) { uint64_t exception; struct thread *td; + uint64_t sstatus; td = curthread; td->td_frame = frame; + /* Ensure we came from usermode, interrupts disabled */ + __asm __volatile("csrr %0, sstatus" : "=&r" (sstatus)); + KASSERT((sstatus & (SSTATUS_SPP | SSTATUS_SIE)) == 0, + ("We must came from U mode with interrupts disabled")); + exception = (frame->tf_scause & EXCP_MASK); if (frame->tf_scause & EXCP_INTR) { /* Interrupt */ riscv_cpu_intr(frame); return; } CTR3(KTR_TRAP, "do_trap_user: curthread: %p, sepc: %lx, frame: %p", curthread, frame->tf_sepc, frame); switch(exception) { - case EXCP_LOAD_ACCESS_FAULT: - case EXCP_STORE_ACCESS_FAULT: - case EXCP_INSTR_ACCESS_FAULT: + case EXCP_FAULT_LOAD: + case EXCP_FAULT_STORE: + case EXCP_FAULT_FETCH: data_abort(frame, 1); break; - case EXCP_UMODE_ENV_CALL: + case EXCP_USER_ECALL: frame->tf_sepc += 4; /* Next instruction */ svc_handler(frame); break; - case EXCP_INSTR_ILLEGAL: + case EXCP_ILLEGAL_INSTRUCTION: call_trapsignal(td, SIGILL, ILL_ILLTRP, (void *)frame->tf_sepc); userret(td, frame); break; - case EXCP_INSTR_BREAKPOINT: + case EXCP_BREAKPOINT: call_trapsignal(td, SIGTRAP, TRAP_BRKPT, (void *)frame->tf_sepc); userret(td, frame); break; default: dump_regs(frame); panic("Unknown userland exception %x badaddr %lx\n", exception, frame->tf_sbadaddr); } } Index: user/alc/PQ_LAUNDRY/sys/riscv/riscv/vm_machdep.c =================================================================== --- user/alc/PQ_LAUNDRY/sys/riscv/riscv/vm_machdep.c (revision 303666) +++ user/alc/PQ_LAUNDRY/sys/riscv/riscv/vm_machdep.c (revision 303667) @@ -1,252 +1,253 @@ /*- - * Copyright (c) 2015 Ruslan Bukin + * Copyright (c) 2015-2016 Ruslan Bukin * All rights reserved. * * Portions of this software were developed by SRI International and the * University of Cambridge Computer Laboratory under DARPA/AFRL contract * FA8750-10-C-0237 ("CTSRD"), as part of the DARPA CRASH research programme. * * Portions of this software were developed by the University of Cambridge * Computer Laboratory as part of the CTSRD Project, with support from the * UK Higher Education Innovation Fund (HEIF). * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Finish a fork operation, with process p2 nearly set up. * Copy and update the pcb, set up the stack so that the child * ready to run and return to user mode. */ void cpu_fork(struct thread *td1, struct proc *p2, struct thread *td2, int flags) { struct pcb *pcb2; struct trapframe *tf; if ((flags & RFPROC) == 0) return; pcb2 = (struct pcb *)(td2->td_kstack + td2->td_kstack_pages * PAGE_SIZE) - 1; td2->td_pcb = pcb2; bcopy(td1->td_pcb, pcb2, sizeof(*pcb2)); td2->td_pcb->pcb_l1addr = vtophys(vmspace_pmap(td2->td_proc->p_vmspace)->pm_l1); tf = (struct trapframe *)STACKALIGN((struct trapframe *)pcb2 - 1); bcopy(td1->td_frame, tf, sizeof(*tf)); /* Clear syscall error flag */ tf->tf_t[0] = 0; /* Arguments for child */ tf->tf_a[0] = 0; tf->tf_a[1] = 0; - tf->tf_sstatus = SSTATUS_PIE; + tf->tf_sstatus = (SSTATUS_SPIE); + tf->tf_sstatus |= (MSTATUS_PRV_U << MSTATUS_SPP_SHIFT); td2->td_frame = tf; /* Set the return value registers for fork() */ td2->td_pcb->pcb_s[0] = (uintptr_t)fork_return; td2->td_pcb->pcb_s[1] = (uintptr_t)td2; td2->td_pcb->pcb_ra = (uintptr_t)fork_trampoline; td2->td_pcb->pcb_sp = (uintptr_t)td2->td_frame; /* Setup to release spin count in fork_exit(). */ td2->td_md.md_spinlock_count = 1; - td2->td_md.md_saved_sstatus_ie = 1; + td2->td_md.md_saved_sstatus_ie = (SSTATUS_SIE); } void cpu_reset(void) { printf("cpu_reset"); while(1) __asm volatile("wfi" ::: "memory"); } void cpu_thread_swapin(struct thread *td) { } void cpu_thread_swapout(struct thread *td) { } void cpu_set_syscall_retval(struct thread *td, int error) { struct trapframe *frame; frame = td->td_frame; switch (error) { case 0: frame->tf_a[0] = td->td_retval[0]; frame->tf_a[1] = td->td_retval[1]; frame->tf_t[0] = 0; /* syscall succeeded */ break; case ERESTART: frame->tf_sepc -= 4; /* prev instruction */ break; case EJUSTRETURN: break; default: frame->tf_a[0] = error; frame->tf_t[0] = 1; /* syscall error */ break; } } /* * Initialize machine state, mostly pcb and trap frame for a new * thread, about to return to userspace. Put enough state in the new * thread's PCB to get it to go back to the fork_return(), which * finalizes the thread state and handles peculiarities of the first * return to userspace for the new thread. */ void cpu_copy_thread(struct thread *td, struct thread *td0) { bcopy(td0->td_frame, td->td_frame, sizeof(struct trapframe)); bcopy(td0->td_pcb, td->td_pcb, sizeof(struct pcb)); td->td_pcb->pcb_s[0] = (uintptr_t)fork_return; td->td_pcb->pcb_s[1] = (uintptr_t)td; td->td_pcb->pcb_ra = (uintptr_t)fork_trampoline; td->td_pcb->pcb_sp = (uintptr_t)td->td_frame; /* Setup to release spin count in fork_exit(). */ td->td_md.md_spinlock_count = 1; - td->td_md.md_saved_sstatus_ie = 1; + td->td_md.md_saved_sstatus_ie = (SSTATUS_SIE); } /* * Set that machine state for performing an upcall that starts * the entry function with the given argument. */ void cpu_set_upcall(struct thread *td, void (*entry)(void *), void *arg, stack_t *stack) { struct trapframe *tf = td->td_frame; tf->tf_sp = STACKALIGN((uintptr_t)stack->ss_sp + stack->ss_size); tf->tf_sepc = (register_t)entry; tf->tf_a[0] = (register_t)arg; } int cpu_set_user_tls(struct thread *td, void *tls_base) { struct pcb *pcb; if ((uintptr_t)tls_base >= VM_MAXUSER_ADDRESS) return (EINVAL); pcb = td->td_pcb; pcb->pcb_tp = (register_t)tls_base; return (0); } void cpu_thread_exit(struct thread *td) { } void cpu_thread_alloc(struct thread *td) { td->td_pcb = (struct pcb *)(td->td_kstack + td->td_kstack_pages * PAGE_SIZE) - 1; td->td_frame = (struct trapframe *)STACKALIGN( (caddr_t)td->td_pcb - 8 - sizeof(struct trapframe)); } void cpu_thread_free(struct thread *td) { } void cpu_thread_clean(struct thread *td) { } /* * Intercept the return address from a freshly forked process that has NOT * been scheduled yet. * * This is needed to make kernel threads stay in kernel mode. */ void cpu_fork_kthread_handler(struct thread *td, void (*func)(void *), void *arg) { td->td_pcb->pcb_s[0] = (uintptr_t)func; td->td_pcb->pcb_s[1] = (uintptr_t)arg; td->td_pcb->pcb_ra = (uintptr_t)fork_trampoline; td->td_pcb->pcb_sp = (uintptr_t)td->td_frame; } void cpu_exit(struct thread *td) { } void swi_vm(void *v) { /* Nothing to do here - busdma bounce buffers are not implemented. */ } Index: user/alc/PQ_LAUNDRY/usr.bin/sed/Makefile =================================================================== --- user/alc/PQ_LAUNDRY/usr.bin/sed/Makefile (revision 303666) +++ user/alc/PQ_LAUNDRY/usr.bin/sed/Makefile (revision 303667) @@ -1,15 +1,15 @@ # @(#)Makefile 8.1 (Berkeley) 6/6/93 # $FreeBSD$ .include PROG= sed SRCS= compile.c main.c misc.c process.c -WARNS?= 5 +WARNS?= 2 .if ${MK_TESTS} != "no" SUBDIR+= tests .endif .include Index: user/alc/PQ_LAUNDRY/usr.bin/sed/compile.c =================================================================== --- user/alc/PQ_LAUNDRY/usr.bin/sed/compile.c (revision 303666) +++ user/alc/PQ_LAUNDRY/usr.bin/sed/compile.c (revision 303667) @@ -1,945 +1,949 @@ /*- * Copyright (c) 1992 Diomidis Spinellis. * Copyright (c) 1992, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * Diomidis Spinellis of Imperial College, University of London. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #ifndef lint static const char sccsid[] = "@(#)compile.c 8.1 (Berkeley) 6/6/93"; #endif #include #include #include #include #include #include #include #include #include #include #include #include #include "defs.h" #include "extern.h" #define LHSZ 128 #define LHMASK (LHSZ - 1) static struct labhash { struct labhash *lh_next; u_int lh_hash; struct s_command *lh_cmd; int lh_ref; } *labels[LHSZ]; -static const char *compile_addr(const char *, struct s_addr *); -static char *compile_ccl(const char **, char *); -static const char *compile_delimited(const char *, char *, int); -static const char *compile_flags(const char *, struct s_subst *); -static const regex_t *compile_re(const char *, int); -static const char *compile_subst(const char *, struct s_subst *); -static char *compile_text(size_t *); -static const char *compile_tr(const char *, struct s_tr **); +static char *compile_addr(char *, struct s_addr *); +static char *compile_ccl(char **, char *); +static char *compile_delimited(char *, char *, int); +static char *compile_flags(char *, struct s_subst *); +static regex_t *compile_re(char *, int); +static char *compile_subst(char *, struct s_subst *); +static char *compile_text(void); +static char *compile_tr(char *, struct s_tr **); static struct s_command **compile_stream(struct s_command **); -static char *duptoeol(const char *, const char *, size_t *); +static char *duptoeol(char *, const char *); static void enterlabel(struct s_command *); static struct s_command - *findlabel(const char *); -static void fixuplabel(struct s_command *, const struct s_command *); + *findlabel(char *); +static void fixuplabel(struct s_command *, struct s_command *); static void uselabel(void); /* * Command specification. This is used to drive the command parser. */ struct s_format { char code; /* Command code */ int naddr; /* Number of address args */ enum e_args args; /* Argument type */ }; static struct s_format cmd_fmts[] = { {'{', 2, GROUP}, {'}', 0, ENDGROUP}, {'a', 1, TEXT}, {'b', 2, BRANCH}, {'c', 2, TEXT}, {'d', 2, EMPTY}, {'D', 2, EMPTY}, {'g', 2, EMPTY}, {'G', 2, EMPTY}, {'h', 2, EMPTY}, {'H', 2, EMPTY}, {'i', 1, TEXT}, {'l', 2, EMPTY}, {'n', 2, EMPTY}, {'N', 2, EMPTY}, {'p', 2, EMPTY}, {'P', 2, EMPTY}, {'q', 1, EMPTY}, {'r', 1, RFILE}, {'s', 2, SUBST}, {'t', 2, BRANCH}, {'w', 2, WFILE}, {'x', 2, EMPTY}, {'y', 2, TR}, {'!', 2, NONSEL}, {':', 0, LABEL}, {'#', 0, COMMENT}, {'=', 1, EMPTY}, {'\0', 0, COMMENT}, }; /* The compiled program. */ struct s_command *prog; /* * Compile the program into prog. * Initialise appends. */ void compile(void) { *compile_stream(&prog) = NULL; fixuplabel(prog, NULL); uselabel(); if (appendnum == 0) appends = NULL; else if ((appends = malloc(sizeof(struct s_appends) * appendnum)) == NULL) err(1, "malloc"); if ((match = malloc((maxnsub + 1) * sizeof(regmatch_t))) == NULL) err(1, "malloc"); } -#define EATSPACE() do { \ - while (*p && isspace((unsigned char)*p)) \ - p++; \ +#define EATSPACE() do { \ + if (p) \ + while (*p && isspace((unsigned char)*p)) \ + p++; \ } while (0) -#define EATSPACEN() do { \ - while (*p && *p != '\n' && isspace((unsigned char)*p)) \ - p++; \ - } while (0) - static struct s_command ** compile_stream(struct s_command **link) { - const char *p; + char *p; + static char lbuf[_POSIX2_LINE_MAX + 1]; /* To save stack */ struct s_command *cmd, *cmd2, *stack; struct s_format *fp; char re[_POSIX2_LINE_MAX + 1]; int naddr; /* Number of addresses */ stack = NULL; for (;;) { - if ((p = cu_fgets(NULL)) == NULL) { + if ((p = cu_fgets(lbuf, sizeof(lbuf), NULL)) == NULL) { if (stack != NULL) errx(1, "%lu: %s: unexpected EOF (pending }'s)", linenum, fname); return (link); } -semicolon: EATSPACEN(); - switch (*p) { - case '#': case '\0': case '\n': - continue; /* to next command-unit */ - case ';': - p++; - goto semicolon; +semicolon: EATSPACE(); + if (p) { + if (*p == '#' || *p == '\0') + continue; + else if (*p == ';') { + p++; + goto semicolon; + } } - if ((*link = cmd = malloc(sizeof(struct s_command))) == NULL) err(1, "malloc"); link = &cmd->next; cmd->startline = cmd->nonsel = 0; /* First parse the addresses */ naddr = 0; /* Valid characters to start an address */ #define addrchar(c) (strchr("0123456789/\\$", (c))) if (addrchar(*p)) { naddr++; if ((cmd->a1 = malloc(sizeof(struct s_addr))) == NULL) err(1, "malloc"); p = compile_addr(p, cmd->a1); EATSPACE(); /* EXTENSION */ if (*p == ',') { p++; EATSPACE(); /* EXTENSION */ naddr++; if ((cmd->a2 = malloc(sizeof(struct s_addr))) == NULL) err(1, "malloc"); p = compile_addr(p, cmd->a2); EATSPACE(); } else cmd->a2 = NULL; } else cmd->a1 = cmd->a2 = NULL; nonsel: /* Now parse the command */ - if (*p == '\0' || *p == '\n') + if (!*p) errx(1, "%lu: %s: command expected", linenum, fname); cmd->code = *p; for (fp = cmd_fmts; fp->code; fp++) if (fp->code == *p) break; if (!fp->code) - errx(1, "%lu: %s: invalid command code %c (%s)", linenum, fname, *p, p); + errx(1, "%lu: %s: invalid command code %c", linenum, fname, *p); if (naddr > fp->naddr) errx(1, "%lu: %s: command %c expects up to %d address(es), found %d", linenum, fname, *p, fp->naddr, naddr); switch (fp->args) { case NONSEL: /* ! */ p++; EATSPACE(); cmd->nonsel = 1; goto nonsel; case GROUP: /* { */ p++; - EATSPACEN(); + EATSPACE(); cmd->next = stack; stack = cmd; link = &cmd->u.c; - if (*p != '\0' && *p != '\n') + if (*p) goto semicolon; break; case ENDGROUP: /* * Short-circuit command processing, since end of * group is really just a noop. */ cmd->nonsel = 1; if (stack == NULL) errx(1, "%lu: %s: unexpected }", linenum, fname); cmd2 = stack; stack = cmd2->next; cmd2->next = cmd; /*FALLTHROUGH*/ case EMPTY: /* d D g G h H l n N p P q x = \0 */ p++; - EATSPACEN(); + EATSPACE(); if (*p == ';') { p++; link = &cmd->next; goto semicolon; } - if (*p != '\0' && *p != '\n') + if (*p) errx(1, "%lu: %s: extra characters at the end of %c command", linenum, fname, cmd->code); break; case TEXT: /* a c i */ p++; EATSPACE(); if (*p != '\\') errx(1, "%lu: %s: command %c expects \\ followed by text", linenum, fname, cmd->code); p++; - EATSPACEN(); - if (*p != '\n') + EATSPACE(); + if (*p) errx(1, - "%lu: %s: extra characters (%c) after \\ at the end of %c command", - linenum, fname, *p, cmd->code); - cmd->t = compile_text(&cmd->tlen); + "%lu: %s: extra characters after \\ at the end of %c command", + linenum, fname, cmd->code); + cmd->t = compile_text(); break; case COMMENT: /* \0 # */ break; case WFILE: /* w */ p++; EATSPACE(); if (*p == '\0') errx(1, "%lu: %s: filename expected", linenum, fname); - cmd->t = duptoeol(p, "w command", &cmd->tlen); + cmd->t = duptoeol(p, "w command"); if (aflag) cmd->u.fd = -1; - else if ((cmd->u.fd = open(cmd->t, + else if ((cmd->u.fd = open(p, O_WRONLY|O_APPEND|O_CREAT|O_TRUNC, DEFFILEMODE)) == -1) err(1, "%s", p); break; case RFILE: /* r */ p++; EATSPACE(); if (*p == '\0') errx(1, "%lu: %s: filename expected", linenum, fname); else - cmd->t = duptoeol(p, "read command", &cmd->tlen); + cmd->t = duptoeol(p, "read command"); break; case BRANCH: /* b t */ p++; - EATSPACEN(); - if (*p == '\0' || *p == '\n') + EATSPACE(); + if (*p == '\0') cmd->t = NULL; else - cmd->t = duptoeol(p, "branch", &cmd->tlen); + cmd->t = duptoeol(p, "branch"); break; case LABEL: /* : */ p++; EATSPACE(); - cmd->t = duptoeol(p, "label", &cmd->tlen); - if (cmd->t[0] == '\0') + cmd->t = duptoeol(p, "label"); + if (strlen(p) == 0) errx(1, "%lu: %s: empty label", linenum, fname); enterlabel(cmd); break; case SUBST: /* s */ p++; - if (*p == '\0' || *p == '\\' || *p == '\n') + if (*p == '\0' || *p == '\\') errx(1, "%lu: %s: substitute pattern can not be delimited by newline or backslash", linenum, fname); if ((cmd->u.s = calloc(1, sizeof(struct s_subst))) == NULL) err(1, "malloc"); p = compile_delimited(p, re, 0); if (p == NULL) errx(1, "%lu: %s: unterminated substitute pattern", linenum, fname); + /* Compile RE with no case sensitivity temporarily */ + if (*re == '\0') + cmd->u.s->re = NULL; + else + cmd->u.s->re = compile_re(re, 0); --p; p = compile_subst(p, cmd->u.s); p = compile_flags(p, cmd->u.s); - if (*re != '\0') + /* Recompile RE with case sensitivity from "I" flag if any */ + if (*re == '\0') + cmd->u.s->re = NULL; + else cmd->u.s->re = compile_re(re, cmd->u.s->icase); - EATSPACE(); - if (*p == ';') { p++; link = &cmd->next; goto semicolon; } break; case TR: /* y */ p++; p = compile_tr(p, &cmd->u.y); EATSPACE(); if (*p == ';') { p++; link = &cmd->next; goto semicolon; } if (*p) errx(1, "%lu: %s: extra text at the end of a transform command", linenum, fname); break; } } } /* * Get a delimited string. P points to the delimiter of the string; d points * to a buffer area. Newline and delimiter escapes are processed; other * escapes are ignored. * * Returns a pointer to the first character after the final delimiter or NULL * in the case of a non-terminated string. The character array d is filled * with the processed string. */ -static const char * -compile_delimited(const char *p, char *d, int is_tr) +static char * +compile_delimited(char *p, char *d, int is_tr) { char c; c = *p++; if (c == '\0') return (NULL); else if (c == '\\') errx(1, "%lu: %s: \\ can not be used as a string delimiter", linenum, fname); else if (c == '\n') errx(1, "%lu: %s: newline can not be used as a string delimiter", linenum, fname); while (*p) { if (*p == '[' && *p != c) { if ((d = compile_ccl(&p, d)) == NULL) errx(1, "%lu: %s: unbalanced brackets ([])", linenum, fname); continue; } else if (*p == '\\' && p[1] == '[') { *d++ = *p++; } else if (*p == '\\' && p[1] == c) p++; else if (*p == '\\' && p[1] == 'n') { *d++ = '\n'; p += 2; continue; } else if (*p == '\\' && p[1] == '\\') { if (is_tr) p++; else *d++ = *p++; } else if (*p == c) { *d = '\0'; return (p + 1); } *d++ = *p++; } return (NULL); } /* compile_ccl: expand a POSIX character class */ static char * -compile_ccl(const char **sp, char *t) +compile_ccl(char **sp, char *t) { int c, d; - const char *s = *sp; + char *s = *sp; *t++ = *s++; if (*s == '^') *t++ = *s++; if (*s == ']') *t++ = *s++; for (; *s && (*t = *s) != ']'; s++, t++) if (*s == '[' && ((d = *(s+1)) == '.' || d == ':' || d == '=')) { *++t = *++s, t++, s++; for (c = *s; (*t = *s) != ']' || c != d; s++, t++) if ((c = *s) == '\0') return NULL; } return (*s == ']') ? *sp = ++s, ++t : NULL; } /* * Compiles the regular expression in RE and returns a pointer to the compiled * regular expression. * Cflags are passed to regcomp. */ -static const regex_t * -compile_re(const char *re, int case_insensitive) +static regex_t * +compile_re(char *re, int case_insensitive) { regex_t *rep; int eval, flags; flags = rflags; if (case_insensitive) flags |= REG_ICASE; if ((rep = malloc(sizeof(regex_t))) == NULL) err(1, "malloc"); if ((eval = regcomp(rep, re, flags)) != 0) errx(1, "%lu: %s: RE error: %s", linenum, fname, strregerror(eval, rep)); if (maxnsub < rep->re_nsub) maxnsub = rep->re_nsub; return (rep); } /* * Compile the substitution string of a regular expression and set res to * point to a saved copy of it. Nsub is the number of parenthesized regular * expressions. */ -static const char * -compile_subst(const char *p, struct s_subst *s) +static char * +compile_subst(char *p, struct s_subst *s) { + static char lbuf[_POSIX2_LINE_MAX + 1]; int asize, size; u_char ref; char c, *text, *op, *sp; - int more = 0, sawesc = 0; + int more = 1, sawesc = 0; c = *p++; /* Terminator character */ if (c == '\0') return (NULL); s->maxbref = 0; s->linenum = linenum; asize = 2 * _POSIX2_LINE_MAX + 1; if ((text = malloc(asize)) == NULL) err(1, "malloc"); size = 0; do { op = sp = text + size; - for (; *p != '\0' && *p != '\n'; p++) { + for (; *p; p++) { if (*p == '\\' || sawesc) { /* * If this is a continuation from the last * buffer, we won't have a character to * skip over. */ if (sawesc) sawesc = 0; else p++; if (*p == '\0') { /* * This escaped character is continued * in the next part of the line. Note * this fact, then cause the loop to * exit w/ normal EOL case and reenter * above with the new buffer. */ sawesc = 1; p--; - break; - } else if (*p == '\n') { - *sp++ = '\n'; - break; + continue; } else if (strchr("123456789", *p) != NULL) { *sp++ = '\\'; ref = *p - '0'; if (s->re != NULL && ref > s->re->re_nsub) errx(1, "%lu: %s: \\%c not defined in the RE", linenum, fname, *p); if (s->maxbref < ref) s->maxbref = ref; } else if (*p == '&' || *p == '\\') *sp++ = '\\'; } else if (*p == c) { if (*++p == '\0' && more) { - const char *nextp; - - nextp = cu_fgets(&more); - if (nextp != NULL) - p = nextp; + if (cu_fgets(lbuf, sizeof(lbuf), &more)) + p = lbuf; } *sp++ = '\0'; size += sp - op; if ((s->new = realloc(text, size)) == NULL) err(1, "realloc"); return (p); } else if (*p == '\n') { errx(1, "%lu: %s: unescaped newline inside substitute pattern", linenum, fname); /* NOTREACHED */ } *sp++ = *p; } size += sp - op; if (asize - size < _POSIX2_LINE_MAX + 1) { asize *= 2; if ((text = realloc(text, asize)) == NULL) err(1, "realloc"); } - } while ((p = cu_fgets(&more)) != NULL); + } while (cu_fgets(p = lbuf, sizeof(lbuf), &more) != NULL); errx(1, "%lu: %s: unterminated substitute in regular expression", linenum, fname); /* NOTREACHED */ } /* * Compile the flags of the s command */ -static const char * -compile_flags(const char *p, struct s_subst *s) +static char * +compile_flags(char *p, struct s_subst *s) { int gn; /* True if we have seen g or n */ unsigned long nval; - char *q; + char wfile[_POSIX2_LINE_MAX + 1], *q, *eq; s->n = 1; /* Default */ s->p = 0; s->wfile = NULL; s->wfd = -1; s->icase = 0; for (gn = 0;;) { - EATSPACEN(); /* EXTENSION */ + EATSPACE(); /* EXTENSION */ switch (*p) { case 'g': if (gn) errx(1, "%lu: %s: more than one number or 'g' in substitute flags", linenum, fname); gn = 1; s->n = 0; break; case '\0': case '\n': case ';': return (p); case 'p': s->p = 1; break; case 'i': case 'I': s->icase = 1; break; case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': if (gn) errx(1, "%lu: %s: more than one number or 'g' in substitute flags", linenum, fname); gn = 1; errno = 0; - nval = strtol(p, &q, 10); + nval = strtol(p, &p, 10); if (errno == ERANGE || nval > INT_MAX) errx(1, "%lu: %s: overflow in the 'N' substitute flag", linenum, fname); s->n = nval; - p = q; - continue; + p--; + break; case 'w': p++; #ifdef HISTORIC_PRACTICE if (*p != ' ') { warnx("%lu: %s: space missing before w wfile", linenum, fname); return (p); } #endif EATSPACE(); - s->wfile = duptoeol(p, "w flag", NULL); - if (!aflag && (s->wfd = open(s->wfile, + q = wfile; + eq = wfile + sizeof(wfile) - 1; + while (*p) { + if (*p == '\n') + break; + if (q >= eq) + err(1, "wfile too long"); + *q++ = *p++; + } + *q = '\0'; + if (q == wfile) + errx(1, "%lu: %s: no wfile specified", linenum, fname); + s->wfile = strdup(wfile); + if (!aflag && (s->wfd = open(wfile, O_WRONLY|O_APPEND|O_CREAT|O_TRUNC, DEFFILEMODE)) == -1) - err(1, "%s", s->wfile); + err(1, "%s", wfile); return (p); default: - errx(1, "%lu: %s: bad flag in substitute command: '%c' (%.10s)", - linenum, fname, *p, p); + errx(1, "%lu: %s: bad flag in substitute command: '%c'", + linenum, fname, *p); break; } p++; } } /* * Compile a translation set of strings into a lookup table. */ -static const char * -compile_tr(const char *p, struct s_tr **py) +static char * +compile_tr(char *p, struct s_tr **py) { struct s_tr *y; int i; const char *op, *np; char old[_POSIX2_LINE_MAX + 1]; char new[_POSIX2_LINE_MAX + 1]; size_t oclen, oldlen, nclen, newlen; mbstate_t mbs1, mbs2; if ((*py = y = malloc(sizeof(*y))) == NULL) - err(1, "malloc"); + err(1, NULL); y->multis = NULL; y->nmultis = 0; if (*p == '\0' || *p == '\\') errx(1, "%lu: %s: transform pattern can not be delimited by newline or backslash", linenum, fname); p = compile_delimited(p, old, 1); if (p == NULL) errx(1, "%lu: %s: unterminated transform source string", linenum, fname); p = compile_delimited(p - 1, new, 1); if (p == NULL) errx(1, "%lu: %s: unterminated transform target string", linenum, fname); EATSPACE(); op = old; oldlen = mbsrtowcs(NULL, &op, 0, NULL); if (oldlen == (size_t)-1) - err(1, "mbsrtowcs"); + err(1, NULL); np = new; newlen = mbsrtowcs(NULL, &np, 0, NULL); if (newlen == (size_t)-1) - err(1, "mbsrtowcs"); + err(1, NULL); if (newlen != oldlen) errx(1, "%lu: %s: transform strings are not the same length", linenum, fname); if (MB_CUR_MAX == 1) { /* * The single-byte encoding case is easy: generate a * lookup table. */ for (i = 0; i <= UCHAR_MAX; i++) y->bytetab[i] = (char)i; for (; *op; op++, np++) y->bytetab[(u_char)*op] = *np; } else { /* * Multi-byte encoding case: generate a lookup table as * above, but only for single-byte characters. The first * bytes of multi-byte characters have their lookup table * entries set to 0, which causes do_tr() to search through * an auxiliary vector of multi-byte mappings. */ memset(&mbs1, 0, sizeof(mbs1)); memset(&mbs2, 0, sizeof(mbs2)); for (i = 0; i <= UCHAR_MAX; i++) y->bytetab[i] = (btowc(i) != WEOF) ? i : 0; while (*op != '\0') { oclen = mbrlen(op, MB_LEN_MAX, &mbs1); if (oclen == (size_t)-1 || oclen == (size_t)-2) errc(1, EILSEQ, NULL); nclen = mbrlen(np, MB_LEN_MAX, &mbs2); if (nclen == (size_t)-1 || nclen == (size_t)-2) errc(1, EILSEQ, NULL); if (oclen == 1 && nclen == 1) y->bytetab[(u_char)*op] = *np; else { y->bytetab[(u_char)*op] = 0; y->multis = realloc(y->multis, (y->nmultis + 1) * sizeof(*y->multis)); if (y->multis == NULL) - err(1, "realloc"); + err(1, NULL); i = y->nmultis++; y->multis[i].fromlen = oclen; memcpy(y->multis[i].from, op, oclen); y->multis[i].tolen = nclen; memcpy(y->multis[i].to, np, nclen); } op += oclen; np += nclen; } } return (p); } /* * Compile the text following an a, c, or i command. */ static char * -compile_text(size_t *ptlen) +compile_text(void) { int asize, esc_nl, size; - char *text, *s; - const char *p, *op; + char *text, *p, *op, *s; + char lbuf[_POSIX2_LINE_MAX + 1]; asize = 2 * _POSIX2_LINE_MAX + 1; if ((text = malloc(asize)) == NULL) err(1, "malloc"); size = 0; - while ((p = cu_fgets(NULL)) != NULL) { + while (cu_fgets(lbuf, sizeof(lbuf), NULL) != NULL) { op = s = text + size; + p = lbuf; for (esc_nl = 0; *p != '\0'; p++) { if (*p == '\\' && p[1] != '\0' && *++p == '\n') esc_nl = 1; *s++ = *p; - if (*p == '\n') - break; } size += s - op; if (!esc_nl) { *s = '\0'; break; } if (asize - size < _POSIX2_LINE_MAX + 1) { asize *= 2; if ((text = realloc(text, asize)) == NULL) err(1, "realloc"); } } text[size] = '\0'; - if ((text = realloc(text, size + 1)) == NULL) + if ((p = realloc(text, size + 1)) == NULL) err(1, "realloc"); - *ptlen = size; - return (text); + return (p); } /* * Get an address and return a pointer to the first character after * it. Fill the structure pointed to according to the address. */ -static const char * -compile_addr(const char *p, struct s_addr *a) +static char * +compile_addr(char *p, struct s_addr *a) { char *end, re[_POSIX2_LINE_MAX + 1]; int icase; icase = 0; a->type = 0; switch (*p) { case '\\': /* Context address */ ++p; /* FALLTHROUGH */ case '/': /* Context address */ p = compile_delimited(p, re, 0); if (p == NULL) errx(1, "%lu: %s: unterminated regular expression", linenum, fname); /* Check for case insensitive regexp flag */ if (*p == 'I') { icase = 1; p++; } if (*re == '\0') a->u.r = NULL; else a->u.r = compile_re(re, icase); a->type = AT_RE; return (p); case '$': /* Last line */ a->type = AT_LAST; return (p + 1); case '+': /* Relative line number */ a->type = AT_RELLINE; p++; /* FALLTHROUGH */ /* Line number */ case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': if (a->type == 0) a->type = AT_LINE; a->u.l = strtol(p, &end, 10); return (end); default: errx(1, "%lu: %s: expected context address", linenum, fname); return (NULL); } } /* * duptoeol -- * Return a copy of all the characters up to \n or \0. */ static char * -duptoeol(const char *s, const char *ctype, size_t *ptlen) +duptoeol(char *s, const char *ctype) { size_t len; int ws; - char *p; - const char *start; + char *p, *start; ws = 0; for (start = s; *s != '\0' && *s != '\n'; ++s) ws = isspace((unsigned char)*s); + *s = '\0'; if (ws) warnx("%lu: %s: whitespace after %s", linenum, fname, ctype); - len = s - start; - if ((p = malloc(len + 1)) == NULL) + len = s - start + 1; + if ((p = malloc(len)) == NULL) err(1, "malloc"); - memmove(p, start, len); - p[len] = '\0'; - if (ptlen != NULL) - *ptlen = len; - return p; + return (memmove(p, start, len)); } /* * Convert goto label names to addresses, and count a and r commands, in * the given subset of the script. Free the memory used by labels in b * and t commands (but not by :). * * TODO: Remove } nodes */ static void -fixuplabel(struct s_command *cp, const struct s_command *end) +fixuplabel(struct s_command *cp, struct s_command *end) { for (; cp != end; cp = cp->next) switch (cp->code) { case 'a': case 'r': appendnum++; break; case 'b': case 't': /* Resolve branch target. */ if (cp->t == NULL) { cp->u.c = NULL; break; } if ((cp->u.c = findlabel(cp->t)) == NULL) - errx(1, "%lu: %s: %c: undefined label '%s'", linenum, fname, cp->code, cp->t); + errx(1, "%lu: %s: undefined label '%s'", linenum, fname, cp->t); free(cp->t); break; case '{': /* Do interior commands. */ fixuplabel(cp->u.c, cp->next); break; } } /* * Associate the given command label for later lookup. */ static void enterlabel(struct s_command *cp) { struct labhash **lhp, *lh; u_char *p; u_int h, c; for (h = 0, p = (u_char *)cp->t; (c = *p) != 0; p++) h = (h << 5) + h + c; lhp = &labels[h & LHMASK]; for (lh = *lhp; lh != NULL; lh = lh->lh_next) if (lh->lh_hash == h && strcmp(cp->t, lh->lh_cmd->t) == 0) errx(1, "%lu: %s: duplicate label '%s'", linenum, fname, cp->t); if ((lh = malloc(sizeof *lh)) == NULL) err(1, "malloc"); lh->lh_next = *lhp; lh->lh_hash = h; lh->lh_cmd = cp; lh->lh_ref = 0; *lhp = lh; } /* * Find the label contained in the command l in the command linked * list cp. L is excluded from the search. Return NULL if not found. */ static struct s_command * -findlabel(const char *name) +findlabel(char *name) { struct labhash *lh; - const u_char *p; + u_char *p; u_int h, c; - for (h = 0, p = (const u_char *)name; (c = *p) != 0; p++) + for (h = 0, p = (u_char *)name; (c = *p) != 0; p++) h = (h << 5) + h + c; for (lh = labels[h & LHMASK]; lh != NULL; lh = lh->lh_next) { if (lh->lh_hash == h && strcmp(name, lh->lh_cmd->t) == 0) { lh->lh_ref = 1; return (lh->lh_cmd); } } return (NULL); } /* * Warn about any unused labels. As a side effect, release the label hash * table space. */ static void uselabel(void) { struct labhash *lh, *next; int i; for (i = 0; i < LHSZ; i++) { for (lh = labels[i]; lh != NULL; lh = next) { next = lh->lh_next; if (!lh->lh_ref) warnx("%lu: %s: unused label '%s'", linenum, fname, lh->lh_cmd->t); free(lh); } } } Index: user/alc/PQ_LAUNDRY/usr.bin/sed/defs.h =================================================================== --- user/alc/PQ_LAUNDRY/usr.bin/sed/defs.h (revision 303666) +++ user/alc/PQ_LAUNDRY/usr.bin/sed/defs.h (revision 303667) @@ -1,150 +1,149 @@ /*- * Copyright (c) 1992 Diomidis Spinellis. * Copyright (c) 1992, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * Diomidis Spinellis of Imperial College, University of London. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)defs.h 8.1 (Berkeley) 6/6/93 * $FreeBSD$ */ /* * Types of address specifications */ enum e_atype { AT_RE = 1, /* Line that match RE */ AT_LINE, /* Specific line */ AT_RELLINE, /* Relative line */ AT_LAST, /* Last line */ }; /* * Format of an address */ struct s_addr { enum e_atype type; /* Address type */ union { u_long l; /* Line number */ - const regex_t *r; /* Regular expression */ + regex_t *r; /* Regular expression */ } u; }; /* * Substitution command */ struct s_subst { int n; /* Occurrence to subst. */ int p; /* True if p flag */ int icase; /* True if I flag */ char *wfile; /* NULL if no wfile */ int wfd; /* Cached file descriptor */ - const regex_t *re; /* Regular expression */ + regex_t *re; /* Regular expression */ unsigned int maxbref; /* Largest backreference. */ u_long linenum; /* Line number. */ char *new; /* Replacement text */ }; /* * Translate command. */ struct s_tr { unsigned char bytetab[256]; struct trmulti { size_t fromlen; char from[MB_LEN_MAX]; size_t tolen; char to[MB_LEN_MAX]; } *multis; int nmultis; }; /* * An internally compiled command. * Initialy, label references are stored in t, on a second pass they * are updated to pointers. */ struct s_command { struct s_command *next; /* Pointer to next command */ struct s_addr *a1, *a2; /* Start and end address */ u_long startline; /* Start line number or zero */ char *t; /* Text for : a c i r w */ - size_t tlen; union { struct s_command *c; /* Command(s) for b t { */ struct s_subst *s; /* Substitute command */ struct s_tr *y; /* Replace command array */ int fd; /* File descriptor for w */ } u; char code; /* Command code */ u_int nonsel:1; /* True if ! */ }; /* * Types of command arguments recognised by the parser */ enum e_args { EMPTY, /* d D g G h H l n N p P q x = \0 */ TEXT, /* a c i */ NONSEL, /* ! */ GROUP, /* { */ ENDGROUP, /* } */ COMMENT, /* # */ BRANCH, /* b t */ LABEL, /* : */ RFILE, /* r */ WFILE, /* w */ SUBST, /* s */ TR /* y */ }; /* * Structure containing things to append before a line is read */ struct s_appends { enum {AP_STRING, AP_FILE} type; char *s; size_t len; }; enum e_spflag { APPEND, /* Append to the contents. */ REPLACE, /* Replace the contents. */ }; /* * Structure for a space (process, hold, otherwise). */ typedef struct { char *space; /* Current space pointer. */ size_t len; /* Current length. */ int deleted; /* If deleted. */ int append_newline; /* If originally terminated by \n. */ char *back; /* Backing memory. */ size_t blen; /* Backing memory length. */ } SPACE; Index: user/alc/PQ_LAUNDRY/usr.bin/sed/extern.h =================================================================== --- user/alc/PQ_LAUNDRY/usr.bin/sed/extern.h (revision 303666) +++ user/alc/PQ_LAUNDRY/usr.bin/sed/extern.h (revision 303667) @@ -1,56 +1,56 @@ /*- * Copyright (c) 1992 Diomidis Spinellis. * Copyright (c) 1992, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * Diomidis Spinellis of Imperial College, University of London. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)extern.h 8.1 (Berkeley) 6/6/93 * $FreeBSD$ */ extern struct s_command *prog; extern struct s_appends *appends; extern regmatch_t *match; extern size_t maxnsub; extern u_long linenum; extern int appendnum; extern int aflag, eflag, nflag; extern const char *fname, *outfname; extern FILE *infile, *outfile; extern int rflags; /* regex flags to use */ -void cfclose(struct s_command *, const struct s_command *); +void cfclose(struct s_command *, struct s_command *); void compile(void); void cspace(SPACE *, const char *, size_t, enum e_spflag); -const char *cu_fgets(int *); +char *cu_fgets(char *, int, int *); int mf_fgets(SPACE *, enum e_spflag); int lastline(void); void process(void); void resetstate(void); -char *strregerror(int, const regex_t *); +char *strregerror(int, regex_t *); Index: user/alc/PQ_LAUNDRY/usr.bin/sed/main.c =================================================================== --- user/alc/PQ_LAUNDRY/usr.bin/sed/main.c (revision 303666) +++ user/alc/PQ_LAUNDRY/usr.bin/sed/main.c (revision 303667) @@ -1,532 +1,542 @@ /*- * Copyright (c) 2013 Johann 'Myrkraverk' Oskarsson. * Copyright (c) 1992 Diomidis Spinellis. * Copyright (c) 1992, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * Diomidis Spinellis of Imperial College, University of London. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #ifndef lint static const char copyright[] = "@(#) Copyright (c) 1992, 1993\n\ The Regents of the University of California. All rights reserved.\n"; #endif #ifndef lint static const char sccsid[] = "@(#)main.c 8.2 (Berkeley) 1/3/94"; #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "defs.h" #include "extern.h" /* * Linked list of units (strings and files) to be compiled */ struct s_compunit { struct s_compunit *next; enum e_cut {CU_FILE, CU_STRING} type; - const char *s; /* Pointer to string or fname */ + char *s; /* Pointer to string or fname */ }; /* * Linked list pointer to compilation units and pointer to current * next pointer. */ static struct s_compunit *script, **cu_nextp = &script; /* * Linked list of files to be processed */ struct s_flist { - const char *fname; + char *fname; struct s_flist *next; }; /* * Linked list pointer to files and pointer to current * next pointer. */ static struct s_flist *files, **fl_nextp = &files; FILE *infile; /* Current input file */ FILE *outfile; /* Current output file */ int aflag, eflag, nflag; int rflags = 0; static int rval; /* Exit status */ static int ispan; /* Whether inplace editing spans across files */ /* * Current file and line number; line numbers restart across compilation * units, but span across input files. The latter is optional if editing * in place. */ const char *fname; /* File name. */ const char *outfname; /* Output file name */ static char oldfname[PATH_MAX]; /* Old file name (for in-place editing) */ static char tmpfname[PATH_MAX]; /* Temporary file name (for in-place editing) */ static const char *inplace; /* Inplace edit file extension. */ u_long linenum; -static void add_compunit(enum e_cut, const char *); -static void add_file(const char *); +static void add_compunit(enum e_cut, char *); +static void add_file(char *); static void usage(void); int main(int argc, char *argv[]) { - char *temp_arg; int c, fflag; + char *temp_arg; (void) setlocale(LC_ALL, ""); fflag = 0; inplace = NULL; while ((c = getopt(argc, argv, "EI:ae:f:i:lnru")) != -1) switch (c) { case 'r': /* Gnu sed compat */ case 'E': rflags = REG_EXTENDED; break; case 'I': inplace = optarg; ispan = 1; /* span across input files */ break; case 'a': aflag = 1; break; case 'e': eflag = 1; - asprintf(&temp_arg, "%s\n", optarg); - if (temp_arg == NULL) - errx(1, "Couldn't allocate temporary buffer"); + if ((temp_arg = malloc(strlen(optarg) + 2)) == NULL) + err(1, "malloc"); + strcpy(temp_arg, optarg); + strcat(temp_arg, "\n"); add_compunit(CU_STRING, temp_arg); break; case 'f': fflag = 1; add_compunit(CU_FILE, optarg); break; case 'i': inplace = optarg; ispan = 0; /* don't span across input files */ break; case 'l': if(setvbuf(stdout, NULL, _IOLBF, 0) != 0) warnx("setting line buffered output failed"); break; case 'n': nflag = 1; break; case 'u': if(setvbuf(stdout, NULL, _IONBF, 0) != 0) warnx("setting unbuffered output failed"); break; default: case '?': usage(); } argc -= optind; argv += optind; /* First usage case; script is the first arg */ if (!eflag && !fflag && *argv) { add_compunit(CU_STRING, *argv); argv++; } compile(); /* Continue with first and start second usage */ if (*argv) for (; *argv; argv++) add_file(*argv); else add_file(NULL); process(); cfclose(prog, NULL); if (fclose(stdout)) err(1, "stdout"); exit(rval); } static void usage(void) { (void)fprintf(stderr, "usage: %s script [-Ealnru] [-i extension] [file ...]\n" "\t%s [-Ealnu] [-i extension] [-e script] ... [-f script_file]" " ... [file ...]\n", getprogname(), getprogname()); exit(1); } /* * Like fgets, but go through the chain of compilation units chaining them * together. Empty strings and files are ignored. */ -const char * -cu_fgets(int *more) +char * +cu_fgets(char *buf, int n, int *more) { static enum {ST_EOF, ST_FILE, ST_STRING} state = ST_EOF; static FILE *f; /* Current open file */ - static const char *s; /* Current pointer inside string */ - static char string_ident[30], *lastresult; - static size_t lastsize; + static char *s; /* Current pointer inside string */ + static char string_ident[30]; char *p; - const char *start; again: switch (state) { case ST_EOF: if (script == NULL) { if (more != NULL) *more = 0; return (NULL); } linenum = 0; switch (script->type) { case CU_FILE: if ((f = fopen(script->s, "r")) == NULL) err(1, "%s", script->s); fname = script->s; state = ST_FILE; goto again; case CU_STRING: if (((size_t)snprintf(string_ident, sizeof(string_ident), "\"%s\"", script->s)) >= sizeof(string_ident) - 1) (void)strcpy(string_ident + sizeof(string_ident) - 6, " ...\""); fname = string_ident; s = script->s; state = ST_STRING; goto again; } case ST_FILE: - p = lastresult; - if (getline(&p, &lastsize, f) != -1) { + if ((p = fgets(buf, n, f)) != NULL) { linenum++; - if (linenum == 1 && p[0] == '#' && p[1] == 'n') + if (linenum == 1 && buf[0] == '#' && buf[1] == 'n') nflag = 1; if (more != NULL) *more = !feof(f); - return (lastresult = p); - } else if (ferror(f)) - err(1, "%s", script->s); + return (p); + } script = script->next; (void)fclose(f); state = ST_EOF; goto again; case ST_STRING: if (linenum == 0 && s[0] == '#' && s[1] == 'n') nflag = 1; - else if (s[0] == '\0') { - state = ST_EOF; - script = script->next; - goto again; - } - start = s; + p = buf; for (;;) { + if (n-- <= 1) { + *p = '\0'; + linenum++; + if (more != NULL) + *more = 1; + return (buf); + } switch (*s) { case '\0': state = ST_EOF; - script = script->next; - /* FALLTHROUGH */ + if (s == script->s) { + script = script->next; + goto again; + } else { + script = script->next; + *p = '\0'; + linenum++; + if (more != NULL) + *more = 0; + return (buf); + } case '\n': + *p++ = '\n'; + *p = '\0'; s++; linenum++; if (more != NULL) *more = 0; - return (start); + return (buf); default: - s++; + *p++ = *s++; } } } /* NOTREACHED */ return (NULL); } /* * Like fgets, but go through the list of files chaining them together. * Set len to the length of the line. */ int mf_fgets(SPACE *sp, enum e_spflag spflag) { struct stat sb; ssize_t len; char *dirbuf, *basebuf; static char *p = NULL; static size_t plen = 0; int c; static int firstfile; if (infile == NULL) { /* stdin? */ if (files->fname == NULL) { if (inplace != NULL) errx(1, "-I or -i may not be used with stdin"); infile = stdin; fname = "stdin"; outfile = stdout; outfname = "stdout"; } firstfile = 1; } for (;;) { if (infile != NULL && (c = getc(infile)) != EOF) { (void)ungetc(c, infile); break; } /* If we are here then either eof or no files are open yet */ if (infile == stdin) { sp->len = 0; return (0); } if (infile != NULL) { fclose(infile); if (*oldfname != '\0') { /* if there was a backup file, remove it */ unlink(oldfname); /* * Backup the original. Note that hard links * are not supported on all filesystems. */ if ((link(fname, oldfname) != 0) && (rename(fname, oldfname) != 0)) { warn("rename()"); if (*tmpfname) unlink(tmpfname); exit(1); } *oldfname = '\0'; } if (*tmpfname != '\0') { if (outfile != NULL && outfile != stdout) if (fclose(outfile) != 0) { warn("fclose()"); unlink(tmpfname); exit(1); } outfile = NULL; if (rename(tmpfname, fname) != 0) { /* this should not happen really! */ warn("rename()"); unlink(tmpfname); exit(1); } *tmpfname = '\0'; } outfname = NULL; } if (firstfile == 0) files = files->next; else firstfile = 0; if (files == NULL) { sp->len = 0; return (0); } fname = files->fname; if (inplace != NULL) { if (lstat(fname, &sb) != 0) err(1, "%s", fname); if (!(sb.st_mode & S_IFREG)) errx(1, "%s: %s %s", fname, "in-place editing only", "works for regular files"); if (*inplace != '\0') { strlcpy(oldfname, fname, sizeof(oldfname)); len = strlcat(oldfname, inplace, sizeof(oldfname)); - if ((size_t)len > sizeof(oldfname)) + if (len > (ssize_t)sizeof(oldfname)) errx(1, "%s: name too long", fname); } if ((dirbuf = strdup(fname)) == NULL || (basebuf = strdup(fname)) == NULL) err(1, "strdup"); len = snprintf(tmpfname, sizeof(tmpfname), "%s/.!%ld!%s", dirname(dirbuf), (long)getpid(), basename(basebuf)); free(dirbuf); free(basebuf); - if ((size_t)len >= sizeof(tmpfname)) + if (len >= (ssize_t)sizeof(tmpfname)) errx(1, "%s: name too long", fname); unlink(tmpfname); if (outfile != NULL && outfile != stdout) fclose(outfile); if ((outfile = fopen(tmpfname, "w")) == NULL) err(1, "%s", fname); fchown(fileno(outfile), sb.st_uid, sb.st_gid); fchmod(fileno(outfile), sb.st_mode & ALLPERMS); outfname = tmpfname; if (!ispan) { linenum = 0; resetstate(); } } else { outfile = stdout; outfname = "stdout"; } if ((infile = fopen(fname, "r")) == NULL) { warn("%s", fname); rval = 1; continue; } } /* * We are here only when infile is open and we still have something * to read from it. * * Use getline() so that we can handle essentially infinite input * data. The p and plen are static so each invocation gives * getline() the same buffer which is expanded as needed. */ len = getline(&p, &plen, infile); if (len == -1) err(1, "%s", fname); if (len != 0 && p[len - 1] == '\n') { sp->append_newline = 1; len--; } else if (!lastline()) { sp->append_newline = 1; } else { sp->append_newline = 0; } cspace(sp, p, len, spflag); linenum++; return (1); } /* * Add a compilation unit to the linked list */ static void -add_compunit(enum e_cut type, const char *s) +add_compunit(enum e_cut type, char *s) { struct s_compunit *cu; if ((cu = malloc(sizeof(struct s_compunit))) == NULL) err(1, "malloc"); cu->type = type; cu->s = s; cu->next = NULL; *cu_nextp = cu; cu_nextp = &cu->next; } /* * Add a file to the linked list */ static void -add_file(const char *s) +add_file(char *s) { struct s_flist *fp; if ((fp = malloc(sizeof(struct s_flist))) == NULL) err(1, "malloc"); fp->next = NULL; *fl_nextp = fp; fp->fname = s; fl_nextp = &fp->next; } static int next_files_have_lines(void) { struct s_flist *file; FILE *file_fd; int ch; file = files; while ((file = file->next) != NULL) { if ((file_fd = fopen(file->fname, "r")) == NULL) continue; if ((ch = getc(file_fd)) != EOF) { /* * This next file has content, therefore current * file doesn't contains the last line. */ ungetc(ch, file_fd); fclose(file_fd); return (1); } fclose(file_fd); } return (0); } int lastline(void) { int ch; if (feof(infile)) { return !( (inplace == NULL || ispan) && next_files_have_lines()); } if ((ch = getc(infile)) == EOF) { return !( (inplace == NULL || ispan) && next_files_have_lines()); } ungetc(ch, infile); return (0); } Index: user/alc/PQ_LAUNDRY/usr.bin/sed/misc.c =================================================================== --- user/alc/PQ_LAUNDRY/usr.bin/sed/misc.c (revision 303666) +++ user/alc/PQ_LAUNDRY/usr.bin/sed/misc.c (revision 303667) @@ -1,69 +1,71 @@ /*- * Copyright (c) 1992 Diomidis Spinellis. * Copyright (c) 1992, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * Diomidis Spinellis of Imperial College, University of London. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #ifndef lint static const char sccsid[] = "@(#)misc.c 8.1 (Berkeley) 6/6/93"; #endif #include #include #include #include #include #include #include #include "defs.h" #include "extern.h" /* * Return a string for a regular expression error passed. This is overkill, * because of the silly semantics of regerror (we can never know the size of * the buffer). */ char * -strregerror(int errcode, const regex_t *preg) +strregerror(int errcode, regex_t *preg) { static char *oe; size_t s; + if (oe != NULL) + free(oe); s = regerror(errcode, preg, NULL, 0); - if ((oe = realloc(oe, s)) == NULL) - err(1, "realloc"); + if ((oe = malloc(s)) == NULL) + err(1, "malloc"); (void)regerror(errcode, preg, oe, s); return (oe); } Index: user/alc/PQ_LAUNDRY/usr.bin/sed/process.c =================================================================== --- user/alc/PQ_LAUNDRY/usr.bin/sed/process.c (revision 303666) +++ user/alc/PQ_LAUNDRY/usr.bin/sed/process.c (revision 303667) @@ -1,785 +1,785 @@ /*- * Copyright (c) 1992 Diomidis Spinellis. * Copyright (c) 1992, 1993, 1994 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * Diomidis Spinellis of Imperial College, University of London. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #ifndef lint static const char sccsid[] = "@(#)process.c 8.6 (Berkeley) 4/20/94"; #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "defs.h" #include "extern.h" static SPACE HS, PS, SS, YS; #define pd PS.deleted #define ps PS.space #define psl PS.len #define psanl PS.append_newline #define hs HS.space #define hsl HS.len -static inline int applies(struct s_command *); -static void do_tr(const struct s_tr *); -static void flush_appends(void); -static void lputs(const char *, size_t); -static int regexec_e(const regex_t *, const char *, int, int, - size_t, size_t); -static void regsub(SPACE *, const char *, const char *); -static int substitute(const struct s_command *); +static inline int applies(struct s_command *); +static void do_tr(struct s_tr *); +static void flush_appends(void); +static void lputs(char *, size_t); +static int regexec_e(regex_t *, const char *, int, int, size_t, + size_t); +static void regsub(SPACE *, char *, char *); +static int substitute(struct s_command *); struct s_appends *appends; /* Array of pointers to strings to append. */ static int appendx; /* Index into appends array. */ int appendnum; /* Size of appends array. */ static int lastaddr; /* Set by applies if last address of a range. */ static int sdone; /* If any substitutes since last line input. */ /* Iov structure for 'w' commands. */ -static const regex_t *defpreg; +static regex_t *defpreg; size_t maxnsub; regmatch_t *match; #define OUT() do { \ fwrite(ps, 1, psl, outfile); \ if (psanl) fputc('\n', outfile); \ } while (0) void process(void) { struct s_command *cp; SPACE tspace; size_t oldpsl; char *p; int oldpsanl; p = NULL; oldpsanl = oldpsl = 0; for (linenum = 0; mf_fgets(&PS, REPLACE);) { pd = 0; top: cp = prog; redirect: while (cp != NULL) { if (!applies(cp)) { cp = cp->next; continue; } switch (cp->code) { case '{': cp = cp->u.c; goto redirect; case 'a': if (appendx >= appendnum) if ((appends = realloc(appends, sizeof(struct s_appends) * (appendnum *= 2))) == NULL) err(1, "realloc"); appends[appendx].type = AP_STRING; appends[appendx].s = cp->t; appends[appendx].len = strlen(cp->t); appendx++; break; case 'b': cp = cp->u.c; goto redirect; case 'c': pd = 1; psl = 0; if (cp->a2 == NULL || lastaddr || lastline()) (void)fprintf(outfile, "%s", cp->t); break; case 'd': pd = 1; goto new; case 'D': if (pd) goto new; if (psl == 0 || (p = memchr(ps, '\n', psl)) == NULL) { pd = 1; goto new; } else { psl -= (p + 1) - ps; memmove(ps, p + 1, psl); goto top; } case 'g': cspace(&PS, hs, hsl, REPLACE); break; case 'G': cspace(&PS, "\n", 1, APPEND); cspace(&PS, hs, hsl, APPEND); break; case 'h': cspace(&HS, ps, psl, REPLACE); break; case 'H': cspace(&HS, "\n", 1, APPEND); cspace(&HS, ps, psl, APPEND); break; case 'i': (void)fprintf(outfile, "%s", cp->t); break; case 'l': lputs(ps, psl); break; case 'n': if (!nflag && !pd) OUT(); flush_appends(); if (!mf_fgets(&PS, REPLACE)) exit(0); pd = 0; break; case 'N': flush_appends(); cspace(&PS, "\n", 1, APPEND); if (!mf_fgets(&PS, APPEND)) exit(0); break; case 'p': if (pd) break; OUT(); break; case 'P': if (pd) break; if ((p = memchr(ps, '\n', psl)) != NULL) { oldpsl = psl; oldpsanl = psanl; psl = p - ps; psanl = 1; } OUT(); if (p != NULL) { psl = oldpsl; psanl = oldpsanl; } break; case 'q': if (!nflag && !pd) OUT(); flush_appends(); exit(0); case 'r': if (appendx >= appendnum) if ((appends = realloc(appends, sizeof(struct s_appends) * (appendnum *= 2))) == NULL) err(1, "realloc"); appends[appendx].type = AP_FILE; appends[appendx].s = cp->t; appends[appendx].len = strlen(cp->t); appendx++; break; case 's': sdone |= substitute(cp); break; case 't': if (sdone) { sdone = 0; cp = cp->u.c; goto redirect; } break; case 'w': if (pd) break; if (cp->u.fd == -1 && (cp->u.fd = open(cp->t, O_WRONLY|O_APPEND|O_CREAT|O_TRUNC, DEFFILEMODE)) == -1) err(1, "%s", cp->t); if (write(cp->u.fd, ps, psl) != (ssize_t)psl || write(cp->u.fd, "\n", 1) != 1) err(1, "%s", cp->t); break; case 'x': /* * If the hold space is null, make it empty * but not null. Otherwise the pattern space * will become null after the swap, which is * an abnormal condition. */ if (hs == NULL) cspace(&HS, "", 0, REPLACE); tspace = PS; PS = HS; psanl = tspace.append_newline; HS = tspace; break; case 'y': if (pd || psl == 0) break; do_tr(cp->u.y); break; case ':': case '}': break; case '=': (void)fprintf(outfile, "%lu\n", linenum); } cp = cp->next; } /* for all cp */ new: if (!nflag && !pd) OUT(); flush_appends(); } /* for all lines */ } /* * TRUE if the address passed matches the current program state * (lastline, linenumber, ps). */ #define MATCH(a) \ ((a)->type == AT_RE ? regexec_e((a)->u.r, ps, 0, 1, 0, psl) : \ (a)->type == AT_LINE ? linenum == (a)->u.l : lastline()) /* * Return TRUE if the command applies to the current line. Sets the start * line for process ranges. Interprets the non-select (``!'') flag. */ static inline int applies(struct s_command *cp) { int r; lastaddr = 0; if (cp->a1 == NULL && cp->a2 == NULL) r = 1; else if (cp->a2) if (cp->startline > 0) { switch (cp->a2->type) { case AT_RELLINE: if (linenum - cp->startline <= cp->a2->u.l) r = 1; else { cp->startline = 0; r = 0; } break; default: if (MATCH(cp->a2)) { cp->startline = 0; lastaddr = 1; r = 1; } else if (cp->a2->type == AT_LINE && linenum > cp->a2->u.l) { /* * We missed the 2nd address due to a * branch, so just close the range and * return false. */ cp->startline = 0; r = 0; } else r = 1; } } else if (cp->a1 && MATCH(cp->a1)) { /* * If the second address is a number less than or * equal to the line number first selected, only * one line shall be selected. * -- POSIX 1003.2 * Likewise if the relative second line address is zero. */ if ((cp->a2->type == AT_LINE && linenum >= cp->a2->u.l) || (cp->a2->type == AT_RELLINE && cp->a2->u.l == 0)) lastaddr = 1; else { cp->startline = linenum; } r = 1; } else r = 0; else r = MATCH(cp->a1); return (cp->nonsel ? ! r : r); } /* * Reset the sed processor to its initial state. */ void resetstate(void) { struct s_command *cp; /* * Reset all in-range markers. */ for (cp = prog; cp; cp = cp->code == '{' ? cp->u.c : cp->next) if (cp->a2) cp->startline = 0; /* * Clear out the hold space. */ cspace(&HS, "", 0, REPLACE); } /* * substitute -- * Do substitutions in the pattern space. Currently, we build a * copy of the new pattern space in the substitute space structure * and then swap them. */ static int -substitute(const struct s_command *cp) +substitute(struct s_command *cp) { SPACE tspace; - const regex_t *re; + regex_t *re; regoff_t slen; int lastempty, n; - regoff_t le = 0; + size_t le = 0; char *s; s = ps; re = cp->u.s->re; if (re == NULL) { if (defpreg != NULL && cp->u.s->maxbref > defpreg->re_nsub) { linenum = cp->u.s->linenum; errx(1, "%lu: %s: \\%u not defined in the RE", linenum, fname, cp->u.s->maxbref); } } if (!regexec_e(re, ps, 0, 0, 0, psl)) return (0); SS.len = 0; /* Clean substitute space. */ slen = psl; n = cp->u.s->n; lastempty = 1; do { /* Copy the leading retained string. */ if (n <= 1 && (match[0].rm_so > le)) cspace(&SS, s, match[0].rm_so - le, APPEND); /* Skip zero-length matches right after other matches. */ if (lastempty || (match[0].rm_so - le) || match[0].rm_so != match[0].rm_eo) { if (n <= 1) { /* Want this match: append replacement. */ regsub(&SS, ps, cp->u.s->new); if (n == 1) n = -1; } else { /* Want a later match: append original. */ if (match[0].rm_eo - le) cspace(&SS, s, match[0].rm_eo - le, APPEND); n--; } } /* Move past this match. */ s = ps + match[0].rm_eo; slen = psl - match[0].rm_eo; le = match[0].rm_eo; /* * After a zero-length match, advance one byte, * and at the end of the line, terminate. */ if (match[0].rm_so == match[0].rm_eo) { if (*s == '\0' || *s == '\n') slen = -1; else slen--; if (*s != '\0') { cspace(&SS, s++, 1, APPEND); le++; } lastempty = 1; } else lastempty = 0; } while (n >= 0 && slen >= 0 && regexec_e(re, ps, REG_NOTBOL, 0, le, psl)); /* Did not find the requested number of matches. */ if (n > 0) return (0); /* Copy the trailing retained string. */ if (slen > 0) cspace(&SS, s, slen, APPEND); /* * Swap the substitute space and the pattern space, and make sure * that any leftover pointers into stdio memory get lost. */ tspace = PS; PS = SS; psanl = tspace.append_newline; SS = tspace; SS.space = SS.back; /* Handle the 'p' flag. */ if (cp->u.s->p) OUT(); /* Handle the 'w' flag. */ if (cp->u.s->wfile && !pd) { if (cp->u.s->wfd == -1 && (cp->u.s->wfd = open(cp->u.s->wfile, O_WRONLY|O_APPEND|O_CREAT|O_TRUNC, DEFFILEMODE)) == -1) err(1, "%s", cp->u.s->wfile); if (write(cp->u.s->wfd, ps, psl) != (ssize_t)psl || write(cp->u.s->wfd, "\n", 1) != 1) err(1, "%s", cp->u.s->wfile); } return (1); } /* * do_tr -- * Perform translation ('y' command) in the pattern space. */ static void -do_tr(const struct s_tr *y) +do_tr(struct s_tr *y) { SPACE tmp; char c, *p; size_t clen, left; int i; if (MB_CUR_MAX == 1) { /* * Single-byte encoding: perform in-place translation * of the pattern space. */ for (p = ps; p < &ps[psl]; p++) *p = y->bytetab[(u_char)*p]; } else { /* * Multi-byte encoding: perform translation into the * translation space, then swap the translation and * pattern spaces. */ /* Clean translation space. */ YS.len = 0; for (p = ps, left = psl; left > 0; p += clen, left -= clen) { if ((c = y->bytetab[(u_char)*p]) != '\0') { cspace(&YS, &c, 1, APPEND); clen = 1; continue; } for (i = 0; i < y->nmultis; i++) if (left >= y->multis[i].fromlen && memcmp(p, y->multis[i].from, y->multis[i].fromlen) == 0) break; if (i < y->nmultis) { cspace(&YS, y->multis[i].to, y->multis[i].tolen, APPEND); clen = y->multis[i].fromlen; } else { cspace(&YS, p, 1, APPEND); clen = 1; } } /* Swap the translation space and the pattern space. */ tmp = PS; PS = YS; psanl = tmp.append_newline; YS = tmp; YS.space = YS.back; } } /* * Flush append requests. Always called before reading a line, * therefore it also resets the substitution done (sdone) flag. */ static void flush_appends(void) { FILE *f; int count, i; char buf[8 * 1024]; for (i = 0; i < appendx; i++) switch (appends[i].type) { case AP_STRING: fwrite(appends[i].s, sizeof(char), appends[i].len, outfile); break; case AP_FILE: /* * Read files probably shouldn't be cached. Since * it's not an error to read a non-existent file, * it's possible that another program is interacting * with the sed script through the filesystem. It * would be truly bizarre, but possible. It's probably * not that big a performance win, anyhow. */ if ((f = fopen(appends[i].s, "r")) == NULL) break; while ((count = fread(buf, sizeof(char), sizeof(buf), f))) (void)fwrite(buf, sizeof(char), count, outfile); (void)fclose(f); break; } if (ferror(outfile)) errx(1, "%s: %s", outfname, strerror(errno ? errno : EIO)); appendx = sdone = 0; } static void -lputs(const char *s, size_t len) +lputs(char *s, size_t len) { static const char escapes[] = "\\\a\b\f\r\t\v"; int c, col, width; const char *p; struct winsize win; static int termwidth = -1; size_t clen, i; wchar_t wc; mbstate_t mbs; if (outfile != stdout) termwidth = 60; if (termwidth == -1) { if ((p = getenv("COLUMNS")) && *p != '\0') termwidth = atoi(p); else if (ioctl(STDOUT_FILENO, TIOCGWINSZ, &win) == 0 && win.ws_col > 0) termwidth = win.ws_col; else termwidth = 60; } if (termwidth <= 0) termwidth = 1; memset(&mbs, 0, sizeof(mbs)); col = 0; while (len != 0) { clen = mbrtowc(&wc, s, len, &mbs); if (clen == 0) clen = 1; if (clen == (size_t)-1 || clen == (size_t)-2) { wc = (unsigned char)*s; clen = 1; memset(&mbs, 0, sizeof(mbs)); } if (wc == '\n') { if (col + 1 >= termwidth) fprintf(outfile, "\\\n"); fputc('$', outfile); fputc('\n', outfile); col = 0; } else if (iswprint(wc)) { width = wcwidth(wc); if (col + width >= termwidth) { fprintf(outfile, "\\\n"); col = 0; } fwrite(s, 1, clen, outfile); col += width; } else if (wc != L'\0' && (c = wctob(wc)) != EOF && (p = strchr(escapes, c)) != NULL) { if (col + 2 >= termwidth) { fprintf(outfile, "\\\n"); col = 0; } fprintf(outfile, "\\%c", "\\abfrtv"[p - escapes]); col += 2; } else { if (col + 4 * clen >= (unsigned)termwidth) { fprintf(outfile, "\\\n"); col = 0; } for (i = 0; i < clen; i++) fprintf(outfile, "\\%03o", (int)(unsigned char)s[i]); col += 4 * clen; } s += clen; len -= clen; } if (col + 1 >= termwidth) fprintf(outfile, "\\\n"); (void)fputc('$', outfile); (void)fputc('\n', outfile); if (ferror(outfile)) errx(1, "%s: %s", outfname, strerror(errno ? errno : EIO)); } static int -regexec_e(const regex_t *preg, const char *string, int eflags, int nomatch, +regexec_e(regex_t *preg, const char *string, int eflags, int nomatch, size_t start, size_t stop) { int eval; if (preg == NULL) { if (defpreg == NULL) errx(1, "first RE may not be empty"); } else defpreg = preg; /* Set anchors */ match[0].rm_so = start; match[0].rm_eo = stop; eval = regexec(defpreg, string, nomatch ? 0 : maxnsub + 1, match, eflags | REG_STARTEND); switch(eval) { case 0: return (1); case REG_NOMATCH: return (0); } errx(1, "RE error: %s", strregerror(eval, defpreg)); /* NOTREACHED */ } /* * regsub - perform substitutions after a regexp match * Based on a routine by Henry Spencer */ static void -regsub(SPACE *sp, const char *string, const char *src) +regsub(SPACE *sp, char *string, char *src) { int len, no; char c, *dst; #define NEEDSP(reqlen) \ /* XXX What is the +1 for? */ \ if (sp->len + (reqlen) + 1 >= sp->blen) { \ sp->blen += (reqlen) + 1024; \ if ((sp->space = sp->back = realloc(sp->back, sp->blen)) \ == NULL) \ err(1, "realloc"); \ dst = sp->space + sp->len; \ } dst = sp->space + sp->len; while ((c = *src++) != '\0') { if (c == '&') no = 0; else if (c == '\\' && isdigit((unsigned char)*src)) no = *src++ - '0'; else no = -1; if (no < 0) { /* Ordinary character. */ if (c == '\\' && (*src == '\\' || *src == '&')) c = *src++; NEEDSP(1); *dst++ = c; ++sp->len; } else if (match[no].rm_so != -1 && match[no].rm_eo != -1) { len = match[no].rm_eo - match[no].rm_so; NEEDSP(len); memmove(dst, string + match[no].rm_so, len); dst += len; sp->len += len; } } NEEDSP(1); *dst = '\0'; } /* * cspace -- * Concatenate space: append the source space to the destination space, * allocating new space as necessary. */ void cspace(SPACE *sp, const char *p, size_t len, enum e_spflag spflag) { size_t tlen; /* Make sure SPACE has enough memory and ramp up quickly. */ tlen = sp->len + len + 1; if (tlen > sp->blen) { sp->blen = tlen + 1024; if ((sp->space = sp->back = realloc(sp->back, sp->blen)) == NULL) err(1, "realloc"); } if (spflag == REPLACE) sp->len = 0; memmove(sp->space + sp->len, p, len); sp->space[sp->len += len] = '\0'; } /* * Close all cached opened files and report any errors */ void -cfclose(struct s_command *cp, const struct s_command *end) +cfclose(struct s_command *cp, struct s_command *end) { for (; cp != end; cp = cp->next) switch(cp->code) { case 's': if (cp->u.s->wfd != -1 && close(cp->u.s->wfd)) err(1, "%s", cp->u.s->wfile); cp->u.s->wfd = -1; break; case 'w': if (cp->u.fd != -1 && close(cp->u.fd)) err(1, "%s", cp->t); cp->u.fd = -1; break; case '{': cfclose(cp->u.c, cp->next); break; } } Index: user/alc/PQ_LAUNDRY =================================================================== --- user/alc/PQ_LAUNDRY (revision 303666) +++ user/alc/PQ_LAUNDRY (revision 303667) Property changes on: user/alc/PQ_LAUNDRY ___________________________________________________________________ Modified: svn:mergeinfo ## -0,0 +0,1 ## Merged /head:r303654-303666