Changeset View
Standalone View
sys/kern/kern_jail.c
Context not available. | |||||
struct prisonlist allprison = TAILQ_HEAD_INITIALIZER(allprison); | struct prisonlist allprison = TAILQ_HEAD_INITIALIZER(allprison); | ||||
LIST_HEAD(, prison_racct) allprison_racct; | LIST_HEAD(, prison_racct) allprison_racct; | ||||
int lastprid = 0; | int lastprid = 0; | ||||
int lastdeadid = 0; | |||||
static int get_next_prid(struct prison **insprp); | static int get_next_prid(struct prison **insprp); | ||||
static int get_next_deadid(struct prison **insprp); | |||||
static int do_jail_attach(struct thread *td, struct prison *pr, int drflags); | static int do_jail_attach(struct thread *td, struct prison *pr, int drflags); | ||||
static void prison_complete(void *context, int pending); | static void prison_complete(void *context, int pending); | ||||
static void prison_deref(struct prison *pr, int flags); | static void prison_deref(struct prison *pr, int flags); | ||||
bz: You could also just add PD_KILL at 0x20 and leave the other flags on the same bits? | |||||
Done Inline ActionsThe reason I did it this way was to keep a bit of separation between the action flags and the current-state flags. In fact, if I'm changing those numbers, it might even be better to start PD_LOCKED at 0x10 or even 0x100. jamie: The reason I did it this way was to keep a bit of separation between the action flags and the… | |||||
Context not available. | |||||
#endif | #endif | ||||
struct vfsopt *opt; | struct vfsopt *opt; | ||||
struct vfsoptlist *opts; | struct vfsoptlist *opts; | ||||
struct prison *pr, *deadpr, *inspr, *mypr, *ppr, *tpr; | struct prison *pr, *deadpr, *dinspr, *inspr, *mypr, *ppr, *tpr; | ||||
struct vnode *root; | struct vnode *root; | ||||
char *domain, *errmsg, *host, *name, *namelc, *p, *path, *uuid; | char *domain, *errmsg, *host, *name, *namelc, *p, *path, *uuid; | ||||
char *g_path, *osrelstr; | char *g_path, *osrelstr; | ||||
Context not available. | |||||
#endif | #endif | ||||
unsigned long hid; | unsigned long hid; | ||||
size_t namelen, onamelen, pnamelen; | size_t namelen, onamelen, pnamelen; | ||||
int born, created, cuflags, descend, drflags, enforce; | int created, cuflags, descend, drflags, enforce; | ||||
int error, errmsg_len, errmsg_pos; | int error, errmsg_len, errmsg_pos; | ||||
int gotchildmax, gotenforce, gothid, gotrsnum, gotslevel; | int gotchildmax, gotenforce, gothid, gotrsnum, gotslevel; | ||||
int jid, jsys, len, level; | int deadid, jid, jsys, len, level; | ||||
int childmax, osreldt, rsnum, slevel; | int childmax, osreldt, rsnum, slevel; | ||||
#if defined(INET) || defined(INET6) | #if defined(INET) || defined(INET6) | ||||
int ii, ij; | int ii, ij; | ||||
Context not available. | |||||
*/ | */ | ||||
pr = NULL; | pr = NULL; | ||||
inspr = NULL; | inspr = NULL; | ||||
deadpr = NULL; | |||||
if (cuflags == JAIL_CREATE && jid == 0 && name != NULL) { | if (cuflags == JAIL_CREATE && jid == 0 && name != NULL) { | ||||
namelc = strrchr(name, '.'); | namelc = strrchr(name, '.'); | ||||
jid = strtoul(namelc != NULL ? namelc + 1 : name, &p, 10); | jid = strtoul(namelc != NULL ? namelc + 1 : name, &p, 10); | ||||
Context not available. | |||||
continue; | continue; | ||||
if (inspr->pr_id > jid) | if (inspr->pr_id > jid) | ||||
break; | break; | ||||
pr = inspr; | if (prison_isalive(inspr)) { | ||||
mtx_lock(&pr->pr_mtx); | pr = inspr; | ||||
drflags |= PD_LOCKED; | mtx_lock(&pr->pr_mtx); | ||||
drflags |= PD_LOCKED; | |||||
} else { | |||||
/* Note a dying jail to handle later. */ | |||||
deadpr = inspr; | |||||
} | |||||
inspr = NULL; | inspr = NULL; | ||||
break; | break; | ||||
} | } | ||||
if (pr != NULL) { | if (cuflags == JAIL_CREATE && pr != NULL) { | ||||
/* Create: jid must not exist. */ | /* | ||||
if (cuflags == JAIL_CREATE) { | * Even creators that cannot see the jail will get | ||||
/* | * EEXIST. | ||||
* Even creators that cannot see the jail will | */ | ||||
* get EEXIST. | error = EEXIST; | ||||
*/ | vfs_opterror(opts, "jail %d already exists", jid); | ||||
error = EEXIST; | goto done_deref; | ||||
vfs_opterror(opts, "jail %d already exists", | } | ||||
jid); | if ((pr == NULL) | ||||
goto done_deref; | ? cuflags == JAIL_UPDATE | ||||
} | : !prison_ischild(mypr, pr)) { | ||||
if (!prison_ischild(mypr, pr)) { | /* | ||||
/* | * Updaters get ENOENT for nonexistent jails, or for | ||||
* Updaters get ENOENT if they cannot see the | * jails they cannot see. The latter case is true | ||||
* jail. This is true even for CREATE | UPDATE, | * even for CREATE | UPDATE, which normally cannot | ||||
* which normally cannot give this error. | * give this error. | ||||
*/ | */ | ||||
error = ENOENT; | error = ENOENT; | ||||
vfs_opterror(opts, "jail %d not found", jid); | vfs_opterror(opts, "jail %d not found", jid); | ||||
goto done_deref; | goto done_deref; | ||||
} | |||||
ppr = pr->pr_parent; | |||||
if (!prison_isalive(ppr)) { | |||||
error = ENOENT; | |||||
vfs_opterror(opts, "jail %d is dying", | |||||
ppr->pr_id); | |||||
goto done_deref; | |||||
} | |||||
if (!prison_isalive(pr)) { | |||||
if (!(flags & JAIL_DYING)) { | |||||
error = ENOENT; | |||||
vfs_opterror(opts, "jail %d is dying", | |||||
jid); | |||||
goto done_deref; | |||||
} | |||||
if ((flags & JAIL_ATTACH) || | |||||
(pr_flags & PR_PERSIST)) { | |||||
/* | |||||
* A dying jail might be resurrected | |||||
* (via attach or persist), but first | |||||
* it must determine if another jail | |||||
* has claimed its name. Accomplish | |||||
* this by implicitly re-setting the | |||||
* name. | |||||
*/ | |||||
if (name == NULL) | |||||
name = prison_name(mypr, pr); | |||||
} | |||||
} | |||||
} else { | |||||
/* Update: jid must exist. */ | |||||
if (cuflags == JAIL_UPDATE) { | |||||
error = ENOENT; | |||||
vfs_opterror(opts, "jail %d not found", jid); | |||||
goto done_deref; | |||||
} | |||||
} | } | ||||
} | } | ||||
/* | /* | ||||
Context not available. | |||||
if (namelc[0] != '\0') { | if (namelc[0] != '\0') { | ||||
pnamelen = | pnamelen = | ||||
(ppr == &prison0) ? 0 : strlen(ppr->pr_name) + 1; | (ppr == &prison0) ? 0 : strlen(ppr->pr_name) + 1; | ||||
deadpr = NULL; | |||||
FOREACH_PRISON_CHILD(ppr, tpr) { | FOREACH_PRISON_CHILD(ppr, tpr) { | ||||
if (tpr != pr && | if (tpr == pr || !prison_isalive(tpr) || | ||||
!strcmp(tpr->pr_name + pnamelen, namelc)) { | strcmp(tpr->pr_name + pnamelen, namelc)) | ||||
if (prison_isalive(tpr)) { | continue; | ||||
if (pr == NULL && | if (cuflags == JAIL_CREATE || pr != NULL) { | ||||
cuflags != JAIL_CREATE) { | /* | ||||
/* | * Create, or update(jid): name must | ||||
* Use this jail | * not exist in an active sibling jail. | ||||
* for updates. | */ | ||||
*/ | error = EEXIST; | ||||
pr = tpr; | |||||
mtx_lock(&pr->pr_mtx); | |||||
drflags |= PD_LOCKED; | |||||
break; | |||||
} | |||||
/* | |||||
* Create, or update(jid): | |||||
* name must not exist in an | |||||
* active sibling jail. | |||||
*/ | |||||
error = EEXIST; | |||||
vfs_opterror(opts, | |||||
"jail \"%s\" already exists", | |||||
name); | |||||
goto done_deref; | |||||
} | |||||
if (pr == NULL && | |||||
cuflags != JAIL_CREATE) { | |||||
deadpr = tpr; | |||||
} | |||||
} | |||||
} | |||||
/* If no active jail is found, use a dying one. */ | |||||
if (deadpr != NULL && pr == NULL) { | |||||
if (flags & JAIL_DYING) { | |||||
pr = deadpr; | |||||
mtx_lock(&pr->pr_mtx); | |||||
drflags |= PD_LOCKED; | |||||
} else if (cuflags == JAIL_UPDATE) { | |||||
error = ENOENT; | |||||
vfs_opterror(opts, | vfs_opterror(opts, | ||||
"jail \"%s\" is dying", name); | "jail \"%s\" already exists", name); | ||||
goto done_deref; | goto done_deref; | ||||
} | } | ||||
/* Use this jail for updates. */ | |||||
pr = tpr; | |||||
mtx_lock(&pr->pr_mtx); | |||||
drflags |= PD_LOCKED; | |||||
break; | |||||
} | } | ||||
/* Update: name must exist if no jid. */ | /* | ||||
else if (cuflags == JAIL_UPDATE && pr == NULL) { | * Update: name must exist if no jid is specified. | ||||
* As with the jid case, the jail must be currently | |||||
* visible, or else even CREATE | UPDATE will get | |||||
* an error. | |||||
*/ | |||||
if ((pr == NULL) | |||||
? cuflags == JAIL_UPDATE | |||||
: !prison_isalive(pr)) { | |||||
error = ENOENT; | error = ENOENT; | ||||
vfs_opterror(opts, "jail \"%s\" not found", | vfs_opterror(opts, "jail \"%s\" not found", | ||||
name); | name); | ||||
Context not available. | |||||
vfs_opterror(opts, "prison limit exceeded"); | vfs_opterror(opts, "prison limit exceeded"); | ||||
goto done_deref; | goto done_deref; | ||||
} | } | ||||
Not Done Inline ActionsI am pondering if "dying jails" should be allocated from the end of the ID space; imagine jid=2 is dying immediately, jid=3...7 are running, and jid=8 is not started yet; then you probably pick jid=8 (I assume from the function name) and when jid=8 tries to start jid 8 is already used and the confusion continues as this change kind-of even makes it worse rather than better. As I am expecting that to become an allocator problem ... and lastid .. unless we split the ID space in half or do some other nasty thing... bz: I am pondering if "dying jails" should be allocated from the end of the ID space;
imagine… | |||||
Done Inline ActionsIf by "jid=8 is not started yet" you mean that nothing at all has been done with jid=8, then yes it would be picked. If you mean it's in the process of creation and not ready yet, then jid=9 would be picked, so no problem. get_next_prid is based on the code that used to be inside kern_jail_set, moved into its function because in one iteration of this code I called it in two different places. Even though it's now only called in one, I'm keeping it in its own function because I should have been better with my code organization all along. jamie: If by "jid=8 is not started yet" you mean that nothing at all has been done with jid=8, then… | |||||
Done Inline ActionsCurrently, lastid sets a limit on the total number of jails, current or dying. With the potential renumbering of dying jails, lastid still sets a limit on the total number of jails, current and dying. So I don't expect any more of an allocator problem than we have now. Generally, I see two scenarios: JID partitioning would either cut down the number of available active jails, limit the number of dying jails (which probably wouldn't be an issue), or convert dying jails to jids greater than JAIL_MAX. jamie: Currently, lastid sets a limit on the total number of jails, current or dying. With the… | |||||
Not Done Inline ActionsI mean if my config has 50 jails and I am starting to rely on jid because the guarantee is now basically coming in and while I am starting all 50 jails one of the earlier jails already dies then the 1..50 guarantee is not there either and I still cannot use "JID" for anything unless I dynamically keep it at the end of start and with that I am no better or worse of using "name", right? So what does this change really buy me? The answer is probably nothing? The only thing is that I never started to use names and should have a long time ago? Or am I getting something wrong here? bz: I mean if my config has 50 jails and I am starting to rely on jid because the guarantee is now… | |||||
Done Inline ActionsThis isn't about relying on the jid staying under a certain number when allocated dynamically; that works exactly the same as it has before. This is about you having 50 jails with static-JID definitions like: Currently, jail 1 dies with a TCP keepalive or other such thing that keeps it in dying state, and you can "create" it again with the same jid using "-d" or "allow.dying", which really just brings it back to life. This doesn't change much from the user perspective: you can still re-create jail 1 when it's dying (with or without the now-unused allow.dying), but you'll get a new jail while the old one gets shoved off to jid 51 or whatever. Either way, the user sees "jail -c 1" working as expected. The difference, and the reason for the patch, is that now the new jail 1 is always in a consistent newly-created state, without potential problems from something in the old jail state and jail.conf. Part of this is to ease potential security problems (I don't have any examples cleaner is better). Part is with a guarantee that dead jails stay dead, more aggressive action can be taken to clean up dying jails. Notably, the potential advantages are the same for other scenarios of making dying jails totally invisible to userspace, or of not allowing static jids. jamie: This isn't about relying on the jid staying under a certain number when allocated dynamically… | |||||
Not Done Inline ActionsTo my understanding the problem you are solving is "everything is running and something gets ""restarted""". Your above explanation has the implicit understanding that no jail dies while all jails are initially started. That is not guaranteed. If I re-created your jail 1 the "now dying one" may be on jid 42 which is < 50 and suddenly jail 42..50 are on 43..51. My problem is that "during startup your change now can screw my expectations of I start <n> jails they'll get jid 1..<n>" as those jids are all unused, because the "dying jail" consumes an extra JID on "restart" given creation of <n> jails is not atomic and a "restart" can happen before all are running. So one problem solved another one introduced; hence my suggestion of moving dead jails to the end of the jail ID space rather than to the next free ID initially to reduce the likelihood of collision. If we do something like that then the startup problem also goes away and I'd be happy I think. bz: To my understanding the problem you are solving is "everything is running and something gets… | |||||
Done Inline ActionsOK, it took a while, but I think get it now :-). Yes, that would happen if you started jails 1-41, and 1 died, and then you restarted it (explicitly setting it to 1), and then you started what you thought would be 42-50 which would turn out to be 43-51. The dying jails don't get renumbered when they're dying. That only happens when a newly created jail wants to use the jid that a dying one currently occupies. So this problem scenario only happens when jails are implicitly numbered at first, and then explicitly numbered to their old jids on re-creation, and that re-creation can happen before all the jails are first created. Yes, in that scenario, the jids may not be as expected, and counting down the renumbered dying jails from the top would solve the issue. jamie: OK, it took a while, but I think get it now :-).
Yes, that would happen if you started jails 1… | |||||
if (deadpr != NULL) { | |||||
/* | |||||
* The prison being created has the same ID as a dying | |||||
* one. Handle this by giving the dying jail a new ID. | |||||
* This may cause some confusion to user space, but | |||||
* only to those listing dying jails. | |||||
*/ | |||||
deadid = get_next_deadid(&dinspr); | |||||
if (deadid == 0) { | |||||
error = EAGAIN; | |||||
vfs_opterror(opts, "no available jail IDs"); | |||||
goto done_deref; | |||||
} | |||||
mtx_lock(&deadpr->pr_mtx); | |||||
deadpr->pr_id = deadid; | |||||
mtx_unlock(&deadpr->pr_mtx); | |||||
if (dinspr == deadpr) | |||||
inspr = deadpr; | |||||
else { | |||||
inspr = TAILQ_NEXT(deadpr, pr_list); | |||||
TAILQ_REMOVE(&allprison, deadpr, pr_list); | |||||
if (dinspr != NULL) | |||||
TAILQ_INSERT_AFTER(&allprison, dinspr, | |||||
deadpr, pr_list); | |||||
else | |||||
TAILQ_INSERT_HEAD(&allprison, deadpr, | |||||
pr_list); | |||||
} | |||||
} | |||||
if (jid == 0 && (jid = get_next_prid(&inspr)) == 0) { | if (jid == 0 && (jid = get_next_prid(&inspr)) == 0) { | ||||
error = EAGAIN; | error = EAGAIN; | ||||
vfs_opterror(opts, "no available jail IDs"); | vfs_opterror(opts, "no available jail IDs"); | ||||
Context not available. | |||||
* Persistent prisons get an extra reference, and prisons losing their | * Persistent prisons get an extra reference, and prisons losing their | ||||
* persist flag lose that reference. | * persist flag lose that reference. | ||||
*/ | */ | ||||
born = !prison_isalive(pr); | |||||
if (ch_flags & PR_PERSIST & (pr_flags ^ pr->pr_flags)) { | if (ch_flags & PR_PERSIST & (pr_flags ^ pr->pr_flags)) { | ||||
if (pr_flags & PR_PERSIST) { | if (pr_flags & PR_PERSIST) { | ||||
prison_hold(pr); | prison_hold(pr); | ||||
/* | /* | ||||
* This may make a dead prison alive again, but wait | * This may be a new prison's first user reference, | ||||
* to label it as such until after OSD calls have had | * but wait to call it alive until after OSD calls | ||||
* a chance to run (and perhaps to fail). | * have had a chance to run (and perhaps to fail). | ||||
*/ | */ | ||||
refcount_acquire(&pr->pr_uref); | refcount_acquire(&pr->pr_uref); | ||||
} else { | } else { | ||||
Context not available. | |||||
* Any errors past this point will need to de-persist newly created | * Any errors past this point will need to de-persist newly created | ||||
* prisons, as well as call remove methods. | * prisons, as well as call remove methods. | ||||
*/ | */ | ||||
if (born) | if (created) | ||||
drflags |= PD_KILL; | drflags |= PD_KILL; | ||||
#ifdef RACCT | #ifdef RACCT | ||||
Context not available. | |||||
#endif | #endif | ||||
/* Let the modules do their work. */ | /* Let the modules do their work. */ | ||||
if (born) { | if (created) { | ||||
error = osd_jail_call(pr, PR_METHOD_CREATE, opts); | error = osd_jail_call(pr, PR_METHOD_CREATE, opts); | ||||
if (error) | if (error) | ||||
goto done_deref; | goto done_deref; | ||||
Context not available. | |||||
* A new prison is now ready to be seen; either it has gained a user | * A new prison is now ready to be seen; either it has gained a user | ||||
* reference via persistence, or is about to gain one via attachment. | * reference via persistence, or is about to gain one via attachment. | ||||
*/ | */ | ||||
if (born) { | if (created) { | ||||
drflags = prison_lock_xlock(pr, drflags); | drflags = prison_lock_xlock(pr, drflags); | ||||
pr->pr_state = PRISON_STATE_ALIVE; | pr->pr_state = PRISON_STATE_ALIVE; | ||||
} | } | ||||
Context not available. | |||||
return (jid); | return (jid); | ||||
} | } | ||||
/* | |||||
* Find the next available ID for a renumbered dead prison. This is the same | |||||
* as get_next_prid, but counting backward from the end of the range. | |||||
*/ | |||||
static int | |||||
get_next_deadid(struct prison **dinsprp) | |||||
{ | |||||
struct prison *dinspr; | |||||
int deadid, minid; | |||||
deadid = lastdeadid ? lastdeadid - 1 : JAIL_MAX; | |||||
/* | |||||
* Take two reverse passes through the allprison list: first | |||||
* starting with the proposed deadid, then ending with it. | |||||
*/ | |||||
for (minid = 1; minid != 0; ) { | |||||
TAILQ_FOREACH_REVERSE(dinspr, &allprison, prisonlist, pr_list) { | |||||
if (dinspr->pr_id > deadid) | |||||
continue; | |||||
if (dinspr->pr_id < deadid) { | |||||
/* Found an opening. */ | |||||
minid = 0; | |||||
break; | |||||
} | |||||
if (--deadid < minid) { | |||||
if (lastdeadid == minid || lastdeadid == 0) | |||||
{ | |||||
/* | |||||
* The entire legal range | |||||
* has been traversed | |||||
*/ | |||||
return 0; | |||||
} | |||||
/* Try again from the end. */ | |||||
deadid = JAIL_MAX; | |||||
minid = lastdeadid; | |||||
break; | |||||
} | |||||
} | |||||
if (dinspr == NULL) { | |||||
/* Found room at the beginning of the list. */ | |||||
break; | |||||
} | |||||
} | |||||
*dinsprp = dinspr; | |||||
lastdeadid = deadid; | |||||
return (deadid); | |||||
} | |||||
/* | /* | ||||
* struct jail_get_args { | * struct jail_get_args { | ||||
* struct iovec *iovp; | * struct iovec *iovp; | ||||
Context not available. |
You could also just add PD_KILL at 0x20 and leave the other flags on the same bits?