Index: projects/nfsv42/sys/amd64/amd64/pmap.c =================================================================== --- projects/nfsv42/sys/amd64/amd64/pmap.c (revision 350367) +++ projects/nfsv42/sys/amd64/amd64/pmap.c (revision 350368) @@ -1,9949 +1,9939 @@ /*- * SPDX-License-Identifier: BSD-4-Clause * * Copyright (c) 1991 Regents of the University of California. * All rights reserved. * Copyright (c) 1994 John S. Dyson * All rights reserved. * Copyright (c) 1994 David Greenman * All rights reserved. * Copyright (c) 2003 Peter Wemm * All rights reserved. * Copyright (c) 2005-2010 Alan L. Cox * All rights reserved. * * This code is derived from software contributed to Berkeley by * the Systems Programming Group of the University of Utah Computer * Science Department and William Jolitz of UUNET Technologies Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)pmap.c 7.7 (Berkeley) 5/12/91 */ /*- * Copyright (c) 2003 Networks Associates Technology, Inc. * Copyright (c) 2014-2019 The FreeBSD Foundation * All rights reserved. * * This software was developed for the FreeBSD Project by Jake Burkholder, * Safeport Network Services, and Network Associates Laboratories, the * Security Research Division of Network Associates, Inc. under * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA * CHATS research program. * * Portions of this software were developed by * Konstantin Belousov under sponsorship from * the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #define AMD64_NPT_AWARE #include __FBSDID("$FreeBSD$"); /* * Manages physical address maps. * * Since the information managed by this module is * also stored by the logical address mapping module, * this module may throw away valid virtual-to-physical * mappings at almost any time. However, invalidations * of virtual-to-physical mappings must be done as * requested. * * In order to cope with hardware architectures which * make virtual-to-physical map invalidates expensive, * this module may delay invalidate or reduced protection * operations until such time as they are actually * necessary. This module is given full information as * to which processors are currently using which maps, * and to when physical maps must be made correct. */ #include "opt_ddb.h" #include "opt_pmap.h" #include "opt_vm.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DDB #include #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef SMP #include #endif #include #include static __inline boolean_t pmap_type_guest(pmap_t pmap) { return ((pmap->pm_type == PT_EPT) || (pmap->pm_type == PT_RVI)); } static __inline boolean_t pmap_emulate_ad_bits(pmap_t pmap) { return ((pmap->pm_flags & PMAP_EMULATE_AD_BITS) != 0); } static __inline pt_entry_t pmap_valid_bit(pmap_t pmap) { pt_entry_t mask; switch (pmap->pm_type) { case PT_X86: case PT_RVI: mask = X86_PG_V; break; case PT_EPT: if (pmap_emulate_ad_bits(pmap)) mask = EPT_PG_EMUL_V; else mask = EPT_PG_READ; break; default: panic("pmap_valid_bit: invalid pm_type %d", pmap->pm_type); } return (mask); } static __inline pt_entry_t pmap_rw_bit(pmap_t pmap) { pt_entry_t mask; switch (pmap->pm_type) { case PT_X86: case PT_RVI: mask = X86_PG_RW; break; case PT_EPT: if (pmap_emulate_ad_bits(pmap)) mask = EPT_PG_EMUL_RW; else mask = EPT_PG_WRITE; break; default: panic("pmap_rw_bit: invalid pm_type %d", pmap->pm_type); } return (mask); } static pt_entry_t pg_g; static __inline pt_entry_t pmap_global_bit(pmap_t pmap) { pt_entry_t mask; switch (pmap->pm_type) { case PT_X86: mask = pg_g; break; case PT_RVI: case PT_EPT: mask = 0; break; default: panic("pmap_global_bit: invalid pm_type %d", pmap->pm_type); } return (mask); } static __inline pt_entry_t pmap_accessed_bit(pmap_t pmap) { pt_entry_t mask; switch (pmap->pm_type) { case PT_X86: case PT_RVI: mask = X86_PG_A; break; case PT_EPT: if (pmap_emulate_ad_bits(pmap)) mask = EPT_PG_READ; else mask = EPT_PG_A; break; default: panic("pmap_accessed_bit: invalid pm_type %d", pmap->pm_type); } return (mask); } static __inline pt_entry_t pmap_modified_bit(pmap_t pmap) { pt_entry_t mask; switch (pmap->pm_type) { case PT_X86: case PT_RVI: mask = X86_PG_M; break; case PT_EPT: if (pmap_emulate_ad_bits(pmap)) mask = EPT_PG_WRITE; else mask = EPT_PG_M; break; default: panic("pmap_modified_bit: invalid pm_type %d", pmap->pm_type); } return (mask); } static __inline pt_entry_t pmap_pku_mask_bit(pmap_t pmap) { return (pmap->pm_type == PT_X86 ? X86_PG_PKU_MASK : 0); } #if !defined(DIAGNOSTIC) #ifdef __GNUC_GNU_INLINE__ #define PMAP_INLINE __attribute__((__gnu_inline__)) inline #else #define PMAP_INLINE extern inline #endif #else #define PMAP_INLINE #endif #ifdef PV_STATS #define PV_STAT(x) do { x ; } while (0) #else #define PV_STAT(x) do { } while (0) #endif #define pa_index(pa) ((pa) >> PDRSHIFT) #define pa_to_pvh(pa) (&pv_table[pa_index(pa)]) #define NPV_LIST_LOCKS MAXCPU #define PHYS_TO_PV_LIST_LOCK(pa) \ (&pv_list_locks[pa_index(pa) % NPV_LIST_LOCKS]) #define CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa) do { \ struct rwlock **_lockp = (lockp); \ struct rwlock *_new_lock; \ \ _new_lock = PHYS_TO_PV_LIST_LOCK(pa); \ if (_new_lock != *_lockp) { \ if (*_lockp != NULL) \ rw_wunlock(*_lockp); \ *_lockp = _new_lock; \ rw_wlock(*_lockp); \ } \ } while (0) #define CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m) \ CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, VM_PAGE_TO_PHYS(m)) #define RELEASE_PV_LIST_LOCK(lockp) do { \ struct rwlock **_lockp = (lockp); \ \ if (*_lockp != NULL) { \ rw_wunlock(*_lockp); \ *_lockp = NULL; \ } \ } while (0) #define VM_PAGE_TO_PV_LIST_LOCK(m) \ PHYS_TO_PV_LIST_LOCK(VM_PAGE_TO_PHYS(m)) struct pmap kernel_pmap_store; vm_offset_t virtual_avail; /* VA of first avail page (after kernel bss) */ vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */ int nkpt; SYSCTL_INT(_machdep, OID_AUTO, nkpt, CTLFLAG_RD, &nkpt, 0, "Number of kernel page table pages allocated on bootup"); static int ndmpdp; vm_paddr_t dmaplimit; vm_offset_t kernel_vm_end = VM_MIN_KERNEL_ADDRESS; pt_entry_t pg_nx; static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD, 0, "VM/pmap parameters"); static int pg_ps_enabled = 1; SYSCTL_INT(_vm_pmap, OID_AUTO, pg_ps_enabled, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &pg_ps_enabled, 0, "Are large page mappings enabled?"); #define PAT_INDEX_SIZE 8 static int pat_index[PAT_INDEX_SIZE]; /* cache mode to PAT index conversion */ static u_int64_t KPTphys; /* phys addr of kernel level 1 */ static u_int64_t KPDphys; /* phys addr of kernel level 2 */ u_int64_t KPDPphys; /* phys addr of kernel level 3 */ u_int64_t KPML4phys; /* phys addr of kernel level 4 */ static u_int64_t DMPDphys; /* phys addr of direct mapped level 2 */ static u_int64_t DMPDPphys; /* phys addr of direct mapped level 3 */ static int ndmpdpphys; /* number of DMPDPphys pages */ static vm_paddr_t KERNend; /* phys addr of end of bootstrap data */ /* * pmap_mapdev support pre initialization (i.e. console) */ #define PMAP_PREINIT_MAPPING_COUNT 8 static struct pmap_preinit_mapping { vm_paddr_t pa; vm_offset_t va; vm_size_t sz; int mode; } pmap_preinit_mapping[PMAP_PREINIT_MAPPING_COUNT]; static int pmap_initialized; /* * Data for the pv entry allocation mechanism. * Updates to pv_invl_gen are protected by the pv_list_locks[] * elements, but reads are not. */ static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks); static struct mtx __exclusive_cache_line pv_chunks_mutex; static struct rwlock __exclusive_cache_line pv_list_locks[NPV_LIST_LOCKS]; static u_long pv_invl_gen[NPV_LIST_LOCKS]; static struct md_page *pv_table; static struct md_page pv_dummy; /* * All those kernel PT submaps that BSD is so fond of */ pt_entry_t *CMAP1 = NULL; caddr_t CADDR1 = 0; static vm_offset_t qframe = 0; static struct mtx qframe_mtx; static int pmap_flags = PMAP_PDE_SUPERPAGE; /* flags for x86 pmaps */ static vmem_t *large_vmem; static u_int lm_ents; #define PMAP_LARGEMAP_MAX_ADDRESS() \ (LARGEMAP_MIN_ADDRESS + NBPML4 * (u_long)lm_ents) int pmap_pcid_enabled = 1; SYSCTL_INT(_vm_pmap, OID_AUTO, pcid_enabled, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &pmap_pcid_enabled, 0, "Is TLB Context ID enabled ?"); int invpcid_works = 0; SYSCTL_INT(_vm_pmap, OID_AUTO, invpcid_works, CTLFLAG_RD, &invpcid_works, 0, "Is the invpcid instruction available ?"); int __read_frequently pti = 0; SYSCTL_INT(_vm_pmap, OID_AUTO, pti, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &pti, 0, "Page Table Isolation enabled"); static vm_object_t pti_obj; static pml4_entry_t *pti_pml4; static vm_pindex_t pti_pg_idx; static bool pti_finalized; struct pmap_pkru_range { struct rs_el pkru_rs_el; u_int pkru_keyidx; int pkru_flags; }; static uma_zone_t pmap_pkru_ranges_zone; static bool pmap_pkru_same(pmap_t pmap, vm_offset_t sva, vm_offset_t eva); static pt_entry_t pmap_pkru_get(pmap_t pmap, vm_offset_t va); static void pmap_pkru_on_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva); static void *pkru_dup_range(void *ctx, void *data); static void pkru_free_range(void *ctx, void *node); static int pmap_pkru_copy(pmap_t dst_pmap, pmap_t src_pmap); static int pmap_pkru_deassign(pmap_t pmap, vm_offset_t sva, vm_offset_t eva); static void pmap_pkru_deassign_all(pmap_t pmap); static int pmap_pcid_save_cnt_proc(SYSCTL_HANDLER_ARGS) { int i; uint64_t res; res = 0; CPU_FOREACH(i) { res += cpuid_to_pcpu[i]->pc_pm_save_cnt; } return (sysctl_handle_64(oidp, &res, 0, req)); } SYSCTL_PROC(_vm_pmap, OID_AUTO, pcid_save_cnt, CTLTYPE_U64 | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, pmap_pcid_save_cnt_proc, "QU", "Count of saved TLB context on switch"); static LIST_HEAD(, pmap_invl_gen) pmap_invl_gen_tracker = LIST_HEAD_INITIALIZER(&pmap_invl_gen_tracker); static struct mtx invl_gen_mtx; /* Fake lock object to satisfy turnstiles interface. */ static struct lock_object invl_gen_ts = { .lo_name = "invlts", }; static struct pmap_invl_gen pmap_invl_gen_head = { .gen = 1, .next = NULL, }; static u_long pmap_invl_gen = 1; static int pmap_invl_waiters; static struct callout pmap_invl_callout; static bool pmap_invl_callout_inited; #define PMAP_ASSERT_NOT_IN_DI() \ KASSERT(pmap_not_in_di(), ("DI already started")) static bool pmap_di_locked(void) { int tun; if ((cpu_feature2 & CPUID2_CX16) == 0) return (true); tun = 0; TUNABLE_INT_FETCH("vm.pmap.di_locked", &tun); return (tun != 0); } static int sysctl_pmap_di_locked(SYSCTL_HANDLER_ARGS) { int locked; locked = pmap_di_locked(); return (sysctl_handle_int(oidp, &locked, 0, req)); } SYSCTL_PROC(_vm_pmap, OID_AUTO, di_locked, CTLTYPE_INT | CTLFLAG_RDTUN | CTLFLAG_MPSAFE, 0, 0, sysctl_pmap_di_locked, "", "Locked delayed invalidation"); static bool pmap_not_in_di_l(void); static bool pmap_not_in_di_u(void); DEFINE_IFUNC(, bool, pmap_not_in_di, (void)) { return (pmap_di_locked() ? pmap_not_in_di_l : pmap_not_in_di_u); } static bool pmap_not_in_di_l(void) { struct pmap_invl_gen *invl_gen; invl_gen = &curthread->td_md.md_invl_gen; return (invl_gen->gen == 0); } static void pmap_thread_init_invl_gen_l(struct thread *td) { struct pmap_invl_gen *invl_gen; invl_gen = &td->td_md.md_invl_gen; invl_gen->gen = 0; } static void pmap_delayed_invl_wait_block(u_long *m_gen, u_long *invl_gen) { struct turnstile *ts; ts = turnstile_trywait(&invl_gen_ts); if (*m_gen > atomic_load_long(invl_gen)) turnstile_wait(ts, NULL, TS_SHARED_QUEUE); else turnstile_cancel(ts); } static void pmap_delayed_invl_finish_unblock(u_long new_gen) { struct turnstile *ts; turnstile_chain_lock(&invl_gen_ts); ts = turnstile_lookup(&invl_gen_ts); if (new_gen != 0) pmap_invl_gen = new_gen; if (ts != NULL) { turnstile_broadcast(ts, TS_SHARED_QUEUE); turnstile_unpend(ts); } turnstile_chain_unlock(&invl_gen_ts); } /* * Start a new Delayed Invalidation (DI) block of code, executed by * the current thread. Within a DI block, the current thread may * destroy both the page table and PV list entries for a mapping and * then release the corresponding PV list lock before ensuring that * the mapping is flushed from the TLBs of any processors with the * pmap active. */ static void pmap_delayed_invl_start_l(void) { struct pmap_invl_gen *invl_gen; u_long currgen; invl_gen = &curthread->td_md.md_invl_gen; PMAP_ASSERT_NOT_IN_DI(); mtx_lock(&invl_gen_mtx); if (LIST_EMPTY(&pmap_invl_gen_tracker)) currgen = pmap_invl_gen; else currgen = LIST_FIRST(&pmap_invl_gen_tracker)->gen; invl_gen->gen = currgen + 1; LIST_INSERT_HEAD(&pmap_invl_gen_tracker, invl_gen, link); mtx_unlock(&invl_gen_mtx); } /* * Finish the DI block, previously started by the current thread. All * required TLB flushes for the pages marked by * pmap_delayed_invl_page() must be finished before this function is * called. * * This function works by bumping the global DI generation number to * the generation number of the current thread's DI, unless there is a * pending DI that started earlier. In the latter case, bumping the * global DI generation number would incorrectly signal that the * earlier DI had finished. Instead, this function bumps the earlier * DI's generation number to match the generation number of the * current thread's DI. */ static void pmap_delayed_invl_finish_l(void) { struct pmap_invl_gen *invl_gen, *next; invl_gen = &curthread->td_md.md_invl_gen; KASSERT(invl_gen->gen != 0, ("missed invl_start")); mtx_lock(&invl_gen_mtx); next = LIST_NEXT(invl_gen, link); if (next == NULL) pmap_delayed_invl_finish_unblock(invl_gen->gen); else next->gen = invl_gen->gen; LIST_REMOVE(invl_gen, link); mtx_unlock(&invl_gen_mtx); invl_gen->gen = 0; } static bool pmap_not_in_di_u(void) { struct pmap_invl_gen *invl_gen; invl_gen = &curthread->td_md.md_invl_gen; return (((uintptr_t)invl_gen->next & PMAP_INVL_GEN_NEXT_INVALID) != 0); } static void pmap_thread_init_invl_gen_u(struct thread *td) { struct pmap_invl_gen *invl_gen; invl_gen = &td->td_md.md_invl_gen; invl_gen->gen = 0; invl_gen->next = (void *)PMAP_INVL_GEN_NEXT_INVALID; } static bool pmap_di_load_invl(struct pmap_invl_gen *ptr, struct pmap_invl_gen *out) { uint64_t new_high, new_low, old_high, old_low; char res; old_low = new_low = 0; old_high = new_high = (uintptr_t)0; __asm volatile("lock;cmpxchg16b\t%1;sete\t%0" : "=r" (res), "+m" (*ptr), "+a" (old_low), "+d" (old_high) : "b"(new_low), "c" (new_high) : "memory", "cc"); if (res == 0) { if ((old_high & PMAP_INVL_GEN_NEXT_INVALID) != 0) return (false); out->gen = old_low; out->next = (void *)old_high; } else { out->gen = new_low; out->next = (void *)new_high; } return (true); } static bool pmap_di_store_invl(struct pmap_invl_gen *ptr, struct pmap_invl_gen *old_val, struct pmap_invl_gen *new_val) { uint64_t new_high, new_low, old_high, old_low; char res; new_low = new_val->gen; new_high = (uintptr_t)new_val->next; old_low = old_val->gen; old_high = (uintptr_t)old_val->next; __asm volatile("lock;cmpxchg16b\t%1;sete\t%0" : "=r" (res), "+m" (*ptr), "+a" (old_low), "+d" (old_high) : "b"(new_low), "c" (new_high) : "memory", "cc"); return (res); } #ifdef PV_STATS static long invl_start_restart; SYSCTL_LONG(_vm_pmap, OID_AUTO, invl_start_restart, CTLFLAG_RD, &invl_start_restart, 0, ""); static long invl_finish_restart; SYSCTL_LONG(_vm_pmap, OID_AUTO, invl_finish_restart, CTLFLAG_RD, &invl_finish_restart, 0, ""); static int invl_max_qlen; SYSCTL_INT(_vm_pmap, OID_AUTO, invl_max_qlen, CTLFLAG_RD, &invl_max_qlen, 0, ""); #endif static struct lock_delay_config __read_frequently di_delay; LOCK_DELAY_SYSINIT_DEFAULT(di_delay); static void pmap_delayed_invl_start_u(void) { struct pmap_invl_gen *invl_gen, *p, prev, new_prev; struct thread *td; struct lock_delay_arg lda; uintptr_t prevl; u_char pri; #ifdef PV_STATS int i, ii; #endif td = curthread; invl_gen = &td->td_md.md_invl_gen; PMAP_ASSERT_NOT_IN_DI(); lock_delay_arg_init(&lda, &di_delay); invl_gen->saved_pri = 0; pri = td->td_base_pri; if (pri > PVM) { thread_lock(td); pri = td->td_base_pri; if (pri > PVM) { invl_gen->saved_pri = pri; sched_prio(td, PVM); } thread_unlock(td); } again: PV_STAT(i = 0); for (p = &pmap_invl_gen_head;; p = prev.next) { PV_STAT(i++); prevl = atomic_load_ptr(&p->next); if ((prevl & PMAP_INVL_GEN_NEXT_INVALID) != 0) { PV_STAT(atomic_add_long(&invl_start_restart, 1)); lock_delay(&lda); goto again; } if (prevl == 0) break; prev.next = (void *)prevl; } #ifdef PV_STATS if ((ii = invl_max_qlen) < i) atomic_cmpset_int(&invl_max_qlen, ii, i); #endif if (!pmap_di_load_invl(p, &prev) || prev.next != NULL) { PV_STAT(atomic_add_long(&invl_start_restart, 1)); lock_delay(&lda); goto again; } new_prev.gen = prev.gen; new_prev.next = invl_gen; invl_gen->gen = prev.gen + 1; /* Formal fence between store to invl->gen and updating *p. */ atomic_thread_fence_rel(); /* * After inserting an invl_gen element with invalid bit set, * this thread blocks any other thread trying to enter the * delayed invalidation block. Do not allow to remove us from * the CPU, because it causes starvation for other threads. */ critical_enter(); /* * ABA for *p is not possible there, since p->gen can only * increase. So if the *p thread finished its di, then * started a new one and got inserted into the list at the * same place, its gen will appear greater than the previously * read gen. */ if (!pmap_di_store_invl(p, &prev, &new_prev)) { critical_exit(); PV_STAT(atomic_add_long(&invl_start_restart, 1)); lock_delay(&lda); goto again; } /* * There we clear PMAP_INVL_GEN_NEXT_INVALID in * invl_gen->next, allowing other threads to iterate past us. * pmap_di_store_invl() provides fence between the generation * write and the update of next. */ invl_gen->next = NULL; critical_exit(); } static bool pmap_delayed_invl_finish_u_crit(struct pmap_invl_gen *invl_gen, struct pmap_invl_gen *p) { struct pmap_invl_gen prev, new_prev; u_long mygen; /* * Load invl_gen->gen after setting invl_gen->next * PMAP_INVL_GEN_NEXT_INVALID. This prevents larger * generations to propagate to our invl_gen->gen. Lock prefix * in atomic_set_ptr() worked as seq_cst fence. */ mygen = atomic_load_long(&invl_gen->gen); if (!pmap_di_load_invl(p, &prev) || prev.next != invl_gen) return (false); KASSERT(prev.gen < mygen, ("invalid di gen sequence %lu %lu", prev.gen, mygen)); new_prev.gen = mygen; new_prev.next = (void *)((uintptr_t)invl_gen->next & ~PMAP_INVL_GEN_NEXT_INVALID); /* Formal fence between load of prev and storing update to it. */ atomic_thread_fence_rel(); return (pmap_di_store_invl(p, &prev, &new_prev)); } static void pmap_delayed_invl_finish_u(void) { struct pmap_invl_gen *invl_gen, *p; struct thread *td; struct lock_delay_arg lda; uintptr_t prevl; td = curthread; invl_gen = &td->td_md.md_invl_gen; KASSERT(invl_gen->gen != 0, ("missed invl_start: gen 0")); KASSERT(((uintptr_t)invl_gen->next & PMAP_INVL_GEN_NEXT_INVALID) == 0, ("missed invl_start: INVALID")); lock_delay_arg_init(&lda, &di_delay); again: for (p = &pmap_invl_gen_head; p != NULL; p = (void *)prevl) { prevl = atomic_load_ptr(&p->next); if ((prevl & PMAP_INVL_GEN_NEXT_INVALID) != 0) { PV_STAT(atomic_add_long(&invl_finish_restart, 1)); lock_delay(&lda); goto again; } if ((void *)prevl == invl_gen) break; } /* * It is legitimate to not find ourself on the list if a * thread before us finished its DI and started it again. */ if (__predict_false(p == NULL)) { PV_STAT(atomic_add_long(&invl_finish_restart, 1)); lock_delay(&lda); goto again; } critical_enter(); atomic_set_ptr((uintptr_t *)&invl_gen->next, PMAP_INVL_GEN_NEXT_INVALID); if (!pmap_delayed_invl_finish_u_crit(invl_gen, p)) { atomic_clear_ptr((uintptr_t *)&invl_gen->next, PMAP_INVL_GEN_NEXT_INVALID); critical_exit(); PV_STAT(atomic_add_long(&invl_finish_restart, 1)); lock_delay(&lda); goto again; } critical_exit(); if (atomic_load_int(&pmap_invl_waiters) > 0) pmap_delayed_invl_finish_unblock(0); if (invl_gen->saved_pri != 0) { thread_lock(td); sched_prio(td, invl_gen->saved_pri); thread_unlock(td); } } #ifdef DDB DB_SHOW_COMMAND(di_queue, pmap_di_queue) { struct pmap_invl_gen *p, *pn; struct thread *td; uintptr_t nextl; bool first; for (p = &pmap_invl_gen_head, first = true; p != NULL; p = pn, first = false) { nextl = atomic_load_ptr(&p->next); pn = (void *)(nextl & ~PMAP_INVL_GEN_NEXT_INVALID); td = first ? NULL : __containerof(p, struct thread, td_md.md_invl_gen); db_printf("gen %lu inv %d td %p tid %d\n", p->gen, (nextl & PMAP_INVL_GEN_NEXT_INVALID) != 0, td, td != NULL ? td->td_tid : -1); } } #endif #ifdef PV_STATS static long invl_wait; SYSCTL_LONG(_vm_pmap, OID_AUTO, invl_wait, CTLFLAG_RD, &invl_wait, 0, "Number of times DI invalidation blocked pmap_remove_all/write"); static long invl_wait_slow; SYSCTL_LONG(_vm_pmap, OID_AUTO, invl_wait_slow, CTLFLAG_RD, &invl_wait_slow, 0, "Number of slow invalidation waits for lockless DI"); #endif static u_long * pmap_delayed_invl_genp(vm_page_t m) { return (&pv_invl_gen[pa_index(VM_PAGE_TO_PHYS(m)) % NPV_LIST_LOCKS]); } static void pmap_delayed_invl_callout_func(void *arg __unused) { if (atomic_load_int(&pmap_invl_waiters) == 0) return; pmap_delayed_invl_finish_unblock(0); } static void pmap_delayed_invl_callout_init(void *arg __unused) { if (pmap_di_locked()) return; callout_init(&pmap_invl_callout, 1); pmap_invl_callout_inited = true; } SYSINIT(pmap_di_callout, SI_SUB_CPU + 1, SI_ORDER_ANY, pmap_delayed_invl_callout_init, NULL); /* * Ensure that all currently executing DI blocks, that need to flush * TLB for the given page m, actually flushed the TLB at the time the * function returned. If the page m has an empty PV list and we call * pmap_delayed_invl_wait(), upon its return we know that no CPU has a * valid mapping for the page m in either its page table or TLB. * * This function works by blocking until the global DI generation * number catches up with the generation number associated with the * given page m and its PV list. Since this function's callers * typically own an object lock and sometimes own a page lock, it * cannot sleep. Instead, it blocks on a turnstile to relinquish the * processor. */ static void pmap_delayed_invl_wait_l(vm_page_t m) { u_long *m_gen; #ifdef PV_STATS bool accounted = false; #endif m_gen = pmap_delayed_invl_genp(m); while (*m_gen > pmap_invl_gen) { #ifdef PV_STATS if (!accounted) { atomic_add_long(&invl_wait, 1); accounted = true; } #endif pmap_delayed_invl_wait_block(m_gen, &pmap_invl_gen); } } static void pmap_delayed_invl_wait_u(vm_page_t m) { u_long *m_gen; struct lock_delay_arg lda; bool fast; fast = true; m_gen = pmap_delayed_invl_genp(m); lock_delay_arg_init(&lda, &di_delay); while (*m_gen > atomic_load_long(&pmap_invl_gen_head.gen)) { if (fast || !pmap_invl_callout_inited) { PV_STAT(atomic_add_long(&invl_wait, 1)); lock_delay(&lda); fast = false; } else { /* * The page's invalidation generation number * is still below the current thread's number. * Prepare to block so that we do not waste * CPU cycles or worse, suffer livelock. * * Since it is impossible to block without * racing with pmap_delayed_invl_finish_u(), * prepare for the race by incrementing * pmap_invl_waiters and arming a 1-tick * callout which will unblock us if we lose * the race. */ atomic_add_int(&pmap_invl_waiters, 1); /* * Re-check the current thread's invalidation * generation after incrementing * pmap_invl_waiters, so that there is no race * with pmap_delayed_invl_finish_u() setting * the page generation and checking * pmap_invl_waiters. The only race allowed * is for a missed unblock, which is handled * by the callout. */ if (*m_gen > atomic_load_long(&pmap_invl_gen_head.gen)) { callout_reset(&pmap_invl_callout, 1, pmap_delayed_invl_callout_func, NULL); PV_STAT(atomic_add_long(&invl_wait_slow, 1)); pmap_delayed_invl_wait_block(m_gen, &pmap_invl_gen_head.gen); } atomic_add_int(&pmap_invl_waiters, -1); } } } DEFINE_IFUNC(, void, pmap_thread_init_invl_gen, (struct thread *)) { return (pmap_di_locked() ? pmap_thread_init_invl_gen_l : pmap_thread_init_invl_gen_u); } DEFINE_IFUNC(static, void, pmap_delayed_invl_start, (void)) { return (pmap_di_locked() ? pmap_delayed_invl_start_l : pmap_delayed_invl_start_u); } DEFINE_IFUNC(static, void, pmap_delayed_invl_finish, (void)) { return (pmap_di_locked() ? pmap_delayed_invl_finish_l : pmap_delayed_invl_finish_u); } DEFINE_IFUNC(static, void, pmap_delayed_invl_wait, (vm_page_t)) { return (pmap_di_locked() ? pmap_delayed_invl_wait_l : pmap_delayed_invl_wait_u); } /* * Mark the page m's PV list as participating in the current thread's * DI block. Any threads concurrently using m's PV list to remove or * restrict all mappings to m will wait for the current thread's DI * block to complete before proceeding. * * The function works by setting the DI generation number for m's PV * list to at least the DI generation number of the current thread. * This forces a caller of pmap_delayed_invl_wait() to block until * current thread calls pmap_delayed_invl_finish(). */ static void pmap_delayed_invl_page(vm_page_t m) { u_long gen, *m_gen; rw_assert(VM_PAGE_TO_PV_LIST_LOCK(m), RA_WLOCKED); gen = curthread->td_md.md_invl_gen.gen; if (gen == 0) return; m_gen = pmap_delayed_invl_genp(m); if (*m_gen < gen) *m_gen = gen; } /* * Crashdump maps. */ static caddr_t crashdumpmap; /* * Internal flags for pmap_enter()'s helper functions. */ #define PMAP_ENTER_NORECLAIM 0x1000000 /* Don't reclaim PV entries. */ #define PMAP_ENTER_NOREPLACE 0x2000000 /* Don't replace mappings. */ static void free_pv_chunk(struct pv_chunk *pc); static void free_pv_entry(pmap_t pmap, pv_entry_t pv); static pv_entry_t get_pv_entry(pmap_t pmap, struct rwlock **lockp); static int popcnt_pc_map_pq(uint64_t *map); static vm_page_t reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp); static void reserve_pv_entries(pmap_t pmap, int needed, struct rwlock **lockp); static void pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, struct rwlock **lockp); static bool pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, pd_entry_t pde, u_int flags, struct rwlock **lockp); #if VM_NRESERVLEVEL > 0 static void pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, struct rwlock **lockp); #endif static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va); static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va); static int pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode, bool noflush); static boolean_t pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va); static boolean_t pmap_demote_pde_locked(pmap_t pmap, pd_entry_t *pde, vm_offset_t va, struct rwlock **lockp); static boolean_t pmap_demote_pdpe(pmap_t pmap, pdp_entry_t *pdpe, vm_offset_t va); static bool pmap_enter_2mpage(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, struct rwlock **lockp); static int pmap_enter_pde(pmap_t pmap, vm_offset_t va, pd_entry_t newpde, u_int flags, vm_page_t m, struct rwlock **lockp); static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp); static void pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte); static int pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte, bool promoted); static void pmap_invalidate_cache_range_selfsnoop(vm_offset_t sva, vm_offset_t eva); static void pmap_invalidate_cache_range_all(vm_offset_t sva, vm_offset_t eva); static void pmap_invalidate_pde_page(pmap_t pmap, vm_offset_t va, pd_entry_t pde); static void pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode); static vm_page_t pmap_large_map_getptp_unlocked(void); static vm_paddr_t pmap_large_map_kextract(vm_offset_t va); static void pmap_pde_attr(pd_entry_t *pde, int cache_bits, int mask); #if VM_NRESERVLEVEL > 0 static void pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va, struct rwlock **lockp); #endif static boolean_t pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva, vm_prot_t prot); static void pmap_pte_attr(pt_entry_t *pte, int cache_bits, int mask); static void pmap_pti_add_kva_locked(vm_offset_t sva, vm_offset_t eva, bool exec); static pdp_entry_t *pmap_pti_pdpe(vm_offset_t va); static pd_entry_t *pmap_pti_pde(vm_offset_t va); static void pmap_pti_wire_pte(void *pte); static int pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva, struct spglist *free, struct rwlock **lockp); static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t sva, pd_entry_t ptepde, struct spglist *free, struct rwlock **lockp); static vm_page_t pmap_remove_pt_page(pmap_t pmap, vm_offset_t va); static void pmap_remove_page(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, struct spglist *free); static bool pmap_remove_ptes(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, pd_entry_t *pde, struct spglist *free, struct rwlock **lockp); static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m, struct rwlock **lockp); static void pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde); static void pmap_update_pde_invalidate(pmap_t, vm_offset_t va, pd_entry_t pde); static vm_page_t _pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp); static vm_page_t pmap_allocpde(pmap_t pmap, vm_offset_t va, struct rwlock **lockp); static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va, struct rwlock **lockp); static void _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free); static int pmap_unuse_pt(pmap_t, vm_offset_t, pd_entry_t, struct spglist *); /********************/ /* Inline functions */ /********************/ /* Return a non-clipped PD index for a given VA */ static __inline vm_pindex_t pmap_pde_pindex(vm_offset_t va) { return (va >> PDRSHIFT); } /* Return a pointer to the PML4 slot that corresponds to a VA */ static __inline pml4_entry_t * pmap_pml4e(pmap_t pmap, vm_offset_t va) { return (&pmap->pm_pml4[pmap_pml4e_index(va)]); } /* Return a pointer to the PDP slot that corresponds to a VA */ static __inline pdp_entry_t * pmap_pml4e_to_pdpe(pml4_entry_t *pml4e, vm_offset_t va) { pdp_entry_t *pdpe; pdpe = (pdp_entry_t *)PHYS_TO_DMAP(*pml4e & PG_FRAME); return (&pdpe[pmap_pdpe_index(va)]); } /* Return a pointer to the PDP slot that corresponds to a VA */ static __inline pdp_entry_t * pmap_pdpe(pmap_t pmap, vm_offset_t va) { pml4_entry_t *pml4e; pt_entry_t PG_V; PG_V = pmap_valid_bit(pmap); pml4e = pmap_pml4e(pmap, va); if ((*pml4e & PG_V) == 0) return (NULL); return (pmap_pml4e_to_pdpe(pml4e, va)); } /* Return a pointer to the PD slot that corresponds to a VA */ static __inline pd_entry_t * pmap_pdpe_to_pde(pdp_entry_t *pdpe, vm_offset_t va) { pd_entry_t *pde; pde = (pd_entry_t *)PHYS_TO_DMAP(*pdpe & PG_FRAME); return (&pde[pmap_pde_index(va)]); } /* Return a pointer to the PD slot that corresponds to a VA */ static __inline pd_entry_t * pmap_pde(pmap_t pmap, vm_offset_t va) { pdp_entry_t *pdpe; pt_entry_t PG_V; PG_V = pmap_valid_bit(pmap); pdpe = pmap_pdpe(pmap, va); if (pdpe == NULL || (*pdpe & PG_V) == 0) return (NULL); return (pmap_pdpe_to_pde(pdpe, va)); } /* Return a pointer to the PT slot that corresponds to a VA */ static __inline pt_entry_t * pmap_pde_to_pte(pd_entry_t *pde, vm_offset_t va) { pt_entry_t *pte; pte = (pt_entry_t *)PHYS_TO_DMAP(*pde & PG_FRAME); return (&pte[pmap_pte_index(va)]); } /* Return a pointer to the PT slot that corresponds to a VA */ static __inline pt_entry_t * pmap_pte(pmap_t pmap, vm_offset_t va) { pd_entry_t *pde; pt_entry_t PG_V; PG_V = pmap_valid_bit(pmap); pde = pmap_pde(pmap, va); if (pde == NULL || (*pde & PG_V) == 0) return (NULL); if ((*pde & PG_PS) != 0) /* compat with i386 pmap_pte() */ return ((pt_entry_t *)pde); return (pmap_pde_to_pte(pde, va)); } static __inline void pmap_resident_count_inc(pmap_t pmap, int count) { PMAP_LOCK_ASSERT(pmap, MA_OWNED); pmap->pm_stats.resident_count += count; } static __inline void pmap_resident_count_dec(pmap_t pmap, int count) { PMAP_LOCK_ASSERT(pmap, MA_OWNED); KASSERT(pmap->pm_stats.resident_count >= count, ("pmap %p resident count underflow %ld %d", pmap, pmap->pm_stats.resident_count, count)); pmap->pm_stats.resident_count -= count; } PMAP_INLINE pt_entry_t * vtopte(vm_offset_t va) { u_int64_t mask = ((1ul << (NPTEPGSHIFT + NPDEPGSHIFT + NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1); KASSERT(va >= VM_MAXUSER_ADDRESS, ("vtopte on a uva/gpa 0x%0lx", va)); return (PTmap + ((va >> PAGE_SHIFT) & mask)); } static __inline pd_entry_t * vtopde(vm_offset_t va) { u_int64_t mask = ((1ul << (NPDEPGSHIFT + NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1); KASSERT(va >= VM_MAXUSER_ADDRESS, ("vtopde on a uva/gpa 0x%0lx", va)); return (PDmap + ((va >> PDRSHIFT) & mask)); } static u_int64_t allocpages(vm_paddr_t *firstaddr, int n) { u_int64_t ret; ret = *firstaddr; bzero((void *)ret, n * PAGE_SIZE); *firstaddr += n * PAGE_SIZE; return (ret); } CTASSERT(powerof2(NDMPML4E)); /* number of kernel PDP slots */ #define NKPDPE(ptpgs) howmany(ptpgs, NPDEPG) static void nkpt_init(vm_paddr_t addr) { int pt_pages; #ifdef NKPT pt_pages = NKPT; #else pt_pages = howmany(addr, 1 << PDRSHIFT); pt_pages += NKPDPE(pt_pages); /* * Add some slop beyond the bare minimum required for bootstrapping * the kernel. * * This is quite important when allocating KVA for kernel modules. * The modules are required to be linked in the negative 2GB of * the address space. If we run out of KVA in this region then * pmap_growkernel() will need to allocate page table pages to map * the entire 512GB of KVA space which is an unnecessary tax on * physical memory. * * Secondly, device memory mapped as part of setting up the low- * level console(s) is taken from KVA, starting at virtual_avail. * This is because cninit() is called after pmap_bootstrap() but * before vm_init() and pmap_init(). 20MB for a frame buffer is * not uncommon. */ pt_pages += 32; /* 64MB additional slop. */ #endif nkpt = pt_pages; } /* * Returns the proper write/execute permission for a physical page that is * part of the initial boot allocations. * * If the page has kernel text, it is marked as read-only. If the page has * kernel read-only data, it is marked as read-only/not-executable. If the * page has only read-write data, it is marked as read-write/not-executable. * If the page is below/above the kernel range, it is marked as read-write. * * This function operates on 2M pages, since we map the kernel space that * way. * * Note that this doesn't currently provide any protection for modules. */ static inline pt_entry_t bootaddr_rwx(vm_paddr_t pa) { /* * Everything in the same 2M page as the start of the kernel * should be static. On the other hand, things in the same 2M * page as the end of the kernel could be read-write/executable, * as the kernel image is not guaranteed to end on a 2M boundary. */ if (pa < trunc_2mpage(btext - KERNBASE) || pa >= trunc_2mpage(_end - KERNBASE)) return (X86_PG_RW); /* * The linker should ensure that the read-only and read-write * portions don't share the same 2M page, so this shouldn't * impact read-only data. However, in any case, any page with * read-write data needs to be read-write. */ if (pa >= trunc_2mpage(brwsection - KERNBASE)) return (X86_PG_RW | pg_nx); /* * Mark any 2M page containing kernel text as read-only. Mark * other pages with read-only data as read-only and not executable. * (It is likely a small portion of the read-only data section will * be marked as read-only, but executable. This should be acceptable * since the read-only protection will keep the data from changing.) * Note that fixups to the .text section will still work until we * set CR0.WP. */ if (pa < round_2mpage(etext - KERNBASE)) return (0); return (pg_nx); } static void create_pagetables(vm_paddr_t *firstaddr) { int i, j, ndm1g, nkpdpe, nkdmpde; pd_entry_t *pd_p; pdp_entry_t *pdp_p; pml4_entry_t *p4_p; uint64_t DMPDkernphys; /* Allocate page table pages for the direct map */ ndmpdp = howmany(ptoa(Maxmem), NBPDP); if (ndmpdp < 4) /* Minimum 4GB of dirmap */ ndmpdp = 4; ndmpdpphys = howmany(ndmpdp, NPDPEPG); if (ndmpdpphys > NDMPML4E) { /* * Each NDMPML4E allows 512 GB, so limit to that, * and then readjust ndmpdp and ndmpdpphys. */ printf("NDMPML4E limits system to %d GB\n", NDMPML4E * 512); Maxmem = atop(NDMPML4E * NBPML4); ndmpdpphys = NDMPML4E; ndmpdp = NDMPML4E * NPDEPG; } DMPDPphys = allocpages(firstaddr, ndmpdpphys); ndm1g = 0; if ((amd_feature & AMDID_PAGE1GB) != 0) { /* * Calculate the number of 1G pages that will fully fit in * Maxmem. */ ndm1g = ptoa(Maxmem) >> PDPSHIFT; /* * Allocate 2M pages for the kernel. These will be used in * place of the first one or more 1G pages from ndm1g. */ nkdmpde = howmany((vm_offset_t)(brwsection - KERNBASE), NBPDP); DMPDkernphys = allocpages(firstaddr, nkdmpde); } if (ndm1g < ndmpdp) DMPDphys = allocpages(firstaddr, ndmpdp - ndm1g); dmaplimit = (vm_paddr_t)ndmpdp << PDPSHIFT; /* Allocate pages */ KPML4phys = allocpages(firstaddr, 1); KPDPphys = allocpages(firstaddr, NKPML4E); /* * Allocate the initial number of kernel page table pages required to * bootstrap. We defer this until after all memory-size dependent * allocations are done (e.g. direct map), so that we don't have to * build in too much slop in our estimate. * * Note that when NKPML4E > 1, we have an empty page underneath * all but the KPML4I'th one, so we need NKPML4E-1 extra (zeroed) * pages. (pmap_enter requires a PD page to exist for each KPML4E.) */ nkpt_init(*firstaddr); nkpdpe = NKPDPE(nkpt); KPTphys = allocpages(firstaddr, nkpt); KPDphys = allocpages(firstaddr, nkpdpe); /* * Connect the zero-filled PT pages to their PD entries. This * implicitly maps the PT pages at their correct locations within * the PTmap. */ pd_p = (pd_entry_t *)KPDphys; for (i = 0; i < nkpt; i++) pd_p[i] = (KPTphys + ptoa(i)) | X86_PG_RW | X86_PG_V; /* * Map from physical address zero to the end of loader preallocated * memory using 2MB pages. This replaces some of the PD entries * created above. */ for (i = 0; (i << PDRSHIFT) < KERNend; i++) /* Preset PG_M and PG_A because demotion expects it. */ pd_p[i] = (i << PDRSHIFT) | X86_PG_V | PG_PS | pg_g | X86_PG_M | X86_PG_A | bootaddr_rwx(i << PDRSHIFT); /* * Because we map the physical blocks in 2M pages, adjust firstaddr * to record the physical blocks we've actually mapped into kernel * virtual address space. */ if (*firstaddr < round_2mpage(KERNend)) *firstaddr = round_2mpage(KERNend); /* And connect up the PD to the PDP (leaving room for L4 pages) */ pdp_p = (pdp_entry_t *)(KPDPphys + ptoa(KPML4I - KPML4BASE)); for (i = 0; i < nkpdpe; i++) pdp_p[i + KPDPI] = (KPDphys + ptoa(i)) | X86_PG_RW | X86_PG_V; /* * Now, set up the direct map region using 2MB and/or 1GB pages. If * the end of physical memory is not aligned to a 1GB page boundary, * then the residual physical memory is mapped with 2MB pages. Later, * if pmap_mapdev{_attr}() uses the direct map for non-write-back * memory, pmap_change_attr() will demote any 2MB or 1GB page mappings * that are partially used. */ pd_p = (pd_entry_t *)DMPDphys; for (i = NPDEPG * ndm1g, j = 0; i < NPDEPG * ndmpdp; i++, j++) { pd_p[j] = (vm_paddr_t)i << PDRSHIFT; /* Preset PG_M and PG_A because demotion expects it. */ pd_p[j] |= X86_PG_RW | X86_PG_V | PG_PS | pg_g | X86_PG_M | X86_PG_A | pg_nx; } pdp_p = (pdp_entry_t *)DMPDPphys; for (i = 0; i < ndm1g; i++) { pdp_p[i] = (vm_paddr_t)i << PDPSHIFT; /* Preset PG_M and PG_A because demotion expects it. */ pdp_p[i] |= X86_PG_RW | X86_PG_V | PG_PS | pg_g | X86_PG_M | X86_PG_A | pg_nx; } for (j = 0; i < ndmpdp; i++, j++) { pdp_p[i] = DMPDphys + ptoa(j); pdp_p[i] |= X86_PG_RW | X86_PG_V; } /* * Instead of using a 1G page for the memory containing the kernel, * use 2M pages with appropriate permissions. (If using 1G pages, * this will partially overwrite the PDPEs above.) */ if (ndm1g) { pd_p = (pd_entry_t *)DMPDkernphys; for (i = 0; i < (NPDEPG * nkdmpde); i++) pd_p[i] = (i << PDRSHIFT) | X86_PG_V | PG_PS | pg_g | X86_PG_M | X86_PG_A | pg_nx | bootaddr_rwx(i << PDRSHIFT); for (i = 0; i < nkdmpde; i++) pdp_p[i] = (DMPDkernphys + ptoa(i)) | X86_PG_RW | X86_PG_V; } /* And recursively map PML4 to itself in order to get PTmap */ p4_p = (pml4_entry_t *)KPML4phys; p4_p[PML4PML4I] = KPML4phys; p4_p[PML4PML4I] |= X86_PG_RW | X86_PG_V | pg_nx; /* Connect the Direct Map slot(s) up to the PML4. */ for (i = 0; i < ndmpdpphys; i++) { p4_p[DMPML4I + i] = DMPDPphys + ptoa(i); p4_p[DMPML4I + i] |= X86_PG_RW | X86_PG_V; } /* Connect the KVA slots up to the PML4 */ for (i = 0; i < NKPML4E; i++) { p4_p[KPML4BASE + i] = KPDPphys + ptoa(i); p4_p[KPML4BASE + i] |= X86_PG_RW | X86_PG_V; } } /* * Bootstrap the system enough to run with virtual memory. * * On amd64 this is called after mapping has already been enabled * and just syncs the pmap module with what has already been done. * [We can't call it easily with mapping off since the kernel is not * mapped with PA == VA, hence we would have to relocate every address * from the linked base (virtual) address "KERNBASE" to the actual * (physical) address starting relative to 0] */ void pmap_bootstrap(vm_paddr_t *firstaddr) { vm_offset_t va; pt_entry_t *pte; uint64_t cr4; u_long res; int i; KERNend = *firstaddr; res = atop(KERNend - (vm_paddr_t)kernphys); if (!pti) pg_g = X86_PG_G; /* * Create an initial set of page tables to run the kernel in. */ create_pagetables(firstaddr); /* * Add a physical memory segment (vm_phys_seg) corresponding to the * preallocated kernel page table pages so that vm_page structures * representing these pages will be created. The vm_page structures * are required for promotion of the corresponding kernel virtual * addresses to superpage mappings. */ vm_phys_add_seg(KPTphys, KPTphys + ptoa(nkpt)); /* * Account for the virtual addresses mapped by create_pagetables(). */ virtual_avail = (vm_offset_t)KERNBASE + round_2mpage(KERNend); virtual_end = VM_MAX_KERNEL_ADDRESS; /* * Enable PG_G global pages, then switch to the kernel page * table from the bootstrap page table. After the switch, it * is possible to enable SMEP and SMAP since PG_U bits are * correct now. */ cr4 = rcr4(); cr4 |= CR4_PGE; load_cr4(cr4); load_cr3(KPML4phys); if (cpu_stdext_feature & CPUID_STDEXT_SMEP) cr4 |= CR4_SMEP; if (cpu_stdext_feature & CPUID_STDEXT_SMAP) cr4 |= CR4_SMAP; load_cr4(cr4); /* * Initialize the kernel pmap (which is statically allocated). * Count bootstrap data as being resident in case any of this data is * later unmapped (using pmap_remove()) and freed. */ PMAP_LOCK_INIT(kernel_pmap); kernel_pmap->pm_pml4 = (pdp_entry_t *)PHYS_TO_DMAP(KPML4phys); kernel_pmap->pm_cr3 = KPML4phys; kernel_pmap->pm_ucr3 = PMAP_NO_CR3; CPU_FILL(&kernel_pmap->pm_active); /* don't allow deactivation */ TAILQ_INIT(&kernel_pmap->pm_pvchunk); kernel_pmap->pm_stats.resident_count = res; kernel_pmap->pm_flags = pmap_flags; /* * Initialize the TLB invalidations generation number lock. */ mtx_init(&invl_gen_mtx, "invlgn", NULL, MTX_DEF); /* * Reserve some special page table entries/VA space for temporary * mapping of pages. */ #define SYSMAP(c, p, v, n) \ v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n); va = virtual_avail; pte = vtopte(va); /* * Crashdump maps. The first page is reused as CMAP1 for the * memory test. */ SYSMAP(caddr_t, CMAP1, crashdumpmap, MAXDUMPPGS) CADDR1 = crashdumpmap; virtual_avail = va; /* * Initialize the PAT MSR. * pmap_init_pat() clears and sets CR4_PGE, which, as a * side-effect, invalidates stale PG_G TLB entries that might * have been created in our pre-boot environment. */ pmap_init_pat(); /* Initialize TLB Context Id. */ if (pmap_pcid_enabled) { for (i = 0; i < MAXCPU; i++) { kernel_pmap->pm_pcids[i].pm_pcid = PMAP_PCID_KERN; kernel_pmap->pm_pcids[i].pm_gen = 1; } /* * PMAP_PCID_KERN + 1 is used for initialization of * proc0 pmap. The pmap' pcid state might be used by * EFIRT entry before first context switch, so it * needs to be valid. */ PCPU_SET(pcid_next, PMAP_PCID_KERN + 2); PCPU_SET(pcid_gen, 1); /* * pcpu area for APs is zeroed during AP startup. * pc_pcid_next and pc_pcid_gen are initialized by AP * during pcpu setup. */ load_cr4(rcr4() | CR4_PCIDE); } } /* * Setup the PAT MSR. */ void pmap_init_pat(void) { uint64_t pat_msr; u_long cr0, cr4; int i; /* Bail if this CPU doesn't implement PAT. */ if ((cpu_feature & CPUID_PAT) == 0) panic("no PAT??"); /* Set default PAT index table. */ for (i = 0; i < PAT_INDEX_SIZE; i++) pat_index[i] = -1; pat_index[PAT_WRITE_BACK] = 0; pat_index[PAT_WRITE_THROUGH] = 1; pat_index[PAT_UNCACHEABLE] = 3; pat_index[PAT_WRITE_COMBINING] = 6; pat_index[PAT_WRITE_PROTECTED] = 5; pat_index[PAT_UNCACHED] = 2; /* * Initialize default PAT entries. * Leave the indices 0-3 at the default of WB, WT, UC-, and UC. * Program 5 and 6 as WP and WC. * * Leave 4 and 7 as WB and UC. Note that a recursive page table * mapping for a 2M page uses a PAT value with the bit 3 set due * to its overload with PG_PS. */ pat_msr = PAT_VALUE(0, PAT_WRITE_BACK) | PAT_VALUE(1, PAT_WRITE_THROUGH) | PAT_VALUE(2, PAT_UNCACHED) | PAT_VALUE(3, PAT_UNCACHEABLE) | PAT_VALUE(4, PAT_WRITE_BACK) | PAT_VALUE(5, PAT_WRITE_PROTECTED) | PAT_VALUE(6, PAT_WRITE_COMBINING) | PAT_VALUE(7, PAT_UNCACHEABLE); /* Disable PGE. */ cr4 = rcr4(); load_cr4(cr4 & ~CR4_PGE); /* Disable caches (CD = 1, NW = 0). */ cr0 = rcr0(); load_cr0((cr0 & ~CR0_NW) | CR0_CD); /* Flushes caches and TLBs. */ wbinvd(); invltlb(); /* Update PAT and index table. */ wrmsr(MSR_PAT, pat_msr); /* Flush caches and TLBs again. */ wbinvd(); invltlb(); /* Restore caches and PGE. */ load_cr0(cr0); load_cr4(cr4); } /* * Initialize a vm_page's machine-dependent fields. */ void pmap_page_init(vm_page_t m) { TAILQ_INIT(&m->md.pv_list); m->md.pat_mode = PAT_WRITE_BACK; } /* * Initialize the pmap module. * Called by vm_init, to initialize any structures that the pmap * system needs to map virtual memory. */ void pmap_init(void) { struct pmap_preinit_mapping *ppim; vm_page_t m, mpte; vm_size_t s; int error, i, pv_npg, ret, skz63; /* L1TF, reserve page @0 unconditionally */ vm_page_blacklist_add(0, bootverbose); /* Detect bare-metal Skylake Server and Skylake-X. */ if (vm_guest == VM_GUEST_NO && cpu_vendor_id == CPU_VENDOR_INTEL && CPUID_TO_FAMILY(cpu_id) == 0x6 && CPUID_TO_MODEL(cpu_id) == 0x55) { /* * Skylake-X errata SKZ63. Processor May Hang When * Executing Code In an HLE Transaction Region between * 40000000H and 403FFFFFH. * * Mark the pages in the range as preallocated. It * seems to be impossible to distinguish between * Skylake Server and Skylake X. */ skz63 = 1; TUNABLE_INT_FETCH("hw.skz63_enable", &skz63); if (skz63 != 0) { if (bootverbose) printf("SKZ63: skipping 4M RAM starting " "at physical 1G\n"); for (i = 0; i < atop(0x400000); i++) { ret = vm_page_blacklist_add(0x40000000 + ptoa(i), FALSE); if (!ret && bootverbose) printf("page at %#lx already used\n", 0x40000000 + ptoa(i)); } } } /* * Initialize the vm page array entries for the kernel pmap's * page table pages. */ PMAP_LOCK(kernel_pmap); for (i = 0; i < nkpt; i++) { mpte = PHYS_TO_VM_PAGE(KPTphys + (i << PAGE_SHIFT)); KASSERT(mpte >= vm_page_array && mpte < &vm_page_array[vm_page_array_size], ("pmap_init: page table page is out of range")); mpte->pindex = pmap_pde_pindex(KERNBASE) + i; mpte->phys_addr = KPTphys + (i << PAGE_SHIFT); mpte->wire_count = 1; /* * Collect the page table pages that were replaced by a 2MB * page in create_pagetables(). They are zero filled. */ if (i << PDRSHIFT < KERNend && pmap_insert_pt_page(kernel_pmap, mpte, false)) panic("pmap_init: pmap_insert_pt_page failed"); } PMAP_UNLOCK(kernel_pmap); vm_wire_add(nkpt); /* * If the kernel is running on a virtual machine, then it must assume * that MCA is enabled by the hypervisor. Moreover, the kernel must * be prepared for the hypervisor changing the vendor and family that * are reported by CPUID. Consequently, the workaround for AMD Family * 10h Erratum 383 is enabled if the processor's feature set does not * include at least one feature that is only supported by older Intel * or newer AMD processors. */ if (vm_guest != VM_GUEST_NO && (cpu_feature & CPUID_SS) == 0 && (cpu_feature2 & (CPUID2_SSSE3 | CPUID2_SSE41 | CPUID2_AESNI | CPUID2_AVX | CPUID2_XSAVE)) == 0 && (amd_feature2 & (AMDID2_XOP | AMDID2_FMA4)) == 0) workaround_erratum383 = 1; /* * Are large page mappings enabled? */ TUNABLE_INT_FETCH("vm.pmap.pg_ps_enabled", &pg_ps_enabled); if (pg_ps_enabled) { KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0, ("pmap_init: can't assign to pagesizes[1]")); pagesizes[1] = NBPDR; } /* * Initialize the pv chunk list mutex. */ mtx_init(&pv_chunks_mutex, "pmap pv chunk list", NULL, MTX_DEF); /* * Initialize the pool of pv list locks. */ for (i = 0; i < NPV_LIST_LOCKS; i++) rw_init(&pv_list_locks[i], "pmap pv list"); /* * Calculate the size of the pv head table for superpages. */ pv_npg = howmany(vm_phys_segs[vm_phys_nsegs - 1].end, NBPDR); /* * Allocate memory for the pv head table for superpages. */ s = (vm_size_t)(pv_npg * sizeof(struct md_page)); s = round_page(s); pv_table = (struct md_page *)kmem_malloc(s, M_WAITOK | M_ZERO); for (i = 0; i < pv_npg; i++) TAILQ_INIT(&pv_table[i].pv_list); TAILQ_INIT(&pv_dummy.pv_list); pmap_initialized = 1; for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { ppim = pmap_preinit_mapping + i; if (ppim->va == 0) continue; /* Make the direct map consistent */ if (ppim->pa < dmaplimit && ppim->pa + ppim->sz <= dmaplimit) { (void)pmap_change_attr(PHYS_TO_DMAP(ppim->pa), ppim->sz, ppim->mode); } if (!bootverbose) continue; printf("PPIM %u: PA=%#lx, VA=%#lx, size=%#lx, mode=%#x\n", i, ppim->pa, ppim->va, ppim->sz, ppim->mode); } mtx_init(&qframe_mtx, "qfrmlk", NULL, MTX_SPIN); error = vmem_alloc(kernel_arena, PAGE_SIZE, M_BESTFIT | M_WAITOK, (vmem_addr_t *)&qframe); if (error != 0) panic("qframe allocation failed"); lm_ents = 8; TUNABLE_INT_FETCH("vm.pmap.large_map_pml4_entries", &lm_ents); if (lm_ents > LMEPML4I - LMSPML4I + 1) lm_ents = LMEPML4I - LMSPML4I + 1; if (bootverbose) printf("pmap: large map %u PML4 slots (%lu Gb)\n", lm_ents, (u_long)lm_ents * (NBPML4 / 1024 / 1024 / 1024)); if (lm_ents != 0) { large_vmem = vmem_create("large", LARGEMAP_MIN_ADDRESS, (vmem_size_t)lm_ents * NBPML4, PAGE_SIZE, 0, M_WAITOK); if (large_vmem == NULL) { printf("pmap: cannot create large map\n"); lm_ents = 0; } for (i = 0; i < lm_ents; i++) { m = pmap_large_map_getptp_unlocked(); kernel_pmap->pm_pml4[LMSPML4I + i] = X86_PG_V | X86_PG_RW | X86_PG_A | X86_PG_M | pg_nx | VM_PAGE_TO_PHYS(m); } } } static SYSCTL_NODE(_vm_pmap, OID_AUTO, pde, CTLFLAG_RD, 0, "2MB page mapping counters"); static u_long pmap_pde_demotions; SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, demotions, CTLFLAG_RD, &pmap_pde_demotions, 0, "2MB page demotions"); static u_long pmap_pde_mappings; SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, mappings, CTLFLAG_RD, &pmap_pde_mappings, 0, "2MB page mappings"); static u_long pmap_pde_p_failures; SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, p_failures, CTLFLAG_RD, &pmap_pde_p_failures, 0, "2MB page promotion failures"); static u_long pmap_pde_promotions; SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, promotions, CTLFLAG_RD, &pmap_pde_promotions, 0, "2MB page promotions"); static SYSCTL_NODE(_vm_pmap, OID_AUTO, pdpe, CTLFLAG_RD, 0, "1GB page mapping counters"); static u_long pmap_pdpe_demotions; SYSCTL_ULONG(_vm_pmap_pdpe, OID_AUTO, demotions, CTLFLAG_RD, &pmap_pdpe_demotions, 0, "1GB page demotions"); /*************************************************** * Low level helper routines..... ***************************************************/ static pt_entry_t pmap_swap_pat(pmap_t pmap, pt_entry_t entry) { int x86_pat_bits = X86_PG_PTE_PAT | X86_PG_PDE_PAT; switch (pmap->pm_type) { case PT_X86: case PT_RVI: /* Verify that both PAT bits are not set at the same time */ KASSERT((entry & x86_pat_bits) != x86_pat_bits, ("Invalid PAT bits in entry %#lx", entry)); /* Swap the PAT bits if one of them is set */ if ((entry & x86_pat_bits) != 0) entry ^= x86_pat_bits; break; case PT_EPT: /* * Nothing to do - the memory attributes are represented * the same way for regular pages and superpages. */ break; default: panic("pmap_switch_pat_bits: bad pm_type %d", pmap->pm_type); } return (entry); } boolean_t pmap_is_valid_memattr(pmap_t pmap __unused, vm_memattr_t mode) { return (mode >= 0 && mode < PAT_INDEX_SIZE && pat_index[(int)mode] >= 0); } /* * Determine the appropriate bits to set in a PTE or PDE for a specified * caching mode. */ int pmap_cache_bits(pmap_t pmap, int mode, boolean_t is_pde) { int cache_bits, pat_flag, pat_idx; if (!pmap_is_valid_memattr(pmap, mode)) panic("Unknown caching mode %d\n", mode); switch (pmap->pm_type) { case PT_X86: case PT_RVI: /* The PAT bit is different for PTE's and PDE's. */ pat_flag = is_pde ? X86_PG_PDE_PAT : X86_PG_PTE_PAT; /* Map the caching mode to a PAT index. */ pat_idx = pat_index[mode]; /* Map the 3-bit index value into the PAT, PCD, and PWT bits. */ cache_bits = 0; if (pat_idx & 0x4) cache_bits |= pat_flag; if (pat_idx & 0x2) cache_bits |= PG_NC_PCD; if (pat_idx & 0x1) cache_bits |= PG_NC_PWT; break; case PT_EPT: cache_bits = EPT_PG_IGNORE_PAT | EPT_PG_MEMORY_TYPE(mode); break; default: panic("unsupported pmap type %d", pmap->pm_type); } return (cache_bits); } static int pmap_cache_mask(pmap_t pmap, boolean_t is_pde) { int mask; switch (pmap->pm_type) { case PT_X86: case PT_RVI: mask = is_pde ? X86_PG_PDE_CACHE : X86_PG_PTE_CACHE; break; case PT_EPT: mask = EPT_PG_IGNORE_PAT | EPT_PG_MEMORY_TYPE(0x7); break; default: panic("pmap_cache_mask: invalid pm_type %d", pmap->pm_type); } return (mask); } bool pmap_ps_enabled(pmap_t pmap) { return (pg_ps_enabled && (pmap->pm_flags & PMAP_PDE_SUPERPAGE) != 0); } static void pmap_update_pde_store(pmap_t pmap, pd_entry_t *pde, pd_entry_t newpde) { switch (pmap->pm_type) { case PT_X86: break; case PT_RVI: case PT_EPT: /* * XXX * This is a little bogus since the generation number is * supposed to be bumped up when a region of the address * space is invalidated in the page tables. * * In this case the old PDE entry is valid but yet we want * to make sure that any mappings using the old entry are * invalidated in the TLB. * * The reason this works as expected is because we rendezvous * "all" host cpus and force any vcpu context to exit as a * side-effect. */ atomic_add_acq_long(&pmap->pm_eptgen, 1); break; default: panic("pmap_update_pde_store: bad pm_type %d", pmap->pm_type); } pde_store(pde, newpde); } /* * After changing the page size for the specified virtual address in the page * table, flush the corresponding entries from the processor's TLB. Only the * calling processor's TLB is affected. * * The calling thread must be pinned to a processor. */ static void pmap_update_pde_invalidate(pmap_t pmap, vm_offset_t va, pd_entry_t newpde) { pt_entry_t PG_G; if (pmap_type_guest(pmap)) return; KASSERT(pmap->pm_type == PT_X86, ("pmap_update_pde_invalidate: invalid type %d", pmap->pm_type)); PG_G = pmap_global_bit(pmap); if ((newpde & PG_PS) == 0) /* Demotion: flush a specific 2MB page mapping. */ invlpg(va); else if ((newpde & PG_G) == 0) /* * Promotion: flush every 4KB page mapping from the TLB * because there are too many to flush individually. */ invltlb(); else { /* * Promotion: flush every 4KB page mapping from the TLB, * including any global (PG_G) mappings. */ invltlb_glob(); } } #ifdef SMP /* * For SMP, these functions have to use the IPI mechanism for coherence. * * N.B.: Before calling any of the following TLB invalidation functions, * the calling processor must ensure that all stores updating a non- * kernel page table are globally performed. Otherwise, another * processor could cache an old, pre-update entry without being * invalidated. This can happen one of two ways: (1) The pmap becomes * active on another processor after its pm_active field is checked by * one of the following functions but before a store updating the page * table is globally performed. (2) The pmap becomes active on another * processor before its pm_active field is checked but due to * speculative loads one of the following functions stills reads the * pmap as inactive on the other processor. * * The kernel page table is exempt because its pm_active field is * immutable. The kernel page table is always active on every * processor. */ /* * Interrupt the cpus that are executing in the guest context. * This will force the vcpu to exit and the cached EPT mappings * will be invalidated by the host before the next vmresume. */ static __inline void pmap_invalidate_ept(pmap_t pmap) { int ipinum; sched_pin(); KASSERT(!CPU_ISSET(curcpu, &pmap->pm_active), ("pmap_invalidate_ept: absurd pm_active")); /* * The TLB mappings associated with a vcpu context are not * flushed each time a different vcpu is chosen to execute. * * This is in contrast with a process's vtop mappings that * are flushed from the TLB on each context switch. * * Therefore we need to do more than just a TLB shootdown on * the active cpus in 'pmap->pm_active'. To do this we keep * track of the number of invalidations performed on this pmap. * * Each vcpu keeps a cache of this counter and compares it * just before a vmresume. If the counter is out-of-date an * invept will be done to flush stale mappings from the TLB. */ atomic_add_acq_long(&pmap->pm_eptgen, 1); /* * Force the vcpu to exit and trap back into the hypervisor. */ ipinum = pmap->pm_flags & PMAP_NESTED_IPIMASK; ipi_selected(pmap->pm_active, ipinum); sched_unpin(); } static cpuset_t pmap_invalidate_cpu_mask(pmap_t pmap) { return (pmap == kernel_pmap ? all_cpus : pmap->pm_active); } static inline void pmap_invalidate_page_pcid(pmap_t pmap, vm_offset_t va, const bool invpcid_works1) { struct invpcid_descr d; uint64_t kcr3, ucr3; uint32_t pcid; u_int cpuid, i; cpuid = PCPU_GET(cpuid); if (pmap == PCPU_GET(curpmap)) { if (pmap->pm_ucr3 != PMAP_NO_CR3) { /* * Because pm_pcid is recalculated on a * context switch, we must disable switching. * Otherwise, we might use a stale value * below. */ critical_enter(); pcid = pmap->pm_pcids[cpuid].pm_pcid; if (invpcid_works1) { d.pcid = pcid | PMAP_PCID_USER_PT; d.pad = 0; d.addr = va; invpcid(&d, INVPCID_ADDR); } else { kcr3 = pmap->pm_cr3 | pcid | CR3_PCID_SAVE; ucr3 = pmap->pm_ucr3 | pcid | PMAP_PCID_USER_PT | CR3_PCID_SAVE; pmap_pti_pcid_invlpg(ucr3, kcr3, va); } critical_exit(); } } else pmap->pm_pcids[cpuid].pm_gen = 0; CPU_FOREACH(i) { if (cpuid != i) pmap->pm_pcids[i].pm_gen = 0; } /* * The fence is between stores to pm_gen and the read of the * pm_active mask. We need to ensure that it is impossible * for us to miss the bit update in pm_active and * simultaneously observe a non-zero pm_gen in * pmap_activate_sw(), otherwise TLB update is missed. * Without the fence, IA32 allows such an outcome. Note that * pm_active is updated by a locked operation, which provides * the reciprocal fence. */ atomic_thread_fence_seq_cst(); } static void pmap_invalidate_page_pcid_invpcid(pmap_t pmap, vm_offset_t va) { pmap_invalidate_page_pcid(pmap, va, true); } static void pmap_invalidate_page_pcid_noinvpcid(pmap_t pmap, vm_offset_t va) { pmap_invalidate_page_pcid(pmap, va, false); } static void pmap_invalidate_page_nopcid(pmap_t pmap, vm_offset_t va) { } DEFINE_IFUNC(static, void, pmap_invalidate_page_mode, (pmap_t, vm_offset_t)) { if (pmap_pcid_enabled) return (invpcid_works ? pmap_invalidate_page_pcid_invpcid : pmap_invalidate_page_pcid_noinvpcid); return (pmap_invalidate_page_nopcid); } void pmap_invalidate_page(pmap_t pmap, vm_offset_t va) { if (pmap_type_guest(pmap)) { pmap_invalidate_ept(pmap); return; } KASSERT(pmap->pm_type == PT_X86, ("pmap_invalidate_page: invalid type %d", pmap->pm_type)); sched_pin(); if (pmap == kernel_pmap) { invlpg(va); } else { if (pmap == PCPU_GET(curpmap)) invlpg(va); pmap_invalidate_page_mode(pmap, va); } smp_masked_invlpg(pmap_invalidate_cpu_mask(pmap), va, pmap); sched_unpin(); } /* 4k PTEs -- Chosen to exceed the total size of Broadwell L2 TLB */ #define PMAP_INVLPG_THRESHOLD (4 * 1024 * PAGE_SIZE) static void pmap_invalidate_range_pcid(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, const bool invpcid_works1) { struct invpcid_descr d; uint64_t kcr3, ucr3; uint32_t pcid; u_int cpuid, i; cpuid = PCPU_GET(cpuid); if (pmap == PCPU_GET(curpmap)) { if (pmap->pm_ucr3 != PMAP_NO_CR3) { critical_enter(); pcid = pmap->pm_pcids[cpuid].pm_pcid; if (invpcid_works1) { d.pcid = pcid | PMAP_PCID_USER_PT; d.pad = 0; d.addr = sva; for (; d.addr < eva; d.addr += PAGE_SIZE) invpcid(&d, INVPCID_ADDR); } else { kcr3 = pmap->pm_cr3 | pcid | CR3_PCID_SAVE; ucr3 = pmap->pm_ucr3 | pcid | PMAP_PCID_USER_PT | CR3_PCID_SAVE; pmap_pti_pcid_invlrng(ucr3, kcr3, sva, eva); } critical_exit(); } } else pmap->pm_pcids[cpuid].pm_gen = 0; CPU_FOREACH(i) { if (cpuid != i) pmap->pm_pcids[i].pm_gen = 0; } /* See the comment in pmap_invalidate_page_pcid(). */ atomic_thread_fence_seq_cst(); } static void pmap_invalidate_range_pcid_invpcid(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { pmap_invalidate_range_pcid(pmap, sva, eva, true); } static void pmap_invalidate_range_pcid_noinvpcid(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { pmap_invalidate_range_pcid(pmap, sva, eva, false); } static void pmap_invalidate_range_nopcid(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { } DEFINE_IFUNC(static, void, pmap_invalidate_range_mode, (pmap_t, vm_offset_t, vm_offset_t)) { if (pmap_pcid_enabled) return (invpcid_works ? pmap_invalidate_range_pcid_invpcid : pmap_invalidate_range_pcid_noinvpcid); return (pmap_invalidate_range_nopcid); } void pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { vm_offset_t addr; if (eva - sva >= PMAP_INVLPG_THRESHOLD) { pmap_invalidate_all(pmap); return; } if (pmap_type_guest(pmap)) { pmap_invalidate_ept(pmap); return; } KASSERT(pmap->pm_type == PT_X86, ("pmap_invalidate_range: invalid type %d", pmap->pm_type)); sched_pin(); if (pmap == kernel_pmap) { for (addr = sva; addr < eva; addr += PAGE_SIZE) invlpg(addr); } else { if (pmap == PCPU_GET(curpmap)) { for (addr = sva; addr < eva; addr += PAGE_SIZE) invlpg(addr); } pmap_invalidate_range_mode(pmap, sva, eva); } smp_masked_invlpg_range(pmap_invalidate_cpu_mask(pmap), sva, eva, pmap); sched_unpin(); } static inline void pmap_invalidate_all_pcid(pmap_t pmap, bool invpcid_works1) { struct invpcid_descr d; uint64_t kcr3, ucr3; uint32_t pcid; u_int cpuid, i; if (pmap == kernel_pmap) { if (invpcid_works1) { bzero(&d, sizeof(d)); invpcid(&d, INVPCID_CTXGLOB); } else { invltlb_glob(); } } else { cpuid = PCPU_GET(cpuid); if (pmap == PCPU_GET(curpmap)) { critical_enter(); pcid = pmap->pm_pcids[cpuid].pm_pcid; if (invpcid_works1) { d.pcid = pcid; d.pad = 0; d.addr = 0; invpcid(&d, INVPCID_CTX); if (pmap->pm_ucr3 != PMAP_NO_CR3) { d.pcid |= PMAP_PCID_USER_PT; invpcid(&d, INVPCID_CTX); } } else { kcr3 = pmap->pm_cr3 | pcid; ucr3 = pmap->pm_ucr3; if (ucr3 != PMAP_NO_CR3) { ucr3 |= pcid | PMAP_PCID_USER_PT; pmap_pti_pcid_invalidate(ucr3, kcr3); } else { load_cr3(kcr3); } } critical_exit(); } else pmap->pm_pcids[cpuid].pm_gen = 0; CPU_FOREACH(i) { if (cpuid != i) pmap->pm_pcids[i].pm_gen = 0; } } /* See the comment in pmap_invalidate_page_pcid(). */ atomic_thread_fence_seq_cst(); } static void pmap_invalidate_all_pcid_invpcid(pmap_t pmap) { pmap_invalidate_all_pcid(pmap, true); } static void pmap_invalidate_all_pcid_noinvpcid(pmap_t pmap) { pmap_invalidate_all_pcid(pmap, false); } static void pmap_invalidate_all_nopcid(pmap_t pmap) { if (pmap == kernel_pmap) invltlb_glob(); else if (pmap == PCPU_GET(curpmap)) invltlb(); } DEFINE_IFUNC(static, void, pmap_invalidate_all_mode, (pmap_t)) { if (pmap_pcid_enabled) return (invpcid_works ? pmap_invalidate_all_pcid_invpcid : pmap_invalidate_all_pcid_noinvpcid); return (pmap_invalidate_all_nopcid); } void pmap_invalidate_all(pmap_t pmap) { if (pmap_type_guest(pmap)) { pmap_invalidate_ept(pmap); return; } KASSERT(pmap->pm_type == PT_X86, ("pmap_invalidate_all: invalid type %d", pmap->pm_type)); sched_pin(); pmap_invalidate_all_mode(pmap); smp_masked_invltlb(pmap_invalidate_cpu_mask(pmap), pmap); sched_unpin(); } void pmap_invalidate_cache(void) { sched_pin(); wbinvd(); smp_cache_flush(); sched_unpin(); } struct pde_action { cpuset_t invalidate; /* processors that invalidate their TLB */ pmap_t pmap; vm_offset_t va; pd_entry_t *pde; pd_entry_t newpde; u_int store; /* processor that updates the PDE */ }; static void pmap_update_pde_action(void *arg) { struct pde_action *act = arg; if (act->store == PCPU_GET(cpuid)) pmap_update_pde_store(act->pmap, act->pde, act->newpde); } static void pmap_update_pde_teardown(void *arg) { struct pde_action *act = arg; if (CPU_ISSET(PCPU_GET(cpuid), &act->invalidate)) pmap_update_pde_invalidate(act->pmap, act->va, act->newpde); } /* * Change the page size for the specified virtual address in a way that * prevents any possibility of the TLB ever having two entries that map the * same virtual address using different page sizes. This is the recommended * workaround for Erratum 383 on AMD Family 10h processors. It prevents a * machine check exception for a TLB state that is improperly diagnosed as a * hardware error. */ static void pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde) { struct pde_action act; cpuset_t active, other_cpus; u_int cpuid; sched_pin(); cpuid = PCPU_GET(cpuid); other_cpus = all_cpus; CPU_CLR(cpuid, &other_cpus); if (pmap == kernel_pmap || pmap_type_guest(pmap)) active = all_cpus; else { active = pmap->pm_active; } if (CPU_OVERLAP(&active, &other_cpus)) { act.store = cpuid; act.invalidate = active; act.va = va; act.pmap = pmap; act.pde = pde; act.newpde = newpde; CPU_SET(cpuid, &active); smp_rendezvous_cpus(active, smp_no_rendezvous_barrier, pmap_update_pde_action, pmap_update_pde_teardown, &act); } else { pmap_update_pde_store(pmap, pde, newpde); if (CPU_ISSET(cpuid, &active)) pmap_update_pde_invalidate(pmap, va, newpde); } sched_unpin(); } #else /* !SMP */ /* * Normal, non-SMP, invalidation functions. */ void pmap_invalidate_page(pmap_t pmap, vm_offset_t va) { struct invpcid_descr d; uint64_t kcr3, ucr3; uint32_t pcid; if (pmap->pm_type == PT_RVI || pmap->pm_type == PT_EPT) { pmap->pm_eptgen++; return; } KASSERT(pmap->pm_type == PT_X86, ("pmap_invalidate_range: unknown type %d", pmap->pm_type)); if (pmap == kernel_pmap || pmap == PCPU_GET(curpmap)) { invlpg(va); if (pmap == PCPU_GET(curpmap) && pmap_pcid_enabled && pmap->pm_ucr3 != PMAP_NO_CR3) { critical_enter(); pcid = pmap->pm_pcids[0].pm_pcid; if (invpcid_works) { d.pcid = pcid | PMAP_PCID_USER_PT; d.pad = 0; d.addr = va; invpcid(&d, INVPCID_ADDR); } else { kcr3 = pmap->pm_cr3 | pcid | CR3_PCID_SAVE; ucr3 = pmap->pm_ucr3 | pcid | PMAP_PCID_USER_PT | CR3_PCID_SAVE; pmap_pti_pcid_invlpg(ucr3, kcr3, va); } critical_exit(); } } else if (pmap_pcid_enabled) pmap->pm_pcids[0].pm_gen = 0; } void pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { struct invpcid_descr d; vm_offset_t addr; uint64_t kcr3, ucr3; if (pmap->pm_type == PT_RVI || pmap->pm_type == PT_EPT) { pmap->pm_eptgen++; return; } KASSERT(pmap->pm_type == PT_X86, ("pmap_invalidate_range: unknown type %d", pmap->pm_type)); if (pmap == kernel_pmap || pmap == PCPU_GET(curpmap)) { for (addr = sva; addr < eva; addr += PAGE_SIZE) invlpg(addr); if (pmap == PCPU_GET(curpmap) && pmap_pcid_enabled && pmap->pm_ucr3 != PMAP_NO_CR3) { critical_enter(); if (invpcid_works) { d.pcid = pmap->pm_pcids[0].pm_pcid | PMAP_PCID_USER_PT; d.pad = 0; d.addr = sva; for (; d.addr < eva; d.addr += PAGE_SIZE) invpcid(&d, INVPCID_ADDR); } else { kcr3 = pmap->pm_cr3 | pmap->pm_pcids[0]. pm_pcid | CR3_PCID_SAVE; ucr3 = pmap->pm_ucr3 | pmap->pm_pcids[0]. pm_pcid | PMAP_PCID_USER_PT | CR3_PCID_SAVE; pmap_pti_pcid_invlrng(ucr3, kcr3, sva, eva); } critical_exit(); } } else if (pmap_pcid_enabled) { pmap->pm_pcids[0].pm_gen = 0; } } void pmap_invalidate_all(pmap_t pmap) { struct invpcid_descr d; uint64_t kcr3, ucr3; if (pmap->pm_type == PT_RVI || pmap->pm_type == PT_EPT) { pmap->pm_eptgen++; return; } KASSERT(pmap->pm_type == PT_X86, ("pmap_invalidate_all: unknown type %d", pmap->pm_type)); if (pmap == kernel_pmap) { if (pmap_pcid_enabled && invpcid_works) { bzero(&d, sizeof(d)); invpcid(&d, INVPCID_CTXGLOB); } else { invltlb_glob(); } } else if (pmap == PCPU_GET(curpmap)) { if (pmap_pcid_enabled) { critical_enter(); if (invpcid_works) { d.pcid = pmap->pm_pcids[0].pm_pcid; d.pad = 0; d.addr = 0; invpcid(&d, INVPCID_CTX); if (pmap->pm_ucr3 != PMAP_NO_CR3) { d.pcid |= PMAP_PCID_USER_PT; invpcid(&d, INVPCID_CTX); } } else { kcr3 = pmap->pm_cr3 | pmap->pm_pcids[0].pm_pcid; if (pmap->pm_ucr3 != PMAP_NO_CR3) { ucr3 = pmap->pm_ucr3 | pmap->pm_pcids[ 0].pm_pcid | PMAP_PCID_USER_PT; pmap_pti_pcid_invalidate(ucr3, kcr3); } else load_cr3(kcr3); } critical_exit(); } else { invltlb(); } } else if (pmap_pcid_enabled) { pmap->pm_pcids[0].pm_gen = 0; } } PMAP_INLINE void pmap_invalidate_cache(void) { wbinvd(); } static void pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde) { pmap_update_pde_store(pmap, pde, newpde); if (pmap == kernel_pmap || pmap == PCPU_GET(curpmap)) pmap_update_pde_invalidate(pmap, va, newpde); else pmap->pm_pcids[0].pm_gen = 0; } #endif /* !SMP */ static void pmap_invalidate_pde_page(pmap_t pmap, vm_offset_t va, pd_entry_t pde) { /* * When the PDE has PG_PROMOTED set, the 2MB page mapping was created * by a promotion that did not invalidate the 512 4KB page mappings * that might exist in the TLB. Consequently, at this point, the TLB * may hold both 4KB and 2MB page mappings for the address range [va, * va + NBPDR). Therefore, the entire range must be invalidated here. * In contrast, when PG_PROMOTED is clear, the TLB will not hold any * 4KB page mappings for the address range [va, va + NBPDR), and so a * single INVLPG suffices to invalidate the 2MB page mapping from the * TLB. */ if ((pde & PG_PROMOTED) != 0) pmap_invalidate_range(pmap, va, va + NBPDR - 1); else pmap_invalidate_page(pmap, va); } DEFINE_IFUNC(, void, pmap_invalidate_cache_range, (vm_offset_t sva, vm_offset_t eva)) { if ((cpu_feature & CPUID_SS) != 0) return (pmap_invalidate_cache_range_selfsnoop); if ((cpu_feature & CPUID_CLFSH) != 0) return (pmap_force_invalidate_cache_range); return (pmap_invalidate_cache_range_all); } #define PMAP_CLFLUSH_THRESHOLD (2 * 1024 * 1024) static void pmap_invalidate_cache_range_check_align(vm_offset_t sva, vm_offset_t eva) { KASSERT((sva & PAGE_MASK) == 0, ("pmap_invalidate_cache_range: sva not page-aligned")); KASSERT((eva & PAGE_MASK) == 0, ("pmap_invalidate_cache_range: eva not page-aligned")); } static void pmap_invalidate_cache_range_selfsnoop(vm_offset_t sva, vm_offset_t eva) { pmap_invalidate_cache_range_check_align(sva, eva); } void pmap_force_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva) { sva &= ~(vm_offset_t)(cpu_clflush_line_size - 1); /* * XXX: Some CPUs fault, hang, or trash the local APIC * registers if we use CLFLUSH on the local APIC range. The * local APIC is always uncached, so we don't need to flush * for that range anyway. */ if (pmap_kextract(sva) == lapic_paddr) return; if ((cpu_stdext_feature & CPUID_STDEXT_CLFLUSHOPT) != 0) { /* * Do per-cache line flush. Use the sfence * instruction to insure that previous stores are * included in the write-back. The processor * propagates flush to other processors in the cache * coherence domain. */ sfence(); for (; sva < eva; sva += cpu_clflush_line_size) clflushopt(sva); sfence(); } else { /* * Writes are ordered by CLFLUSH on Intel CPUs. */ if (cpu_vendor_id != CPU_VENDOR_INTEL) mfence(); for (; sva < eva; sva += cpu_clflush_line_size) clflush(sva); if (cpu_vendor_id != CPU_VENDOR_INTEL) mfence(); } } static void pmap_invalidate_cache_range_all(vm_offset_t sva, vm_offset_t eva) { pmap_invalidate_cache_range_check_align(sva, eva); pmap_invalidate_cache(); } /* * Remove the specified set of pages from the data and instruction caches. * * In contrast to pmap_invalidate_cache_range(), this function does not * rely on the CPU's self-snoop feature, because it is intended for use * when moving pages into a different cache domain. */ void pmap_invalidate_cache_pages(vm_page_t *pages, int count) { vm_offset_t daddr, eva; int i; bool useclflushopt; useclflushopt = (cpu_stdext_feature & CPUID_STDEXT_CLFLUSHOPT) != 0; if (count >= PMAP_CLFLUSH_THRESHOLD / PAGE_SIZE || ((cpu_feature & CPUID_CLFSH) == 0 && !useclflushopt)) pmap_invalidate_cache(); else { if (useclflushopt) sfence(); else if (cpu_vendor_id != CPU_VENDOR_INTEL) mfence(); for (i = 0; i < count; i++) { daddr = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pages[i])); eva = daddr + PAGE_SIZE; for (; daddr < eva; daddr += cpu_clflush_line_size) { if (useclflushopt) clflushopt(daddr); else clflush(daddr); } } if (useclflushopt) sfence(); else if (cpu_vendor_id != CPU_VENDOR_INTEL) mfence(); } } void pmap_flush_cache_range(vm_offset_t sva, vm_offset_t eva) { pmap_invalidate_cache_range_check_align(sva, eva); if ((cpu_stdext_feature & CPUID_STDEXT_CLWB) == 0) { pmap_force_invalidate_cache_range(sva, eva); return; } /* See comment in pmap_force_invalidate_cache_range(). */ if (pmap_kextract(sva) == lapic_paddr) return; sfence(); for (; sva < eva; sva += cpu_clflush_line_size) clwb(sva); sfence(); } void pmap_flush_cache_phys_range(vm_paddr_t spa, vm_paddr_t epa, vm_memattr_t mattr) { pt_entry_t *pte; vm_offset_t vaddr; int error, pte_bits; KASSERT((spa & PAGE_MASK) == 0, ("pmap_flush_cache_phys_range: spa not page-aligned")); KASSERT((epa & PAGE_MASK) == 0, ("pmap_flush_cache_phys_range: epa not page-aligned")); if (spa < dmaplimit) { pmap_flush_cache_range(PHYS_TO_DMAP(spa), PHYS_TO_DMAP(MIN( dmaplimit, epa))); if (dmaplimit >= epa) return; spa = dmaplimit; } pte_bits = pmap_cache_bits(kernel_pmap, mattr, 0) | X86_PG_RW | X86_PG_V; error = vmem_alloc(kernel_arena, PAGE_SIZE, M_BESTFIT | M_WAITOK, &vaddr); KASSERT(error == 0, ("vmem_alloc failed: %d", error)); pte = vtopte(vaddr); for (; spa < epa; spa += PAGE_SIZE) { sched_pin(); pte_store(pte, spa | pte_bits); invlpg(vaddr); /* XXXKIB sfences inside flush_cache_range are excessive */ pmap_flush_cache_range(vaddr, vaddr + PAGE_SIZE); sched_unpin(); } vmem_free(kernel_arena, vaddr, PAGE_SIZE); } /* * Routine: pmap_extract * Function: * Extract the physical page address associated * with the given map/virtual_address pair. */ vm_paddr_t pmap_extract(pmap_t pmap, vm_offset_t va) { pdp_entry_t *pdpe; pd_entry_t *pde; pt_entry_t *pte, PG_V; vm_paddr_t pa; pa = 0; PG_V = pmap_valid_bit(pmap); PMAP_LOCK(pmap); pdpe = pmap_pdpe(pmap, va); if (pdpe != NULL && (*pdpe & PG_V) != 0) { if ((*pdpe & PG_PS) != 0) pa = (*pdpe & PG_PS_FRAME) | (va & PDPMASK); else { pde = pmap_pdpe_to_pde(pdpe, va); if ((*pde & PG_V) != 0) { if ((*pde & PG_PS) != 0) { pa = (*pde & PG_PS_FRAME) | (va & PDRMASK); } else { pte = pmap_pde_to_pte(pde, va); pa = (*pte & PG_FRAME) | (va & PAGE_MASK); } } } } PMAP_UNLOCK(pmap); return (pa); } /* * Routine: pmap_extract_and_hold * Function: * Atomically extract and hold the physical page * with the given pmap and virtual address pair * if that mapping permits the given protection. */ vm_page_t pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot) { pd_entry_t pde, *pdep; pt_entry_t pte, PG_RW, PG_V; vm_paddr_t pa; vm_page_t m; pa = 0; m = NULL; PG_RW = pmap_rw_bit(pmap); PG_V = pmap_valid_bit(pmap); PMAP_LOCK(pmap); retry: pdep = pmap_pde(pmap, va); if (pdep != NULL && (pde = *pdep)) { if (pde & PG_PS) { if ((pde & PG_RW) || (prot & VM_PROT_WRITE) == 0) { if (vm_page_pa_tryrelock(pmap, (pde & PG_PS_FRAME) | (va & PDRMASK), &pa)) goto retry; m = PHYS_TO_VM_PAGE(pa); } } else { pte = *pmap_pde_to_pte(pdep, va); if ((pte & PG_V) && ((pte & PG_RW) || (prot & VM_PROT_WRITE) == 0)) { if (vm_page_pa_tryrelock(pmap, pte & PG_FRAME, &pa)) goto retry; m = PHYS_TO_VM_PAGE(pa); } } if (m != NULL) vm_page_wire(m); } PA_UNLOCK_COND(pa); PMAP_UNLOCK(pmap); return (m); } vm_paddr_t pmap_kextract(vm_offset_t va) { pd_entry_t pde; vm_paddr_t pa; if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) { pa = DMAP_TO_PHYS(va); } else if (LARGEMAP_MIN_ADDRESS <= va && va < PMAP_LARGEMAP_MAX_ADDRESS()) { pa = pmap_large_map_kextract(va); } else { pde = *vtopde(va); if (pde & PG_PS) { pa = (pde & PG_PS_FRAME) | (va & PDRMASK); } else { /* * Beware of a concurrent promotion that changes the * PDE at this point! For example, vtopte() must not * be used to access the PTE because it would use the * new PDE. It is, however, safe to use the old PDE * because the page table page is preserved by the * promotion. */ pa = *pmap_pde_to_pte(&pde, va); pa = (pa & PG_FRAME) | (va & PAGE_MASK); } } return (pa); } /*************************************************** * Low level mapping routines..... ***************************************************/ /* * Add a wired page to the kva. * Note: not SMP coherent. */ PMAP_INLINE void pmap_kenter(vm_offset_t va, vm_paddr_t pa) { pt_entry_t *pte; pte = vtopte(va); pte_store(pte, pa | X86_PG_RW | X86_PG_V | pg_g); } static __inline void pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode) { pt_entry_t *pte; int cache_bits; pte = vtopte(va); cache_bits = pmap_cache_bits(kernel_pmap, mode, 0); pte_store(pte, pa | X86_PG_RW | X86_PG_V | pg_g | cache_bits); } /* * Remove a page from the kernel pagetables. * Note: not SMP coherent. */ PMAP_INLINE void pmap_kremove(vm_offset_t va) { pt_entry_t *pte; pte = vtopte(va); pte_clear(pte); } /* * Used to map a range of physical addresses into kernel * virtual address space. * * The value passed in '*virt' is a suggested virtual address for * the mapping. Architectures which can support a direct-mapped * physical to virtual region can return the appropriate address * within that region, leaving '*virt' unchanged. Other * architectures should map the pages starting at '*virt' and * update '*virt' with the first usable address after the mapped * region. */ vm_offset_t pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot) { return PHYS_TO_DMAP(start); } /* * Add a list of wired pages to the kva * this routine is only used for temporary * kernel mappings that do not need to have * page modification or references recorded. * Note that old mappings are simply written * over. The page *must* be wired. * Note: SMP coherent. Uses a ranged shootdown IPI. */ void pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count) { pt_entry_t *endpte, oldpte, pa, *pte; vm_page_t m; int cache_bits; oldpte = 0; pte = vtopte(sva); endpte = pte + count; while (pte < endpte) { m = *ma++; cache_bits = pmap_cache_bits(kernel_pmap, m->md.pat_mode, 0); pa = VM_PAGE_TO_PHYS(m) | cache_bits; if ((*pte & (PG_FRAME | X86_PG_PTE_CACHE)) != pa) { oldpte |= *pte; pte_store(pte, pa | pg_g | pg_nx | X86_PG_RW | X86_PG_V); } pte++; } if (__predict_false((oldpte & X86_PG_V) != 0)) pmap_invalidate_range(kernel_pmap, sva, sva + count * PAGE_SIZE); } /* * This routine tears out page mappings from the * kernel -- it is meant only for temporary mappings. * Note: SMP coherent. Uses a ranged shootdown IPI. */ void pmap_qremove(vm_offset_t sva, int count) { vm_offset_t va; va = sva; while (count-- > 0) { KASSERT(va >= VM_MIN_KERNEL_ADDRESS, ("usermode va %lx", va)); pmap_kremove(va); va += PAGE_SIZE; } pmap_invalidate_range(kernel_pmap, sva, va); } /*************************************************** * Page table page management routines..... ***************************************************/ /* * Schedule the specified unused page table page to be freed. Specifically, * add the page to the specified list of pages that will be released to the * physical memory manager after the TLB has been updated. */ static __inline void pmap_add_delayed_free_list(vm_page_t m, struct spglist *free, boolean_t set_PG_ZERO) { if (set_PG_ZERO) m->flags |= PG_ZERO; else m->flags &= ~PG_ZERO; SLIST_INSERT_HEAD(free, m, plinks.s.ss); } /* * Inserts the specified page table page into the specified pmap's collection * of idle page table pages. Each of a pmap's page table pages is responsible * for mapping a distinct range of virtual addresses. The pmap's collection is * ordered by this virtual address range. * * If "promoted" is false, then the page table page "mpte" must be zero filled. */ static __inline int pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte, bool promoted) { PMAP_LOCK_ASSERT(pmap, MA_OWNED); mpte->valid = promoted ? VM_PAGE_BITS_ALL : 0; return (vm_radix_insert(&pmap->pm_root, mpte)); } /* * Removes the page table page mapping the specified virtual address from the * specified pmap's collection of idle page table pages, and returns it. * Otherwise, returns NULL if there is no page table page corresponding to the * specified virtual address. */ static __inline vm_page_t pmap_remove_pt_page(pmap_t pmap, vm_offset_t va) { PMAP_LOCK_ASSERT(pmap, MA_OWNED); return (vm_radix_remove(&pmap->pm_root, pmap_pde_pindex(va))); } /* * Decrements a page table page's wire count, which is used to record the * number of valid page table entries within the page. If the wire count * drops to zero, then the page table page is unmapped. Returns TRUE if the * page table page was unmapped and FALSE otherwise. */ static inline boolean_t pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) { --m->wire_count; if (m->wire_count == 0) { _pmap_unwire_ptp(pmap, va, m, free); return (TRUE); } else return (FALSE); } static void _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) { PMAP_LOCK_ASSERT(pmap, MA_OWNED); /* * unmap the page table page */ if (m->pindex >= (NUPDE + NUPDPE)) { /* PDP page */ pml4_entry_t *pml4; pml4 = pmap_pml4e(pmap, va); *pml4 = 0; if (pmap->pm_pml4u != NULL && va <= VM_MAXUSER_ADDRESS) { pml4 = &pmap->pm_pml4u[pmap_pml4e_index(va)]; *pml4 = 0; } } else if (m->pindex >= NUPDE) { /* PD page */ pdp_entry_t *pdp; pdp = pmap_pdpe(pmap, va); *pdp = 0; } else { /* PTE page */ pd_entry_t *pd; pd = pmap_pde(pmap, va); *pd = 0; } pmap_resident_count_dec(pmap, 1); if (m->pindex < NUPDE) { /* We just released a PT, unhold the matching PD */ vm_page_t pdpg; pdpg = PHYS_TO_VM_PAGE(*pmap_pdpe(pmap, va) & PG_FRAME); pmap_unwire_ptp(pmap, va, pdpg, free); } if (m->pindex >= NUPDE && m->pindex < (NUPDE + NUPDPE)) { /* We just released a PD, unhold the matching PDP */ vm_page_t pdppg; pdppg = PHYS_TO_VM_PAGE(*pmap_pml4e(pmap, va) & PG_FRAME); pmap_unwire_ptp(pmap, va, pdppg, free); } /* * Put page on a list so that it is released after * *ALL* TLB shootdown is done */ pmap_add_delayed_free_list(m, free, TRUE); } /* * After removing a page table entry, this routine is used to * conditionally free the page, and manage the hold/wire counts. */ static int pmap_unuse_pt(pmap_t pmap, vm_offset_t va, pd_entry_t ptepde, struct spglist *free) { vm_page_t mpte; if (va >= VM_MAXUSER_ADDRESS) return (0); KASSERT(ptepde != 0, ("pmap_unuse_pt: ptepde != 0")); mpte = PHYS_TO_VM_PAGE(ptepde & PG_FRAME); return (pmap_unwire_ptp(pmap, va, mpte, free)); } void pmap_pinit0(pmap_t pmap) { struct proc *p; struct thread *td; int i; PMAP_LOCK_INIT(pmap); pmap->pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(KPML4phys); pmap->pm_pml4u = NULL; pmap->pm_cr3 = KPML4phys; /* hack to keep pmap_pti_pcid_invalidate() alive */ pmap->pm_ucr3 = PMAP_NO_CR3; pmap->pm_root.rt_root = 0; CPU_ZERO(&pmap->pm_active); TAILQ_INIT(&pmap->pm_pvchunk); bzero(&pmap->pm_stats, sizeof pmap->pm_stats); pmap->pm_flags = pmap_flags; CPU_FOREACH(i) { pmap->pm_pcids[i].pm_pcid = PMAP_PCID_KERN + 1; pmap->pm_pcids[i].pm_gen = 1; } pmap_activate_boot(pmap); td = curthread; if (pti) { p = td->td_proc; PROC_LOCK(p); p->p_md.md_flags |= P_MD_KPTI; PROC_UNLOCK(p); } pmap_thread_init_invl_gen(td); if ((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0) { pmap_pkru_ranges_zone = uma_zcreate("pkru ranges", sizeof(struct pmap_pkru_range), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); } } void pmap_pinit_pml4(vm_page_t pml4pg) { pml4_entry_t *pm_pml4; int i; pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml4pg)); /* Wire in kernel global address entries. */ for (i = 0; i < NKPML4E; i++) { pm_pml4[KPML4BASE + i] = (KPDPphys + ptoa(i)) | X86_PG_RW | X86_PG_V; } for (i = 0; i < ndmpdpphys; i++) { pm_pml4[DMPML4I + i] = (DMPDPphys + ptoa(i)) | X86_PG_RW | X86_PG_V; } /* install self-referential address mapping entry(s) */ pm_pml4[PML4PML4I] = VM_PAGE_TO_PHYS(pml4pg) | X86_PG_V | X86_PG_RW | X86_PG_A | X86_PG_M; /* install large map entries if configured */ for (i = 0; i < lm_ents; i++) pm_pml4[LMSPML4I + i] = kernel_pmap->pm_pml4[LMSPML4I + i]; } static void pmap_pinit_pml4_pti(vm_page_t pml4pg) { pml4_entry_t *pm_pml4; int i; pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml4pg)); for (i = 0; i < NPML4EPG; i++) pm_pml4[i] = pti_pml4[i]; } /* * Initialize a preallocated and zeroed pmap structure, * such as one in a vmspace structure. */ int pmap_pinit_type(pmap_t pmap, enum pmap_type pm_type, int flags) { vm_page_t pml4pg, pml4pgu; vm_paddr_t pml4phys; int i; /* * allocate the page directory page */ pml4pg = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO | VM_ALLOC_WAITOK); pml4phys = VM_PAGE_TO_PHYS(pml4pg); pmap->pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(pml4phys); CPU_FOREACH(i) { pmap->pm_pcids[i].pm_pcid = PMAP_PCID_NONE; pmap->pm_pcids[i].pm_gen = 0; } pmap->pm_cr3 = PMAP_NO_CR3; /* initialize to an invalid value */ pmap->pm_ucr3 = PMAP_NO_CR3; pmap->pm_pml4u = NULL; pmap->pm_type = pm_type; if ((pml4pg->flags & PG_ZERO) == 0) pagezero(pmap->pm_pml4); /* * Do not install the host kernel mappings in the nested page * tables. These mappings are meaningless in the guest physical * address space. * Install minimal kernel mappings in PTI case. */ if (pm_type == PT_X86) { pmap->pm_cr3 = pml4phys; pmap_pinit_pml4(pml4pg); if ((curproc->p_md.md_flags & P_MD_KPTI) != 0) { pml4pgu = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_WAITOK); pmap->pm_pml4u = (pml4_entry_t *)PHYS_TO_DMAP( VM_PAGE_TO_PHYS(pml4pgu)); pmap_pinit_pml4_pti(pml4pgu); pmap->pm_ucr3 = VM_PAGE_TO_PHYS(pml4pgu); } if ((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0) { rangeset_init(&pmap->pm_pkru, pkru_dup_range, pkru_free_range, pmap, M_NOWAIT); } } pmap->pm_root.rt_root = 0; CPU_ZERO(&pmap->pm_active); TAILQ_INIT(&pmap->pm_pvchunk); bzero(&pmap->pm_stats, sizeof pmap->pm_stats); pmap->pm_flags = flags; pmap->pm_eptgen = 0; return (1); } int pmap_pinit(pmap_t pmap) { return (pmap_pinit_type(pmap, PT_X86, pmap_flags)); } /* * This routine is called if the desired page table page does not exist. * * If page table page allocation fails, this routine may sleep before * returning NULL. It sleeps only if a lock pointer was given. * * Note: If a page allocation fails at page table level two or three, * one or two pages may be held during the wait, only to be released * afterwards. This conservative approach is easily argued to avoid * race conditions. */ static vm_page_t _pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp) { vm_page_t m, pdppg, pdpg; pt_entry_t PG_A, PG_M, PG_RW, PG_V; PMAP_LOCK_ASSERT(pmap, MA_OWNED); PG_A = pmap_accessed_bit(pmap); PG_M = pmap_modified_bit(pmap); PG_V = pmap_valid_bit(pmap); PG_RW = pmap_rw_bit(pmap); /* * Allocate a page table page. */ if ((m = vm_page_alloc(NULL, ptepindex, VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) { if (lockp != NULL) { RELEASE_PV_LIST_LOCK(lockp); PMAP_UNLOCK(pmap); PMAP_ASSERT_NOT_IN_DI(); vm_wait(NULL); PMAP_LOCK(pmap); } /* * Indicate the need to retry. While waiting, the page table * page may have been allocated. */ return (NULL); } if ((m->flags & PG_ZERO) == 0) pmap_zero_page(m); /* * Map the pagetable page into the process address space, if * it isn't already there. */ if (ptepindex >= (NUPDE + NUPDPE)) { pml4_entry_t *pml4, *pml4u; vm_pindex_t pml4index; /* Wire up a new PDPE page */ pml4index = ptepindex - (NUPDE + NUPDPE); pml4 = &pmap->pm_pml4[pml4index]; *pml4 = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M; if (pmap->pm_pml4u != NULL && pml4index < NUPML4E) { /* * PTI: Make all user-space mappings in the * kernel-mode page table no-execute so that * we detect any programming errors that leave * the kernel-mode page table active on return * to user space. */ if (pmap->pm_ucr3 != PMAP_NO_CR3) *pml4 |= pg_nx; pml4u = &pmap->pm_pml4u[pml4index]; *pml4u = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M; } } else if (ptepindex >= NUPDE) { vm_pindex_t pml4index; vm_pindex_t pdpindex; pml4_entry_t *pml4; pdp_entry_t *pdp; /* Wire up a new PDE page */ pdpindex = ptepindex - NUPDE; pml4index = pdpindex >> NPML4EPGSHIFT; pml4 = &pmap->pm_pml4[pml4index]; if ((*pml4 & PG_V) == 0) { /* Have to allocate a new pdp, recurse */ if (_pmap_allocpte(pmap, NUPDE + NUPDPE + pml4index, lockp) == NULL) { vm_page_unwire_noq(m); vm_page_free_zero(m); return (NULL); } } else { /* Add reference to pdp page */ pdppg = PHYS_TO_VM_PAGE(*pml4 & PG_FRAME); pdppg->wire_count++; } pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME); /* Now find the pdp page */ pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)]; *pdp = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M; } else { vm_pindex_t pml4index; vm_pindex_t pdpindex; pml4_entry_t *pml4; pdp_entry_t *pdp; pd_entry_t *pd; /* Wire up a new PTE page */ pdpindex = ptepindex >> NPDPEPGSHIFT; pml4index = pdpindex >> NPML4EPGSHIFT; /* First, find the pdp and check that its valid. */ pml4 = &pmap->pm_pml4[pml4index]; if ((*pml4 & PG_V) == 0) { /* Have to allocate a new pd, recurse */ if (_pmap_allocpte(pmap, NUPDE + pdpindex, lockp) == NULL) { vm_page_unwire_noq(m); vm_page_free_zero(m); return (NULL); } pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME); pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)]; } else { pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME); pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)]; if ((*pdp & PG_V) == 0) { /* Have to allocate a new pd, recurse */ if (_pmap_allocpte(pmap, NUPDE + pdpindex, lockp) == NULL) { vm_page_unwire_noq(m); vm_page_free_zero(m); return (NULL); } } else { /* Add reference to the pd page */ pdpg = PHYS_TO_VM_PAGE(*pdp & PG_FRAME); pdpg->wire_count++; } } pd = (pd_entry_t *)PHYS_TO_DMAP(*pdp & PG_FRAME); /* Now we know where the page directory page is */ pd = &pd[ptepindex & ((1ul << NPDEPGSHIFT) - 1)]; *pd = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M; } pmap_resident_count_inc(pmap, 1); return (m); } static vm_page_t pmap_allocpde(pmap_t pmap, vm_offset_t va, struct rwlock **lockp) { vm_pindex_t pdpindex, ptepindex; pdp_entry_t *pdpe, PG_V; vm_page_t pdpg; PG_V = pmap_valid_bit(pmap); retry: pdpe = pmap_pdpe(pmap, va); if (pdpe != NULL && (*pdpe & PG_V) != 0) { /* Add a reference to the pd page. */ pdpg = PHYS_TO_VM_PAGE(*pdpe & PG_FRAME); pdpg->wire_count++; } else { /* Allocate a pd page. */ ptepindex = pmap_pde_pindex(va); pdpindex = ptepindex >> NPDPEPGSHIFT; pdpg = _pmap_allocpte(pmap, NUPDE + pdpindex, lockp); if (pdpg == NULL && lockp != NULL) goto retry; } return (pdpg); } static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va, struct rwlock **lockp) { vm_pindex_t ptepindex; pd_entry_t *pd, PG_V; vm_page_t m; PG_V = pmap_valid_bit(pmap); /* * Calculate pagetable page index */ ptepindex = pmap_pde_pindex(va); retry: /* * Get the page directory entry */ pd = pmap_pde(pmap, va); /* * This supports switching from a 2MB page to a * normal 4K page. */ if (pd != NULL && (*pd & (PG_PS | PG_V)) == (PG_PS | PG_V)) { if (!pmap_demote_pde_locked(pmap, pd, va, lockp)) { /* * Invalidation of the 2MB page mapping may have caused * the deallocation of the underlying PD page. */ pd = NULL; } } /* * If the page table page is mapped, we just increment the * hold count, and activate it. */ if (pd != NULL && (*pd & PG_V) != 0) { m = PHYS_TO_VM_PAGE(*pd & PG_FRAME); m->wire_count++; } else { /* * Here if the pte page isn't mapped, or if it has been * deallocated. */ m = _pmap_allocpte(pmap, ptepindex, lockp); if (m == NULL && lockp != NULL) goto retry; } return (m); } /*************************************************** * Pmap allocation/deallocation routines. ***************************************************/ /* * Release any resources held by the given physical map. * Called when a pmap initialized by pmap_pinit is being released. * Should only be called if the map contains no valid mappings. */ void pmap_release(pmap_t pmap) { vm_page_t m; int i; KASSERT(pmap->pm_stats.resident_count == 0, ("pmap_release: pmap resident count %ld != 0", pmap->pm_stats.resident_count)); KASSERT(vm_radix_is_empty(&pmap->pm_root), ("pmap_release: pmap has reserved page table page(s)")); KASSERT(CPU_EMPTY(&pmap->pm_active), ("releasing active pmap %p", pmap)); m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pmap->pm_pml4)); for (i = 0; i < NKPML4E; i++) /* KVA */ pmap->pm_pml4[KPML4BASE + i] = 0; for (i = 0; i < ndmpdpphys; i++)/* Direct Map */ pmap->pm_pml4[DMPML4I + i] = 0; pmap->pm_pml4[PML4PML4I] = 0; /* Recursive Mapping */ for (i = 0; i < lm_ents; i++) /* Large Map */ pmap->pm_pml4[LMSPML4I + i] = 0; vm_page_unwire_noq(m); vm_page_free_zero(m); if (pmap->pm_pml4u != NULL) { m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pmap->pm_pml4u)); vm_page_unwire_noq(m); vm_page_free(m); } if (pmap->pm_type == PT_X86 && (cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0) rangeset_fini(&pmap->pm_pkru); } static int kvm_size(SYSCTL_HANDLER_ARGS) { unsigned long ksize = VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS; return sysctl_handle_long(oidp, &ksize, 0, req); } SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG|CTLFLAG_RD, 0, 0, kvm_size, "LU", "Size of KVM"); static int kvm_free(SYSCTL_HANDLER_ARGS) { unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end; return sysctl_handle_long(oidp, &kfree, 0, req); } SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG|CTLFLAG_RD, 0, 0, kvm_free, "LU", "Amount of KVM free"); /* * grow the number of kernel page table entries, if needed */ void pmap_growkernel(vm_offset_t addr) { vm_paddr_t paddr; vm_page_t nkpg; pd_entry_t *pde, newpdir; pdp_entry_t *pdpe; mtx_assert(&kernel_map->system_mtx, MA_OWNED); /* * Return if "addr" is within the range of kernel page table pages * that were preallocated during pmap bootstrap. Moreover, leave * "kernel_vm_end" and the kernel page table as they were. * * The correctness of this action is based on the following * argument: vm_map_insert() allocates contiguous ranges of the * kernel virtual address space. It calls this function if a range * ends after "kernel_vm_end". If the kernel is mapped between * "kernel_vm_end" and "addr", then the range cannot begin at * "kernel_vm_end". In fact, its beginning address cannot be less * than the kernel. Thus, there is no immediate need to allocate * any new kernel page table pages between "kernel_vm_end" and * "KERNBASE". */ if (KERNBASE < addr && addr <= KERNBASE + nkpt * NBPDR) return; addr = roundup2(addr, NBPDR); if (addr - 1 >= vm_map_max(kernel_map)) addr = vm_map_max(kernel_map); while (kernel_vm_end < addr) { pdpe = pmap_pdpe(kernel_pmap, kernel_vm_end); if ((*pdpe & X86_PG_V) == 0) { /* We need a new PDP entry */ nkpg = vm_page_alloc(NULL, kernel_vm_end >> PDPSHIFT, VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO); if (nkpg == NULL) panic("pmap_growkernel: no memory to grow kernel"); if ((nkpg->flags & PG_ZERO) == 0) pmap_zero_page(nkpg); paddr = VM_PAGE_TO_PHYS(nkpg); *pdpe = (pdp_entry_t)(paddr | X86_PG_V | X86_PG_RW | X86_PG_A | X86_PG_M); continue; /* try again */ } pde = pmap_pdpe_to_pde(pdpe, kernel_vm_end); if ((*pde & X86_PG_V) != 0) { kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK; if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) { kernel_vm_end = vm_map_max(kernel_map); break; } continue; } nkpg = vm_page_alloc(NULL, pmap_pde_pindex(kernel_vm_end), VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO); if (nkpg == NULL) panic("pmap_growkernel: no memory to grow kernel"); if ((nkpg->flags & PG_ZERO) == 0) pmap_zero_page(nkpg); paddr = VM_PAGE_TO_PHYS(nkpg); newpdir = paddr | X86_PG_V | X86_PG_RW | X86_PG_A | X86_PG_M; pde_store(pde, newpdir); kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK; if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) { kernel_vm_end = vm_map_max(kernel_map); break; } } } /*************************************************** * page management routines. ***************************************************/ CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE); CTASSERT(_NPCM == 3); CTASSERT(_NPCPV == 168); static __inline struct pv_chunk * pv_to_chunk(pv_entry_t pv) { return ((struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK)); } #define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap) #define PC_FREE0 0xfffffffffffffffful #define PC_FREE1 0xfffffffffffffffful #define PC_FREE2 0x000000fffffffffful static const uint64_t pc_freemask[_NPCM] = { PC_FREE0, PC_FREE1, PC_FREE2 }; #ifdef PV_STATS static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail; SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0, "Current number of pv entry chunks"); SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0, "Current number of pv entry chunks allocated"); SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0, "Current number of pv entry chunks frees"); SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0, "Number of times tried to get a chunk page but failed."); static long pv_entry_frees, pv_entry_allocs, pv_entry_count; static int pv_entry_spare; SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0, "Current number of pv entry frees"); SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0, "Current number of pv entry allocs"); SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0, "Current number of pv entries"); SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0, "Current number of spare pv entries"); #endif static void reclaim_pv_chunk_leave_pmap(pmap_t pmap, pmap_t locked_pmap, bool start_di) { if (pmap == NULL) return; pmap_invalidate_all(pmap); if (pmap != locked_pmap) PMAP_UNLOCK(pmap); if (start_di) pmap_delayed_invl_finish(); } /* * We are in a serious low memory condition. Resort to * drastic measures to free some pages so we can allocate * another pv entry chunk. * * Returns NULL if PV entries were reclaimed from the specified pmap. * * We do not, however, unmap 2mpages because subsequent accesses will * allocate per-page pv entries until repromotion occurs, thereby * exacerbating the shortage of free pv entries. */ static vm_page_t reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp) { struct pv_chunk *pc, *pc_marker, *pc_marker_end; struct pv_chunk_header pc_marker_b, pc_marker_end_b; struct md_page *pvh; pd_entry_t *pde; pmap_t next_pmap, pmap; pt_entry_t *pte, tpte; pt_entry_t PG_G, PG_A, PG_M, PG_RW; pv_entry_t pv; vm_offset_t va; vm_page_t m, m_pc; struct spglist free; uint64_t inuse; int bit, field, freed; bool start_di; static int active_reclaims = 0; PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED); KASSERT(lockp != NULL, ("reclaim_pv_chunk: lockp is NULL")); pmap = NULL; m_pc = NULL; PG_G = PG_A = PG_M = PG_RW = 0; SLIST_INIT(&free); bzero(&pc_marker_b, sizeof(pc_marker_b)); bzero(&pc_marker_end_b, sizeof(pc_marker_end_b)); pc_marker = (struct pv_chunk *)&pc_marker_b; pc_marker_end = (struct pv_chunk *)&pc_marker_end_b; /* * A delayed invalidation block should already be active if * pmap_advise() or pmap_remove() called this function by way * of pmap_demote_pde_locked(). */ start_di = pmap_not_in_di(); mtx_lock(&pv_chunks_mutex); active_reclaims++; TAILQ_INSERT_HEAD(&pv_chunks, pc_marker, pc_lru); TAILQ_INSERT_TAIL(&pv_chunks, pc_marker_end, pc_lru); while ((pc = TAILQ_NEXT(pc_marker, pc_lru)) != pc_marker_end && SLIST_EMPTY(&free)) { next_pmap = pc->pc_pmap; if (next_pmap == NULL) { /* * The next chunk is a marker. However, it is * not our marker, so active_reclaims must be * > 1. Consequently, the next_chunk code * will not rotate the pv_chunks list. */ goto next_chunk; } mtx_unlock(&pv_chunks_mutex); /* * A pv_chunk can only be removed from the pc_lru list * when both pc_chunks_mutex is owned and the * corresponding pmap is locked. */ if (pmap != next_pmap) { reclaim_pv_chunk_leave_pmap(pmap, locked_pmap, start_di); pmap = next_pmap; /* Avoid deadlock and lock recursion. */ if (pmap > locked_pmap) { RELEASE_PV_LIST_LOCK(lockp); PMAP_LOCK(pmap); if (start_di) pmap_delayed_invl_start(); mtx_lock(&pv_chunks_mutex); continue; } else if (pmap != locked_pmap) { if (PMAP_TRYLOCK(pmap)) { if (start_di) pmap_delayed_invl_start(); mtx_lock(&pv_chunks_mutex); continue; } else { pmap = NULL; /* pmap is not locked */ mtx_lock(&pv_chunks_mutex); pc = TAILQ_NEXT(pc_marker, pc_lru); if (pc == NULL || pc->pc_pmap != next_pmap) continue; goto next_chunk; } } else if (start_di) pmap_delayed_invl_start(); PG_G = pmap_global_bit(pmap); PG_A = pmap_accessed_bit(pmap); PG_M = pmap_modified_bit(pmap); PG_RW = pmap_rw_bit(pmap); } /* * Destroy every non-wired, 4 KB page mapping in the chunk. */ freed = 0; for (field = 0; field < _NPCM; field++) { for (inuse = ~pc->pc_map[field] & pc_freemask[field]; inuse != 0; inuse &= ~(1UL << bit)) { bit = bsfq(inuse); pv = &pc->pc_pventry[field * 64 + bit]; va = pv->pv_va; pde = pmap_pde(pmap, va); if ((*pde & PG_PS) != 0) continue; pte = pmap_pde_to_pte(pde, va); if ((*pte & PG_W) != 0) continue; tpte = pte_load_clear(pte); if ((tpte & PG_G) != 0) pmap_invalidate_page(pmap, va); m = PHYS_TO_VM_PAGE(tpte & PG_FRAME); if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) vm_page_dirty(m); if ((tpte & PG_A) != 0) vm_page_aflag_set(m, PGA_REFERENCED); CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); m->md.pv_gen++; if (TAILQ_EMPTY(&m->md.pv_list) && (m->flags & PG_FICTITIOUS) == 0) { pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); if (TAILQ_EMPTY(&pvh->pv_list)) { vm_page_aflag_clear(m, PGA_WRITEABLE); } } pmap_delayed_invl_page(m); pc->pc_map[field] |= 1UL << bit; pmap_unuse_pt(pmap, va, *pde, &free); freed++; } } if (freed == 0) { mtx_lock(&pv_chunks_mutex); goto next_chunk; } /* Every freed mapping is for a 4 KB page. */ pmap_resident_count_dec(pmap, freed); PV_STAT(atomic_add_long(&pv_entry_frees, freed)); PV_STAT(atomic_add_int(&pv_entry_spare, freed)); PV_STAT(atomic_subtract_long(&pv_entry_count, freed)); TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); if (pc->pc_map[0] == PC_FREE0 && pc->pc_map[1] == PC_FREE1 && pc->pc_map[2] == PC_FREE2) { PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV)); PV_STAT(atomic_subtract_int(&pc_chunk_count, 1)); PV_STAT(atomic_add_int(&pc_chunk_frees, 1)); /* Entire chunk is free; return it. */ m_pc = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc)); dump_drop_page(m_pc->phys_addr); mtx_lock(&pv_chunks_mutex); TAILQ_REMOVE(&pv_chunks, pc, pc_lru); break; } TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); mtx_lock(&pv_chunks_mutex); /* One freed pv entry in locked_pmap is sufficient. */ if (pmap == locked_pmap) break; next_chunk: TAILQ_REMOVE(&pv_chunks, pc_marker, pc_lru); TAILQ_INSERT_AFTER(&pv_chunks, pc, pc_marker, pc_lru); if (active_reclaims == 1 && pmap != NULL) { /* * Rotate the pv chunks list so that we do not * scan the same pv chunks that could not be * freed (because they contained a wired * and/or superpage mapping) on every * invocation of reclaim_pv_chunk(). */ while ((pc = TAILQ_FIRST(&pv_chunks)) != pc_marker) { MPASS(pc->pc_pmap != NULL); TAILQ_REMOVE(&pv_chunks, pc, pc_lru); TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru); } } } TAILQ_REMOVE(&pv_chunks, pc_marker, pc_lru); TAILQ_REMOVE(&pv_chunks, pc_marker_end, pc_lru); active_reclaims--; mtx_unlock(&pv_chunks_mutex); reclaim_pv_chunk_leave_pmap(pmap, locked_pmap, start_di); if (m_pc == NULL && !SLIST_EMPTY(&free)) { m_pc = SLIST_FIRST(&free); SLIST_REMOVE_HEAD(&free, plinks.s.ss); /* Recycle a freed page table page. */ m_pc->wire_count = 1; } vm_page_free_pages_toq(&free, true); return (m_pc); } /* * free the pv_entry back to the free list */ static void free_pv_entry(pmap_t pmap, pv_entry_t pv) { struct pv_chunk *pc; int idx, field, bit; PMAP_LOCK_ASSERT(pmap, MA_OWNED); PV_STAT(atomic_add_long(&pv_entry_frees, 1)); PV_STAT(atomic_add_int(&pv_entry_spare, 1)); PV_STAT(atomic_subtract_long(&pv_entry_count, 1)); pc = pv_to_chunk(pv); idx = pv - &pc->pc_pventry[0]; field = idx / 64; bit = idx % 64; pc->pc_map[field] |= 1ul << bit; if (pc->pc_map[0] != PC_FREE0 || pc->pc_map[1] != PC_FREE1 || pc->pc_map[2] != PC_FREE2) { /* 98% of the time, pc is already at the head of the list. */ if (__predict_false(pc != TAILQ_FIRST(&pmap->pm_pvchunk))) { TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); } return; } TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); free_pv_chunk(pc); } static void free_pv_chunk(struct pv_chunk *pc) { vm_page_t m; mtx_lock(&pv_chunks_mutex); TAILQ_REMOVE(&pv_chunks, pc, pc_lru); mtx_unlock(&pv_chunks_mutex); PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV)); PV_STAT(atomic_subtract_int(&pc_chunk_count, 1)); PV_STAT(atomic_add_int(&pc_chunk_frees, 1)); /* entire chunk is free, return it */ m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc)); dump_drop_page(m->phys_addr); vm_page_unwire_noq(m); vm_page_free(m); } /* * Returns a new PV entry, allocating a new PV chunk from the system when * needed. If this PV chunk allocation fails and a PV list lock pointer was * given, a PV chunk is reclaimed from an arbitrary pmap. Otherwise, NULL is * returned. * * The given PV list lock may be released. */ static pv_entry_t get_pv_entry(pmap_t pmap, struct rwlock **lockp) { int bit, field; pv_entry_t pv; struct pv_chunk *pc; vm_page_t m; PMAP_LOCK_ASSERT(pmap, MA_OWNED); PV_STAT(atomic_add_long(&pv_entry_allocs, 1)); retry: pc = TAILQ_FIRST(&pmap->pm_pvchunk); if (pc != NULL) { for (field = 0; field < _NPCM; field++) { if (pc->pc_map[field]) { bit = bsfq(pc->pc_map[field]); break; } } if (field < _NPCM) { pv = &pc->pc_pventry[field * 64 + bit]; pc->pc_map[field] &= ~(1ul << bit); /* If this was the last item, move it to tail */ if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 && pc->pc_map[2] == 0) { TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); } PV_STAT(atomic_add_long(&pv_entry_count, 1)); PV_STAT(atomic_subtract_int(&pv_entry_spare, 1)); return (pv); } } /* No free items, allocate another chunk */ m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED); if (m == NULL) { if (lockp == NULL) { PV_STAT(pc_chunk_tryfail++); return (NULL); } m = reclaim_pv_chunk(pmap, lockp); if (m == NULL) goto retry; } PV_STAT(atomic_add_int(&pc_chunk_count, 1)); PV_STAT(atomic_add_int(&pc_chunk_allocs, 1)); dump_add_page(m->phys_addr); pc = (void *)PHYS_TO_DMAP(m->phys_addr); pc->pc_pmap = pmap; pc->pc_map[0] = PC_FREE0 & ~1ul; /* preallocated bit 0 */ pc->pc_map[1] = PC_FREE1; pc->pc_map[2] = PC_FREE2; mtx_lock(&pv_chunks_mutex); TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru); mtx_unlock(&pv_chunks_mutex); pv = &pc->pc_pventry[0]; TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); PV_STAT(atomic_add_long(&pv_entry_count, 1)); PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV - 1)); return (pv); } /* * Returns the number of one bits within the given PV chunk map. * * The erratas for Intel processors state that "POPCNT Instruction May * Take Longer to Execute Than Expected". It is believed that the * issue is the spurious dependency on the destination register. * Provide a hint to the register rename logic that the destination * value is overwritten, by clearing it, as suggested in the * optimization manual. It should be cheap for unaffected processors * as well. * * Reference numbers for erratas are * 4th Gen Core: HSD146 * 5th Gen Core: BDM85 * 6th Gen Core: SKL029 */ static int popcnt_pc_map_pq(uint64_t *map) { u_long result, tmp; __asm __volatile("xorl %k0,%k0;popcntq %2,%0;" "xorl %k1,%k1;popcntq %3,%1;addl %k1,%k0;" "xorl %k1,%k1;popcntq %4,%1;addl %k1,%k0" : "=&r" (result), "=&r" (tmp) : "m" (map[0]), "m" (map[1]), "m" (map[2])); return (result); } /* * Ensure that the number of spare PV entries in the specified pmap meets or * exceeds the given count, "needed". * * The given PV list lock may be released. */ static void reserve_pv_entries(pmap_t pmap, int needed, struct rwlock **lockp) { struct pch new_tail; struct pv_chunk *pc; vm_page_t m; int avail, free; bool reclaimed; PMAP_LOCK_ASSERT(pmap, MA_OWNED); KASSERT(lockp != NULL, ("reserve_pv_entries: lockp is NULL")); /* * Newly allocated PV chunks must be stored in a private list until * the required number of PV chunks have been allocated. Otherwise, * reclaim_pv_chunk() could recycle one of these chunks. In * contrast, these chunks must be added to the pmap upon allocation. */ TAILQ_INIT(&new_tail); retry: avail = 0; TAILQ_FOREACH(pc, &pmap->pm_pvchunk, pc_list) { #ifndef __POPCNT__ if ((cpu_feature2 & CPUID2_POPCNT) == 0) bit_count((bitstr_t *)pc->pc_map, 0, sizeof(pc->pc_map) * NBBY, &free); else #endif free = popcnt_pc_map_pq(pc->pc_map); if (free == 0) break; avail += free; if (avail >= needed) break; } for (reclaimed = false; avail < needed; avail += _NPCPV) { m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED); if (m == NULL) { m = reclaim_pv_chunk(pmap, lockp); if (m == NULL) goto retry; reclaimed = true; } PV_STAT(atomic_add_int(&pc_chunk_count, 1)); PV_STAT(atomic_add_int(&pc_chunk_allocs, 1)); dump_add_page(m->phys_addr); pc = (void *)PHYS_TO_DMAP(m->phys_addr); pc->pc_pmap = pmap; pc->pc_map[0] = PC_FREE0; pc->pc_map[1] = PC_FREE1; pc->pc_map[2] = PC_FREE2; TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru); PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV)); /* * The reclaim might have freed a chunk from the current pmap. * If that chunk contained available entries, we need to * re-count the number of available entries. */ if (reclaimed) goto retry; } if (!TAILQ_EMPTY(&new_tail)) { mtx_lock(&pv_chunks_mutex); TAILQ_CONCAT(&pv_chunks, &new_tail, pc_lru); mtx_unlock(&pv_chunks_mutex); } } /* * First find and then remove the pv entry for the specified pmap and virtual * address from the specified pv list. Returns the pv entry if found and NULL * otherwise. This operation can be performed on pv lists for either 4KB or * 2MB page mappings. */ static __inline pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va) { pv_entry_t pv; TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { if (pmap == PV_PMAP(pv) && va == pv->pv_va) { TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); pvh->pv_gen++; break; } } return (pv); } /* * After demotion from a 2MB page mapping to 512 4KB page mappings, * destroy the pv entry for the 2MB page mapping and reinstantiate the pv * entries for each of the 4KB page mappings. */ static void pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, struct rwlock **lockp) { struct md_page *pvh; struct pv_chunk *pc; pv_entry_t pv; vm_offset_t va_last; vm_page_t m; int bit, field; PMAP_LOCK_ASSERT(pmap, MA_OWNED); KASSERT((pa & PDRMASK) == 0, ("pmap_pv_demote_pde: pa is not 2mpage aligned")); CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); /* * Transfer the 2mpage's pv entry for this mapping to the first * page's pv list. Once this transfer begins, the pv list lock * must not be released until the last pv entry is reinstantiated. */ pvh = pa_to_pvh(pa); va = trunc_2mpage(va); pv = pmap_pvh_remove(pvh, pmap, va); KASSERT(pv != NULL, ("pmap_pv_demote_pde: pv not found")); m = PHYS_TO_VM_PAGE(pa); TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); m->md.pv_gen++; /* Instantiate the remaining NPTEPG - 1 pv entries. */ PV_STAT(atomic_add_long(&pv_entry_allocs, NPTEPG - 1)); va_last = va + NBPDR - PAGE_SIZE; for (;;) { pc = TAILQ_FIRST(&pmap->pm_pvchunk); KASSERT(pc->pc_map[0] != 0 || pc->pc_map[1] != 0 || pc->pc_map[2] != 0, ("pmap_pv_demote_pde: missing spare")); for (field = 0; field < _NPCM; field++) { while (pc->pc_map[field]) { bit = bsfq(pc->pc_map[field]); pc->pc_map[field] &= ~(1ul << bit); pv = &pc->pc_pventry[field * 64 + bit]; va += PAGE_SIZE; pv->pv_va = va; m++; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_pv_demote_pde: page %p is not managed", m)); TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); m->md.pv_gen++; if (va == va_last) goto out; } } TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); } out: if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 && pc->pc_map[2] == 0) { TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); } PV_STAT(atomic_add_long(&pv_entry_count, NPTEPG - 1)); PV_STAT(atomic_subtract_int(&pv_entry_spare, NPTEPG - 1)); } #if VM_NRESERVLEVEL > 0 /* * After promotion from 512 4KB page mappings to a single 2MB page mapping, * replace the many pv entries for the 4KB page mappings by a single pv entry * for the 2MB page mapping. */ static void pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, struct rwlock **lockp) { struct md_page *pvh; pv_entry_t pv; vm_offset_t va_last; vm_page_t m; KASSERT((pa & PDRMASK) == 0, ("pmap_pv_promote_pde: pa is not 2mpage aligned")); CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); /* * Transfer the first page's pv entry for this mapping to the 2mpage's * pv list. Aside from avoiding the cost of a call to get_pv_entry(), * a transfer avoids the possibility that get_pv_entry() calls * reclaim_pv_chunk() and that reclaim_pv_chunk() removes one of the * mappings that is being promoted. */ m = PHYS_TO_VM_PAGE(pa); va = trunc_2mpage(va); pv = pmap_pvh_remove(&m->md, pmap, va); KASSERT(pv != NULL, ("pmap_pv_promote_pde: pv not found")); pvh = pa_to_pvh(pa); TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); pvh->pv_gen++; /* Free the remaining NPTEPG - 1 pv entries. */ va_last = va + NBPDR - PAGE_SIZE; do { m++; va += PAGE_SIZE; pmap_pvh_free(&m->md, pmap, va); } while (va < va_last); } #endif /* VM_NRESERVLEVEL > 0 */ /* * First find and then destroy the pv entry for the specified pmap and virtual * address. This operation can be performed on pv lists for either 4KB or 2MB * page mappings. */ static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va) { pv_entry_t pv; pv = pmap_pvh_remove(pvh, pmap, va); KASSERT(pv != NULL, ("pmap_pvh_free: pv not found")); free_pv_entry(pmap, pv); } /* * Conditionally create the PV entry for a 4KB page mapping if the required * memory can be allocated without resorting to reclamation. */ static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m, struct rwlock **lockp) { pv_entry_t pv; PMAP_LOCK_ASSERT(pmap, MA_OWNED); /* Pass NULL instead of the lock pointer to disable reclamation. */ if ((pv = get_pv_entry(pmap, NULL)) != NULL) { pv->pv_va = va; CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); m->md.pv_gen++; return (TRUE); } else return (FALSE); } /* * Create the PV entry for a 2MB page mapping. Always returns true unless the * flag PMAP_ENTER_NORECLAIM is specified. If that flag is specified, returns * false if the PV entry cannot be allocated without resorting to reclamation. */ static bool pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, pd_entry_t pde, u_int flags, struct rwlock **lockp) { struct md_page *pvh; pv_entry_t pv; vm_paddr_t pa; PMAP_LOCK_ASSERT(pmap, MA_OWNED); /* Pass NULL instead of the lock pointer to disable reclamation. */ if ((pv = get_pv_entry(pmap, (flags & PMAP_ENTER_NORECLAIM) != 0 ? NULL : lockp)) == NULL) return (false); pv->pv_va = va; pa = pde & PG_PS_FRAME; CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); pvh = pa_to_pvh(pa); TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); pvh->pv_gen++; return (true); } /* * Fills a page table page with mappings to consecutive physical pages. */ static void pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte) { pt_entry_t *pte; for (pte = firstpte; pte < firstpte + NPTEPG; pte++) { *pte = newpte; newpte += PAGE_SIZE; } } /* * Tries to demote a 2MB page mapping. If demotion fails, the 2MB page * mapping is invalidated. */ static boolean_t pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va) { struct rwlock *lock; boolean_t rv; lock = NULL; rv = pmap_demote_pde_locked(pmap, pde, va, &lock); if (lock != NULL) rw_wunlock(lock); return (rv); } static void pmap_demote_pde_check(pt_entry_t *firstpte __unused, pt_entry_t newpte __unused) { #ifdef INVARIANTS #ifdef DIAGNOSTIC pt_entry_t *xpte, *ypte; for (xpte = firstpte; xpte < firstpte + NPTEPG; xpte++, newpte += PAGE_SIZE) { if ((*xpte & PG_FRAME) != (newpte & PG_FRAME)) { printf("pmap_demote_pde: xpte %zd and newpte map " "different pages: found %#lx, expected %#lx\n", xpte - firstpte, *xpte, newpte); printf("page table dump\n"); for (ypte = firstpte; ypte < firstpte + NPTEPG; ypte++) printf("%zd %#lx\n", ypte - firstpte, *ypte); panic("firstpte"); } } #else KASSERT((*firstpte & PG_FRAME) == (newpte & PG_FRAME), ("pmap_demote_pde: firstpte and newpte map different physical" " addresses")); #endif #endif } static void pmap_demote_pde_abort(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t oldpde, struct rwlock **lockp) { struct spglist free; vm_offset_t sva; SLIST_INIT(&free); sva = trunc_2mpage(va); pmap_remove_pde(pmap, pde, sva, &free, lockp); if ((oldpde & pmap_global_bit(pmap)) == 0) pmap_invalidate_pde_page(pmap, sva, oldpde); vm_page_free_pages_toq(&free, true); CTR2(KTR_PMAP, "pmap_demote_pde: failure for va %#lx in pmap %p", va, pmap); } static boolean_t pmap_demote_pde_locked(pmap_t pmap, pd_entry_t *pde, vm_offset_t va, struct rwlock **lockp) { pd_entry_t newpde, oldpde; pt_entry_t *firstpte, newpte; pt_entry_t PG_A, PG_G, PG_M, PG_PKU_MASK, PG_RW, PG_V; vm_paddr_t mptepa; vm_page_t mpte; int PG_PTE_CACHE; bool in_kernel; PG_A = pmap_accessed_bit(pmap); PG_G = pmap_global_bit(pmap); PG_M = pmap_modified_bit(pmap); PG_RW = pmap_rw_bit(pmap); PG_V = pmap_valid_bit(pmap); PG_PTE_CACHE = pmap_cache_mask(pmap, 0); PG_PKU_MASK = pmap_pku_mask_bit(pmap); PMAP_LOCK_ASSERT(pmap, MA_OWNED); in_kernel = va >= VM_MAXUSER_ADDRESS; oldpde = *pde; KASSERT((oldpde & (PG_PS | PG_V)) == (PG_PS | PG_V), ("pmap_demote_pde: oldpde is missing PG_PS and/or PG_V")); /* * Invalidate the 2MB page mapping and return "failure" if the * mapping was never accessed. */ if ((oldpde & PG_A) == 0) { KASSERT((oldpde & PG_W) == 0, ("pmap_demote_pde: a wired mapping is missing PG_A")); pmap_demote_pde_abort(pmap, va, pde, oldpde, lockp); return (FALSE); } mpte = pmap_remove_pt_page(pmap, va); if (mpte == NULL) { KASSERT((oldpde & PG_W) == 0, ("pmap_demote_pde: page table page for a wired mapping" " is missing")); /* * If the page table page is missing and the mapping * is for a kernel address, the mapping must belong to * the direct map. Page table pages are preallocated * for every other part of the kernel address space, * so the direct map region is the only part of the * kernel address space that must be handled here. */ KASSERT(!in_kernel || (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS), ("pmap_demote_pde: No saved mpte for va %#lx", va)); /* * If the 2MB page mapping belongs to the direct map * region of the kernel's address space, then the page * allocation request specifies the highest possible * priority (VM_ALLOC_INTERRUPT). Otherwise, the * priority is normal. */ mpte = vm_page_alloc(NULL, pmap_pde_pindex(va), (in_kernel ? VM_ALLOC_INTERRUPT : VM_ALLOC_NORMAL) | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED); /* * If the allocation of the new page table page fails, * invalidate the 2MB page mapping and return "failure". */ if (mpte == NULL) { pmap_demote_pde_abort(pmap, va, pde, oldpde, lockp); return (FALSE); } if (!in_kernel) { mpte->wire_count = NPTEPG; pmap_resident_count_inc(pmap, 1); } } mptepa = VM_PAGE_TO_PHYS(mpte); firstpte = (pt_entry_t *)PHYS_TO_DMAP(mptepa); newpde = mptepa | PG_M | PG_A | (oldpde & PG_U) | PG_RW | PG_V; KASSERT((oldpde & (PG_M | PG_RW)) != PG_RW, ("pmap_demote_pde: oldpde is missing PG_M")); newpte = oldpde & ~PG_PS; newpte = pmap_swap_pat(pmap, newpte); /* * If the page table page is not leftover from an earlier promotion, * initialize it. */ if (mpte->valid == 0) pmap_fill_ptp(firstpte, newpte); pmap_demote_pde_check(firstpte, newpte); /* * If the mapping has changed attributes, update the page table * entries. */ if ((*firstpte & PG_PTE_PROMOTE) != (newpte & PG_PTE_PROMOTE)) pmap_fill_ptp(firstpte, newpte); /* * The spare PV entries must be reserved prior to demoting the * mapping, that is, prior to changing the PDE. Otherwise, the state * of the PDE and the PV lists will be inconsistent, which can result * in reclaim_pv_chunk() attempting to remove a PV entry from the * wrong PV list and pmap_pv_demote_pde() failing to find the expected * PV entry for the 2MB page mapping that is being demoted. */ if ((oldpde & PG_MANAGED) != 0) reserve_pv_entries(pmap, NPTEPG - 1, lockp); /* * Demote the mapping. This pmap is locked. The old PDE has * PG_A set. If the old PDE has PG_RW set, it also has PG_M * set. Thus, there is no danger of a race with another * processor changing the setting of PG_A and/or PG_M between * the read above and the store below. */ if (workaround_erratum383) pmap_update_pde(pmap, va, pde, newpde); else pde_store(pde, newpde); /* * Invalidate a stale recursive mapping of the page table page. */ if (in_kernel) pmap_invalidate_page(pmap, (vm_offset_t)vtopte(va)); /* * Demote the PV entry. */ if ((oldpde & PG_MANAGED) != 0) pmap_pv_demote_pde(pmap, va, oldpde & PG_PS_FRAME, lockp); atomic_add_long(&pmap_pde_demotions, 1); CTR2(KTR_PMAP, "pmap_demote_pde: success for va %#lx in pmap %p", va, pmap); return (TRUE); } /* * pmap_remove_kernel_pde: Remove a kernel superpage mapping. */ static void pmap_remove_kernel_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va) { pd_entry_t newpde; vm_paddr_t mptepa; vm_page_t mpte; KASSERT(pmap == kernel_pmap, ("pmap %p is not kernel_pmap", pmap)); PMAP_LOCK_ASSERT(pmap, MA_OWNED); mpte = pmap_remove_pt_page(pmap, va); if (mpte == NULL) panic("pmap_remove_kernel_pde: Missing pt page."); mptepa = VM_PAGE_TO_PHYS(mpte); newpde = mptepa | X86_PG_M | X86_PG_A | X86_PG_RW | X86_PG_V; /* * If this page table page was unmapped by a promotion, then it * contains valid mappings. Zero it to invalidate those mappings. */ if (mpte->valid != 0) pagezero((void *)PHYS_TO_DMAP(mptepa)); /* * Demote the mapping. */ if (workaround_erratum383) pmap_update_pde(pmap, va, pde, newpde); else pde_store(pde, newpde); /* * Invalidate a stale recursive mapping of the page table page. */ pmap_invalidate_page(pmap, (vm_offset_t)vtopte(va)); } /* * pmap_remove_pde: do the things to unmap a superpage in a process */ static int pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva, struct spglist *free, struct rwlock **lockp) { struct md_page *pvh; pd_entry_t oldpde; vm_offset_t eva, va; vm_page_t m, mpte; pt_entry_t PG_G, PG_A, PG_M, PG_RW; PG_G = pmap_global_bit(pmap); PG_A = pmap_accessed_bit(pmap); PG_M = pmap_modified_bit(pmap); PG_RW = pmap_rw_bit(pmap); PMAP_LOCK_ASSERT(pmap, MA_OWNED); KASSERT((sva & PDRMASK) == 0, ("pmap_remove_pde: sva is not 2mpage aligned")); oldpde = pte_load_clear(pdq); if (oldpde & PG_W) pmap->pm_stats.wired_count -= NBPDR / PAGE_SIZE; if ((oldpde & PG_G) != 0) pmap_invalidate_pde_page(kernel_pmap, sva, oldpde); pmap_resident_count_dec(pmap, NBPDR / PAGE_SIZE); if (oldpde & PG_MANAGED) { CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, oldpde & PG_PS_FRAME); pvh = pa_to_pvh(oldpde & PG_PS_FRAME); pmap_pvh_free(pvh, pmap, sva); eva = sva + NBPDR; for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME); va < eva; va += PAGE_SIZE, m++) { if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW)) vm_page_dirty(m); if (oldpde & PG_A) vm_page_aflag_set(m, PGA_REFERENCED); if (TAILQ_EMPTY(&m->md.pv_list) && TAILQ_EMPTY(&pvh->pv_list)) vm_page_aflag_clear(m, PGA_WRITEABLE); pmap_delayed_invl_page(m); } } if (pmap == kernel_pmap) { pmap_remove_kernel_pde(pmap, pdq, sva); } else { mpte = pmap_remove_pt_page(pmap, sva); if (mpte != NULL) { KASSERT(mpte->valid == VM_PAGE_BITS_ALL, ("pmap_remove_pde: pte page not promoted")); pmap_resident_count_dec(pmap, 1); KASSERT(mpte->wire_count == NPTEPG, ("pmap_remove_pde: pte page wire count error")); mpte->wire_count = 0; pmap_add_delayed_free_list(mpte, free, FALSE); } } return (pmap_unuse_pt(pmap, sva, *pmap_pdpe(pmap, sva), free)); } /* * pmap_remove_pte: do the things to unmap a page in a process */ static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va, pd_entry_t ptepde, struct spglist *free, struct rwlock **lockp) { struct md_page *pvh; pt_entry_t oldpte, PG_A, PG_M, PG_RW; vm_page_t m; PG_A = pmap_accessed_bit(pmap); PG_M = pmap_modified_bit(pmap); PG_RW = pmap_rw_bit(pmap); PMAP_LOCK_ASSERT(pmap, MA_OWNED); oldpte = pte_load_clear(ptq); if (oldpte & PG_W) pmap->pm_stats.wired_count -= 1; pmap_resident_count_dec(pmap, 1); if (oldpte & PG_MANAGED) { m = PHYS_TO_VM_PAGE(oldpte & PG_FRAME); if ((oldpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) vm_page_dirty(m); if (oldpte & PG_A) vm_page_aflag_set(m, PGA_REFERENCED); CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); pmap_pvh_free(&m->md, pmap, va); if (TAILQ_EMPTY(&m->md.pv_list) && (m->flags & PG_FICTITIOUS) == 0) { pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); if (TAILQ_EMPTY(&pvh->pv_list)) vm_page_aflag_clear(m, PGA_WRITEABLE); } pmap_delayed_invl_page(m); } return (pmap_unuse_pt(pmap, va, ptepde, free)); } /* * Remove a single page from a process address space */ static void pmap_remove_page(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, struct spglist *free) { struct rwlock *lock; pt_entry_t *pte, PG_V; PG_V = pmap_valid_bit(pmap); PMAP_LOCK_ASSERT(pmap, MA_OWNED); if ((*pde & PG_V) == 0) return; pte = pmap_pde_to_pte(pde, va); if ((*pte & PG_V) == 0) return; lock = NULL; pmap_remove_pte(pmap, pte, va, *pde, free, &lock); if (lock != NULL) rw_wunlock(lock); pmap_invalidate_page(pmap, va); } /* * Removes the specified range of addresses from the page table page. */ static bool pmap_remove_ptes(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, pd_entry_t *pde, struct spglist *free, struct rwlock **lockp) { pt_entry_t PG_G, *pte; vm_offset_t va; bool anyvalid; PMAP_LOCK_ASSERT(pmap, MA_OWNED); PG_G = pmap_global_bit(pmap); anyvalid = false; va = eva; for (pte = pmap_pde_to_pte(pde, sva); sva != eva; pte++, sva += PAGE_SIZE) { if (*pte == 0) { if (va != eva) { pmap_invalidate_range(pmap, va, sva); va = eva; } continue; } if ((*pte & PG_G) == 0) anyvalid = true; else if (va == eva) va = sva; if (pmap_remove_pte(pmap, pte, sva, *pde, free, lockp)) { sva += PAGE_SIZE; break; } } if (va != eva) pmap_invalidate_range(pmap, va, sva); return (anyvalid); } /* * Remove the given range of addresses from the specified map. * * It is assumed that the start and end are properly * rounded to the page size. */ void pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { struct rwlock *lock; vm_offset_t va_next; pml4_entry_t *pml4e; pdp_entry_t *pdpe; pd_entry_t ptpaddr, *pde; pt_entry_t PG_G, PG_V; struct spglist free; int anyvalid; PG_G = pmap_global_bit(pmap); PG_V = pmap_valid_bit(pmap); /* * Perform an unsynchronized read. This is, however, safe. */ if (pmap->pm_stats.resident_count == 0) return; anyvalid = 0; SLIST_INIT(&free); pmap_delayed_invl_start(); PMAP_LOCK(pmap); pmap_pkru_on_remove(pmap, sva, eva); /* * special handling of removing one page. a very * common operation and easy to short circuit some * code. */ if (sva + PAGE_SIZE == eva) { pde = pmap_pde(pmap, sva); if (pde && (*pde & PG_PS) == 0) { pmap_remove_page(pmap, sva, pde, &free); goto out; } } lock = NULL; for (; sva < eva; sva = va_next) { if (pmap->pm_stats.resident_count == 0) break; pml4e = pmap_pml4e(pmap, sva); if ((*pml4e & PG_V) == 0) { va_next = (sva + NBPML4) & ~PML4MASK; if (va_next < sva) va_next = eva; continue; } pdpe = pmap_pml4e_to_pdpe(pml4e, sva); if ((*pdpe & PG_V) == 0) { va_next = (sva + NBPDP) & ~PDPMASK; if (va_next < sva) va_next = eva; continue; } /* * Calculate index for next page table. */ va_next = (sva + NBPDR) & ~PDRMASK; if (va_next < sva) va_next = eva; pde = pmap_pdpe_to_pde(pdpe, sva); ptpaddr = *pde; /* * Weed out invalid mappings. */ if (ptpaddr == 0) continue; /* * Check for large page. */ if ((ptpaddr & PG_PS) != 0) { /* * Are we removing the entire large page? If not, * demote the mapping and fall through. */ if (sva + NBPDR == va_next && eva >= va_next) { /* * The TLB entry for a PG_G mapping is * invalidated by pmap_remove_pde(). */ if ((ptpaddr & PG_G) == 0) anyvalid = 1; pmap_remove_pde(pmap, pde, sva, &free, &lock); continue; } else if (!pmap_demote_pde_locked(pmap, pde, sva, &lock)) { /* The large page mapping was destroyed. */ continue; } else ptpaddr = *pde; } /* * Limit our scan to either the end of the va represented * by the current page table page, or to the end of the * range being removed. */ if (va_next > eva) va_next = eva; if (pmap_remove_ptes(pmap, sva, va_next, pde, &free, &lock)) anyvalid = 1; } if (lock != NULL) rw_wunlock(lock); out: if (anyvalid) pmap_invalidate_all(pmap); PMAP_UNLOCK(pmap); pmap_delayed_invl_finish(); vm_page_free_pages_toq(&free, true); } /* * Routine: pmap_remove_all * Function: * Removes this physical page from * all physical maps in which it resides. * Reflects back modify bits to the pager. * * Notes: * Original versions of this routine were very * inefficient because they iteratively called * pmap_remove (slow...) */ void pmap_remove_all(vm_page_t m) { struct md_page *pvh; pv_entry_t pv; pmap_t pmap; struct rwlock *lock; pt_entry_t *pte, tpte, PG_A, PG_M, PG_RW; pd_entry_t *pde; vm_offset_t va; struct spglist free; int pvh_gen, md_gen; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_remove_all: page %p is not managed", m)); SLIST_INIT(&free); lock = VM_PAGE_TO_PV_LIST_LOCK(m); pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : pa_to_pvh(VM_PAGE_TO_PHYS(m)); retry: rw_wlock(lock); while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { pvh_gen = pvh->pv_gen; rw_wunlock(lock); PMAP_LOCK(pmap); rw_wlock(lock); if (pvh_gen != pvh->pv_gen) { rw_wunlock(lock); PMAP_UNLOCK(pmap); goto retry; } } va = pv->pv_va; pde = pmap_pde(pmap, va); (void)pmap_demote_pde_locked(pmap, pde, va, &lock); PMAP_UNLOCK(pmap); } while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { pvh_gen = pvh->pv_gen; md_gen = m->md.pv_gen; rw_wunlock(lock); PMAP_LOCK(pmap); rw_wlock(lock); if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { rw_wunlock(lock); PMAP_UNLOCK(pmap); goto retry; } } PG_A = pmap_accessed_bit(pmap); PG_M = pmap_modified_bit(pmap); PG_RW = pmap_rw_bit(pmap); pmap_resident_count_dec(pmap, 1); pde = pmap_pde(pmap, pv->pv_va); KASSERT((*pde & PG_PS) == 0, ("pmap_remove_all: found" " a 2mpage in page %p's pv list", m)); pte = pmap_pde_to_pte(pde, pv->pv_va); tpte = pte_load_clear(pte); if (tpte & PG_W) pmap->pm_stats.wired_count--; if (tpte & PG_A) vm_page_aflag_set(m, PGA_REFERENCED); /* * Update the vm_page_t clean and reference bits. */ if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) vm_page_dirty(m); pmap_unuse_pt(pmap, pv->pv_va, *pde, &free); pmap_invalidate_page(pmap, pv->pv_va); TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); m->md.pv_gen++; free_pv_entry(pmap, pv); PMAP_UNLOCK(pmap); } vm_page_aflag_clear(m, PGA_WRITEABLE); rw_wunlock(lock); pmap_delayed_invl_wait(m); vm_page_free_pages_toq(&free, true); } /* * pmap_protect_pde: do the things to protect a 2mpage in a process */ static boolean_t pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva, vm_prot_t prot) { pd_entry_t newpde, oldpde; vm_page_t m, mt; boolean_t anychanged; pt_entry_t PG_G, PG_M, PG_RW; PG_G = pmap_global_bit(pmap); PG_M = pmap_modified_bit(pmap); PG_RW = pmap_rw_bit(pmap); PMAP_LOCK_ASSERT(pmap, MA_OWNED); KASSERT((sva & PDRMASK) == 0, ("pmap_protect_pde: sva is not 2mpage aligned")); anychanged = FALSE; retry: oldpde = newpde = *pde; if ((prot & VM_PROT_WRITE) == 0) { if ((oldpde & (PG_MANAGED | PG_M | PG_RW)) == (PG_MANAGED | PG_M | PG_RW)) { m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME); for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++) vm_page_dirty(mt); } newpde &= ~(PG_RW | PG_M); } if ((prot & VM_PROT_EXECUTE) == 0) newpde |= pg_nx; if (newpde != oldpde) { /* * As an optimization to future operations on this PDE, clear * PG_PROMOTED. The impending invalidation will remove any * lingering 4KB page mappings from the TLB. */ if (!atomic_cmpset_long(pde, oldpde, newpde & ~PG_PROMOTED)) goto retry; if ((oldpde & PG_G) != 0) pmap_invalidate_pde_page(kernel_pmap, sva, oldpde); else anychanged = TRUE; } return (anychanged); } /* * Set the physical protection on the * specified range of this map as requested. */ void pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) { vm_offset_t va_next; pml4_entry_t *pml4e; pdp_entry_t *pdpe; pd_entry_t ptpaddr, *pde; pt_entry_t *pte, PG_G, PG_M, PG_RW, PG_V; boolean_t anychanged; KASSERT((prot & ~VM_PROT_ALL) == 0, ("invalid prot %x", prot)); if (prot == VM_PROT_NONE) { pmap_remove(pmap, sva, eva); return; } if ((prot & (VM_PROT_WRITE|VM_PROT_EXECUTE)) == (VM_PROT_WRITE|VM_PROT_EXECUTE)) return; PG_G = pmap_global_bit(pmap); PG_M = pmap_modified_bit(pmap); PG_V = pmap_valid_bit(pmap); PG_RW = pmap_rw_bit(pmap); anychanged = FALSE; /* * Although this function delays and batches the invalidation * of stale TLB entries, it does not need to call * pmap_delayed_invl_start() and * pmap_delayed_invl_finish(), because it does not * ordinarily destroy mappings. Stale TLB entries from * protection-only changes need only be invalidated before the * pmap lock is released, because protection-only changes do * not destroy PV entries. Even operations that iterate over * a physical page's PV list of mappings, like * pmap_remove_write(), acquire the pmap lock for each * mapping. Consequently, for protection-only changes, the * pmap lock suffices to synchronize both page table and TLB * updates. * * This function only destroys a mapping if pmap_demote_pde() * fails. In that case, stale TLB entries are immediately * invalidated. */ PMAP_LOCK(pmap); for (; sva < eva; sva = va_next) { pml4e = pmap_pml4e(pmap, sva); if ((*pml4e & PG_V) == 0) { va_next = (sva + NBPML4) & ~PML4MASK; if (va_next < sva) va_next = eva; continue; } pdpe = pmap_pml4e_to_pdpe(pml4e, sva); if ((*pdpe & PG_V) == 0) { va_next = (sva + NBPDP) & ~PDPMASK; if (va_next < sva) va_next = eva; continue; } va_next = (sva + NBPDR) & ~PDRMASK; if (va_next < sva) va_next = eva; pde = pmap_pdpe_to_pde(pdpe, sva); ptpaddr = *pde; /* * Weed out invalid mappings. */ if (ptpaddr == 0) continue; /* * Check for large page. */ if ((ptpaddr & PG_PS) != 0) { /* * Are we protecting the entire large page? If not, * demote the mapping and fall through. */ if (sva + NBPDR == va_next && eva >= va_next) { /* * The TLB entry for a PG_G mapping is * invalidated by pmap_protect_pde(). */ if (pmap_protect_pde(pmap, pde, sva, prot)) anychanged = TRUE; continue; } else if (!pmap_demote_pde(pmap, pde, sva)) { /* * The large page mapping was destroyed. */ continue; } } if (va_next > eva) va_next = eva; for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++, sva += PAGE_SIZE) { pt_entry_t obits, pbits; vm_page_t m; retry: obits = pbits = *pte; if ((pbits & PG_V) == 0) continue; if ((prot & VM_PROT_WRITE) == 0) { if ((pbits & (PG_MANAGED | PG_M | PG_RW)) == (PG_MANAGED | PG_M | PG_RW)) { m = PHYS_TO_VM_PAGE(pbits & PG_FRAME); vm_page_dirty(m); } pbits &= ~(PG_RW | PG_M); } if ((prot & VM_PROT_EXECUTE) == 0) pbits |= pg_nx; if (pbits != obits) { if (!atomic_cmpset_long(pte, obits, pbits)) goto retry; if (obits & PG_G) pmap_invalidate_page(pmap, sva); else anychanged = TRUE; } } } if (anychanged) pmap_invalidate_all(pmap); PMAP_UNLOCK(pmap); } #if VM_NRESERVLEVEL > 0 /* * Tries to promote the 512, contiguous 4KB page mappings that are within a * single page table page (PTP) to a single 2MB page mapping. For promotion * to occur, two conditions must be met: (1) the 4KB page mappings must map * aligned, contiguous physical memory and (2) the 4KB page mappings must have * identical characteristics. */ static void pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va, struct rwlock **lockp) { pd_entry_t newpde; pt_entry_t *firstpte, oldpte, pa, *pte; pt_entry_t PG_G, PG_A, PG_M, PG_RW, PG_V, PG_PKU_MASK; vm_page_t mpte; int PG_PTE_CACHE; PG_A = pmap_accessed_bit(pmap); PG_G = pmap_global_bit(pmap); PG_M = pmap_modified_bit(pmap); PG_V = pmap_valid_bit(pmap); PG_RW = pmap_rw_bit(pmap); PG_PKU_MASK = pmap_pku_mask_bit(pmap); PG_PTE_CACHE = pmap_cache_mask(pmap, 0); PMAP_LOCK_ASSERT(pmap, MA_OWNED); /* * Examine the first PTE in the specified PTP. Abort if this PTE is * either invalid, unused, or does not map the first 4KB physical page * within a 2MB page. */ firstpte = (pt_entry_t *)PHYS_TO_DMAP(*pde & PG_FRAME); setpde: newpde = *firstpte; if ((newpde & ((PG_FRAME & PDRMASK) | PG_A | PG_V)) != (PG_A | PG_V)) { atomic_add_long(&pmap_pde_p_failures, 1); CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx" " in pmap %p", va, pmap); return; } if ((newpde & (PG_M | PG_RW)) == PG_RW) { /* * When PG_M is already clear, PG_RW can be cleared without * a TLB invalidation. */ if (!atomic_cmpset_long(firstpte, newpde, newpde & ~PG_RW)) goto setpde; newpde &= ~PG_RW; } /* * Examine each of the other PTEs in the specified PTP. Abort if this * PTE maps an unexpected 4KB physical page or does not have identical * characteristics to the first PTE. */ pa = (newpde & (PG_PS_FRAME | PG_A | PG_V)) + NBPDR - PAGE_SIZE; for (pte = firstpte + NPTEPG - 1; pte > firstpte; pte--) { setpte: oldpte = *pte; if ((oldpte & (PG_FRAME | PG_A | PG_V)) != pa) { atomic_add_long(&pmap_pde_p_failures, 1); CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx" " in pmap %p", va, pmap); return; } if ((oldpte & (PG_M | PG_RW)) == PG_RW) { /* * When PG_M is already clear, PG_RW can be cleared * without a TLB invalidation. */ if (!atomic_cmpset_long(pte, oldpte, oldpte & ~PG_RW)) goto setpte; oldpte &= ~PG_RW; CTR2(KTR_PMAP, "pmap_promote_pde: protect for va %#lx" " in pmap %p", (oldpte & PG_FRAME & PDRMASK) | (va & ~PDRMASK), pmap); } if ((oldpte & PG_PTE_PROMOTE) != (newpde & PG_PTE_PROMOTE)) { atomic_add_long(&pmap_pde_p_failures, 1); CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx" " in pmap %p", va, pmap); return; } pa -= PAGE_SIZE; } /* * Save the page table page in its current state until the PDE * mapping the superpage is demoted by pmap_demote_pde() or * destroyed by pmap_remove_pde(). */ mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME); KASSERT(mpte >= vm_page_array && mpte < &vm_page_array[vm_page_array_size], ("pmap_promote_pde: page table page is out of range")); KASSERT(mpte->pindex == pmap_pde_pindex(va), ("pmap_promote_pde: page table page's pindex is wrong")); if (pmap_insert_pt_page(pmap, mpte, true)) { atomic_add_long(&pmap_pde_p_failures, 1); CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx in pmap %p", va, pmap); return; } /* * Promote the pv entries. */ if ((newpde & PG_MANAGED) != 0) pmap_pv_promote_pde(pmap, va, newpde & PG_PS_FRAME, lockp); /* * Propagate the PAT index to its proper position. */ newpde = pmap_swap_pat(pmap, newpde); /* * Map the superpage. */ if (workaround_erratum383) pmap_update_pde(pmap, va, pde, PG_PS | newpde); else pde_store(pde, PG_PROMOTED | PG_PS | newpde); atomic_add_long(&pmap_pde_promotions, 1); CTR2(KTR_PMAP, "pmap_promote_pde: success for va %#lx" " in pmap %p", va, pmap); } #endif /* VM_NRESERVLEVEL > 0 */ /* * Insert the given physical page (p) at * the specified virtual address (v) in the * target physical map with the protection requested. * * If specified, the page will be wired down, meaning * that the related pte can not be reclaimed. * * NB: This is the only routine which MAY NOT lazy-evaluate * or lose information. That is, this routine must actually * insert this page into the given map NOW. * * When destroying both a page table and PV entry, this function * performs the TLB invalidation before releasing the PV list * lock, so we do not need pmap_delayed_invl_page() calls here. */ int pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, u_int flags, int8_t psind) { struct rwlock *lock; pd_entry_t *pde; pt_entry_t *pte, PG_G, PG_A, PG_M, PG_RW, PG_V; pt_entry_t newpte, origpte; pv_entry_t pv; vm_paddr_t opa, pa; vm_page_t mpte, om; int rv; boolean_t nosleep; PG_A = pmap_accessed_bit(pmap); PG_G = pmap_global_bit(pmap); PG_M = pmap_modified_bit(pmap); PG_V = pmap_valid_bit(pmap); PG_RW = pmap_rw_bit(pmap); va = trunc_page(va); KASSERT(va <= VM_MAX_KERNEL_ADDRESS, ("pmap_enter: toobig")); KASSERT(va < UPT_MIN_ADDRESS || va >= UPT_MAX_ADDRESS, ("pmap_enter: invalid to pmap_enter page table pages (va: 0x%lx)", va)); KASSERT((m->oflags & VPO_UNMANAGED) != 0 || va < kmi.clean_sva || va >= kmi.clean_eva, ("pmap_enter: managed mapping within the clean submap")); if ((m->oflags & VPO_UNMANAGED) == 0 && !vm_page_xbusied(m)) VM_OBJECT_ASSERT_LOCKED(m->object); KASSERT((flags & PMAP_ENTER_RESERVED) == 0, ("pmap_enter: flags %u has reserved bits set", flags)); pa = VM_PAGE_TO_PHYS(m); newpte = (pt_entry_t)(pa | PG_A | PG_V); if ((flags & VM_PROT_WRITE) != 0) newpte |= PG_M; if ((prot & VM_PROT_WRITE) != 0) newpte |= PG_RW; KASSERT((newpte & (PG_M | PG_RW)) != PG_M, ("pmap_enter: flags includes VM_PROT_WRITE but prot doesn't")); if ((prot & VM_PROT_EXECUTE) == 0) newpte |= pg_nx; if ((flags & PMAP_ENTER_WIRED) != 0) newpte |= PG_W; if (va < VM_MAXUSER_ADDRESS) newpte |= PG_U; if (pmap == kernel_pmap) newpte |= PG_G; newpte |= pmap_cache_bits(pmap, m->md.pat_mode, psind > 0); /* * Set modified bit gratuitously for writeable mappings if * the page is unmanaged. We do not want to take a fault * to do the dirty bit accounting for these mappings. */ if ((m->oflags & VPO_UNMANAGED) != 0) { if ((newpte & PG_RW) != 0) newpte |= PG_M; } else newpte |= PG_MANAGED; lock = NULL; PMAP_LOCK(pmap); if (psind == 1) { /* Assert the required virtual and physical alignment. */ KASSERT((va & PDRMASK) == 0, ("pmap_enter: va unaligned")); KASSERT(m->psind > 0, ("pmap_enter: m->psind < psind")); rv = pmap_enter_pde(pmap, va, newpte | PG_PS, flags, m, &lock); goto out; } mpte = NULL; /* * In the case that a page table page is not * resident, we are creating it here. */ retry: pde = pmap_pde(pmap, va); if (pde != NULL && (*pde & PG_V) != 0 && ((*pde & PG_PS) == 0 || pmap_demote_pde_locked(pmap, pde, va, &lock))) { pte = pmap_pde_to_pte(pde, va); if (va < VM_MAXUSER_ADDRESS && mpte == NULL) { mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME); mpte->wire_count++; } } else if (va < VM_MAXUSER_ADDRESS) { /* * Here if the pte page isn't mapped, or if it has been * deallocated. */ nosleep = (flags & PMAP_ENTER_NOSLEEP) != 0; mpte = _pmap_allocpte(pmap, pmap_pde_pindex(va), nosleep ? NULL : &lock); if (mpte == NULL && nosleep) { rv = KERN_RESOURCE_SHORTAGE; goto out; } goto retry; } else panic("pmap_enter: invalid page directory va=%#lx", va); origpte = *pte; pv = NULL; if (va < VM_MAXUSER_ADDRESS && pmap->pm_type == PT_X86) newpte |= pmap_pkru_get(pmap, va); /* * Is the specified virtual address already mapped? */ if ((origpte & PG_V) != 0) { /* * Wiring change, just update stats. We don't worry about * wiring PT pages as they remain resident as long as there * are valid mappings in them. Hence, if a user page is wired, * the PT page will be also. */ if ((newpte & PG_W) != 0 && (origpte & PG_W) == 0) pmap->pm_stats.wired_count++; else if ((newpte & PG_W) == 0 && (origpte & PG_W) != 0) pmap->pm_stats.wired_count--; /* * Remove the extra PT page reference. */ if (mpte != NULL) { mpte->wire_count--; KASSERT(mpte->wire_count > 0, ("pmap_enter: missing reference to page table page," " va: 0x%lx", va)); } /* * Has the physical page changed? */ opa = origpte & PG_FRAME; if (opa == pa) { /* * No, might be a protection or wiring change. */ if ((origpte & PG_MANAGED) != 0 && (newpte & PG_RW) != 0) vm_page_aflag_set(m, PGA_WRITEABLE); if (((origpte ^ newpte) & ~(PG_M | PG_A)) == 0) goto unchanged; goto validate; } /* * The physical page has changed. Temporarily invalidate * the mapping. This ensures that all threads sharing the * pmap keep a consistent view of the mapping, which is * necessary for the correct handling of COW faults. It * also permits reuse of the old mapping's PV entry, * avoiding an allocation. * * For consistency, handle unmanaged mappings the same way. */ origpte = pte_load_clear(pte); KASSERT((origpte & PG_FRAME) == opa, ("pmap_enter: unexpected pa update for %#lx", va)); if ((origpte & PG_MANAGED) != 0) { om = PHYS_TO_VM_PAGE(opa); /* * The pmap lock is sufficient to synchronize with * concurrent calls to pmap_page_test_mappings() and * pmap_ts_referenced(). */ if ((origpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) vm_page_dirty(om); if ((origpte & PG_A) != 0) vm_page_aflag_set(om, PGA_REFERENCED); CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, opa); pv = pmap_pvh_remove(&om->md, pmap, va); KASSERT(pv != NULL, ("pmap_enter: no PV entry for %#lx", va)); if ((newpte & PG_MANAGED) == 0) free_pv_entry(pmap, pv); if ((om->aflags & PGA_WRITEABLE) != 0 && TAILQ_EMPTY(&om->md.pv_list) && ((om->flags & PG_FICTITIOUS) != 0 || TAILQ_EMPTY(&pa_to_pvh(opa)->pv_list))) vm_page_aflag_clear(om, PGA_WRITEABLE); } if ((origpte & PG_A) != 0) pmap_invalidate_page(pmap, va); origpte = 0; } else { /* * Increment the counters. */ if ((newpte & PG_W) != 0) pmap->pm_stats.wired_count++; pmap_resident_count_inc(pmap, 1); } /* * Enter on the PV list if part of our managed memory. */ if ((newpte & PG_MANAGED) != 0) { if (pv == NULL) { pv = get_pv_entry(pmap, &lock); pv->pv_va = va; } CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, pa); TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); m->md.pv_gen++; if ((newpte & PG_RW) != 0) vm_page_aflag_set(m, PGA_WRITEABLE); } /* * Update the PTE. */ if ((origpte & PG_V) != 0) { validate: origpte = pte_load_store(pte, newpte); KASSERT((origpte & PG_FRAME) == pa, ("pmap_enter: unexpected pa update for %#lx", va)); if ((newpte & PG_M) == 0 && (origpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { if ((origpte & PG_MANAGED) != 0) vm_page_dirty(m); /* * Although the PTE may still have PG_RW set, TLB * invalidation may nonetheless be required because * the PTE no longer has PG_M set. */ } else if ((origpte & PG_NX) != 0 || (newpte & PG_NX) == 0) { /* * This PTE change does not require TLB invalidation. */ goto unchanged; } if ((origpte & PG_A) != 0) pmap_invalidate_page(pmap, va); } else pte_store(pte, newpte); unchanged: #if VM_NRESERVLEVEL > 0 /* * If both the page table page and the reservation are fully * populated, then attempt promotion. */ if ((mpte == NULL || mpte->wire_count == NPTEPG) && pmap_ps_enabled(pmap) && (m->flags & PG_FICTITIOUS) == 0 && vm_reserv_level_iffullpop(m) == 0) pmap_promote_pde(pmap, pde, va, &lock); #endif rv = KERN_SUCCESS; out: if (lock != NULL) rw_wunlock(lock); PMAP_UNLOCK(pmap); return (rv); } /* * Tries to create a read- and/or execute-only 2MB page mapping. Returns true * if successful. Returns false if (1) a page table page cannot be allocated * without sleeping, (2) a mapping already exists at the specified virtual * address, or (3) a PV entry cannot be allocated without reclaiming another * PV entry. */ static bool pmap_enter_2mpage(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, struct rwlock **lockp) { pd_entry_t newpde; pt_entry_t PG_V; PMAP_LOCK_ASSERT(pmap, MA_OWNED); PG_V = pmap_valid_bit(pmap); newpde = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(pmap, m->md.pat_mode, 1) | PG_PS | PG_V; if ((m->oflags & VPO_UNMANAGED) == 0) newpde |= PG_MANAGED; if ((prot & VM_PROT_EXECUTE) == 0) newpde |= pg_nx; if (va < VM_MAXUSER_ADDRESS) newpde |= PG_U; return (pmap_enter_pde(pmap, va, newpde, PMAP_ENTER_NOSLEEP | PMAP_ENTER_NOREPLACE | PMAP_ENTER_NORECLAIM, NULL, lockp) == KERN_SUCCESS); } /* * Tries to create the specified 2MB page mapping. Returns KERN_SUCCESS if * the mapping was created, and either KERN_FAILURE or KERN_RESOURCE_SHORTAGE * otherwise. Returns KERN_FAILURE if PMAP_ENTER_NOREPLACE was specified and * a mapping already exists at the specified virtual address. Returns * KERN_RESOURCE_SHORTAGE if PMAP_ENTER_NOSLEEP was specified and a page table * page allocation failed. Returns KERN_RESOURCE_SHORTAGE if * PMAP_ENTER_NORECLAIM was specified and a PV entry allocation failed. * * The parameter "m" is only used when creating a managed, writeable mapping. */ static int pmap_enter_pde(pmap_t pmap, vm_offset_t va, pd_entry_t newpde, u_int flags, vm_page_t m, struct rwlock **lockp) { struct spglist free; pd_entry_t oldpde, *pde; pt_entry_t PG_G, PG_RW, PG_V; vm_page_t mt, pdpg; KASSERT(pmap == kernel_pmap || (newpde & PG_W) == 0, ("pmap_enter_pde: cannot create wired user mapping")); PG_G = pmap_global_bit(pmap); PG_RW = pmap_rw_bit(pmap); KASSERT((newpde & (pmap_modified_bit(pmap) | PG_RW)) != PG_RW, ("pmap_enter_pde: newpde is missing PG_M")); PG_V = pmap_valid_bit(pmap); PMAP_LOCK_ASSERT(pmap, MA_OWNED); if ((pdpg = pmap_allocpde(pmap, va, (flags & PMAP_ENTER_NOSLEEP) != 0 ? NULL : lockp)) == NULL) { CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx" " in pmap %p", va, pmap); return (KERN_RESOURCE_SHORTAGE); } /* * If pkru is not same for the whole pde range, return failure * and let vm_fault() cope. Check after pde allocation, since * it could sleep. */ if (!pmap_pkru_same(pmap, va, va + NBPDR)) { SLIST_INIT(&free); if (pmap_unwire_ptp(pmap, va, pdpg, &free)) { pmap_invalidate_page(pmap, va); vm_page_free_pages_toq(&free, true); } return (KERN_FAILURE); } if (va < VM_MAXUSER_ADDRESS && pmap->pm_type == PT_X86) { newpde &= ~X86_PG_PKU_MASK; newpde |= pmap_pkru_get(pmap, va); } pde = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pdpg)); pde = &pde[pmap_pde_index(va)]; oldpde = *pde; if ((oldpde & PG_V) != 0) { KASSERT(pdpg->wire_count > 1, ("pmap_enter_pde: pdpg's wire count is too low")); if ((flags & PMAP_ENTER_NOREPLACE) != 0) { pdpg->wire_count--; CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx" " in pmap %p", va, pmap); return (KERN_FAILURE); } /* Break the existing mapping(s). */ SLIST_INIT(&free); if ((oldpde & PG_PS) != 0) { /* * The reference to the PD page that was acquired by * pmap_allocpde() ensures that it won't be freed. * However, if the PDE resulted from a promotion, then * a reserved PT page could be freed. */ (void)pmap_remove_pde(pmap, pde, va, &free, lockp); if ((oldpde & PG_G) == 0) pmap_invalidate_pde_page(pmap, va, oldpde); } else { pmap_delayed_invl_start(); if (pmap_remove_ptes(pmap, va, va + NBPDR, pde, &free, lockp)) pmap_invalidate_all(pmap); pmap_delayed_invl_finish(); } vm_page_free_pages_toq(&free, true); if (va >= VM_MAXUSER_ADDRESS) { /* * Both pmap_remove_pde() and pmap_remove_ptes() will * leave the kernel page table page zero filled. */ mt = PHYS_TO_VM_PAGE(*pde & PG_FRAME); if (pmap_insert_pt_page(pmap, mt, false)) panic("pmap_enter_pde: trie insert failed"); } else KASSERT(*pde == 0, ("pmap_enter_pde: non-zero pde %p", pde)); } if ((newpde & PG_MANAGED) != 0) { /* * Abort this mapping if its PV entry could not be created. */ if (!pmap_pv_insert_pde(pmap, va, newpde, flags, lockp)) { SLIST_INIT(&free); if (pmap_unwire_ptp(pmap, va, pdpg, &free)) { /* * Although "va" is not mapped, paging- * structure caches could nonetheless have * entries that refer to the freed page table * pages. Invalidate those entries. */ pmap_invalidate_page(pmap, va); vm_page_free_pages_toq(&free, true); } CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx" " in pmap %p", va, pmap); return (KERN_RESOURCE_SHORTAGE); } if ((newpde & PG_RW) != 0) { for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++) vm_page_aflag_set(mt, PGA_WRITEABLE); } } /* * Increment counters. */ if ((newpde & PG_W) != 0) pmap->pm_stats.wired_count += NBPDR / PAGE_SIZE; pmap_resident_count_inc(pmap, NBPDR / PAGE_SIZE); /* * Map the superpage. (This is not a promoted mapping; there will not * be any lingering 4KB page mappings in the TLB.) */ pde_store(pde, newpde); atomic_add_long(&pmap_pde_mappings, 1); CTR2(KTR_PMAP, "pmap_enter_pde: success for va %#lx" " in pmap %p", va, pmap); return (KERN_SUCCESS); } /* * Maps a sequence of resident pages belonging to the same object. * The sequence begins with the given page m_start. This page is * mapped at the given virtual address start. Each subsequent page is * mapped at a virtual address that is offset from start by the same * amount as the page is offset from m_start within the object. The * last page in the sequence is the page with the largest offset from * m_start that can be mapped at a virtual address less than the given * virtual address end. Not every virtual page between start and end * is mapped; only those for which a resident page exists with the * corresponding offset from m_start are mapped. */ void pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end, vm_page_t m_start, vm_prot_t prot) { struct rwlock *lock; vm_offset_t va; vm_page_t m, mpte; vm_pindex_t diff, psize; VM_OBJECT_ASSERT_LOCKED(m_start->object); psize = atop(end - start); mpte = NULL; m = m_start; lock = NULL; PMAP_LOCK(pmap); while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) { va = start + ptoa(diff); if ((va & PDRMASK) == 0 && va + NBPDR <= end && m->psind == 1 && pmap_ps_enabled(pmap) && pmap_enter_2mpage(pmap, va, m, prot, &lock)) m = &m[NBPDR / PAGE_SIZE - 1]; else mpte = pmap_enter_quick_locked(pmap, va, m, prot, mpte, &lock); m = TAILQ_NEXT(m, listq); } if (lock != NULL) rw_wunlock(lock); PMAP_UNLOCK(pmap); } /* * this code makes some *MAJOR* assumptions: * 1. Current pmap & pmap exists. * 2. Not wired. * 3. Read access. * 4. No page table pages. * but is *MUCH* faster than pmap_enter... */ void pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot) { struct rwlock *lock; lock = NULL; PMAP_LOCK(pmap); (void)pmap_enter_quick_locked(pmap, va, m, prot, NULL, &lock); if (lock != NULL) rw_wunlock(lock); PMAP_UNLOCK(pmap); } static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp) { struct spglist free; pt_entry_t newpte, *pte, PG_V; KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva || (m->oflags & VPO_UNMANAGED) != 0, ("pmap_enter_quick_locked: managed mapping within the clean submap")); PG_V = pmap_valid_bit(pmap); PMAP_LOCK_ASSERT(pmap, MA_OWNED); /* * In the case that a page table page is not * resident, we are creating it here. */ if (va < VM_MAXUSER_ADDRESS) { vm_pindex_t ptepindex; pd_entry_t *ptepa; /* * Calculate pagetable page index */ ptepindex = pmap_pde_pindex(va); if (mpte && (mpte->pindex == ptepindex)) { mpte->wire_count++; } else { /* * Get the page directory entry */ ptepa = pmap_pde(pmap, va); /* * If the page table page is mapped, we just increment * the hold count, and activate it. Otherwise, we * attempt to allocate a page table page. If this * attempt fails, we don't retry. Instead, we give up. */ if (ptepa && (*ptepa & PG_V) != 0) { if (*ptepa & PG_PS) return (NULL); mpte = PHYS_TO_VM_PAGE(*ptepa & PG_FRAME); mpte->wire_count++; } else { /* * Pass NULL instead of the PV list lock * pointer, because we don't intend to sleep. */ mpte = _pmap_allocpte(pmap, ptepindex, NULL); if (mpte == NULL) return (mpte); } } pte = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mpte)); pte = &pte[pmap_pte_index(va)]; } else { mpte = NULL; pte = vtopte(va); } if (*pte) { if (mpte != NULL) { mpte->wire_count--; mpte = NULL; } return (mpte); } /* * Enter on the PV list if part of our managed memory. */ if ((m->oflags & VPO_UNMANAGED) == 0 && !pmap_try_insert_pv_entry(pmap, va, m, lockp)) { if (mpte != NULL) { SLIST_INIT(&free); if (pmap_unwire_ptp(pmap, va, mpte, &free)) { /* * Although "va" is not mapped, paging- * structure caches could nonetheless have * entries that refer to the freed page table * pages. Invalidate those entries. */ pmap_invalidate_page(pmap, va); vm_page_free_pages_toq(&free, true); } mpte = NULL; } return (mpte); } /* * Increment counters */ pmap_resident_count_inc(pmap, 1); newpte = VM_PAGE_TO_PHYS(m) | PG_V | pmap_cache_bits(pmap, m->md.pat_mode, 0); if ((m->oflags & VPO_UNMANAGED) == 0) newpte |= PG_MANAGED; if ((prot & VM_PROT_EXECUTE) == 0) newpte |= pg_nx; if (va < VM_MAXUSER_ADDRESS) newpte |= PG_U | pmap_pkru_get(pmap, va); pte_store(pte, newpte); return (mpte); } /* * Make a temporary mapping for a physical address. This is only intended * to be used for panic dumps. */ void * pmap_kenter_temporary(vm_paddr_t pa, int i) { vm_offset_t va; va = (vm_offset_t)crashdumpmap + (i * PAGE_SIZE); pmap_kenter(va, pa); invlpg(va); return ((void *)crashdumpmap); } /* * This code maps large physical mmap regions into the * processor address space. Note that some shortcuts * are taken, but the code works. */ void pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object, vm_pindex_t pindex, vm_size_t size) { pd_entry_t *pde; pt_entry_t PG_A, PG_M, PG_RW, PG_V; vm_paddr_t pa, ptepa; vm_page_t p, pdpg; int pat_mode; PG_A = pmap_accessed_bit(pmap); PG_M = pmap_modified_bit(pmap); PG_V = pmap_valid_bit(pmap); PG_RW = pmap_rw_bit(pmap); VM_OBJECT_ASSERT_WLOCKED(object); KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG, ("pmap_object_init_pt: non-device object")); if ((addr & (NBPDR - 1)) == 0 && (size & (NBPDR - 1)) == 0) { if (!pmap_ps_enabled(pmap)) return; if (!vm_object_populate(object, pindex, pindex + atop(size))) return; p = vm_page_lookup(object, pindex); KASSERT(p->valid == VM_PAGE_BITS_ALL, ("pmap_object_init_pt: invalid page %p", p)); pat_mode = p->md.pat_mode; /* * Abort the mapping if the first page is not physically * aligned to a 2MB page boundary. */ ptepa = VM_PAGE_TO_PHYS(p); if (ptepa & (NBPDR - 1)) return; /* * Skip the first page. Abort the mapping if the rest of * the pages are not physically contiguous or have differing * memory attributes. */ p = TAILQ_NEXT(p, listq); for (pa = ptepa + PAGE_SIZE; pa < ptepa + size; pa += PAGE_SIZE) { KASSERT(p->valid == VM_PAGE_BITS_ALL, ("pmap_object_init_pt: invalid page %p", p)); if (pa != VM_PAGE_TO_PHYS(p) || pat_mode != p->md.pat_mode) return; p = TAILQ_NEXT(p, listq); } /* * Map using 2MB pages. Since "ptepa" is 2M aligned and * "size" is a multiple of 2M, adding the PAT setting to "pa" * will not affect the termination of this loop. */ PMAP_LOCK(pmap); for (pa = ptepa | pmap_cache_bits(pmap, pat_mode, 1); pa < ptepa + size; pa += NBPDR) { pdpg = pmap_allocpde(pmap, addr, NULL); if (pdpg == NULL) { /* * The creation of mappings below is only an * optimization. If a page directory page * cannot be allocated without blocking, * continue on to the next mapping rather than * blocking. */ addr += NBPDR; continue; } pde = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pdpg)); pde = &pde[pmap_pde_index(addr)]; if ((*pde & PG_V) == 0) { pde_store(pde, pa | PG_PS | PG_M | PG_A | PG_U | PG_RW | PG_V); pmap_resident_count_inc(pmap, NBPDR / PAGE_SIZE); atomic_add_long(&pmap_pde_mappings, 1); } else { /* Continue on if the PDE is already valid. */ pdpg->wire_count--; KASSERT(pdpg->wire_count > 0, ("pmap_object_init_pt: missing reference " "to page directory page, va: 0x%lx", addr)); } addr += NBPDR; } PMAP_UNLOCK(pmap); } } /* * Clear the wired attribute from the mappings for the specified range of * addresses in the given pmap. Every valid mapping within that range * must have the wired attribute set. In contrast, invalid mappings * cannot have the wired attribute set, so they are ignored. * * The wired attribute of the page table entry is not a hardware * feature, so there is no need to invalidate any TLB entries. * Since pmap_demote_pde() for the wired entry must never fail, * pmap_delayed_invl_start()/finish() calls around the * function are not needed. */ void pmap_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { vm_offset_t va_next; pml4_entry_t *pml4e; pdp_entry_t *pdpe; pd_entry_t *pde; pt_entry_t *pte, PG_V; PG_V = pmap_valid_bit(pmap); PMAP_LOCK(pmap); for (; sva < eva; sva = va_next) { pml4e = pmap_pml4e(pmap, sva); if ((*pml4e & PG_V) == 0) { va_next = (sva + NBPML4) & ~PML4MASK; if (va_next < sva) va_next = eva; continue; } pdpe = pmap_pml4e_to_pdpe(pml4e, sva); if ((*pdpe & PG_V) == 0) { va_next = (sva + NBPDP) & ~PDPMASK; if (va_next < sva) va_next = eva; continue; } va_next = (sva + NBPDR) & ~PDRMASK; if (va_next < sva) va_next = eva; pde = pmap_pdpe_to_pde(pdpe, sva); if ((*pde & PG_V) == 0) continue; if ((*pde & PG_PS) != 0) { if ((*pde & PG_W) == 0) panic("pmap_unwire: pde %#jx is missing PG_W", (uintmax_t)*pde); /* * Are we unwiring the entire large page? If not, * demote the mapping and fall through. */ if (sva + NBPDR == va_next && eva >= va_next) { atomic_clear_long(pde, PG_W); pmap->pm_stats.wired_count -= NBPDR / PAGE_SIZE; continue; } else if (!pmap_demote_pde(pmap, pde, sva)) panic("pmap_unwire: demotion failed"); } if (va_next > eva) va_next = eva; for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++, sva += PAGE_SIZE) { if ((*pte & PG_V) == 0) continue; if ((*pte & PG_W) == 0) panic("pmap_unwire: pte %#jx is missing PG_W", (uintmax_t)*pte); /* * PG_W must be cleared atomically. Although the pmap * lock synchronizes access to PG_W, another processor * could be setting PG_M and/or PG_A concurrently. */ atomic_clear_long(pte, PG_W); pmap->pm_stats.wired_count--; } } PMAP_UNLOCK(pmap); } /* * Copy the range specified by src_addr/len * from the source map to the range dst_addr/len * in the destination map. * * This routine is only advisory and need not do anything. */ void pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len, vm_offset_t src_addr) { struct rwlock *lock; struct spglist free; pml4_entry_t *pml4e; pdp_entry_t *pdpe; pd_entry_t *pde, srcptepaddr; pt_entry_t *dst_pte, PG_A, PG_M, PG_V, ptetemp, *src_pte; vm_offset_t addr, end_addr, va_next; vm_page_t dst_pdpg, dstmpte, srcmpte; if (dst_addr != src_addr) return; if (dst_pmap->pm_type != src_pmap->pm_type) return; /* * EPT page table entries that require emulation of A/D bits are * sensitive to clearing the PG_A bit (aka EPT_PG_READ). Although * we clear PG_M (aka EPT_PG_WRITE) concomitantly, the PG_U bit * (aka EPT_PG_EXECUTE) could still be set. Since some EPT * implementations flag an EPT misconfiguration for exec-only * mappings we skip this function entirely for emulated pmaps. */ if (pmap_emulate_ad_bits(dst_pmap)) return; end_addr = src_addr + len; lock = NULL; if (dst_pmap < src_pmap) { PMAP_LOCK(dst_pmap); PMAP_LOCK(src_pmap); } else { PMAP_LOCK(src_pmap); PMAP_LOCK(dst_pmap); } PG_A = pmap_accessed_bit(dst_pmap); PG_M = pmap_modified_bit(dst_pmap); PG_V = pmap_valid_bit(dst_pmap); for (addr = src_addr; addr < end_addr; addr = va_next) { KASSERT(addr < UPT_MIN_ADDRESS, ("pmap_copy: invalid to pmap_copy page tables")); pml4e = pmap_pml4e(src_pmap, addr); if ((*pml4e & PG_V) == 0) { va_next = (addr + NBPML4) & ~PML4MASK; if (va_next < addr) va_next = end_addr; continue; } pdpe = pmap_pml4e_to_pdpe(pml4e, addr); if ((*pdpe & PG_V) == 0) { va_next = (addr + NBPDP) & ~PDPMASK; if (va_next < addr) va_next = end_addr; continue; } va_next = (addr + NBPDR) & ~PDRMASK; if (va_next < addr) va_next = end_addr; pde = pmap_pdpe_to_pde(pdpe, addr); srcptepaddr = *pde; if (srcptepaddr == 0) continue; if (srcptepaddr & PG_PS) { if ((addr & PDRMASK) != 0 || addr + NBPDR > end_addr) continue; dst_pdpg = pmap_allocpde(dst_pmap, addr, NULL); if (dst_pdpg == NULL) break; pde = (pd_entry_t *) PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dst_pdpg)); pde = &pde[pmap_pde_index(addr)]; if (*pde == 0 && ((srcptepaddr & PG_MANAGED) == 0 || pmap_pv_insert_pde(dst_pmap, addr, srcptepaddr, PMAP_ENTER_NORECLAIM, &lock))) { *pde = srcptepaddr & ~PG_W; pmap_resident_count_inc(dst_pmap, NBPDR / PAGE_SIZE); atomic_add_long(&pmap_pde_mappings, 1); } else dst_pdpg->wire_count--; continue; } srcptepaddr &= PG_FRAME; srcmpte = PHYS_TO_VM_PAGE(srcptepaddr); KASSERT(srcmpte->wire_count > 0, ("pmap_copy: source page table page is unused")); if (va_next > end_addr) va_next = end_addr; src_pte = (pt_entry_t *)PHYS_TO_DMAP(srcptepaddr); src_pte = &src_pte[pmap_pte_index(addr)]; dstmpte = NULL; for (; addr < va_next; addr += PAGE_SIZE, src_pte++) { ptetemp = *src_pte; /* * We only virtual copy managed pages. */ if ((ptetemp & PG_MANAGED) == 0) continue; if (dstmpte != NULL) { KASSERT(dstmpte->pindex == pmap_pde_pindex(addr), ("dstmpte pindex/addr mismatch")); dstmpte->wire_count++; } else if ((dstmpte = pmap_allocpte(dst_pmap, addr, NULL)) == NULL) goto out; dst_pte = (pt_entry_t *) PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dstmpte)); dst_pte = &dst_pte[pmap_pte_index(addr)]; if (*dst_pte == 0 && pmap_try_insert_pv_entry(dst_pmap, addr, PHYS_TO_VM_PAGE(ptetemp & PG_FRAME), &lock)) { /* * Clear the wired, modified, and accessed * (referenced) bits during the copy. */ *dst_pte = ptetemp & ~(PG_W | PG_M | PG_A); pmap_resident_count_inc(dst_pmap, 1); } else { SLIST_INIT(&free); if (pmap_unwire_ptp(dst_pmap, addr, dstmpte, &free)) { /* * Although "addr" is not mapped, * paging-structure caches could * nonetheless have entries that refer * to the freed page table pages. * Invalidate those entries. */ pmap_invalidate_page(dst_pmap, addr); vm_page_free_pages_toq(&free, true); } goto out; } /* Have we copied all of the valid mappings? */ if (dstmpte->wire_count >= srcmpte->wire_count) break; } } out: if (lock != NULL) rw_wunlock(lock); PMAP_UNLOCK(src_pmap); PMAP_UNLOCK(dst_pmap); } int pmap_vmspace_copy(pmap_t dst_pmap, pmap_t src_pmap) { int error; if (dst_pmap->pm_type != src_pmap->pm_type || dst_pmap->pm_type != PT_X86 || (cpu_stdext_feature2 & CPUID_STDEXT2_PKU) == 0) return (0); for (;;) { if (dst_pmap < src_pmap) { PMAP_LOCK(dst_pmap); PMAP_LOCK(src_pmap); } else { PMAP_LOCK(src_pmap); PMAP_LOCK(dst_pmap); } error = pmap_pkru_copy(dst_pmap, src_pmap); /* Clean up partial copy on failure due to no memory. */ if (error == ENOMEM) pmap_pkru_deassign_all(dst_pmap); PMAP_UNLOCK(src_pmap); PMAP_UNLOCK(dst_pmap); if (error != ENOMEM) break; vm_wait(NULL); } return (error); } /* * Zero the specified hardware page. */ void pmap_zero_page(vm_page_t m) { vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); pagezero((void *)va); } /* * Zero an an area within a single hardware page. off and size must not * cover an area beyond a single hardware page. */ void pmap_zero_page_area(vm_page_t m, int off, int size) { vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); if (off == 0 && size == PAGE_SIZE) pagezero((void *)va); else bzero((char *)va + off, size); } /* * Copy 1 specified hardware page to another. */ void pmap_copy_page(vm_page_t msrc, vm_page_t mdst) { vm_offset_t src = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(msrc)); vm_offset_t dst = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mdst)); pagecopy((void *)src, (void *)dst); } int unmapped_buf_allowed = 1; void pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[], vm_offset_t b_offset, int xfersize) { void *a_cp, *b_cp; vm_page_t pages[2]; vm_offset_t vaddr[2], a_pg_offset, b_pg_offset; int cnt; boolean_t mapped; while (xfersize > 0) { a_pg_offset = a_offset & PAGE_MASK; pages[0] = ma[a_offset >> PAGE_SHIFT]; b_pg_offset = b_offset & PAGE_MASK; pages[1] = mb[b_offset >> PAGE_SHIFT]; cnt = min(xfersize, PAGE_SIZE - a_pg_offset); cnt = min(cnt, PAGE_SIZE - b_pg_offset); mapped = pmap_map_io_transient(pages, vaddr, 2, FALSE); a_cp = (char *)vaddr[0] + a_pg_offset; b_cp = (char *)vaddr[1] + b_pg_offset; bcopy(a_cp, b_cp, cnt); if (__predict_false(mapped)) pmap_unmap_io_transient(pages, vaddr, 2, FALSE); a_offset += cnt; b_offset += cnt; xfersize -= cnt; } } /* * Returns true if the pmap's pv is one of the first * 16 pvs linked to from this page. This count may * be changed upwards or downwards in the future; it * is only necessary that true be returned for a small * subset of pmaps for proper page aging. */ boolean_t pmap_page_exists_quick(pmap_t pmap, vm_page_t m) { struct md_page *pvh; struct rwlock *lock; pv_entry_t pv; int loops = 0; boolean_t rv; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_page_exists_quick: page %p is not managed", m)); rv = FALSE; lock = VM_PAGE_TO_PV_LIST_LOCK(m); rw_rlock(lock); TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { if (PV_PMAP(pv) == pmap) { rv = TRUE; break; } loops++; if (loops >= 16) break; } if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) { pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { if (PV_PMAP(pv) == pmap) { rv = TRUE; break; } loops++; if (loops >= 16) break; } } rw_runlock(lock); return (rv); } /* * pmap_page_wired_mappings: * * Return the number of managed mappings to the given physical page * that are wired. */ int pmap_page_wired_mappings(vm_page_t m) { struct rwlock *lock; struct md_page *pvh; pmap_t pmap; pt_entry_t *pte; pv_entry_t pv; int count, md_gen, pvh_gen; if ((m->oflags & VPO_UNMANAGED) != 0) return (0); lock = VM_PAGE_TO_PV_LIST_LOCK(m); rw_rlock(lock); restart: count = 0; TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { md_gen = m->md.pv_gen; rw_runlock(lock); PMAP_LOCK(pmap); rw_rlock(lock); if (md_gen != m->md.pv_gen) { PMAP_UNLOCK(pmap); goto restart; } } pte = pmap_pte(pmap, pv->pv_va); if ((*pte & PG_W) != 0) count++; PMAP_UNLOCK(pmap); } if ((m->flags & PG_FICTITIOUS) == 0) { pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { md_gen = m->md.pv_gen; pvh_gen = pvh->pv_gen; rw_runlock(lock); PMAP_LOCK(pmap); rw_rlock(lock); if (md_gen != m->md.pv_gen || pvh_gen != pvh->pv_gen) { PMAP_UNLOCK(pmap); goto restart; } } pte = pmap_pde(pmap, pv->pv_va); if ((*pte & PG_W) != 0) count++; PMAP_UNLOCK(pmap); } } rw_runlock(lock); return (count); } /* * Returns TRUE if the given page is mapped individually or as part of * a 2mpage. Otherwise, returns FALSE. */ boolean_t pmap_page_is_mapped(vm_page_t m) { struct rwlock *lock; boolean_t rv; if ((m->oflags & VPO_UNMANAGED) != 0) return (FALSE); lock = VM_PAGE_TO_PV_LIST_LOCK(m); rw_rlock(lock); rv = !TAILQ_EMPTY(&m->md.pv_list) || ((m->flags & PG_FICTITIOUS) == 0 && !TAILQ_EMPTY(&pa_to_pvh(VM_PAGE_TO_PHYS(m))->pv_list)); rw_runlock(lock); return (rv); } /* * Destroy all managed, non-wired mappings in the given user-space * pmap. This pmap cannot be active on any processor besides the * caller. * * This function cannot be applied to the kernel pmap. Moreover, it * is not intended for general use. It is only to be used during * process termination. Consequently, it can be implemented in ways * that make it faster than pmap_remove(). First, it can more quickly * destroy mappings by iterating over the pmap's collection of PV * entries, rather than searching the page table. Second, it doesn't * have to test and clear the page table entries atomically, because * no processor is currently accessing the user address space. In * particular, a page table entry's dirty bit won't change state once * this function starts. * * Although this function destroys all of the pmap's managed, * non-wired mappings, it can delay and batch the invalidation of TLB * entries without calling pmap_delayed_invl_start() and * pmap_delayed_invl_finish(). Because the pmap is not active on * any other processor, none of these TLB entries will ever be used * before their eventual invalidation. Consequently, there is no need * for either pmap_remove_all() or pmap_remove_write() to wait for * that eventual TLB invalidation. */ void pmap_remove_pages(pmap_t pmap) { pd_entry_t ptepde; pt_entry_t *pte, tpte; pt_entry_t PG_M, PG_RW, PG_V; struct spglist free; vm_page_t m, mpte, mt; pv_entry_t pv; struct md_page *pvh; struct pv_chunk *pc, *npc; struct rwlock *lock; int64_t bit; uint64_t inuse, bitmask; int allfree, field, freed, idx; boolean_t superpage; vm_paddr_t pa; /* * Assert that the given pmap is only active on the current * CPU. Unfortunately, we cannot block another CPU from * activating the pmap while this function is executing. */ KASSERT(pmap == PCPU_GET(curpmap), ("non-current pmap %p", pmap)); #ifdef INVARIANTS { cpuset_t other_cpus; other_cpus = all_cpus; critical_enter(); CPU_CLR(PCPU_GET(cpuid), &other_cpus); CPU_AND(&other_cpus, &pmap->pm_active); critical_exit(); KASSERT(CPU_EMPTY(&other_cpus), ("pmap active %p", pmap)); } #endif lock = NULL; PG_M = pmap_modified_bit(pmap); PG_V = pmap_valid_bit(pmap); PG_RW = pmap_rw_bit(pmap); SLIST_INIT(&free); PMAP_LOCK(pmap); TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) { allfree = 1; freed = 0; for (field = 0; field < _NPCM; field++) { inuse = ~pc->pc_map[field] & pc_freemask[field]; while (inuse != 0) { bit = bsfq(inuse); bitmask = 1UL << bit; idx = field * 64 + bit; pv = &pc->pc_pventry[idx]; inuse &= ~bitmask; pte = pmap_pdpe(pmap, pv->pv_va); ptepde = *pte; pte = pmap_pdpe_to_pde(pte, pv->pv_va); tpte = *pte; if ((tpte & (PG_PS | PG_V)) == PG_V) { superpage = FALSE; ptepde = tpte; pte = (pt_entry_t *)PHYS_TO_DMAP(tpte & PG_FRAME); pte = &pte[pmap_pte_index(pv->pv_va)]; tpte = *pte; } else { /* * Keep track whether 'tpte' is a * superpage explicitly instead of * relying on PG_PS being set. * * This is because PG_PS is numerically * identical to PG_PTE_PAT and thus a * regular page could be mistaken for * a superpage. */ superpage = TRUE; } if ((tpte & PG_V) == 0) { panic("bad pte va %lx pte %lx", pv->pv_va, tpte); } /* * We cannot remove wired pages from a process' mapping at this time */ if (tpte & PG_W) { allfree = 0; continue; } if (superpage) pa = tpte & PG_PS_FRAME; else pa = tpte & PG_FRAME; m = PHYS_TO_VM_PAGE(pa); KASSERT(m->phys_addr == pa, ("vm_page_t %p phys_addr mismatch %016jx %016jx", m, (uintmax_t)m->phys_addr, (uintmax_t)tpte)); KASSERT((m->flags & PG_FICTITIOUS) != 0 || m < &vm_page_array[vm_page_array_size], ("pmap_remove_pages: bad tpte %#jx", (uintmax_t)tpte)); pte_clear(pte); /* * Update the vm_page_t clean/reference bits. */ if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { if (superpage) { for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++) vm_page_dirty(mt); } else vm_page_dirty(m); } CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, m); /* Mark free */ pc->pc_map[field] |= bitmask; if (superpage) { pmap_resident_count_dec(pmap, NBPDR / PAGE_SIZE); pvh = pa_to_pvh(tpte & PG_PS_FRAME); TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); pvh->pv_gen++; if (TAILQ_EMPTY(&pvh->pv_list)) { for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++) if ((mt->aflags & PGA_WRITEABLE) != 0 && TAILQ_EMPTY(&mt->md.pv_list)) vm_page_aflag_clear(mt, PGA_WRITEABLE); } mpte = pmap_remove_pt_page(pmap, pv->pv_va); if (mpte != NULL) { KASSERT(mpte->valid == VM_PAGE_BITS_ALL, ("pmap_remove_pages: pte page not promoted")); pmap_resident_count_dec(pmap, 1); KASSERT(mpte->wire_count == NPTEPG, ("pmap_remove_pages: pte page wire count error")); mpte->wire_count = 0; pmap_add_delayed_free_list(mpte, &free, FALSE); } } else { pmap_resident_count_dec(pmap, 1); TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); m->md.pv_gen++; if ((m->aflags & PGA_WRITEABLE) != 0 && TAILQ_EMPTY(&m->md.pv_list) && (m->flags & PG_FICTITIOUS) == 0) { pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); if (TAILQ_EMPTY(&pvh->pv_list)) vm_page_aflag_clear(m, PGA_WRITEABLE); } } pmap_unuse_pt(pmap, pv->pv_va, ptepde, &free); freed++; } } PV_STAT(atomic_add_long(&pv_entry_frees, freed)); PV_STAT(atomic_add_int(&pv_entry_spare, freed)); PV_STAT(atomic_subtract_long(&pv_entry_count, freed)); if (allfree) { TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); free_pv_chunk(pc); } } if (lock != NULL) rw_wunlock(lock); pmap_invalidate_all(pmap); pmap_pkru_deassign_all(pmap); PMAP_UNLOCK(pmap); vm_page_free_pages_toq(&free, true); } static boolean_t pmap_page_test_mappings(vm_page_t m, boolean_t accessed, boolean_t modified) { struct rwlock *lock; pv_entry_t pv; struct md_page *pvh; pt_entry_t *pte, mask; pt_entry_t PG_A, PG_M, PG_RW, PG_V; pmap_t pmap; int md_gen, pvh_gen; boolean_t rv; rv = FALSE; lock = VM_PAGE_TO_PV_LIST_LOCK(m); rw_rlock(lock); restart: TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { md_gen = m->md.pv_gen; rw_runlock(lock); PMAP_LOCK(pmap); rw_rlock(lock); if (md_gen != m->md.pv_gen) { PMAP_UNLOCK(pmap); goto restart; } } pte = pmap_pte(pmap, pv->pv_va); mask = 0; if (modified) { PG_M = pmap_modified_bit(pmap); PG_RW = pmap_rw_bit(pmap); mask |= PG_RW | PG_M; } if (accessed) { PG_A = pmap_accessed_bit(pmap); PG_V = pmap_valid_bit(pmap); mask |= PG_V | PG_A; } rv = (*pte & mask) == mask; PMAP_UNLOCK(pmap); if (rv) goto out; } if ((m->flags & PG_FICTITIOUS) == 0) { pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { md_gen = m->md.pv_gen; pvh_gen = pvh->pv_gen; rw_runlock(lock); PMAP_LOCK(pmap); rw_rlock(lock); if (md_gen != m->md.pv_gen || pvh_gen != pvh->pv_gen) { PMAP_UNLOCK(pmap); goto restart; } } pte = pmap_pde(pmap, pv->pv_va); mask = 0; if (modified) { PG_M = pmap_modified_bit(pmap); PG_RW = pmap_rw_bit(pmap); mask |= PG_RW | PG_M; } if (accessed) { PG_A = pmap_accessed_bit(pmap); PG_V = pmap_valid_bit(pmap); mask |= PG_V | PG_A; } rv = (*pte & mask) == mask; PMAP_UNLOCK(pmap); if (rv) goto out; } } out: rw_runlock(lock); return (rv); } /* * pmap_is_modified: * * Return whether or not the specified physical page was modified * in any physical maps. */ boolean_t pmap_is_modified(vm_page_t m) { KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_is_modified: page %p is not managed", m)); /* * If the page is not exclusive busied, then PGA_WRITEABLE cannot be * concurrently set while the object is locked. Thus, if PGA_WRITEABLE * is clear, no PTEs can have PG_M set. */ VM_OBJECT_ASSERT_WLOCKED(m->object); if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0) return (FALSE); return (pmap_page_test_mappings(m, FALSE, TRUE)); } /* * pmap_is_prefaultable: * * Return whether or not the specified virtual address is eligible * for prefault. */ boolean_t pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr) { pd_entry_t *pde; pt_entry_t *pte, PG_V; boolean_t rv; PG_V = pmap_valid_bit(pmap); rv = FALSE; PMAP_LOCK(pmap); pde = pmap_pde(pmap, addr); if (pde != NULL && (*pde & (PG_PS | PG_V)) == PG_V) { pte = pmap_pde_to_pte(pde, addr); rv = (*pte & PG_V) == 0; } PMAP_UNLOCK(pmap); return (rv); } /* * pmap_is_referenced: * * Return whether or not the specified physical page was referenced * in any physical maps. */ boolean_t pmap_is_referenced(vm_page_t m) { KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_is_referenced: page %p is not managed", m)); return (pmap_page_test_mappings(m, TRUE, FALSE)); } /* * Clear the write and modified bits in each of the given page's mappings. */ void pmap_remove_write(vm_page_t m) { struct md_page *pvh; pmap_t pmap; struct rwlock *lock; pv_entry_t next_pv, pv; pd_entry_t *pde; pt_entry_t oldpte, *pte, PG_M, PG_RW; vm_offset_t va; int pvh_gen, md_gen; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_remove_write: page %p is not managed", m)); /* * If the page is not exclusive busied, then PGA_WRITEABLE cannot be * set by another thread while the object is locked. Thus, * if PGA_WRITEABLE is clear, no page table entries need updating. */ VM_OBJECT_ASSERT_WLOCKED(m->object); if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0) return; lock = VM_PAGE_TO_PV_LIST_LOCK(m); pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : pa_to_pvh(VM_PAGE_TO_PHYS(m)); retry_pv_loop: rw_wlock(lock); TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { pvh_gen = pvh->pv_gen; rw_wunlock(lock); PMAP_LOCK(pmap); rw_wlock(lock); if (pvh_gen != pvh->pv_gen) { PMAP_UNLOCK(pmap); rw_wunlock(lock); goto retry_pv_loop; } } PG_RW = pmap_rw_bit(pmap); va = pv->pv_va; pde = pmap_pde(pmap, va); if ((*pde & PG_RW) != 0) (void)pmap_demote_pde_locked(pmap, pde, va, &lock); KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m), ("inconsistent pv lock %p %p for page %p", lock, VM_PAGE_TO_PV_LIST_LOCK(m), m)); PMAP_UNLOCK(pmap); } TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { pvh_gen = pvh->pv_gen; md_gen = m->md.pv_gen; rw_wunlock(lock); PMAP_LOCK(pmap); rw_wlock(lock); if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { PMAP_UNLOCK(pmap); rw_wunlock(lock); goto retry_pv_loop; } } PG_M = pmap_modified_bit(pmap); PG_RW = pmap_rw_bit(pmap); pde = pmap_pde(pmap, pv->pv_va); KASSERT((*pde & PG_PS) == 0, ("pmap_remove_write: found a 2mpage in page %p's pv list", m)); pte = pmap_pde_to_pte(pde, pv->pv_va); retry: oldpte = *pte; if (oldpte & PG_RW) { if (!atomic_cmpset_long(pte, oldpte, oldpte & ~(PG_RW | PG_M))) goto retry; if ((oldpte & PG_M) != 0) vm_page_dirty(m); pmap_invalidate_page(pmap, pv->pv_va); } PMAP_UNLOCK(pmap); } rw_wunlock(lock); vm_page_aflag_clear(m, PGA_WRITEABLE); pmap_delayed_invl_wait(m); } static __inline boolean_t safe_to_clear_referenced(pmap_t pmap, pt_entry_t pte) { if (!pmap_emulate_ad_bits(pmap)) return (TRUE); KASSERT(pmap->pm_type == PT_EPT, ("invalid pm_type %d", pmap->pm_type)); /* * XWR = 010 or 110 will cause an unconditional EPT misconfiguration * so we don't let the referenced (aka EPT_PG_READ) bit to be cleared * if the EPT_PG_WRITE bit is set. */ if ((pte & EPT_PG_WRITE) != 0) return (FALSE); /* * XWR = 100 is allowed only if the PMAP_SUPPORTS_EXEC_ONLY is set. */ if ((pte & EPT_PG_EXECUTE) == 0 || ((pmap->pm_flags & PMAP_SUPPORTS_EXEC_ONLY) != 0)) return (TRUE); else return (FALSE); } /* * pmap_ts_referenced: * * Return a count of reference bits for a page, clearing those bits. * It is not necessary for every reference bit to be cleared, but it * is necessary that 0 only be returned when there are truly no * reference bits set. * * As an optimization, update the page's dirty field if a modified bit is * found while counting reference bits. This opportunistic update can be * performed at low cost and can eliminate the need for some future calls * to pmap_is_modified(). However, since this function stops after * finding PMAP_TS_REFERENCED_MAX reference bits, it may not detect some * dirty pages. Those dirty pages will only be detected by a future call * to pmap_is_modified(). * * A DI block is not needed within this function, because * invalidations are performed before the PV list lock is * released. */ int pmap_ts_referenced(vm_page_t m) { struct md_page *pvh; pv_entry_t pv, pvf; pmap_t pmap; struct rwlock *lock; pd_entry_t oldpde, *pde; pt_entry_t *pte, PG_A, PG_M, PG_RW; vm_offset_t va; vm_paddr_t pa; int cleared, md_gen, not_cleared, pvh_gen; struct spglist free; boolean_t demoted; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_ts_referenced: page %p is not managed", m)); SLIST_INIT(&free); cleared = 0; pa = VM_PAGE_TO_PHYS(m); lock = PHYS_TO_PV_LIST_LOCK(pa); pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : pa_to_pvh(pa); rw_wlock(lock); retry: not_cleared = 0; if ((pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL) goto small_mappings; pv = pvf; do { if (pvf == NULL) pvf = pv; pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { pvh_gen = pvh->pv_gen; rw_wunlock(lock); PMAP_LOCK(pmap); rw_wlock(lock); if (pvh_gen != pvh->pv_gen) { PMAP_UNLOCK(pmap); goto retry; } } PG_A = pmap_accessed_bit(pmap); PG_M = pmap_modified_bit(pmap); PG_RW = pmap_rw_bit(pmap); va = pv->pv_va; pde = pmap_pde(pmap, pv->pv_va); oldpde = *pde; if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW)) { /* * Although "oldpde" is mapping a 2MB page, because * this function is called at a 4KB page granularity, * we only update the 4KB page under test. */ vm_page_dirty(m); } if ((oldpde & PG_A) != 0) { /* * Since this reference bit is shared by 512 4KB * pages, it should not be cleared every time it is * tested. Apply a simple "hash" function on the * physical page number, the virtual superpage number, * and the pmap address to select one 4KB page out of * the 512 on which testing the reference bit will * result in clearing that reference bit. This * function is designed to avoid the selection of the * same 4KB page for every 2MB page mapping. * * On demotion, a mapping that hasn't been referenced * is simply destroyed. To avoid the possibility of a * subsequent page fault on a demoted wired mapping, * always leave its reference bit set. Moreover, * since the superpage is wired, the current state of * its reference bit won't affect page replacement. */ if ((((pa >> PAGE_SHIFT) ^ (pv->pv_va >> PDRSHIFT) ^ (uintptr_t)pmap) & (NPTEPG - 1)) == 0 && (oldpde & PG_W) == 0) { if (safe_to_clear_referenced(pmap, oldpde)) { atomic_clear_long(pde, PG_A); pmap_invalidate_page(pmap, pv->pv_va); demoted = FALSE; } else if (pmap_demote_pde_locked(pmap, pde, pv->pv_va, &lock)) { /* * Remove the mapping to a single page * so that a subsequent access may * repromote. Since the underlying * page table page is fully populated, * this removal never frees a page * table page. */ demoted = TRUE; va += VM_PAGE_TO_PHYS(m) - (oldpde & PG_PS_FRAME); pte = pmap_pde_to_pte(pde, va); pmap_remove_pte(pmap, pte, va, *pde, NULL, &lock); pmap_invalidate_page(pmap, va); } else demoted = TRUE; if (demoted) { /* * The superpage mapping was removed * entirely and therefore 'pv' is no * longer valid. */ if (pvf == pv) pvf = NULL; pv = NULL; } cleared++; KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m), ("inconsistent pv lock %p %p for page %p", lock, VM_PAGE_TO_PV_LIST_LOCK(m), m)); } else not_cleared++; } PMAP_UNLOCK(pmap); /* Rotate the PV list if it has more than one entry. */ if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) { TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); pvh->pv_gen++; } if (cleared + not_cleared >= PMAP_TS_REFERENCED_MAX) goto out; } while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf); small_mappings: if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL) goto out; pv = pvf; do { if (pvf == NULL) pvf = pv; pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { pvh_gen = pvh->pv_gen; md_gen = m->md.pv_gen; rw_wunlock(lock); PMAP_LOCK(pmap); rw_wlock(lock); if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { PMAP_UNLOCK(pmap); goto retry; } } PG_A = pmap_accessed_bit(pmap); PG_M = pmap_modified_bit(pmap); PG_RW = pmap_rw_bit(pmap); pde = pmap_pde(pmap, pv->pv_va); KASSERT((*pde & PG_PS) == 0, ("pmap_ts_referenced: found a 2mpage in page %p's pv list", m)); pte = pmap_pde_to_pte(pde, pv->pv_va); if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) vm_page_dirty(m); if ((*pte & PG_A) != 0) { if (safe_to_clear_referenced(pmap, *pte)) { atomic_clear_long(pte, PG_A); pmap_invalidate_page(pmap, pv->pv_va); cleared++; } else if ((*pte & PG_W) == 0) { /* * Wired pages cannot be paged out so * doing accessed bit emulation for * them is wasted effort. We do the * hard work for unwired pages only. */ pmap_remove_pte(pmap, pte, pv->pv_va, *pde, &free, &lock); pmap_invalidate_page(pmap, pv->pv_va); cleared++; if (pvf == pv) pvf = NULL; pv = NULL; KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m), ("inconsistent pv lock %p %p for page %p", lock, VM_PAGE_TO_PV_LIST_LOCK(m), m)); } else not_cleared++; } PMAP_UNLOCK(pmap); /* Rotate the PV list if it has more than one entry. */ if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) { TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); m->md.pv_gen++; } } while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && cleared + not_cleared < PMAP_TS_REFERENCED_MAX); out: rw_wunlock(lock); vm_page_free_pages_toq(&free, true); return (cleared + not_cleared); } /* * Apply the given advice to the specified range of addresses within the * given pmap. Depending on the advice, clear the referenced and/or * modified flags in each mapping and set the mapped page's dirty field. */ void pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice) { struct rwlock *lock; pml4_entry_t *pml4e; pdp_entry_t *pdpe; pd_entry_t oldpde, *pde; pt_entry_t *pte, PG_A, PG_G, PG_M, PG_RW, PG_V; vm_offset_t va, va_next; vm_page_t m; boolean_t anychanged; if (advice != MADV_DONTNEED && advice != MADV_FREE) return; /* * A/D bit emulation requires an alternate code path when clearing * the modified and accessed bits below. Since this function is * advisory in nature we skip it entirely for pmaps that require * A/D bit emulation. */ if (pmap_emulate_ad_bits(pmap)) return; PG_A = pmap_accessed_bit(pmap); PG_G = pmap_global_bit(pmap); PG_M = pmap_modified_bit(pmap); PG_V = pmap_valid_bit(pmap); PG_RW = pmap_rw_bit(pmap); anychanged = FALSE; pmap_delayed_invl_start(); PMAP_LOCK(pmap); for (; sva < eva; sva = va_next) { pml4e = pmap_pml4e(pmap, sva); if ((*pml4e & PG_V) == 0) { va_next = (sva + NBPML4) & ~PML4MASK; if (va_next < sva) va_next = eva; continue; } pdpe = pmap_pml4e_to_pdpe(pml4e, sva); if ((*pdpe & PG_V) == 0) { va_next = (sva + NBPDP) & ~PDPMASK; if (va_next < sva) va_next = eva; continue; } va_next = (sva + NBPDR) & ~PDRMASK; if (va_next < sva) va_next = eva; pde = pmap_pdpe_to_pde(pdpe, sva); oldpde = *pde; if ((oldpde & PG_V) == 0) continue; else if ((oldpde & PG_PS) != 0) { if ((oldpde & PG_MANAGED) == 0) continue; lock = NULL; if (!pmap_demote_pde_locked(pmap, pde, sva, &lock)) { if (lock != NULL) rw_wunlock(lock); /* * The large page mapping was destroyed. */ continue; } /* * Unless the page mappings are wired, remove the * mapping to a single page so that a subsequent * access may repromote. Since the underlying page * table page is fully populated, this removal never * frees a page table page. */ if ((oldpde & PG_W) == 0) { pte = pmap_pde_to_pte(pde, sva); KASSERT((*pte & PG_V) != 0, ("pmap_advise: invalid PTE")); pmap_remove_pte(pmap, pte, sva, *pde, NULL, &lock); anychanged = TRUE; } if (lock != NULL) rw_wunlock(lock); } if (va_next > eva) va_next = eva; va = va_next; for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++, sva += PAGE_SIZE) { if ((*pte & (PG_MANAGED | PG_V)) != (PG_MANAGED | PG_V)) goto maybe_invlrng; else if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { if (advice == MADV_DONTNEED) { /* * Future calls to pmap_is_modified() * can be avoided by making the page * dirty now. */ m = PHYS_TO_VM_PAGE(*pte & PG_FRAME); vm_page_dirty(m); } atomic_clear_long(pte, PG_M | PG_A); } else if ((*pte & PG_A) != 0) atomic_clear_long(pte, PG_A); else goto maybe_invlrng; if ((*pte & PG_G) != 0) { if (va == va_next) va = sva; } else anychanged = TRUE; continue; maybe_invlrng: if (va != va_next) { pmap_invalidate_range(pmap, va, sva); va = va_next; } } if (va != va_next) pmap_invalidate_range(pmap, va, sva); } if (anychanged) pmap_invalidate_all(pmap); PMAP_UNLOCK(pmap); pmap_delayed_invl_finish(); } /* * Clear the modify bits on the specified physical page. */ void pmap_clear_modify(vm_page_t m) { struct md_page *pvh; pmap_t pmap; pv_entry_t next_pv, pv; pd_entry_t oldpde, *pde; - pt_entry_t oldpte, *pte, PG_M, PG_RW, PG_V; + pt_entry_t *pte, PG_M, PG_RW; struct rwlock *lock; vm_offset_t va; int md_gen, pvh_gen; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_clear_modify: page %p is not managed", m)); VM_OBJECT_ASSERT_WLOCKED(m->object); KASSERT(!vm_page_xbusied(m), ("pmap_clear_modify: page %p is exclusive busied", m)); /* * If the page is not PGA_WRITEABLE, then no PTEs can have PG_M set. * If the object containing the page is locked and the page is not * exclusive busied, then PGA_WRITEABLE cannot be concurrently set. */ if ((m->aflags & PGA_WRITEABLE) == 0) return; pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : pa_to_pvh(VM_PAGE_TO_PHYS(m)); lock = VM_PAGE_TO_PV_LIST_LOCK(m); rw_wlock(lock); restart: TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { pvh_gen = pvh->pv_gen; rw_wunlock(lock); PMAP_LOCK(pmap); rw_wlock(lock); if (pvh_gen != pvh->pv_gen) { PMAP_UNLOCK(pmap); goto restart; } } PG_M = pmap_modified_bit(pmap); - PG_V = pmap_valid_bit(pmap); PG_RW = pmap_rw_bit(pmap); va = pv->pv_va; pde = pmap_pde(pmap, va); oldpde = *pde; - if ((oldpde & PG_RW) != 0) { - if (pmap_demote_pde_locked(pmap, pde, va, &lock)) { - if ((oldpde & PG_W) == 0) { - /* - * Write protect the mapping to a - * single page so that a subsequent - * write access may repromote. - */ - va += VM_PAGE_TO_PHYS(m) - (oldpde & - PG_PS_FRAME); - pte = pmap_pde_to_pte(pde, va); - oldpte = *pte; - if ((oldpte & PG_V) != 0) { - while (!atomic_cmpset_long(pte, - oldpte, - oldpte & ~(PG_M | PG_RW))) - oldpte = *pte; - vm_page_dirty(m); - pmap_invalidate_page(pmap, va); - } - } - } + /* If oldpde has PG_RW set, then it also has PG_M set. */ + if ((oldpde & PG_RW) != 0 && + pmap_demote_pde_locked(pmap, pde, va, &lock) && + (oldpde & PG_W) == 0) { + /* + * Write protect the mapping to a single page so that + * a subsequent write access may repromote. + */ + va += VM_PAGE_TO_PHYS(m) - (oldpde & PG_PS_FRAME); + pte = pmap_pde_to_pte(pde, va); + atomic_clear_long(pte, PG_M | PG_RW); + vm_page_dirty(m); + pmap_invalidate_page(pmap, va); } PMAP_UNLOCK(pmap); } TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { md_gen = m->md.pv_gen; pvh_gen = pvh->pv_gen; rw_wunlock(lock); PMAP_LOCK(pmap); rw_wlock(lock); if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { PMAP_UNLOCK(pmap); goto restart; } } PG_M = pmap_modified_bit(pmap); PG_RW = pmap_rw_bit(pmap); pde = pmap_pde(pmap, pv->pv_va); KASSERT((*pde & PG_PS) == 0, ("pmap_clear_modify: found" " a 2mpage in page %p's pv list", m)); pte = pmap_pde_to_pte(pde, pv->pv_va); if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { atomic_clear_long(pte, PG_M); pmap_invalidate_page(pmap, pv->pv_va); } PMAP_UNLOCK(pmap); } rw_wunlock(lock); } /* * Miscellaneous support routines follow */ /* Adjust the cache mode for a 4KB page mapped via a PTE. */ static __inline void pmap_pte_attr(pt_entry_t *pte, int cache_bits, int mask) { u_int opte, npte; /* * The cache mode bits are all in the low 32-bits of the * PTE, so we can just spin on updating the low 32-bits. */ do { opte = *(u_int *)pte; npte = opte & ~mask; npte |= cache_bits; } while (npte != opte && !atomic_cmpset_int((u_int *)pte, opte, npte)); } /* Adjust the cache mode for a 2MB page mapped via a PDE. */ static __inline void pmap_pde_attr(pd_entry_t *pde, int cache_bits, int mask) { u_int opde, npde; /* * The cache mode bits are all in the low 32-bits of the * PDE, so we can just spin on updating the low 32-bits. */ do { opde = *(u_int *)pde; npde = opde & ~mask; npde |= cache_bits; } while (npde != opde && !atomic_cmpset_int((u_int *)pde, opde, npde)); } /* * Map a set of physical memory pages into the kernel virtual * address space. Return a pointer to where it is mapped. This * routine is intended to be used for mapping device memory, * NOT real memory. */ static void * pmap_mapdev_internal(vm_paddr_t pa, vm_size_t size, int mode, bool noflush) { struct pmap_preinit_mapping *ppim; vm_offset_t va, offset; vm_size_t tmpsize; int i; offset = pa & PAGE_MASK; size = round_page(offset + size); pa = trunc_page(pa); if (!pmap_initialized) { va = 0; for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { ppim = pmap_preinit_mapping + i; if (ppim->va == 0) { ppim->pa = pa; ppim->sz = size; ppim->mode = mode; ppim->va = virtual_avail; virtual_avail += size; va = ppim->va; break; } } if (va == 0) panic("%s: too many preinit mappings", __func__); } else { /* * If we have a preinit mapping, re-use it. */ for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { ppim = pmap_preinit_mapping + i; if (ppim->pa == pa && ppim->sz == size && ppim->mode == mode) return ((void *)(ppim->va + offset)); } /* * If the specified range of physical addresses fits within * the direct map window, use the direct map. */ if (pa < dmaplimit && pa + size <= dmaplimit) { va = PHYS_TO_DMAP(pa); PMAP_LOCK(kernel_pmap); i = pmap_change_attr_locked(va, size, mode, noflush); PMAP_UNLOCK(kernel_pmap); if (!i) return ((void *)(va + offset)); } va = kva_alloc(size); if (va == 0) panic("%s: Couldn't allocate KVA", __func__); } for (tmpsize = 0; tmpsize < size; tmpsize += PAGE_SIZE) pmap_kenter_attr(va + tmpsize, pa + tmpsize, mode); pmap_invalidate_range(kernel_pmap, va, va + tmpsize); if (!noflush) pmap_invalidate_cache_range(va, va + tmpsize); return ((void *)(va + offset)); } void * pmap_mapdev_attr(vm_paddr_t pa, vm_size_t size, int mode) { return (pmap_mapdev_internal(pa, size, mode, false)); } void * pmap_mapdev(vm_paddr_t pa, vm_size_t size) { return (pmap_mapdev_internal(pa, size, PAT_UNCACHEABLE, false)); } void * pmap_mapdev_pciecfg(vm_paddr_t pa, vm_size_t size) { return (pmap_mapdev_internal(pa, size, PAT_UNCACHEABLE, true)); } void * pmap_mapbios(vm_paddr_t pa, vm_size_t size) { return (pmap_mapdev_internal(pa, size, PAT_WRITE_BACK, false)); } void pmap_unmapdev(vm_offset_t va, vm_size_t size) { struct pmap_preinit_mapping *ppim; vm_offset_t offset; int i; /* If we gave a direct map region in pmap_mapdev, do nothing */ if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) return; offset = va & PAGE_MASK; size = round_page(offset + size); va = trunc_page(va); for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { ppim = pmap_preinit_mapping + i; if (ppim->va == va && ppim->sz == size) { if (pmap_initialized) return; ppim->pa = 0; ppim->va = 0; ppim->sz = 0; ppim->mode = 0; if (va + size == virtual_avail) virtual_avail = va; return; } } if (pmap_initialized) kva_free(va, size); } /* * Tries to demote a 1GB page mapping. */ static boolean_t pmap_demote_pdpe(pmap_t pmap, pdp_entry_t *pdpe, vm_offset_t va) { pdp_entry_t newpdpe, oldpdpe; pd_entry_t *firstpde, newpde, *pde; pt_entry_t PG_A, PG_M, PG_RW, PG_V; vm_paddr_t pdpgpa; vm_page_t pdpg; PG_A = pmap_accessed_bit(pmap); PG_M = pmap_modified_bit(pmap); PG_V = pmap_valid_bit(pmap); PG_RW = pmap_rw_bit(pmap); PMAP_LOCK_ASSERT(pmap, MA_OWNED); oldpdpe = *pdpe; KASSERT((oldpdpe & (PG_PS | PG_V)) == (PG_PS | PG_V), ("pmap_demote_pdpe: oldpdpe is missing PG_PS and/or PG_V")); if ((pdpg = vm_page_alloc(NULL, va >> PDPSHIFT, VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) { CTR2(KTR_PMAP, "pmap_demote_pdpe: failure for va %#lx" " in pmap %p", va, pmap); return (FALSE); } pdpgpa = VM_PAGE_TO_PHYS(pdpg); firstpde = (pd_entry_t *)PHYS_TO_DMAP(pdpgpa); newpdpe = pdpgpa | PG_M | PG_A | (oldpdpe & PG_U) | PG_RW | PG_V; KASSERT((oldpdpe & PG_A) != 0, ("pmap_demote_pdpe: oldpdpe is missing PG_A")); KASSERT((oldpdpe & (PG_M | PG_RW)) != PG_RW, ("pmap_demote_pdpe: oldpdpe is missing PG_M")); newpde = oldpdpe; /* * Initialize the page directory page. */ for (pde = firstpde; pde < firstpde + NPDEPG; pde++) { *pde = newpde; newpde += NBPDR; } /* * Demote the mapping. */ *pdpe = newpdpe; /* * Invalidate a stale recursive mapping of the page directory page. */ pmap_invalidate_page(pmap, (vm_offset_t)vtopde(va)); pmap_pdpe_demotions++; CTR2(KTR_PMAP, "pmap_demote_pdpe: success for va %#lx" " in pmap %p", va, pmap); return (TRUE); } /* * Sets the memory attribute for the specified page. */ void pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma) { m->md.pat_mode = ma; /* * If "m" is a normal page, update its direct mapping. This update * can be relied upon to perform any cache operations that are * required for data coherence. */ if ((m->flags & PG_FICTITIOUS) == 0 && pmap_change_attr(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)), PAGE_SIZE, m->md.pat_mode)) panic("memory attribute change on the direct map failed"); } /* * Changes the specified virtual address range's memory type to that given by * the parameter "mode". The specified virtual address range must be * completely contained within either the direct map or the kernel map. If * the virtual address range is contained within the kernel map, then the * memory type for each of the corresponding ranges of the direct map is also * changed. (The corresponding ranges of the direct map are those ranges that * map the same physical pages as the specified virtual address range.) These * changes to the direct map are necessary because Intel describes the * behavior of their processors as "undefined" if two or more mappings to the * same physical page have different memory types. * * Returns zero if the change completed successfully, and either EINVAL or * ENOMEM if the change failed. Specifically, EINVAL is returned if some part * of the virtual address range was not mapped, and ENOMEM is returned if * there was insufficient memory available to complete the change. In the * latter case, the memory type may have been changed on some part of the * virtual address range or the direct map. */ int pmap_change_attr(vm_offset_t va, vm_size_t size, int mode) { int error; PMAP_LOCK(kernel_pmap); error = pmap_change_attr_locked(va, size, mode, false); PMAP_UNLOCK(kernel_pmap); return (error); } static int pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode, bool noflush) { vm_offset_t base, offset, tmpva; vm_paddr_t pa_start, pa_end, pa_end1; pdp_entry_t *pdpe; pd_entry_t *pde; pt_entry_t *pte; int cache_bits_pte, cache_bits_pde, error; boolean_t changed; PMAP_LOCK_ASSERT(kernel_pmap, MA_OWNED); base = trunc_page(va); offset = va & PAGE_MASK; size = round_page(offset + size); /* * Only supported on kernel virtual addresses, including the direct * map but excluding the recursive map. */ if (base < DMAP_MIN_ADDRESS) return (EINVAL); cache_bits_pde = pmap_cache_bits(kernel_pmap, mode, 1); cache_bits_pte = pmap_cache_bits(kernel_pmap, mode, 0); changed = FALSE; /* * Pages that aren't mapped aren't supported. Also break down 2MB pages * into 4KB pages if required. */ for (tmpva = base; tmpva < base + size; ) { pdpe = pmap_pdpe(kernel_pmap, tmpva); if (pdpe == NULL || *pdpe == 0) return (EINVAL); if (*pdpe & PG_PS) { /* * If the current 1GB page already has the required * memory type, then we need not demote this page. Just * increment tmpva to the next 1GB page frame. */ if ((*pdpe & X86_PG_PDE_CACHE) == cache_bits_pde) { tmpva = trunc_1gpage(tmpva) + NBPDP; continue; } /* * If the current offset aligns with a 1GB page frame * and there is at least 1GB left within the range, then * we need not break down this page into 2MB pages. */ if ((tmpva & PDPMASK) == 0 && tmpva + PDPMASK < base + size) { tmpva += NBPDP; continue; } if (!pmap_demote_pdpe(kernel_pmap, pdpe, tmpva)) return (ENOMEM); } pde = pmap_pdpe_to_pde(pdpe, tmpva); if (*pde == 0) return (EINVAL); if (*pde & PG_PS) { /* * If the current 2MB page already has the required * memory type, then we need not demote this page. Just * increment tmpva to the next 2MB page frame. */ if ((*pde & X86_PG_PDE_CACHE) == cache_bits_pde) { tmpva = trunc_2mpage(tmpva) + NBPDR; continue; } /* * If the current offset aligns with a 2MB page frame * and there is at least 2MB left within the range, then * we need not break down this page into 4KB pages. */ if ((tmpva & PDRMASK) == 0 && tmpva + PDRMASK < base + size) { tmpva += NBPDR; continue; } if (!pmap_demote_pde(kernel_pmap, pde, tmpva)) return (ENOMEM); } pte = pmap_pde_to_pte(pde, tmpva); if (*pte == 0) return (EINVAL); tmpva += PAGE_SIZE; } error = 0; /* * Ok, all the pages exist, so run through them updating their * cache mode if required. */ pa_start = pa_end = 0; for (tmpva = base; tmpva < base + size; ) { pdpe = pmap_pdpe(kernel_pmap, tmpva); if (*pdpe & PG_PS) { if ((*pdpe & X86_PG_PDE_CACHE) != cache_bits_pde) { pmap_pde_attr(pdpe, cache_bits_pde, X86_PG_PDE_CACHE); changed = TRUE; } if (tmpva >= VM_MIN_KERNEL_ADDRESS && (*pdpe & PG_PS_FRAME) < dmaplimit) { if (pa_start == pa_end) { /* Start physical address run. */ pa_start = *pdpe & PG_PS_FRAME; pa_end = pa_start + NBPDP; } else if (pa_end == (*pdpe & PG_PS_FRAME)) pa_end += NBPDP; else { /* Run ended, update direct map. */ error = pmap_change_attr_locked( PHYS_TO_DMAP(pa_start), pa_end - pa_start, mode, noflush); if (error != 0) break; /* Start physical address run. */ pa_start = *pdpe & PG_PS_FRAME; pa_end = pa_start + NBPDP; } } tmpva = trunc_1gpage(tmpva) + NBPDP; continue; } pde = pmap_pdpe_to_pde(pdpe, tmpva); if (*pde & PG_PS) { if ((*pde & X86_PG_PDE_CACHE) != cache_bits_pde) { pmap_pde_attr(pde, cache_bits_pde, X86_PG_PDE_CACHE); changed = TRUE; } if (tmpva >= VM_MIN_KERNEL_ADDRESS && (*pde & PG_PS_FRAME) < dmaplimit) { if (pa_start == pa_end) { /* Start physical address run. */ pa_start = *pde & PG_PS_FRAME; pa_end = pa_start + NBPDR; } else if (pa_end == (*pde & PG_PS_FRAME)) pa_end += NBPDR; else { /* Run ended, update direct map. */ error = pmap_change_attr_locked( PHYS_TO_DMAP(pa_start), pa_end - pa_start, mode, noflush); if (error != 0) break; /* Start physical address run. */ pa_start = *pde & PG_PS_FRAME; pa_end = pa_start + NBPDR; } } tmpva = trunc_2mpage(tmpva) + NBPDR; } else { pte = pmap_pde_to_pte(pde, tmpva); if ((*pte & X86_PG_PTE_CACHE) != cache_bits_pte) { pmap_pte_attr(pte, cache_bits_pte, X86_PG_PTE_CACHE); changed = TRUE; } if (tmpva >= VM_MIN_KERNEL_ADDRESS && (*pte & PG_FRAME) < dmaplimit) { if (pa_start == pa_end) { /* Start physical address run. */ pa_start = *pte & PG_FRAME; pa_end = pa_start + PAGE_SIZE; } else if (pa_end == (*pte & PG_FRAME)) pa_end += PAGE_SIZE; else { /* Run ended, update direct map. */ error = pmap_change_attr_locked( PHYS_TO_DMAP(pa_start), pa_end - pa_start, mode, noflush); if (error != 0) break; /* Start physical address run. */ pa_start = *pte & PG_FRAME; pa_end = pa_start + PAGE_SIZE; } } tmpva += PAGE_SIZE; } } if (error == 0 && pa_start != pa_end && pa_start < dmaplimit) { pa_end1 = MIN(pa_end, dmaplimit); if (pa_start != pa_end1) error = pmap_change_attr_locked(PHYS_TO_DMAP(pa_start), pa_end1 - pa_start, mode, noflush); } /* * Flush CPU caches if required to make sure any data isn't cached that * shouldn't be, etc. */ if (changed) { pmap_invalidate_range(kernel_pmap, base, tmpva); if (!noflush) pmap_invalidate_cache_range(base, tmpva); } return (error); } /* * Demotes any mapping within the direct map region that covers more than the * specified range of physical addresses. This range's size must be a power * of two and its starting address must be a multiple of its size. Since the * demotion does not change any attributes of the mapping, a TLB invalidation * is not mandatory. The caller may, however, request a TLB invalidation. */ void pmap_demote_DMAP(vm_paddr_t base, vm_size_t len, boolean_t invalidate) { pdp_entry_t *pdpe; pd_entry_t *pde; vm_offset_t va; boolean_t changed; if (len == 0) return; KASSERT(powerof2(len), ("pmap_demote_DMAP: len is not a power of 2")); KASSERT((base & (len - 1)) == 0, ("pmap_demote_DMAP: base is not a multiple of len")); if (len < NBPDP && base < dmaplimit) { va = PHYS_TO_DMAP(base); changed = FALSE; PMAP_LOCK(kernel_pmap); pdpe = pmap_pdpe(kernel_pmap, va); if ((*pdpe & X86_PG_V) == 0) panic("pmap_demote_DMAP: invalid PDPE"); if ((*pdpe & PG_PS) != 0) { if (!pmap_demote_pdpe(kernel_pmap, pdpe, va)) panic("pmap_demote_DMAP: PDPE failed"); changed = TRUE; } if (len < NBPDR) { pde = pmap_pdpe_to_pde(pdpe, va); if ((*pde & X86_PG_V) == 0) panic("pmap_demote_DMAP: invalid PDE"); if ((*pde & PG_PS) != 0) { if (!pmap_demote_pde(kernel_pmap, pde, va)) panic("pmap_demote_DMAP: PDE failed"); changed = TRUE; } } if (changed && invalidate) pmap_invalidate_page(kernel_pmap, va); PMAP_UNLOCK(kernel_pmap); } } /* * perform the pmap work for mincore */ int pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *locked_pa) { pd_entry_t *pdep; pt_entry_t pte, PG_A, PG_M, PG_RW, PG_V; vm_paddr_t pa; int val; PG_A = pmap_accessed_bit(pmap); PG_M = pmap_modified_bit(pmap); PG_V = pmap_valid_bit(pmap); PG_RW = pmap_rw_bit(pmap); PMAP_LOCK(pmap); retry: pdep = pmap_pde(pmap, addr); if (pdep != NULL && (*pdep & PG_V)) { if (*pdep & PG_PS) { pte = *pdep; /* Compute the physical address of the 4KB page. */ pa = ((*pdep & PG_PS_FRAME) | (addr & PDRMASK)) & PG_FRAME; val = MINCORE_SUPER; } else { pte = *pmap_pde_to_pte(pdep, addr); pa = pte & PG_FRAME; val = 0; } } else { pte = 0; pa = 0; val = 0; } if ((pte & PG_V) != 0) { val |= MINCORE_INCORE; if ((pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER; if ((pte & PG_A) != 0) val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER; } if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) != (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) && (pte & (PG_MANAGED | PG_V)) == (PG_MANAGED | PG_V)) { /* Ensure that "PHYS_TO_VM_PAGE(pa)->object" doesn't change. */ if (vm_page_pa_tryrelock(pmap, pa, locked_pa)) goto retry; } else PA_UNLOCK_COND(*locked_pa); PMAP_UNLOCK(pmap); return (val); } static uint64_t pmap_pcid_alloc(pmap_t pmap, u_int cpuid) { uint32_t gen, new_gen, pcid_next; CRITICAL_ASSERT(curthread); gen = PCPU_GET(pcid_gen); if (pmap->pm_pcids[cpuid].pm_pcid == PMAP_PCID_KERN) return (pti ? 0 : CR3_PCID_SAVE); if (pmap->pm_pcids[cpuid].pm_gen == gen) return (CR3_PCID_SAVE); pcid_next = PCPU_GET(pcid_next); KASSERT((!pti && pcid_next <= PMAP_PCID_OVERMAX) || (pti && pcid_next <= PMAP_PCID_OVERMAX_KERN), ("cpu %d pcid_next %#x", cpuid, pcid_next)); if ((!pti && pcid_next == PMAP_PCID_OVERMAX) || (pti && pcid_next == PMAP_PCID_OVERMAX_KERN)) { new_gen = gen + 1; if (new_gen == 0) new_gen = 1; PCPU_SET(pcid_gen, new_gen); pcid_next = PMAP_PCID_KERN + 1; } else { new_gen = gen; } pmap->pm_pcids[cpuid].pm_pcid = pcid_next; pmap->pm_pcids[cpuid].pm_gen = new_gen; PCPU_SET(pcid_next, pcid_next + 1); return (0); } static uint64_t pmap_pcid_alloc_checked(pmap_t pmap, u_int cpuid) { uint64_t cached; cached = pmap_pcid_alloc(pmap, cpuid); KASSERT(pmap->pm_pcids[cpuid].pm_pcid < PMAP_PCID_OVERMAX, ("pmap %p cpu %d pcid %#x", pmap, cpuid, pmap->pm_pcids[cpuid].pm_pcid)); KASSERT(pmap->pm_pcids[cpuid].pm_pcid != PMAP_PCID_KERN || pmap == kernel_pmap, ("non-kernel pmap pmap %p cpu %d pcid %#x", pmap, cpuid, pmap->pm_pcids[cpuid].pm_pcid)); return (cached); } static void pmap_activate_sw_pti_post(struct thread *td, pmap_t pmap) { PCPU_GET(tssp)->tss_rsp0 = pmap->pm_ucr3 != PMAP_NO_CR3 ? PCPU_GET(pti_rsp0) : (uintptr_t)td->td_pcb; } static void inline pmap_activate_sw_pcid_pti(pmap_t pmap, u_int cpuid, const bool invpcid_works1) { struct invpcid_descr d; uint64_t cached, cr3, kcr3, ucr3; cached = pmap_pcid_alloc_checked(pmap, cpuid); cr3 = rcr3(); if ((cr3 & ~CR3_PCID_MASK) != pmap->pm_cr3) load_cr3(pmap->pm_cr3 | pmap->pm_pcids[cpuid].pm_pcid); PCPU_SET(curpmap, pmap); kcr3 = pmap->pm_cr3 | pmap->pm_pcids[cpuid].pm_pcid; ucr3 = pmap->pm_ucr3 | pmap->pm_pcids[cpuid].pm_pcid | PMAP_PCID_USER_PT; if (!cached && pmap->pm_ucr3 != PMAP_NO_CR3) { /* * Explicitly invalidate translations cached from the * user page table. They are not automatically * flushed by reload of cr3 with the kernel page table * pointer above. * * Note that the if() condition is resolved statically * by using the function argument instead of * runtime-evaluated invpcid_works value. */ if (invpcid_works1) { d.pcid = PMAP_PCID_USER_PT | pmap->pm_pcids[cpuid].pm_pcid; d.pad = 0; d.addr = 0; invpcid(&d, INVPCID_CTX); } else { pmap_pti_pcid_invalidate(ucr3, kcr3); } } PCPU_SET(kcr3, kcr3 | CR3_PCID_SAVE); PCPU_SET(ucr3, ucr3 | CR3_PCID_SAVE); if (cached) PCPU_INC(pm_save_cnt); } static void pmap_activate_sw_pcid_invpcid_pti(struct thread *td, pmap_t pmap, u_int cpuid) { pmap_activate_sw_pcid_pti(pmap, cpuid, true); pmap_activate_sw_pti_post(td, pmap); } static void pmap_activate_sw_pcid_noinvpcid_pti(struct thread *td, pmap_t pmap, u_int cpuid) { register_t rflags; /* * If the INVPCID instruction is not available, * invltlb_pcid_handler() is used to handle an invalidate_all * IPI, which checks for curpmap == smp_tlb_pmap. The below * sequence of operations has a window where %CR3 is loaded * with the new pmap's PML4 address, but the curpmap value has * not yet been updated. This causes the invltlb IPI handler, * which is called between the updates, to execute as a NOP, * which leaves stale TLB entries. * * Note that the most typical use of pmap_activate_sw(), from * the context switch, is immune to this race, because * interrupts are disabled (while the thread lock is owned), * and the IPI happens after curpmap is updated. Protect * other callers in a similar way, by disabling interrupts * around the %cr3 register reload and curpmap assignment. */ rflags = intr_disable(); pmap_activate_sw_pcid_pti(pmap, cpuid, false); intr_restore(rflags); pmap_activate_sw_pti_post(td, pmap); } static void pmap_activate_sw_pcid_nopti(struct thread *td __unused, pmap_t pmap, u_int cpuid) { uint64_t cached, cr3; cached = pmap_pcid_alloc_checked(pmap, cpuid); cr3 = rcr3(); if (!cached || (cr3 & ~CR3_PCID_MASK) != pmap->pm_cr3) load_cr3(pmap->pm_cr3 | pmap->pm_pcids[cpuid].pm_pcid | cached); PCPU_SET(curpmap, pmap); if (cached) PCPU_INC(pm_save_cnt); } static void pmap_activate_sw_pcid_noinvpcid_nopti(struct thread *td __unused, pmap_t pmap, u_int cpuid) { register_t rflags; rflags = intr_disable(); pmap_activate_sw_pcid_nopti(td, pmap, cpuid); intr_restore(rflags); } static void pmap_activate_sw_nopcid_nopti(struct thread *td __unused, pmap_t pmap, u_int cpuid __unused) { load_cr3(pmap->pm_cr3); PCPU_SET(curpmap, pmap); } static void pmap_activate_sw_nopcid_pti(struct thread *td, pmap_t pmap, u_int cpuid __unused) { pmap_activate_sw_nopcid_nopti(td, pmap, cpuid); PCPU_SET(kcr3, pmap->pm_cr3); PCPU_SET(ucr3, pmap->pm_ucr3); pmap_activate_sw_pti_post(td, pmap); } DEFINE_IFUNC(static, void, pmap_activate_sw_mode, (struct thread *, pmap_t, u_int)) { if (pmap_pcid_enabled && pti && invpcid_works) return (pmap_activate_sw_pcid_invpcid_pti); else if (pmap_pcid_enabled && pti && !invpcid_works) return (pmap_activate_sw_pcid_noinvpcid_pti); else if (pmap_pcid_enabled && !pti && invpcid_works) return (pmap_activate_sw_pcid_nopti); else if (pmap_pcid_enabled && !pti && !invpcid_works) return (pmap_activate_sw_pcid_noinvpcid_nopti); else if (!pmap_pcid_enabled && pti) return (pmap_activate_sw_nopcid_pti); else /* if (!pmap_pcid_enabled && !pti) */ return (pmap_activate_sw_nopcid_nopti); } void pmap_activate_sw(struct thread *td) { pmap_t oldpmap, pmap; u_int cpuid; oldpmap = PCPU_GET(curpmap); pmap = vmspace_pmap(td->td_proc->p_vmspace); if (oldpmap == pmap) return; cpuid = PCPU_GET(cpuid); #ifdef SMP CPU_SET_ATOMIC(cpuid, &pmap->pm_active); #else CPU_SET(cpuid, &pmap->pm_active); #endif pmap_activate_sw_mode(td, pmap, cpuid); #ifdef SMP CPU_CLR_ATOMIC(cpuid, &oldpmap->pm_active); #else CPU_CLR(cpuid, &oldpmap->pm_active); #endif } void pmap_activate(struct thread *td) { critical_enter(); pmap_activate_sw(td); critical_exit(); } void pmap_activate_boot(pmap_t pmap) { uint64_t kcr3; u_int cpuid; /* * kernel_pmap must be never deactivated, and we ensure that * by never activating it at all. */ MPASS(pmap != kernel_pmap); cpuid = PCPU_GET(cpuid); #ifdef SMP CPU_SET_ATOMIC(cpuid, &pmap->pm_active); #else CPU_SET(cpuid, &pmap->pm_active); #endif PCPU_SET(curpmap, pmap); if (pti) { kcr3 = pmap->pm_cr3; if (pmap_pcid_enabled) kcr3 |= pmap->pm_pcids[cpuid].pm_pcid | CR3_PCID_SAVE; } else { kcr3 = PMAP_NO_CR3; } PCPU_SET(kcr3, kcr3); PCPU_SET(ucr3, PMAP_NO_CR3); } void pmap_sync_icache(pmap_t pm, vm_offset_t va, vm_size_t sz) { } /* * Increase the starting virtual address of the given mapping if a * different alignment might result in more superpage mappings. */ void pmap_align_superpage(vm_object_t object, vm_ooffset_t offset, vm_offset_t *addr, vm_size_t size) { vm_offset_t superpage_offset; if (size < NBPDR) return; if (object != NULL && (object->flags & OBJ_COLORED) != 0) offset += ptoa(object->pg_color); superpage_offset = offset & PDRMASK; if (size - ((NBPDR - superpage_offset) & PDRMASK) < NBPDR || (*addr & PDRMASK) == superpage_offset) return; if ((*addr & PDRMASK) < superpage_offset) *addr = (*addr & ~PDRMASK) + superpage_offset; else *addr = ((*addr + PDRMASK) & ~PDRMASK) + superpage_offset; } #ifdef INVARIANTS static unsigned long num_dirty_emulations; SYSCTL_ULONG(_vm_pmap, OID_AUTO, num_dirty_emulations, CTLFLAG_RW, &num_dirty_emulations, 0, NULL); static unsigned long num_accessed_emulations; SYSCTL_ULONG(_vm_pmap, OID_AUTO, num_accessed_emulations, CTLFLAG_RW, &num_accessed_emulations, 0, NULL); static unsigned long num_superpage_accessed_emulations; SYSCTL_ULONG(_vm_pmap, OID_AUTO, num_superpage_accessed_emulations, CTLFLAG_RW, &num_superpage_accessed_emulations, 0, NULL); static unsigned long ad_emulation_superpage_promotions; SYSCTL_ULONG(_vm_pmap, OID_AUTO, ad_emulation_superpage_promotions, CTLFLAG_RW, &ad_emulation_superpage_promotions, 0, NULL); #endif /* INVARIANTS */ int pmap_emulate_accessed_dirty(pmap_t pmap, vm_offset_t va, int ftype) { int rv; struct rwlock *lock; #if VM_NRESERVLEVEL > 0 vm_page_t m, mpte; #endif pd_entry_t *pde; pt_entry_t *pte, PG_A, PG_M, PG_RW, PG_V; KASSERT(ftype == VM_PROT_READ || ftype == VM_PROT_WRITE, ("pmap_emulate_accessed_dirty: invalid fault type %d", ftype)); if (!pmap_emulate_ad_bits(pmap)) return (-1); PG_A = pmap_accessed_bit(pmap); PG_M = pmap_modified_bit(pmap); PG_V = pmap_valid_bit(pmap); PG_RW = pmap_rw_bit(pmap); rv = -1; lock = NULL; PMAP_LOCK(pmap); pde = pmap_pde(pmap, va); if (pde == NULL || (*pde & PG_V) == 0) goto done; if ((*pde & PG_PS) != 0) { if (ftype == VM_PROT_READ) { #ifdef INVARIANTS atomic_add_long(&num_superpage_accessed_emulations, 1); #endif *pde |= PG_A; rv = 0; } goto done; } pte = pmap_pde_to_pte(pde, va); if ((*pte & PG_V) == 0) goto done; if (ftype == VM_PROT_WRITE) { if ((*pte & PG_RW) == 0) goto done; /* * Set the modified and accessed bits simultaneously. * * Intel EPT PTEs that do software emulation of A/D bits map * PG_A and PG_M to EPT_PG_READ and EPT_PG_WRITE respectively. * An EPT misconfiguration is triggered if the PTE is writable * but not readable (WR=10). This is avoided by setting PG_A * and PG_M simultaneously. */ *pte |= PG_M | PG_A; } else { *pte |= PG_A; } #if VM_NRESERVLEVEL > 0 /* try to promote the mapping */ if (va < VM_MAXUSER_ADDRESS) mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME); else mpte = NULL; m = PHYS_TO_VM_PAGE(*pte & PG_FRAME); if ((mpte == NULL || mpte->wire_count == NPTEPG) && pmap_ps_enabled(pmap) && (m->flags & PG_FICTITIOUS) == 0 && vm_reserv_level_iffullpop(m) == 0) { pmap_promote_pde(pmap, pde, va, &lock); #ifdef INVARIANTS atomic_add_long(&ad_emulation_superpage_promotions, 1); #endif } #endif #ifdef INVARIANTS if (ftype == VM_PROT_WRITE) atomic_add_long(&num_dirty_emulations, 1); else atomic_add_long(&num_accessed_emulations, 1); #endif rv = 0; /* success */ done: if (lock != NULL) rw_wunlock(lock); PMAP_UNLOCK(pmap); return (rv); } void pmap_get_mapping(pmap_t pmap, vm_offset_t va, uint64_t *ptr, int *num) { pml4_entry_t *pml4; pdp_entry_t *pdp; pd_entry_t *pde; pt_entry_t *pte, PG_V; int idx; idx = 0; PG_V = pmap_valid_bit(pmap); PMAP_LOCK(pmap); pml4 = pmap_pml4e(pmap, va); ptr[idx++] = *pml4; if ((*pml4 & PG_V) == 0) goto done; pdp = pmap_pml4e_to_pdpe(pml4, va); ptr[idx++] = *pdp; if ((*pdp & PG_V) == 0 || (*pdp & PG_PS) != 0) goto done; pde = pmap_pdpe_to_pde(pdp, va); ptr[idx++] = *pde; if ((*pde & PG_V) == 0 || (*pde & PG_PS) != 0) goto done; pte = pmap_pde_to_pte(pde, va); ptr[idx++] = *pte; done: PMAP_UNLOCK(pmap); *num = idx; } /** * Get the kernel virtual address of a set of physical pages. If there are * physical addresses not covered by the DMAP perform a transient mapping * that will be removed when calling pmap_unmap_io_transient. * * \param page The pages the caller wishes to obtain the virtual * address on the kernel memory map. * \param vaddr On return contains the kernel virtual memory address * of the pages passed in the page parameter. * \param count Number of pages passed in. * \param can_fault TRUE if the thread using the mapped pages can take * page faults, FALSE otherwise. * * \returns TRUE if the caller must call pmap_unmap_io_transient when * finished or FALSE otherwise. * */ boolean_t pmap_map_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count, boolean_t can_fault) { vm_paddr_t paddr; boolean_t needs_mapping; pt_entry_t *pte; int cache_bits, error __unused, i; /* * Allocate any KVA space that we need, this is done in a separate * loop to prevent calling vmem_alloc while pinned. */ needs_mapping = FALSE; for (i = 0; i < count; i++) { paddr = VM_PAGE_TO_PHYS(page[i]); if (__predict_false(paddr >= dmaplimit)) { error = vmem_alloc(kernel_arena, PAGE_SIZE, M_BESTFIT | M_WAITOK, &vaddr[i]); KASSERT(error == 0, ("vmem_alloc failed: %d", error)); needs_mapping = TRUE; } else { vaddr[i] = PHYS_TO_DMAP(paddr); } } /* Exit early if everything is covered by the DMAP */ if (!needs_mapping) return (FALSE); /* * NB: The sequence of updating a page table followed by accesses * to the corresponding pages used in the !DMAP case is subject to * the situation described in the "AMD64 Architecture Programmer's * Manual Volume 2: System Programming" rev. 3.23, "7.3.1 Special * Coherency Considerations". Therefore, issuing the INVLPG right * after modifying the PTE bits is crucial. */ if (!can_fault) sched_pin(); for (i = 0; i < count; i++) { paddr = VM_PAGE_TO_PHYS(page[i]); if (paddr >= dmaplimit) { if (can_fault) { /* * Slow path, since we can get page faults * while mappings are active don't pin the * thread to the CPU and instead add a global * mapping visible to all CPUs. */ pmap_qenter(vaddr[i], &page[i], 1); } else { pte = vtopte(vaddr[i]); cache_bits = pmap_cache_bits(kernel_pmap, page[i]->md.pat_mode, 0); pte_store(pte, paddr | X86_PG_RW | X86_PG_V | cache_bits); invlpg(vaddr[i]); } } } return (needs_mapping); } void pmap_unmap_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count, boolean_t can_fault) { vm_paddr_t paddr; int i; if (!can_fault) sched_unpin(); for (i = 0; i < count; i++) { paddr = VM_PAGE_TO_PHYS(page[i]); if (paddr >= dmaplimit) { if (can_fault) pmap_qremove(vaddr[i], 1); vmem_free(kernel_arena, vaddr[i], PAGE_SIZE); } } } vm_offset_t pmap_quick_enter_page(vm_page_t m) { vm_paddr_t paddr; paddr = VM_PAGE_TO_PHYS(m); if (paddr < dmaplimit) return (PHYS_TO_DMAP(paddr)); mtx_lock_spin(&qframe_mtx); KASSERT(*vtopte(qframe) == 0, ("qframe busy")); pte_store(vtopte(qframe), paddr | X86_PG_RW | X86_PG_V | X86_PG_A | X86_PG_M | pmap_cache_bits(kernel_pmap, m->md.pat_mode, 0)); return (qframe); } void pmap_quick_remove_page(vm_offset_t addr) { if (addr != qframe) return; pte_store(vtopte(qframe), 0); invlpg(qframe); mtx_unlock_spin(&qframe_mtx); } /* * Pdp pages from the large map are managed differently from either * kernel or user page table pages. They are permanently allocated at * initialization time, and their wire count is permanently set to * zero. The pml4 entries pointing to those pages are copied into * each allocated pmap. * * In contrast, pd and pt pages are managed like user page table * pages. They are dynamically allocated, and their wire count * represents the number of valid entries within the page. */ static vm_page_t pmap_large_map_getptp_unlocked(void) { vm_page_t m; m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_ZERO); if (m != NULL && (m->flags & PG_ZERO) == 0) pmap_zero_page(m); return (m); } static vm_page_t pmap_large_map_getptp(void) { vm_page_t m; PMAP_LOCK_ASSERT(kernel_pmap, MA_OWNED); m = pmap_large_map_getptp_unlocked(); if (m == NULL) { PMAP_UNLOCK(kernel_pmap); vm_wait(NULL); PMAP_LOCK(kernel_pmap); /* Callers retry. */ } return (m); } static pdp_entry_t * pmap_large_map_pdpe(vm_offset_t va) { vm_pindex_t pml4_idx; vm_paddr_t mphys; pml4_idx = pmap_pml4e_index(va); KASSERT(LMSPML4I <= pml4_idx && pml4_idx < LMSPML4I + lm_ents, ("pmap_large_map_pdpe: va %#jx out of range idx %#jx LMSPML4I " "%#jx lm_ents %d", (uintmax_t)va, (uintmax_t)pml4_idx, LMSPML4I, lm_ents)); KASSERT((kernel_pmap->pm_pml4[pml4_idx] & X86_PG_V) != 0, ("pmap_large_map_pdpe: invalid pml4 for va %#jx idx %#jx " "LMSPML4I %#jx lm_ents %d", (uintmax_t)va, (uintmax_t)pml4_idx, LMSPML4I, lm_ents)); mphys = kernel_pmap->pm_pml4[pml4_idx] & PG_FRAME; return ((pdp_entry_t *)PHYS_TO_DMAP(mphys) + pmap_pdpe_index(va)); } static pd_entry_t * pmap_large_map_pde(vm_offset_t va) { pdp_entry_t *pdpe; vm_page_t m; vm_paddr_t mphys; retry: pdpe = pmap_large_map_pdpe(va); if (*pdpe == 0) { m = pmap_large_map_getptp(); if (m == NULL) goto retry; mphys = VM_PAGE_TO_PHYS(m); *pdpe = mphys | X86_PG_A | X86_PG_RW | X86_PG_V | pg_nx; } else { MPASS((*pdpe & X86_PG_PS) == 0); mphys = *pdpe & PG_FRAME; } return ((pd_entry_t *)PHYS_TO_DMAP(mphys) + pmap_pde_index(va)); } static pt_entry_t * pmap_large_map_pte(vm_offset_t va) { pd_entry_t *pde; vm_page_t m; vm_paddr_t mphys; retry: pde = pmap_large_map_pde(va); if (*pde == 0) { m = pmap_large_map_getptp(); if (m == NULL) goto retry; mphys = VM_PAGE_TO_PHYS(m); *pde = mphys | X86_PG_A | X86_PG_RW | X86_PG_V | pg_nx; PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pde))->wire_count++; } else { MPASS((*pde & X86_PG_PS) == 0); mphys = *pde & PG_FRAME; } return ((pt_entry_t *)PHYS_TO_DMAP(mphys) + pmap_pte_index(va)); } static vm_paddr_t pmap_large_map_kextract(vm_offset_t va) { pdp_entry_t *pdpe, pdp; pd_entry_t *pde, pd; pt_entry_t *pte, pt; KASSERT(LARGEMAP_MIN_ADDRESS <= va && va < PMAP_LARGEMAP_MAX_ADDRESS(), ("not largemap range %#lx", (u_long)va)); pdpe = pmap_large_map_pdpe(va); pdp = *pdpe; KASSERT((pdp & X86_PG_V) != 0, ("invalid pdp va %#lx pdpe %#lx pdp %#lx", va, (u_long)pdpe, pdp)); if ((pdp & X86_PG_PS) != 0) { KASSERT((amd_feature & AMDID_PAGE1GB) != 0, ("no 1G pages, va %#lx pdpe %#lx pdp %#lx", va, (u_long)pdpe, pdp)); return ((pdp & PG_PS_PDP_FRAME) | (va & PDPMASK)); } pde = pmap_pdpe_to_pde(pdpe, va); pd = *pde; KASSERT((pd & X86_PG_V) != 0, ("invalid pd va %#lx pde %#lx pd %#lx", va, (u_long)pde, pd)); if ((pd & X86_PG_PS) != 0) return ((pd & PG_PS_FRAME) | (va & PDRMASK)); pte = pmap_pde_to_pte(pde, va); pt = *pte; KASSERT((pt & X86_PG_V) != 0, ("invalid pte va %#lx pte %#lx pt %#lx", va, (u_long)pte, pt)); return ((pt & PG_FRAME) | (va & PAGE_MASK)); } static int pmap_large_map_getva(vm_size_t len, vm_offset_t align, vm_offset_t phase, vmem_addr_t *vmem_res) { /* * Large mappings are all but static. Consequently, there * is no point in waiting for an earlier allocation to be * freed. */ return (vmem_xalloc(large_vmem, len, align, phase, 0, VMEM_ADDR_MIN, VMEM_ADDR_MAX, M_NOWAIT | M_BESTFIT, vmem_res)); } int pmap_large_map(vm_paddr_t spa, vm_size_t len, void **addr, vm_memattr_t mattr) { pdp_entry_t *pdpe; pd_entry_t *pde; pt_entry_t *pte; vm_offset_t va, inc; vmem_addr_t vmem_res; vm_paddr_t pa; int error; if (len == 0 || spa + len < spa) return (EINVAL); /* See if DMAP can serve. */ if (spa + len <= dmaplimit) { va = PHYS_TO_DMAP(spa); *addr = (void *)va; return (pmap_change_attr(va, len, mattr)); } /* * No, allocate KVA. Fit the address with best possible * alignment for superpages. Fall back to worse align if * failed. */ error = ENOMEM; if ((amd_feature & AMDID_PAGE1GB) != 0 && rounddown2(spa + len, NBPDP) >= roundup2(spa, NBPDP) + NBPDP) error = pmap_large_map_getva(len, NBPDP, spa & PDPMASK, &vmem_res); if (error != 0 && rounddown2(spa + len, NBPDR) >= roundup2(spa, NBPDR) + NBPDR) error = pmap_large_map_getva(len, NBPDR, spa & PDRMASK, &vmem_res); if (error != 0) error = pmap_large_map_getva(len, PAGE_SIZE, 0, &vmem_res); if (error != 0) return (error); /* * Fill pagetable. PG_M is not pre-set, we scan modified bits * in the pagetable to minimize flushing. No need to * invalidate TLB, since we only update invalid entries. */ PMAP_LOCK(kernel_pmap); for (pa = spa, va = vmem_res; len > 0; pa += inc, va += inc, len -= inc) { if ((amd_feature & AMDID_PAGE1GB) != 0 && len >= NBPDP && (pa & PDPMASK) == 0 && (va & PDPMASK) == 0) { pdpe = pmap_large_map_pdpe(va); MPASS(*pdpe == 0); *pdpe = pa | pg_g | X86_PG_PS | X86_PG_RW | X86_PG_V | X86_PG_A | pg_nx | pmap_cache_bits(kernel_pmap, mattr, TRUE); inc = NBPDP; } else if (len >= NBPDR && (pa & PDRMASK) == 0 && (va & PDRMASK) == 0) { pde = pmap_large_map_pde(va); MPASS(*pde == 0); *pde = pa | pg_g | X86_PG_PS | X86_PG_RW | X86_PG_V | X86_PG_A | pg_nx | pmap_cache_bits(kernel_pmap, mattr, TRUE); PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pde))-> wire_count++; inc = NBPDR; } else { pte = pmap_large_map_pte(va); MPASS(*pte == 0); *pte = pa | pg_g | X86_PG_RW | X86_PG_V | X86_PG_A | pg_nx | pmap_cache_bits(kernel_pmap, mattr, FALSE); PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pte))-> wire_count++; inc = PAGE_SIZE; } } PMAP_UNLOCK(kernel_pmap); MPASS(len == 0); *addr = (void *)vmem_res; return (0); } void pmap_large_unmap(void *svaa, vm_size_t len) { vm_offset_t sva, va; vm_size_t inc; pdp_entry_t *pdpe, pdp; pd_entry_t *pde, pd; pt_entry_t *pte; vm_page_t m; struct spglist spgf; sva = (vm_offset_t)svaa; if (len == 0 || sva + len < sva || (sva >= DMAP_MIN_ADDRESS && sva + len <= DMAP_MIN_ADDRESS + dmaplimit)) return; SLIST_INIT(&spgf); KASSERT(LARGEMAP_MIN_ADDRESS <= sva && sva + len <= PMAP_LARGEMAP_MAX_ADDRESS(), ("not largemap range %#lx %#lx", (u_long)svaa, (u_long)svaa + len)); PMAP_LOCK(kernel_pmap); for (va = sva; va < sva + len; va += inc) { pdpe = pmap_large_map_pdpe(va); pdp = *pdpe; KASSERT((pdp & X86_PG_V) != 0, ("invalid pdp va %#lx pdpe %#lx pdp %#lx", va, (u_long)pdpe, pdp)); if ((pdp & X86_PG_PS) != 0) { KASSERT((amd_feature & AMDID_PAGE1GB) != 0, ("no 1G pages, va %#lx pdpe %#lx pdp %#lx", va, (u_long)pdpe, pdp)); KASSERT((va & PDPMASK) == 0, ("PDPMASK bit set, va %#lx pdpe %#lx pdp %#lx", va, (u_long)pdpe, pdp)); KASSERT(va + NBPDP <= sva + len, ("unmap covers partial 1GB page, sva %#lx va %#lx " "pdpe %#lx pdp %#lx len %#lx", sva, va, (u_long)pdpe, pdp, len)); *pdpe = 0; inc = NBPDP; continue; } pde = pmap_pdpe_to_pde(pdpe, va); pd = *pde; KASSERT((pd & X86_PG_V) != 0, ("invalid pd va %#lx pde %#lx pd %#lx", va, (u_long)pde, pd)); if ((pd & X86_PG_PS) != 0) { KASSERT((va & PDRMASK) == 0, ("PDRMASK bit set, va %#lx pde %#lx pd %#lx", va, (u_long)pde, pd)); KASSERT(va + NBPDR <= sva + len, ("unmap covers partial 2MB page, sva %#lx va %#lx " "pde %#lx pd %#lx len %#lx", sva, va, (u_long)pde, pd, len)); pde_store(pde, 0); inc = NBPDR; m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pde)); m->wire_count--; if (m->wire_count == 0) { *pdpe = 0; SLIST_INSERT_HEAD(&spgf, m, plinks.s.ss); } continue; } pte = pmap_pde_to_pte(pde, va); KASSERT((*pte & X86_PG_V) != 0, ("invalid pte va %#lx pte %#lx pt %#lx", va, (u_long)pte, *pte)); pte_clear(pte); inc = PAGE_SIZE; m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pte)); m->wire_count--; if (m->wire_count == 0) { *pde = 0; SLIST_INSERT_HEAD(&spgf, m, plinks.s.ss); m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pde)); m->wire_count--; if (m->wire_count == 0) { *pdpe = 0; SLIST_INSERT_HEAD(&spgf, m, plinks.s.ss); } } } pmap_invalidate_range(kernel_pmap, sva, sva + len); PMAP_UNLOCK(kernel_pmap); vm_page_free_pages_toq(&spgf, false); vmem_free(large_vmem, sva, len); } static void pmap_large_map_wb_fence_mfence(void) { mfence(); } static void pmap_large_map_wb_fence_sfence(void) { sfence(); } static void pmap_large_map_wb_fence_nop(void) { } DEFINE_IFUNC(static, void, pmap_large_map_wb_fence, (void)) { if (cpu_vendor_id != CPU_VENDOR_INTEL) return (pmap_large_map_wb_fence_mfence); else if ((cpu_stdext_feature & (CPUID_STDEXT_CLWB | CPUID_STDEXT_CLFLUSHOPT)) == 0) return (pmap_large_map_wb_fence_sfence); else /* clflush is strongly enough ordered */ return (pmap_large_map_wb_fence_nop); } static void pmap_large_map_flush_range_clwb(vm_offset_t va, vm_size_t len) { for (; len > 0; len -= cpu_clflush_line_size, va += cpu_clflush_line_size) clwb(va); } static void pmap_large_map_flush_range_clflushopt(vm_offset_t va, vm_size_t len) { for (; len > 0; len -= cpu_clflush_line_size, va += cpu_clflush_line_size) clflushopt(va); } static void pmap_large_map_flush_range_clflush(vm_offset_t va, vm_size_t len) { for (; len > 0; len -= cpu_clflush_line_size, va += cpu_clflush_line_size) clflush(va); } static void pmap_large_map_flush_range_nop(vm_offset_t sva __unused, vm_size_t len __unused) { } DEFINE_IFUNC(static, void, pmap_large_map_flush_range, (vm_offset_t, vm_size_t)) { if ((cpu_stdext_feature & CPUID_STDEXT_CLWB) != 0) return (pmap_large_map_flush_range_clwb); else if ((cpu_stdext_feature & CPUID_STDEXT_CLFLUSHOPT) != 0) return (pmap_large_map_flush_range_clflushopt); else if ((cpu_feature & CPUID_CLFSH) != 0) return (pmap_large_map_flush_range_clflush); else return (pmap_large_map_flush_range_nop); } static void pmap_large_map_wb_large(vm_offset_t sva, vm_offset_t eva) { volatile u_long *pe; u_long p; vm_offset_t va; vm_size_t inc; bool seen_other; for (va = sva; va < eva; va += inc) { inc = 0; if ((amd_feature & AMDID_PAGE1GB) != 0) { pe = (volatile u_long *)pmap_large_map_pdpe(va); p = *pe; if ((p & X86_PG_PS) != 0) inc = NBPDP; } if (inc == 0) { pe = (volatile u_long *)pmap_large_map_pde(va); p = *pe; if ((p & X86_PG_PS) != 0) inc = NBPDR; } if (inc == 0) { pe = (volatile u_long *)pmap_large_map_pte(va); p = *pe; inc = PAGE_SIZE; } seen_other = false; for (;;) { if ((p & X86_PG_AVAIL1) != 0) { /* * Spin-wait for the end of a parallel * write-back. */ cpu_spinwait(); p = *pe; /* * If we saw other write-back * occuring, we cannot rely on PG_M to * indicate state of the cache. The * PG_M bit is cleared before the * flush to avoid ignoring new writes, * and writes which are relevant for * us might happen after. */ seen_other = true; continue; } if ((p & X86_PG_M) != 0 || seen_other) { if (!atomic_fcmpset_long(pe, &p, (p & ~X86_PG_M) | X86_PG_AVAIL1)) /* * If we saw PG_M without * PG_AVAIL1, and then on the * next attempt we do not * observe either PG_M or * PG_AVAIL1, the other * write-back started after us * and finished before us. We * can rely on it doing our * work. */ continue; pmap_large_map_flush_range(va, inc); atomic_clear_long(pe, X86_PG_AVAIL1); } break; } maybe_yield(); } } /* * Write-back cache lines for the given address range. * * Must be called only on the range or sub-range returned from * pmap_large_map(). Must not be called on the coalesced ranges. * * Does nothing on CPUs without CLWB, CLFLUSHOPT, or CLFLUSH * instructions support. */ void pmap_large_map_wb(void *svap, vm_size_t len) { vm_offset_t eva, sva; sva = (vm_offset_t)svap; eva = sva + len; pmap_large_map_wb_fence(); if (sva >= DMAP_MIN_ADDRESS && eva <= DMAP_MIN_ADDRESS + dmaplimit) { pmap_large_map_flush_range(sva, len); } else { KASSERT(sva >= LARGEMAP_MIN_ADDRESS && eva <= LARGEMAP_MIN_ADDRESS + lm_ents * NBPML4, ("pmap_large_map_wb: not largemap %#lx %#lx", sva, len)); pmap_large_map_wb_large(sva, eva); } pmap_large_map_wb_fence(); } static vm_page_t pmap_pti_alloc_page(void) { vm_page_t m; VM_OBJECT_ASSERT_WLOCKED(pti_obj); m = vm_page_grab(pti_obj, pti_pg_idx++, VM_ALLOC_NOBUSY | VM_ALLOC_WIRED | VM_ALLOC_ZERO); return (m); } static bool pmap_pti_free_page(vm_page_t m) { KASSERT(m->wire_count > 0, ("page %p not wired", m)); if (!vm_page_unwire_noq(m)) return (false); vm_page_free_zero(m); return (true); } static void pmap_pti_init(void) { vm_page_t pml4_pg; pdp_entry_t *pdpe; vm_offset_t va; int i; if (!pti) return; pti_obj = vm_pager_allocate(OBJT_PHYS, NULL, 0, VM_PROT_ALL, 0, NULL); VM_OBJECT_WLOCK(pti_obj); pml4_pg = pmap_pti_alloc_page(); pti_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml4_pg)); for (va = VM_MIN_KERNEL_ADDRESS; va <= VM_MAX_KERNEL_ADDRESS && va >= VM_MIN_KERNEL_ADDRESS && va > NBPML4; va += NBPML4) { pdpe = pmap_pti_pdpe(va); pmap_pti_wire_pte(pdpe); } pmap_pti_add_kva_locked((vm_offset_t)&__pcpu[0], (vm_offset_t)&__pcpu[0] + sizeof(__pcpu[0]) * MAXCPU, false); pmap_pti_add_kva_locked((vm_offset_t)gdt, (vm_offset_t)gdt + sizeof(struct user_segment_descriptor) * NGDT * MAXCPU, false); pmap_pti_add_kva_locked((vm_offset_t)idt, (vm_offset_t)idt + sizeof(struct gate_descriptor) * NIDT, false); pmap_pti_add_kva_locked((vm_offset_t)common_tss, (vm_offset_t)common_tss + sizeof(struct amd64tss) * MAXCPU, false); CPU_FOREACH(i) { /* Doublefault stack IST 1 */ va = common_tss[i].tss_ist1; pmap_pti_add_kva_locked(va - PAGE_SIZE, va, false); /* NMI stack IST 2 */ va = common_tss[i].tss_ist2 + sizeof(struct nmi_pcpu); pmap_pti_add_kva_locked(va - PAGE_SIZE, va, false); /* MC# stack IST 3 */ va = common_tss[i].tss_ist3 + sizeof(struct nmi_pcpu); pmap_pti_add_kva_locked(va - PAGE_SIZE, va, false); /* DB# stack IST 4 */ va = common_tss[i].tss_ist4 + sizeof(struct nmi_pcpu); pmap_pti_add_kva_locked(va - PAGE_SIZE, va, false); } pmap_pti_add_kva_locked((vm_offset_t)kernphys + KERNBASE, (vm_offset_t)etext, true); pti_finalized = true; VM_OBJECT_WUNLOCK(pti_obj); } SYSINIT(pmap_pti, SI_SUB_CPU + 1, SI_ORDER_ANY, pmap_pti_init, NULL); static pdp_entry_t * pmap_pti_pdpe(vm_offset_t va) { pml4_entry_t *pml4e; pdp_entry_t *pdpe; vm_page_t m; vm_pindex_t pml4_idx; vm_paddr_t mphys; VM_OBJECT_ASSERT_WLOCKED(pti_obj); pml4_idx = pmap_pml4e_index(va); pml4e = &pti_pml4[pml4_idx]; m = NULL; if (*pml4e == 0) { if (pti_finalized) panic("pml4 alloc after finalization\n"); m = pmap_pti_alloc_page(); if (*pml4e != 0) { pmap_pti_free_page(m); mphys = *pml4e & ~PAGE_MASK; } else { mphys = VM_PAGE_TO_PHYS(m); *pml4e = mphys | X86_PG_RW | X86_PG_V; } } else { mphys = *pml4e & ~PAGE_MASK; } pdpe = (pdp_entry_t *)PHYS_TO_DMAP(mphys) + pmap_pdpe_index(va); return (pdpe); } static void pmap_pti_wire_pte(void *pte) { vm_page_t m; VM_OBJECT_ASSERT_WLOCKED(pti_obj); m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pte)); m->wire_count++; } static void pmap_pti_unwire_pde(void *pde, bool only_ref) { vm_page_t m; VM_OBJECT_ASSERT_WLOCKED(pti_obj); m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pde)); MPASS(m->wire_count > 0); MPASS(only_ref || m->wire_count > 1); pmap_pti_free_page(m); } static void pmap_pti_unwire_pte(void *pte, vm_offset_t va) { vm_page_t m; pd_entry_t *pde; VM_OBJECT_ASSERT_WLOCKED(pti_obj); m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pte)); MPASS(m->wire_count > 0); if (pmap_pti_free_page(m)) { pde = pmap_pti_pde(va); MPASS((*pde & (X86_PG_PS | X86_PG_V)) == X86_PG_V); *pde = 0; pmap_pti_unwire_pde(pde, false); } } static pd_entry_t * pmap_pti_pde(vm_offset_t va) { pdp_entry_t *pdpe; pd_entry_t *pde; vm_page_t m; vm_pindex_t pd_idx; vm_paddr_t mphys; VM_OBJECT_ASSERT_WLOCKED(pti_obj); pdpe = pmap_pti_pdpe(va); if (*pdpe == 0) { m = pmap_pti_alloc_page(); if (*pdpe != 0) { pmap_pti_free_page(m); MPASS((*pdpe & X86_PG_PS) == 0); mphys = *pdpe & ~PAGE_MASK; } else { mphys = VM_PAGE_TO_PHYS(m); *pdpe = mphys | X86_PG_RW | X86_PG_V; } } else { MPASS((*pdpe & X86_PG_PS) == 0); mphys = *pdpe & ~PAGE_MASK; } pde = (pd_entry_t *)PHYS_TO_DMAP(mphys); pd_idx = pmap_pde_index(va); pde += pd_idx; return (pde); } static pt_entry_t * pmap_pti_pte(vm_offset_t va, bool *unwire_pde) { pd_entry_t *pde; pt_entry_t *pte; vm_page_t m; vm_paddr_t mphys; VM_OBJECT_ASSERT_WLOCKED(pti_obj); pde = pmap_pti_pde(va); if (unwire_pde != NULL) { *unwire_pde = true; pmap_pti_wire_pte(pde); } if (*pde == 0) { m = pmap_pti_alloc_page(); if (*pde != 0) { pmap_pti_free_page(m); MPASS((*pde & X86_PG_PS) == 0); mphys = *pde & ~(PAGE_MASK | pg_nx); } else { mphys = VM_PAGE_TO_PHYS(m); *pde = mphys | X86_PG_RW | X86_PG_V; if (unwire_pde != NULL) *unwire_pde = false; } } else { MPASS((*pde & X86_PG_PS) == 0); mphys = *pde & ~(PAGE_MASK | pg_nx); } pte = (pt_entry_t *)PHYS_TO_DMAP(mphys); pte += pmap_pte_index(va); return (pte); } static void pmap_pti_add_kva_locked(vm_offset_t sva, vm_offset_t eva, bool exec) { vm_paddr_t pa; pd_entry_t *pde; pt_entry_t *pte, ptev; bool unwire_pde; VM_OBJECT_ASSERT_WLOCKED(pti_obj); sva = trunc_page(sva); MPASS(sva > VM_MAXUSER_ADDRESS); eva = round_page(eva); MPASS(sva < eva); for (; sva < eva; sva += PAGE_SIZE) { pte = pmap_pti_pte(sva, &unwire_pde); pa = pmap_kextract(sva); ptev = pa | X86_PG_RW | X86_PG_V | X86_PG_A | X86_PG_G | (exec ? 0 : pg_nx) | pmap_cache_bits(kernel_pmap, VM_MEMATTR_DEFAULT, FALSE); if (*pte == 0) { pte_store(pte, ptev); pmap_pti_wire_pte(pte); } else { KASSERT(!pti_finalized, ("pti overlap after fin %#lx %#lx %#lx", sva, *pte, ptev)); KASSERT(*pte == ptev, ("pti non-identical pte after fin %#lx %#lx %#lx", sva, *pte, ptev)); } if (unwire_pde) { pde = pmap_pti_pde(sva); pmap_pti_unwire_pde(pde, true); } } } void pmap_pti_add_kva(vm_offset_t sva, vm_offset_t eva, bool exec) { if (!pti) return; VM_OBJECT_WLOCK(pti_obj); pmap_pti_add_kva_locked(sva, eva, exec); VM_OBJECT_WUNLOCK(pti_obj); } void pmap_pti_remove_kva(vm_offset_t sva, vm_offset_t eva) { pt_entry_t *pte; vm_offset_t va; if (!pti) return; sva = rounddown2(sva, PAGE_SIZE); MPASS(sva > VM_MAXUSER_ADDRESS); eva = roundup2(eva, PAGE_SIZE); MPASS(sva < eva); VM_OBJECT_WLOCK(pti_obj); for (va = sva; va < eva; va += PAGE_SIZE) { pte = pmap_pti_pte(va, NULL); KASSERT((*pte & X86_PG_V) != 0, ("invalid pte va %#lx pte %#lx pt %#lx", va, (u_long)pte, *pte)); pte_clear(pte); pmap_pti_unwire_pte(pte, va); } pmap_invalidate_range(kernel_pmap, sva, eva); VM_OBJECT_WUNLOCK(pti_obj); } static void * pkru_dup_range(void *ctx __unused, void *data) { struct pmap_pkru_range *node, *new_node; new_node = uma_zalloc(pmap_pkru_ranges_zone, M_NOWAIT); if (new_node == NULL) return (NULL); node = data; memcpy(new_node, node, sizeof(*node)); return (new_node); } static void pkru_free_range(void *ctx __unused, void *node) { uma_zfree(pmap_pkru_ranges_zone, node); } static int pmap_pkru_assign(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, u_int keyidx, int flags) { struct pmap_pkru_range *ppr; int error; PMAP_LOCK_ASSERT(pmap, MA_OWNED); MPASS(pmap->pm_type == PT_X86); MPASS((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0); if ((flags & AMD64_PKRU_EXCL) != 0 && !rangeset_check_empty(&pmap->pm_pkru, sva, eva)) return (EBUSY); ppr = uma_zalloc(pmap_pkru_ranges_zone, M_NOWAIT); if (ppr == NULL) return (ENOMEM); ppr->pkru_keyidx = keyidx; ppr->pkru_flags = flags & AMD64_PKRU_PERSIST; error = rangeset_insert(&pmap->pm_pkru, sva, eva, ppr); if (error != 0) uma_zfree(pmap_pkru_ranges_zone, ppr); return (error); } static int pmap_pkru_deassign(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { PMAP_LOCK_ASSERT(pmap, MA_OWNED); MPASS(pmap->pm_type == PT_X86); MPASS((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0); return (rangeset_remove(&pmap->pm_pkru, sva, eva)); } static void pmap_pkru_deassign_all(pmap_t pmap) { PMAP_LOCK_ASSERT(pmap, MA_OWNED); if (pmap->pm_type == PT_X86 && (cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0) rangeset_remove_all(&pmap->pm_pkru); } static bool pmap_pkru_same(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { struct pmap_pkru_range *ppr, *prev_ppr; vm_offset_t va; PMAP_LOCK_ASSERT(pmap, MA_OWNED); if (pmap->pm_type != PT_X86 || (cpu_stdext_feature2 & CPUID_STDEXT2_PKU) == 0 || sva >= VM_MAXUSER_ADDRESS) return (true); MPASS(eva <= VM_MAXUSER_ADDRESS); for (va = sva, prev_ppr = NULL; va < eva;) { ppr = rangeset_lookup(&pmap->pm_pkru, va); if ((ppr == NULL) ^ (prev_ppr == NULL)) return (false); if (ppr == NULL) { va += PAGE_SIZE; continue; } if (prev_ppr->pkru_keyidx != ppr->pkru_keyidx) return (false); va = ppr->pkru_rs_el.re_end; } return (true); } static pt_entry_t pmap_pkru_get(pmap_t pmap, vm_offset_t va) { struct pmap_pkru_range *ppr; PMAP_LOCK_ASSERT(pmap, MA_OWNED); if (pmap->pm_type != PT_X86 || (cpu_stdext_feature2 & CPUID_STDEXT2_PKU) == 0 || va >= VM_MAXUSER_ADDRESS) return (0); ppr = rangeset_lookup(&pmap->pm_pkru, va); if (ppr != NULL) return (X86_PG_PKU(ppr->pkru_keyidx)); return (0); } static bool pred_pkru_on_remove(void *ctx __unused, void *r) { struct pmap_pkru_range *ppr; ppr = r; return ((ppr->pkru_flags & AMD64_PKRU_PERSIST) == 0); } static void pmap_pkru_on_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { PMAP_LOCK_ASSERT(pmap, MA_OWNED); if (pmap->pm_type == PT_X86 && (cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0) { rangeset_remove_pred(&pmap->pm_pkru, sva, eva, pred_pkru_on_remove); } } static int pmap_pkru_copy(pmap_t dst_pmap, pmap_t src_pmap) { PMAP_LOCK_ASSERT(dst_pmap, MA_OWNED); PMAP_LOCK_ASSERT(src_pmap, MA_OWNED); MPASS(dst_pmap->pm_type == PT_X86); MPASS(src_pmap->pm_type == PT_X86); MPASS((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0); if (src_pmap->pm_pkru.rs_data_ctx == NULL) return (0); return (rangeset_copy(&dst_pmap->pm_pkru, &src_pmap->pm_pkru)); } static void pmap_pkru_update_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, u_int keyidx) { pml4_entry_t *pml4e; pdp_entry_t *pdpe; pd_entry_t newpde, ptpaddr, *pde; pt_entry_t newpte, *ptep, pte; vm_offset_t va, va_next; bool changed; PMAP_LOCK_ASSERT(pmap, MA_OWNED); MPASS(pmap->pm_type == PT_X86); MPASS(keyidx <= PMAP_MAX_PKRU_IDX); for (changed = false, va = sva; va < eva; va = va_next) { pml4e = pmap_pml4e(pmap, va); if ((*pml4e & X86_PG_V) == 0) { va_next = (va + NBPML4) & ~PML4MASK; if (va_next < va) va_next = eva; continue; } pdpe = pmap_pml4e_to_pdpe(pml4e, va); if ((*pdpe & X86_PG_V) == 0) { va_next = (va + NBPDP) & ~PDPMASK; if (va_next < va) va_next = eva; continue; } va_next = (va + NBPDR) & ~PDRMASK; if (va_next < va) va_next = eva; pde = pmap_pdpe_to_pde(pdpe, va); ptpaddr = *pde; if (ptpaddr == 0) continue; MPASS((ptpaddr & X86_PG_V) != 0); if ((ptpaddr & PG_PS) != 0) { if (va + NBPDR == va_next && eva >= va_next) { newpde = (ptpaddr & ~X86_PG_PKU_MASK) | X86_PG_PKU(keyidx); if (newpde != ptpaddr) { *pde = newpde; changed = true; } continue; } else if (!pmap_demote_pde(pmap, pde, va)) { continue; } } if (va_next > eva) va_next = eva; for (ptep = pmap_pde_to_pte(pde, va); va != va_next; ptep++, va += PAGE_SIZE) { pte = *ptep; if ((pte & X86_PG_V) == 0) continue; newpte = (pte & ~X86_PG_PKU_MASK) | X86_PG_PKU(keyidx); if (newpte != pte) { *ptep = newpte; changed = true; } } } if (changed) pmap_invalidate_range(pmap, sva, eva); } static int pmap_pkru_check_uargs(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, u_int keyidx, int flags) { if (pmap->pm_type != PT_X86 || keyidx > PMAP_MAX_PKRU_IDX || (flags & ~(AMD64_PKRU_PERSIST | AMD64_PKRU_EXCL)) != 0) return (EINVAL); if (eva <= sva || eva > VM_MAXUSER_ADDRESS) return (EFAULT); if ((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) == 0) return (ENOTSUP); return (0); } int pmap_pkru_set(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, u_int keyidx, int flags) { int error; sva = trunc_page(sva); eva = round_page(eva); error = pmap_pkru_check_uargs(pmap, sva, eva, keyidx, flags); if (error != 0) return (error); for (;;) { PMAP_LOCK(pmap); error = pmap_pkru_assign(pmap, sva, eva, keyidx, flags); if (error == 0) pmap_pkru_update_range(pmap, sva, eva, keyidx); PMAP_UNLOCK(pmap); if (error != ENOMEM) break; vm_wait(NULL); } return (error); } int pmap_pkru_clear(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { int error; sva = trunc_page(sva); eva = round_page(eva); error = pmap_pkru_check_uargs(pmap, sva, eva, 0, 0); if (error != 0) return (error); for (;;) { PMAP_LOCK(pmap); error = pmap_pkru_deassign(pmap, sva, eva); if (error == 0) pmap_pkru_update_range(pmap, sva, eva, 0); PMAP_UNLOCK(pmap); if (error != ENOMEM) break; vm_wait(NULL); } return (error); } #ifdef DDB DB_SHOW_COMMAND(pte, pmap_print_pte) { pmap_t pmap; pml4_entry_t *pml4; pdp_entry_t *pdp; pd_entry_t *pde; pt_entry_t *pte, PG_V; vm_offset_t va; if (!have_addr) { db_printf("show pte addr\n"); return; } va = (vm_offset_t)addr; if (kdb_thread != NULL) pmap = vmspace_pmap(kdb_thread->td_proc->p_vmspace); else pmap = PCPU_GET(curpmap); PG_V = pmap_valid_bit(pmap); pml4 = pmap_pml4e(pmap, va); db_printf("VA %#016lx pml4e %#016lx", va, *pml4); if ((*pml4 & PG_V) == 0) { db_printf("\n"); return; } pdp = pmap_pml4e_to_pdpe(pml4, va); db_printf(" pdpe %#016lx", *pdp); if ((*pdp & PG_V) == 0 || (*pdp & PG_PS) != 0) { db_printf("\n"); return; } pde = pmap_pdpe_to_pde(pdp, va); db_printf(" pde %#016lx", *pde); if ((*pde & PG_V) == 0 || (*pde & PG_PS) != 0) { db_printf("\n"); return; } pte = pmap_pde_to_pte(pde, va); db_printf(" pte %#016lx\n", *pte); } DB_SHOW_COMMAND(phys2dmap, pmap_phys2dmap) { vm_paddr_t a; if (have_addr) { a = (vm_paddr_t)addr; db_printf("0x%jx\n", (uintmax_t)PHYS_TO_DMAP(a)); } else { db_printf("show phys2dmap addr\n"); } } #endif Index: projects/nfsv42/sys/arm64/arm64/pmap.c =================================================================== --- projects/nfsv42/sys/arm64/arm64/pmap.c (revision 350367) +++ projects/nfsv42/sys/arm64/arm64/pmap.c (revision 350368) @@ -1,5832 +1,5930 @@ /*- * Copyright (c) 1991 Regents of the University of California. * All rights reserved. * Copyright (c) 1994 John S. Dyson * All rights reserved. * Copyright (c) 1994 David Greenman * All rights reserved. * Copyright (c) 2003 Peter Wemm * All rights reserved. * Copyright (c) 2005-2010 Alan L. Cox * All rights reserved. * Copyright (c) 2014 Andrew Turner * All rights reserved. * Copyright (c) 2014-2016 The FreeBSD Foundation * All rights reserved. * * This code is derived from software contributed to Berkeley by * the Systems Programming Group of the University of Utah Computer * Science Department and William Jolitz of UUNET Technologies Inc. * * This software was developed by Andrew Turner under sponsorship from * the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)pmap.c 7.7 (Berkeley) 5/12/91 */ /*- * Copyright (c) 2003 Networks Associates Technology, Inc. * All rights reserved. * * This software was developed for the FreeBSD Project by Jake Burkholder, * Safeport Network Services, and Network Associates Laboratories, the * Security Research Division of Network Associates, Inc. under * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA * CHATS research program. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); /* * Manages physical address maps. * * Since the information managed by this module is * also stored by the logical address mapping module, * this module may throw away valid virtual-to-physical * mappings at almost any time. However, invalidations * of virtual-to-physical mappings must be done as * requested. * * In order to cope with hardware architectures which * make virtual-to-physical map invalidates expensive, * this module may delay invalidate or reduced protection * operations until such time as they are actually * necessary. This module is given full information as * to which processors are currently using which maps, * and to when physical maps must be made correct. */ #include "opt_vm.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define NL0PG (PAGE_SIZE/(sizeof (pd_entry_t))) #define NL1PG (PAGE_SIZE/(sizeof (pd_entry_t))) #define NL2PG (PAGE_SIZE/(sizeof (pd_entry_t))) #define NL3PG (PAGE_SIZE/(sizeof (pt_entry_t))) #define NUL0E L0_ENTRIES #define NUL1E (NUL0E * NL1PG) #define NUL2E (NUL1E * NL2PG) #if !defined(DIAGNOSTIC) #ifdef __GNUC_GNU_INLINE__ #define PMAP_INLINE __attribute__((__gnu_inline__)) inline #else #define PMAP_INLINE extern inline #endif #else #define PMAP_INLINE #endif /* * These are configured by the mair_el1 register. This is set up in locore.S */ #define DEVICE_MEMORY 0 #define UNCACHED_MEMORY 1 #define CACHED_MEMORY 2 #ifdef PV_STATS #define PV_STAT(x) do { x ; } while (0) #else #define PV_STAT(x) do { } while (0) #endif #define pmap_l2_pindex(v) ((v) >> L2_SHIFT) #define pa_to_pvh(pa) (&pv_table[pmap_l2_pindex(pa)]) #define NPV_LIST_LOCKS MAXCPU #define PHYS_TO_PV_LIST_LOCK(pa) \ (&pv_list_locks[pa_index(pa) % NPV_LIST_LOCKS]) #define CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa) do { \ struct rwlock **_lockp = (lockp); \ struct rwlock *_new_lock; \ \ _new_lock = PHYS_TO_PV_LIST_LOCK(pa); \ if (_new_lock != *_lockp) { \ if (*_lockp != NULL) \ rw_wunlock(*_lockp); \ *_lockp = _new_lock; \ rw_wlock(*_lockp); \ } \ } while (0) #define CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m) \ CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, VM_PAGE_TO_PHYS(m)) #define RELEASE_PV_LIST_LOCK(lockp) do { \ struct rwlock **_lockp = (lockp); \ \ if (*_lockp != NULL) { \ rw_wunlock(*_lockp); \ *_lockp = NULL; \ } \ } while (0) #define VM_PAGE_TO_PV_LIST_LOCK(m) \ PHYS_TO_PV_LIST_LOCK(VM_PAGE_TO_PHYS(m)) /* * The presence of this flag indicates that the mapping is writeable. * If the ATTR_AP_RO bit is also set, then the mapping is clean, otherwise it is * dirty. This flag may only be set on managed mappings. */ static pt_entry_t ATTR_SW_DBM; struct pmap kernel_pmap_store; /* Used for mapping ACPI memory before VM is initialized */ #define PMAP_PREINIT_MAPPING_COUNT 32 #define PMAP_PREINIT_MAPPING_SIZE (PMAP_PREINIT_MAPPING_COUNT * L2_SIZE) static vm_offset_t preinit_map_va; /* Start VA of pre-init mapping space */ static int vm_initialized = 0; /* No need to use pre-init maps when set */ /* * Reserve a few L2 blocks starting from 'preinit_map_va' pointer. * Always map entire L2 block for simplicity. * VA of L2 block = preinit_map_va + i * L2_SIZE */ static struct pmap_preinit_mapping { vm_paddr_t pa; vm_offset_t va; vm_size_t size; } pmap_preinit_mapping[PMAP_PREINIT_MAPPING_COUNT]; vm_offset_t virtual_avail; /* VA of first avail page (after kernel bss) */ vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */ vm_offset_t kernel_vm_end = 0; /* * Data for the pv entry allocation mechanism. */ static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks); static struct mtx pv_chunks_mutex; static struct rwlock pv_list_locks[NPV_LIST_LOCKS]; static struct md_page *pv_table; static struct md_page pv_dummy; vm_paddr_t dmap_phys_base; /* The start of the dmap region */ vm_paddr_t dmap_phys_max; /* The limit of the dmap region */ vm_offset_t dmap_max_addr; /* The virtual address limit of the dmap */ /* This code assumes all L1 DMAP entries will be used */ CTASSERT((DMAP_MIN_ADDRESS & ~L0_OFFSET) == DMAP_MIN_ADDRESS); CTASSERT((DMAP_MAX_ADDRESS & ~L0_OFFSET) == DMAP_MAX_ADDRESS); #define DMAP_TABLES ((DMAP_MAX_ADDRESS - DMAP_MIN_ADDRESS) >> L0_SHIFT) extern pt_entry_t pagetable_dmap[]; #define PHYSMAP_SIZE (2 * (VM_PHYSSEG_MAX - 1)) static vm_paddr_t physmap[PHYSMAP_SIZE]; static u_int physmap_idx; static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD, 0, "VM/pmap parameters"); static int superpages_enabled = 1; SYSCTL_INT(_vm_pmap, OID_AUTO, superpages_enabled, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &superpages_enabled, 0, "Are large page mappings enabled?"); /* * Internal flags for pmap_enter()'s helper functions. */ #define PMAP_ENTER_NORECLAIM 0x1000000 /* Don't reclaim PV entries. */ #define PMAP_ENTER_NOREPLACE 0x2000000 /* Don't replace mappings. */ static void free_pv_chunk(struct pv_chunk *pc); static void free_pv_entry(pmap_t pmap, pv_entry_t pv); static pv_entry_t get_pv_entry(pmap_t pmap, struct rwlock **lockp); static vm_page_t reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp); static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va); static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va); static int pmap_change_attr(vm_offset_t va, vm_size_t size, int mode); static int pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode); static pt_entry_t *pmap_demote_l1(pmap_t pmap, pt_entry_t *l1, vm_offset_t va); static pt_entry_t *pmap_demote_l2_locked(pmap_t pmap, pt_entry_t *l2, vm_offset_t va, struct rwlock **lockp); static pt_entry_t *pmap_demote_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va); static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp); static int pmap_enter_l2(pmap_t pmap, vm_offset_t va, pd_entry_t new_l2, u_int flags, vm_page_t m, struct rwlock **lockp); static int pmap_remove_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t sva, pd_entry_t l1e, struct spglist *free, struct rwlock **lockp); static int pmap_remove_l3(pmap_t pmap, pt_entry_t *l3, vm_offset_t sva, pd_entry_t l2e, struct spglist *free, struct rwlock **lockp); static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m, struct rwlock **lockp); static vm_page_t _pmap_alloc_l3(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp); static void _pmap_unwire_l3(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free); static int pmap_unuse_pt(pmap_t, vm_offset_t, pd_entry_t, struct spglist *); static __inline vm_page_t pmap_remove_pt_page(pmap_t pmap, vm_offset_t va); /* * These load the old table data and store the new value. * They need to be atomic as the System MMU may write to the table at * the same time as the CPU. */ #define pmap_clear(table) atomic_store_64(table, 0) #define pmap_clear_bits(table, bits) atomic_clear_64(table, bits) #define pmap_load(table) (*table) #define pmap_load_clear(table) atomic_swap_64(table, 0) #define pmap_load_store(table, entry) atomic_swap_64(table, entry) #define pmap_set_bits(table, bits) atomic_set_64(table, bits) #define pmap_store(table, entry) atomic_store_64(table, entry) /********************/ /* Inline functions */ /********************/ static __inline void pagecopy(void *s, void *d) { memcpy(d, s, PAGE_SIZE); } static __inline pd_entry_t * pmap_l0(pmap_t pmap, vm_offset_t va) { return (&pmap->pm_l0[pmap_l0_index(va)]); } static __inline pd_entry_t * pmap_l0_to_l1(pd_entry_t *l0, vm_offset_t va) { pd_entry_t *l1; l1 = (pd_entry_t *)PHYS_TO_DMAP(pmap_load(l0) & ~ATTR_MASK); return (&l1[pmap_l1_index(va)]); } static __inline pd_entry_t * pmap_l1(pmap_t pmap, vm_offset_t va) { pd_entry_t *l0; l0 = pmap_l0(pmap, va); if ((pmap_load(l0) & ATTR_DESCR_MASK) != L0_TABLE) return (NULL); return (pmap_l0_to_l1(l0, va)); } static __inline pd_entry_t * pmap_l1_to_l2(pd_entry_t *l1, vm_offset_t va) { pd_entry_t *l2; l2 = (pd_entry_t *)PHYS_TO_DMAP(pmap_load(l1) & ~ATTR_MASK); return (&l2[pmap_l2_index(va)]); } static __inline pd_entry_t * pmap_l2(pmap_t pmap, vm_offset_t va) { pd_entry_t *l1; l1 = pmap_l1(pmap, va); if ((pmap_load(l1) & ATTR_DESCR_MASK) != L1_TABLE) return (NULL); return (pmap_l1_to_l2(l1, va)); } static __inline pt_entry_t * pmap_l2_to_l3(pd_entry_t *l2, vm_offset_t va) { pt_entry_t *l3; l3 = (pd_entry_t *)PHYS_TO_DMAP(pmap_load(l2) & ~ATTR_MASK); return (&l3[pmap_l3_index(va)]); } /* * Returns the lowest valid pde for a given virtual address. * The next level may or may not point to a valid page or block. */ static __inline pd_entry_t * pmap_pde(pmap_t pmap, vm_offset_t va, int *level) { pd_entry_t *l0, *l1, *l2, desc; l0 = pmap_l0(pmap, va); desc = pmap_load(l0) & ATTR_DESCR_MASK; if (desc != L0_TABLE) { *level = -1; return (NULL); } l1 = pmap_l0_to_l1(l0, va); desc = pmap_load(l1) & ATTR_DESCR_MASK; if (desc != L1_TABLE) { *level = 0; return (l0); } l2 = pmap_l1_to_l2(l1, va); desc = pmap_load(l2) & ATTR_DESCR_MASK; if (desc != L2_TABLE) { *level = 1; return (l1); } *level = 2; return (l2); } /* * Returns the lowest valid pte block or table entry for a given virtual * address. If there are no valid entries return NULL and set the level to * the first invalid level. */ static __inline pt_entry_t * pmap_pte(pmap_t pmap, vm_offset_t va, int *level) { pd_entry_t *l1, *l2, desc; pt_entry_t *l3; l1 = pmap_l1(pmap, va); if (l1 == NULL) { *level = 0; return (NULL); } desc = pmap_load(l1) & ATTR_DESCR_MASK; if (desc == L1_BLOCK) { *level = 1; return (l1); } if (desc != L1_TABLE) { *level = 1; return (NULL); } l2 = pmap_l1_to_l2(l1, va); desc = pmap_load(l2) & ATTR_DESCR_MASK; if (desc == L2_BLOCK) { *level = 2; return (l2); } if (desc != L2_TABLE) { *level = 2; return (NULL); } *level = 3; l3 = pmap_l2_to_l3(l2, va); if ((pmap_load(l3) & ATTR_DESCR_MASK) != L3_PAGE) return (NULL); return (l3); } bool pmap_ps_enabled(pmap_t pmap __unused) { return (superpages_enabled != 0); } bool pmap_get_tables(pmap_t pmap, vm_offset_t va, pd_entry_t **l0, pd_entry_t **l1, pd_entry_t **l2, pt_entry_t **l3) { pd_entry_t *l0p, *l1p, *l2p; if (pmap->pm_l0 == NULL) return (false); l0p = pmap_l0(pmap, va); *l0 = l0p; if ((pmap_load(l0p) & ATTR_DESCR_MASK) != L0_TABLE) return (false); l1p = pmap_l0_to_l1(l0p, va); *l1 = l1p; if ((pmap_load(l1p) & ATTR_DESCR_MASK) == L1_BLOCK) { *l2 = NULL; *l3 = NULL; return (true); } if ((pmap_load(l1p) & ATTR_DESCR_MASK) != L1_TABLE) return (false); l2p = pmap_l1_to_l2(l1p, va); *l2 = l2p; if ((pmap_load(l2p) & ATTR_DESCR_MASK) == L2_BLOCK) { *l3 = NULL; return (true); } if ((pmap_load(l2p) & ATTR_DESCR_MASK) != L2_TABLE) return (false); *l3 = pmap_l2_to_l3(l2p, va); return (true); } static __inline int pmap_l3_valid(pt_entry_t l3) { return ((l3 & ATTR_DESCR_MASK) == L3_PAGE); } CTASSERT(L1_BLOCK == L2_BLOCK); /* * Checks if the PTE is dirty. */ static inline int pmap_pte_dirty(pt_entry_t pte) { KASSERT((pte & ATTR_SW_MANAGED) != 0, ("pte %#lx is unmanaged", pte)); KASSERT((pte & (ATTR_AP_RW_BIT | ATTR_SW_DBM)) != 0, ("pte %#lx is writeable and missing ATTR_SW_DBM", pte)); return ((pte & (ATTR_AP_RW_BIT | ATTR_SW_DBM)) == (ATTR_AP(ATTR_AP_RW) | ATTR_SW_DBM)); } static __inline void pmap_resident_count_inc(pmap_t pmap, int count) { PMAP_LOCK_ASSERT(pmap, MA_OWNED); pmap->pm_stats.resident_count += count; } static __inline void pmap_resident_count_dec(pmap_t pmap, int count) { PMAP_LOCK_ASSERT(pmap, MA_OWNED); KASSERT(pmap->pm_stats.resident_count >= count, ("pmap %p resident count underflow %ld %d", pmap, pmap->pm_stats.resident_count, count)); pmap->pm_stats.resident_count -= count; } static pt_entry_t * pmap_early_page_idx(vm_offset_t l1pt, vm_offset_t va, u_int *l1_slot, u_int *l2_slot) { pt_entry_t *l2; pd_entry_t *l1; l1 = (pd_entry_t *)l1pt; *l1_slot = (va >> L1_SHIFT) & Ln_ADDR_MASK; /* Check locore has used a table L1 map */ KASSERT((l1[*l1_slot] & ATTR_DESCR_MASK) == L1_TABLE, ("Invalid bootstrap L1 table")); /* Find the address of the L2 table */ l2 = (pt_entry_t *)init_pt_va; *l2_slot = pmap_l2_index(va); return (l2); } static vm_paddr_t pmap_early_vtophys(vm_offset_t l1pt, vm_offset_t va) { u_int l1_slot, l2_slot; pt_entry_t *l2; l2 = pmap_early_page_idx(l1pt, va, &l1_slot, &l2_slot); return ((l2[l2_slot] & ~ATTR_MASK) + (va & L2_OFFSET)); } static vm_offset_t pmap_bootstrap_dmap(vm_offset_t kern_l1, vm_paddr_t min_pa, vm_offset_t freemempos) { pt_entry_t *l2; vm_offset_t va; vm_paddr_t l2_pa, pa; u_int l1_slot, l2_slot, prev_l1_slot; int i; dmap_phys_base = min_pa & ~L1_OFFSET; dmap_phys_max = 0; dmap_max_addr = 0; l2 = NULL; prev_l1_slot = -1; #define DMAP_TABLES ((DMAP_MAX_ADDRESS - DMAP_MIN_ADDRESS) >> L0_SHIFT) memset(pagetable_dmap, 0, PAGE_SIZE * DMAP_TABLES); for (i = 0; i < (physmap_idx * 2); i += 2) { pa = physmap[i] & ~L2_OFFSET; va = pa - dmap_phys_base + DMAP_MIN_ADDRESS; /* Create L2 mappings at the start of the region */ if ((pa & L1_OFFSET) != 0) { l1_slot = ((va - DMAP_MIN_ADDRESS) >> L1_SHIFT); if (l1_slot != prev_l1_slot) { prev_l1_slot = l1_slot; l2 = (pt_entry_t *)freemempos; l2_pa = pmap_early_vtophys(kern_l1, (vm_offset_t)l2); freemempos += PAGE_SIZE; pmap_store(&pagetable_dmap[l1_slot], (l2_pa & ~Ln_TABLE_MASK) | L1_TABLE); memset(l2, 0, PAGE_SIZE); } KASSERT(l2 != NULL, ("pmap_bootstrap_dmap: NULL l2 map")); for (; va < DMAP_MAX_ADDRESS && pa < physmap[i + 1]; pa += L2_SIZE, va += L2_SIZE) { /* * We are on a boundary, stop to * create a level 1 block */ if ((pa & L1_OFFSET) == 0) break; l2_slot = pmap_l2_index(va); KASSERT(l2_slot != 0, ("...")); pmap_store(&l2[l2_slot], (pa & ~L2_OFFSET) | ATTR_DEFAULT | ATTR_XN | ATTR_IDX(CACHED_MEMORY) | L2_BLOCK); } KASSERT(va == (pa - dmap_phys_base + DMAP_MIN_ADDRESS), ("...")); } for (; va < DMAP_MAX_ADDRESS && pa < physmap[i + 1] && (physmap[i + 1] - pa) >= L1_SIZE; pa += L1_SIZE, va += L1_SIZE) { l1_slot = ((va - DMAP_MIN_ADDRESS) >> L1_SHIFT); pmap_store(&pagetable_dmap[l1_slot], (pa & ~L1_OFFSET) | ATTR_DEFAULT | ATTR_XN | ATTR_IDX(CACHED_MEMORY) | L1_BLOCK); } /* Create L2 mappings at the end of the region */ if (pa < physmap[i + 1]) { l1_slot = ((va - DMAP_MIN_ADDRESS) >> L1_SHIFT); if (l1_slot != prev_l1_slot) { prev_l1_slot = l1_slot; l2 = (pt_entry_t *)freemempos; l2_pa = pmap_early_vtophys(kern_l1, (vm_offset_t)l2); freemempos += PAGE_SIZE; pmap_store(&pagetable_dmap[l1_slot], (l2_pa & ~Ln_TABLE_MASK) | L1_TABLE); memset(l2, 0, PAGE_SIZE); } KASSERT(l2 != NULL, ("pmap_bootstrap_dmap: NULL l2 map")); for (; va < DMAP_MAX_ADDRESS && pa < physmap[i + 1]; pa += L2_SIZE, va += L2_SIZE) { l2_slot = pmap_l2_index(va); pmap_store(&l2[l2_slot], (pa & ~L2_OFFSET) | ATTR_DEFAULT | ATTR_XN | ATTR_IDX(CACHED_MEMORY) | L2_BLOCK); } } if (pa > dmap_phys_max) { dmap_phys_max = pa; dmap_max_addr = va; } } cpu_tlb_flushID(); return (freemempos); } static vm_offset_t pmap_bootstrap_l2(vm_offset_t l1pt, vm_offset_t va, vm_offset_t l2_start) { vm_offset_t l2pt; vm_paddr_t pa; pd_entry_t *l1; u_int l1_slot; KASSERT((va & L1_OFFSET) == 0, ("Invalid virtual address")); l1 = (pd_entry_t *)l1pt; l1_slot = pmap_l1_index(va); l2pt = l2_start; for (; va < VM_MAX_KERNEL_ADDRESS; l1_slot++, va += L1_SIZE) { KASSERT(l1_slot < Ln_ENTRIES, ("Invalid L1 index")); pa = pmap_early_vtophys(l1pt, l2pt); pmap_store(&l1[l1_slot], (pa & ~Ln_TABLE_MASK) | L1_TABLE); l2pt += PAGE_SIZE; } /* Clean the L2 page table */ memset((void *)l2_start, 0, l2pt - l2_start); return l2pt; } static vm_offset_t pmap_bootstrap_l3(vm_offset_t l1pt, vm_offset_t va, vm_offset_t l3_start) { vm_offset_t l3pt; vm_paddr_t pa; pd_entry_t *l2; u_int l2_slot; KASSERT((va & L2_OFFSET) == 0, ("Invalid virtual address")); l2 = pmap_l2(kernel_pmap, va); l2 = (pd_entry_t *)rounddown2((uintptr_t)l2, PAGE_SIZE); l2_slot = pmap_l2_index(va); l3pt = l3_start; for (; va < VM_MAX_KERNEL_ADDRESS; l2_slot++, va += L2_SIZE) { KASSERT(l2_slot < Ln_ENTRIES, ("Invalid L2 index")); pa = pmap_early_vtophys(l1pt, l3pt); pmap_store(&l2[l2_slot], (pa & ~Ln_TABLE_MASK) | L2_TABLE); l3pt += PAGE_SIZE; } /* Clean the L2 page table */ memset((void *)l3_start, 0, l3pt - l3_start); return l3pt; } /* * Bootstrap the system enough to run with virtual memory. */ void pmap_bootstrap(vm_offset_t l0pt, vm_offset_t l1pt, vm_paddr_t kernstart, vm_size_t kernlen) { u_int l1_slot, l2_slot; pt_entry_t *l2; vm_offset_t va, freemempos; vm_offset_t dpcpu, msgbufpv; vm_paddr_t start_pa, pa, min_pa; uint64_t kern_delta; int i; #ifdef notyet /* Determine whether the hardware implements DBM management. */ uint64_t reg = READ_SPECIALREG(ID_AA64MMFR1_EL1); ATTR_SW_DBM = ID_AA64MMFR1_HAFDBS(reg) == ID_AA64MMFR1_HAFDBS_AF_DBS ? ATTR_DBM : _ATTR_SW_DBM; #else ATTR_SW_DBM = _ATTR_SW_DBM; #endif kern_delta = KERNBASE - kernstart; printf("pmap_bootstrap %lx %lx %lx\n", l1pt, kernstart, kernlen); printf("%lx\n", l1pt); printf("%lx\n", (KERNBASE >> L1_SHIFT) & Ln_ADDR_MASK); /* Set this early so we can use the pagetable walking functions */ kernel_pmap_store.pm_l0 = (pd_entry_t *)l0pt; PMAP_LOCK_INIT(kernel_pmap); /* Assume the address we were loaded to is a valid physical address */ min_pa = KERNBASE - kern_delta; physmap_idx = arm_physmem_avail(physmap, nitems(physmap)); physmap_idx /= 2; /* * Find the minimum physical address. physmap is sorted, * but may contain empty ranges. */ for (i = 0; i < (physmap_idx * 2); i += 2) { if (physmap[i] == physmap[i + 1]) continue; if (physmap[i] <= min_pa) min_pa = physmap[i]; } freemempos = KERNBASE + kernlen; freemempos = roundup2(freemempos, PAGE_SIZE); /* Create a direct map region early so we can use it for pa -> va */ freemempos = pmap_bootstrap_dmap(l1pt, min_pa, freemempos); va = KERNBASE; start_pa = pa = KERNBASE - kern_delta; /* * Read the page table to find out what is already mapped. * This assumes we have mapped a block of memory from KERNBASE * using a single L1 entry. */ l2 = pmap_early_page_idx(l1pt, KERNBASE, &l1_slot, &l2_slot); /* Sanity check the index, KERNBASE should be the first VA */ KASSERT(l2_slot == 0, ("The L2 index is non-zero")); /* Find how many pages we have mapped */ for (; l2_slot < Ln_ENTRIES; l2_slot++) { if ((l2[l2_slot] & ATTR_DESCR_MASK) == 0) break; /* Check locore used L2 blocks */ KASSERT((l2[l2_slot] & ATTR_DESCR_MASK) == L2_BLOCK, ("Invalid bootstrap L2 table")); KASSERT((l2[l2_slot] & ~ATTR_MASK) == pa, ("Incorrect PA in L2 table")); va += L2_SIZE; pa += L2_SIZE; } va = roundup2(va, L1_SIZE); /* Create the l2 tables up to VM_MAX_KERNEL_ADDRESS */ freemempos = pmap_bootstrap_l2(l1pt, va, freemempos); /* And the l3 tables for the early devmap */ freemempos = pmap_bootstrap_l3(l1pt, VM_MAX_KERNEL_ADDRESS - (PMAP_MAPDEV_EARLY_SIZE), freemempos); cpu_tlb_flushID(); #define alloc_pages(var, np) \ (var) = freemempos; \ freemempos += (np * PAGE_SIZE); \ memset((char *)(var), 0, ((np) * PAGE_SIZE)); /* Allocate dynamic per-cpu area. */ alloc_pages(dpcpu, DPCPU_SIZE / PAGE_SIZE); dpcpu_init((void *)dpcpu, 0); /* Allocate memory for the msgbuf, e.g. for /sbin/dmesg */ alloc_pages(msgbufpv, round_page(msgbufsize) / PAGE_SIZE); msgbufp = (void *)msgbufpv; /* Reserve some VA space for early BIOS/ACPI mapping */ preinit_map_va = roundup2(freemempos, L2_SIZE); virtual_avail = preinit_map_va + PMAP_PREINIT_MAPPING_SIZE; virtual_avail = roundup2(virtual_avail, L1_SIZE); virtual_end = VM_MAX_KERNEL_ADDRESS - (PMAP_MAPDEV_EARLY_SIZE); kernel_vm_end = virtual_avail; pa = pmap_early_vtophys(l1pt, freemempos); arm_physmem_exclude_region(start_pa, pa - start_pa, EXFLAG_NOALLOC); cpu_tlb_flushID(); } /* * Initialize a vm_page's machine-dependent fields. */ void pmap_page_init(vm_page_t m) { TAILQ_INIT(&m->md.pv_list); m->md.pv_memattr = VM_MEMATTR_WRITE_BACK; } /* * Initialize the pmap module. * Called by vm_init, to initialize any structures that the pmap * system needs to map virtual memory. */ void pmap_init(void) { vm_size_t s; int i, pv_npg; /* * Are large page mappings enabled? */ TUNABLE_INT_FETCH("vm.pmap.superpages_enabled", &superpages_enabled); if (superpages_enabled) { KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0, ("pmap_init: can't assign to pagesizes[1]")); pagesizes[1] = L2_SIZE; } /* * Initialize the pv chunk list mutex. */ mtx_init(&pv_chunks_mutex, "pmap pv chunk list", NULL, MTX_DEF); /* * Initialize the pool of pv list locks. */ for (i = 0; i < NPV_LIST_LOCKS; i++) rw_init(&pv_list_locks[i], "pmap pv list"); /* * Calculate the size of the pv head table for superpages. */ pv_npg = howmany(vm_phys_segs[vm_phys_nsegs - 1].end, L2_SIZE); /* * Allocate memory for the pv head table for superpages. */ s = (vm_size_t)(pv_npg * sizeof(struct md_page)); s = round_page(s); pv_table = (struct md_page *)kmem_malloc(s, M_WAITOK | M_ZERO); for (i = 0; i < pv_npg; i++) TAILQ_INIT(&pv_table[i].pv_list); TAILQ_INIT(&pv_dummy.pv_list); vm_initialized = 1; } static SYSCTL_NODE(_vm_pmap, OID_AUTO, l2, CTLFLAG_RD, 0, "2MB page mapping counters"); static u_long pmap_l2_demotions; SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, demotions, CTLFLAG_RD, &pmap_l2_demotions, 0, "2MB page demotions"); static u_long pmap_l2_mappings; SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, mappings, CTLFLAG_RD, &pmap_l2_mappings, 0, "2MB page mappings"); static u_long pmap_l2_p_failures; SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, p_failures, CTLFLAG_RD, &pmap_l2_p_failures, 0, "2MB page promotion failures"); static u_long pmap_l2_promotions; SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, promotions, CTLFLAG_RD, &pmap_l2_promotions, 0, "2MB page promotions"); /* * Invalidate a single TLB entry. */ static __inline void pmap_invalidate_page(pmap_t pmap, vm_offset_t va) { sched_pin(); __asm __volatile( "dsb ishst \n" "tlbi vaae1is, %0 \n" "dsb ish \n" "isb \n" : : "r"(va >> PAGE_SHIFT)); sched_unpin(); } static __inline void pmap_invalidate_range_nopin(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { vm_offset_t addr; dsb(ishst); for (addr = sva; addr < eva; addr += PAGE_SIZE) { __asm __volatile( "tlbi vaae1is, %0" : : "r"(addr >> PAGE_SHIFT)); } __asm __volatile( "dsb ish \n" "isb \n"); } static __inline void pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { sched_pin(); pmap_invalidate_range_nopin(pmap, sva, eva); sched_unpin(); } static __inline void pmap_invalidate_all(pmap_t pmap) { sched_pin(); __asm __volatile( "dsb ishst \n" "tlbi vmalle1is \n" "dsb ish \n" "isb \n"); sched_unpin(); } /* * Routine: pmap_extract * Function: * Extract the physical page address associated * with the given map/virtual_address pair. */ vm_paddr_t pmap_extract(pmap_t pmap, vm_offset_t va) { pt_entry_t *pte, tpte; vm_paddr_t pa; int lvl; pa = 0; PMAP_LOCK(pmap); /* * Find the block or page map for this virtual address. pmap_pte * will return either a valid block/page entry, or NULL. */ pte = pmap_pte(pmap, va, &lvl); if (pte != NULL) { tpte = pmap_load(pte); pa = tpte & ~ATTR_MASK; switch(lvl) { case 1: KASSERT((tpte & ATTR_DESCR_MASK) == L1_BLOCK, ("pmap_extract: Invalid L1 pte found: %lx", tpte & ATTR_DESCR_MASK)); pa |= (va & L1_OFFSET); break; case 2: KASSERT((tpte & ATTR_DESCR_MASK) == L2_BLOCK, ("pmap_extract: Invalid L2 pte found: %lx", tpte & ATTR_DESCR_MASK)); pa |= (va & L2_OFFSET); break; case 3: KASSERT((tpte & ATTR_DESCR_MASK) == L3_PAGE, ("pmap_extract: Invalid L3 pte found: %lx", tpte & ATTR_DESCR_MASK)); pa |= (va & L3_OFFSET); break; } } PMAP_UNLOCK(pmap); return (pa); } /* * Routine: pmap_extract_and_hold * Function: * Atomically extract and hold the physical page * with the given pmap and virtual address pair * if that mapping permits the given protection. */ vm_page_t pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot) { pt_entry_t *pte, tpte; vm_offset_t off; vm_paddr_t pa; vm_page_t m; int lvl; pa = 0; m = NULL; PMAP_LOCK(pmap); retry: pte = pmap_pte(pmap, va, &lvl); if (pte != NULL) { tpte = pmap_load(pte); KASSERT(lvl > 0 && lvl <= 3, ("pmap_extract_and_hold: Invalid level %d", lvl)); CTASSERT(L1_BLOCK == L2_BLOCK); KASSERT((lvl == 3 && (tpte & ATTR_DESCR_MASK) == L3_PAGE) || (lvl < 3 && (tpte & ATTR_DESCR_MASK) == L1_BLOCK), ("pmap_extract_and_hold: Invalid pte at L%d: %lx", lvl, tpte & ATTR_DESCR_MASK)); if (((tpte & ATTR_AP_RW_BIT) == ATTR_AP(ATTR_AP_RW)) || ((prot & VM_PROT_WRITE) == 0)) { switch(lvl) { case 1: off = va & L1_OFFSET; break; case 2: off = va & L2_OFFSET; break; case 3: default: off = 0; } if (vm_page_pa_tryrelock(pmap, (tpte & ~ATTR_MASK) | off, &pa)) goto retry; m = PHYS_TO_VM_PAGE((tpte & ~ATTR_MASK) | off); vm_page_wire(m); } } PA_UNLOCK_COND(pa); PMAP_UNLOCK(pmap); return (m); } vm_paddr_t pmap_kextract(vm_offset_t va) { pt_entry_t *pte, tpte; vm_paddr_t pa; int lvl; if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) { pa = DMAP_TO_PHYS(va); } else { pa = 0; pte = pmap_pte(kernel_pmap, va, &lvl); if (pte != NULL) { tpte = pmap_load(pte); pa = tpte & ~ATTR_MASK; switch(lvl) { case 1: KASSERT((tpte & ATTR_DESCR_MASK) == L1_BLOCK, ("pmap_kextract: Invalid L1 pte found: %lx", tpte & ATTR_DESCR_MASK)); pa |= (va & L1_OFFSET); break; case 2: KASSERT((tpte & ATTR_DESCR_MASK) == L2_BLOCK, ("pmap_kextract: Invalid L2 pte found: %lx", tpte & ATTR_DESCR_MASK)); pa |= (va & L2_OFFSET); break; case 3: KASSERT((tpte & ATTR_DESCR_MASK) == L3_PAGE, ("pmap_kextract: Invalid L3 pte found: %lx", tpte & ATTR_DESCR_MASK)); pa |= (va & L3_OFFSET); break; } } } return (pa); } /*************************************************** * Low level mapping routines..... ***************************************************/ void pmap_kenter(vm_offset_t sva, vm_size_t size, vm_paddr_t pa, int mode) { pd_entry_t *pde; pt_entry_t *pte, attr; vm_offset_t va; int lvl; KASSERT((pa & L3_OFFSET) == 0, ("pmap_kenter: Invalid physical address")); KASSERT((sva & L3_OFFSET) == 0, ("pmap_kenter: Invalid virtual address")); KASSERT((size & PAGE_MASK) == 0, ("pmap_kenter: Mapping is not page-sized")); attr = ATTR_DEFAULT | ATTR_IDX(mode) | L3_PAGE; if (mode == DEVICE_MEMORY) attr |= ATTR_XN; va = sva; while (size != 0) { pde = pmap_pde(kernel_pmap, va, &lvl); KASSERT(pde != NULL, ("pmap_kenter: Invalid page entry, va: 0x%lx", va)); KASSERT(lvl == 2, ("pmap_kenter: Invalid level %d", lvl)); pte = pmap_l2_to_l3(pde, va); pmap_load_store(pte, (pa & ~L3_OFFSET) | attr); va += PAGE_SIZE; pa += PAGE_SIZE; size -= PAGE_SIZE; } pmap_invalidate_range(kernel_pmap, sva, va); } void pmap_kenter_device(vm_offset_t sva, vm_size_t size, vm_paddr_t pa) { pmap_kenter(sva, size, pa, DEVICE_MEMORY); } /* * Remove a page from the kernel pagetables. */ PMAP_INLINE void pmap_kremove(vm_offset_t va) { pt_entry_t *pte; int lvl; pte = pmap_pte(kernel_pmap, va, &lvl); KASSERT(pte != NULL, ("pmap_kremove: Invalid address")); KASSERT(lvl == 3, ("pmap_kremove: Invalid pte level %d", lvl)); pmap_clear(pte); pmap_invalidate_page(kernel_pmap, va); } void pmap_kremove_device(vm_offset_t sva, vm_size_t size) { pt_entry_t *pte; vm_offset_t va; int lvl; KASSERT((sva & L3_OFFSET) == 0, ("pmap_kremove_device: Invalid virtual address")); KASSERT((size & PAGE_MASK) == 0, ("pmap_kremove_device: Mapping is not page-sized")); va = sva; while (size != 0) { pte = pmap_pte(kernel_pmap, va, &lvl); KASSERT(pte != NULL, ("Invalid page table, va: 0x%lx", va)); KASSERT(lvl == 3, ("Invalid device pagetable level: %d != 3", lvl)); pmap_clear(pte); va += PAGE_SIZE; size -= PAGE_SIZE; } pmap_invalidate_range(kernel_pmap, sva, va); } /* * Used to map a range of physical addresses into kernel * virtual address space. * * The value passed in '*virt' is a suggested virtual address for * the mapping. Architectures which can support a direct-mapped * physical to virtual region can return the appropriate address * within that region, leaving '*virt' unchanged. Other * architectures should map the pages starting at '*virt' and * update '*virt' with the first usable address after the mapped * region. */ vm_offset_t pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot) { return PHYS_TO_DMAP(start); } /* * Add a list of wired pages to the kva * this routine is only used for temporary * kernel mappings that do not need to have * page modification or references recorded. * Note that old mappings are simply written * over. The page *must* be wired. * Note: SMP coherent. Uses a ranged shootdown IPI. */ void pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count) { pd_entry_t *pde; pt_entry_t *pte, pa; vm_offset_t va; vm_page_t m; int i, lvl; va = sva; for (i = 0; i < count; i++) { pde = pmap_pde(kernel_pmap, va, &lvl); KASSERT(pde != NULL, ("pmap_qenter: Invalid page entry, va: 0x%lx", va)); KASSERT(lvl == 2, ("pmap_qenter: Invalid level %d", lvl)); m = ma[i]; pa = VM_PAGE_TO_PHYS(m) | ATTR_DEFAULT | ATTR_AP(ATTR_AP_RW) | ATTR_IDX(m->md.pv_memattr) | L3_PAGE; if (m->md.pv_memattr == DEVICE_MEMORY) pa |= ATTR_XN; pte = pmap_l2_to_l3(pde, va); pmap_load_store(pte, pa); va += L3_SIZE; } pmap_invalidate_range(kernel_pmap, sva, va); } /* * This routine tears out page mappings from the * kernel -- it is meant only for temporary mappings. */ void pmap_qremove(vm_offset_t sva, int count) { pt_entry_t *pte; vm_offset_t va; int lvl; KASSERT(sva >= VM_MIN_KERNEL_ADDRESS, ("usermode va %lx", sva)); va = sva; while (count-- > 0) { pte = pmap_pte(kernel_pmap, va, &lvl); KASSERT(lvl == 3, ("Invalid device pagetable level: %d != 3", lvl)); if (pte != NULL) { pmap_clear(pte); } va += PAGE_SIZE; } pmap_invalidate_range(kernel_pmap, sva, va); } /*************************************************** * Page table page management routines..... ***************************************************/ /* * Schedule the specified unused page table page to be freed. Specifically, * add the page to the specified list of pages that will be released to the * physical memory manager after the TLB has been updated. */ static __inline void pmap_add_delayed_free_list(vm_page_t m, struct spglist *free, boolean_t set_PG_ZERO) { if (set_PG_ZERO) m->flags |= PG_ZERO; else m->flags &= ~PG_ZERO; SLIST_INSERT_HEAD(free, m, plinks.s.ss); } /* * Decrements a page table page's wire count, which is used to record the * number of valid page table entries within the page. If the wire count * drops to zero, then the page table page is unmapped. Returns TRUE if the * page table page was unmapped and FALSE otherwise. */ static inline boolean_t pmap_unwire_l3(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) { --m->wire_count; if (m->wire_count == 0) { _pmap_unwire_l3(pmap, va, m, free); return (TRUE); } else return (FALSE); } static void _pmap_unwire_l3(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) { PMAP_LOCK_ASSERT(pmap, MA_OWNED); /* * unmap the page table page */ if (m->pindex >= (NUL2E + NUL1E)) { /* l1 page */ pd_entry_t *l0; l0 = pmap_l0(pmap, va); pmap_clear(l0); } else if (m->pindex >= NUL2E) { /* l2 page */ pd_entry_t *l1; l1 = pmap_l1(pmap, va); pmap_clear(l1); } else { /* l3 page */ pd_entry_t *l2; l2 = pmap_l2(pmap, va); pmap_clear(l2); } pmap_resident_count_dec(pmap, 1); if (m->pindex < NUL2E) { /* We just released an l3, unhold the matching l2 */ pd_entry_t *l1, tl1; vm_page_t l2pg; l1 = pmap_l1(pmap, va); tl1 = pmap_load(l1); l2pg = PHYS_TO_VM_PAGE(tl1 & ~ATTR_MASK); pmap_unwire_l3(pmap, va, l2pg, free); } else if (m->pindex < (NUL2E + NUL1E)) { /* We just released an l2, unhold the matching l1 */ pd_entry_t *l0, tl0; vm_page_t l1pg; l0 = pmap_l0(pmap, va); tl0 = pmap_load(l0); l1pg = PHYS_TO_VM_PAGE(tl0 & ~ATTR_MASK); pmap_unwire_l3(pmap, va, l1pg, free); } pmap_invalidate_page(pmap, va); /* * Put page on a list so that it is released after * *ALL* TLB shootdown is done */ pmap_add_delayed_free_list(m, free, TRUE); } /* * After removing a page table entry, this routine is used to * conditionally free the page, and manage the hold/wire counts. */ static int pmap_unuse_pt(pmap_t pmap, vm_offset_t va, pd_entry_t ptepde, struct spglist *free) { vm_page_t mpte; if (va >= VM_MAXUSER_ADDRESS) return (0); KASSERT(ptepde != 0, ("pmap_unuse_pt: ptepde != 0")); mpte = PHYS_TO_VM_PAGE(ptepde & ~ATTR_MASK); return (pmap_unwire_l3(pmap, va, mpte, free)); } void pmap_pinit0(pmap_t pmap) { PMAP_LOCK_INIT(pmap); bzero(&pmap->pm_stats, sizeof(pmap->pm_stats)); pmap->pm_l0 = kernel_pmap->pm_l0; pmap->pm_root.rt_root = 0; } int pmap_pinit(pmap_t pmap) { vm_paddr_t l0phys; vm_page_t l0pt; /* * allocate the l0 page */ while ((l0pt = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) vm_wait(NULL); l0phys = VM_PAGE_TO_PHYS(l0pt); pmap->pm_l0 = (pd_entry_t *)PHYS_TO_DMAP(l0phys); if ((l0pt->flags & PG_ZERO) == 0) pagezero(pmap->pm_l0); pmap->pm_root.rt_root = 0; bzero(&pmap->pm_stats, sizeof(pmap->pm_stats)); return (1); } /* * This routine is called if the desired page table page does not exist. * * If page table page allocation fails, this routine may sleep before * returning NULL. It sleeps only if a lock pointer was given. * * Note: If a page allocation fails at page table level two or three, * one or two pages may be held during the wait, only to be released * afterwards. This conservative approach is easily argued to avoid * race conditions. */ static vm_page_t _pmap_alloc_l3(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp) { vm_page_t m, l1pg, l2pg; PMAP_LOCK_ASSERT(pmap, MA_OWNED); /* * Allocate a page table page. */ if ((m = vm_page_alloc(NULL, ptepindex, VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) { if (lockp != NULL) { RELEASE_PV_LIST_LOCK(lockp); PMAP_UNLOCK(pmap); vm_wait(NULL); PMAP_LOCK(pmap); } /* * Indicate the need to retry. While waiting, the page table * page may have been allocated. */ return (NULL); } if ((m->flags & PG_ZERO) == 0) pmap_zero_page(m); /* * Map the pagetable page into the process address space, if * it isn't already there. */ if (ptepindex >= (NUL2E + NUL1E)) { pd_entry_t *l0; vm_pindex_t l0index; l0index = ptepindex - (NUL2E + NUL1E); l0 = &pmap->pm_l0[l0index]; pmap_store(l0, VM_PAGE_TO_PHYS(m) | L0_TABLE); } else if (ptepindex >= NUL2E) { vm_pindex_t l0index, l1index; pd_entry_t *l0, *l1; pd_entry_t tl0; l1index = ptepindex - NUL2E; l0index = l1index >> L0_ENTRIES_SHIFT; l0 = &pmap->pm_l0[l0index]; tl0 = pmap_load(l0); if (tl0 == 0) { /* recurse for allocating page dir */ if (_pmap_alloc_l3(pmap, NUL2E + NUL1E + l0index, lockp) == NULL) { vm_page_unwire_noq(m); vm_page_free_zero(m); return (NULL); } } else { l1pg = PHYS_TO_VM_PAGE(tl0 & ~ATTR_MASK); l1pg->wire_count++; } l1 = (pd_entry_t *)PHYS_TO_DMAP(pmap_load(l0) & ~ATTR_MASK); l1 = &l1[ptepindex & Ln_ADDR_MASK]; pmap_store(l1, VM_PAGE_TO_PHYS(m) | L1_TABLE); } else { vm_pindex_t l0index, l1index; pd_entry_t *l0, *l1, *l2; pd_entry_t tl0, tl1; l1index = ptepindex >> Ln_ENTRIES_SHIFT; l0index = l1index >> L0_ENTRIES_SHIFT; l0 = &pmap->pm_l0[l0index]; tl0 = pmap_load(l0); if (tl0 == 0) { /* recurse for allocating page dir */ if (_pmap_alloc_l3(pmap, NUL2E + l1index, lockp) == NULL) { vm_page_unwire_noq(m); vm_page_free_zero(m); return (NULL); } tl0 = pmap_load(l0); l1 = (pd_entry_t *)PHYS_TO_DMAP(tl0 & ~ATTR_MASK); l1 = &l1[l1index & Ln_ADDR_MASK]; } else { l1 = (pd_entry_t *)PHYS_TO_DMAP(tl0 & ~ATTR_MASK); l1 = &l1[l1index & Ln_ADDR_MASK]; tl1 = pmap_load(l1); if (tl1 == 0) { /* recurse for allocating page dir */ if (_pmap_alloc_l3(pmap, NUL2E + l1index, lockp) == NULL) { vm_page_unwire_noq(m); vm_page_free_zero(m); return (NULL); } } else { l2pg = PHYS_TO_VM_PAGE(tl1 & ~ATTR_MASK); l2pg->wire_count++; } } l2 = (pd_entry_t *)PHYS_TO_DMAP(pmap_load(l1) & ~ATTR_MASK); l2 = &l2[ptepindex & Ln_ADDR_MASK]; pmap_store(l2, VM_PAGE_TO_PHYS(m) | L2_TABLE); } pmap_resident_count_inc(pmap, 1); return (m); } static vm_page_t pmap_alloc_l2(pmap_t pmap, vm_offset_t va, struct rwlock **lockp) { pd_entry_t *l1; vm_page_t l2pg; vm_pindex_t l2pindex; retry: l1 = pmap_l1(pmap, va); if (l1 != NULL && (pmap_load(l1) & ATTR_DESCR_MASK) == L1_TABLE) { /* Add a reference to the L2 page. */ l2pg = PHYS_TO_VM_PAGE(pmap_load(l1) & ~ATTR_MASK); l2pg->wire_count++; } else { /* Allocate a L2 page. */ l2pindex = pmap_l2_pindex(va) >> Ln_ENTRIES_SHIFT; l2pg = _pmap_alloc_l3(pmap, NUL2E + l2pindex, lockp); if (l2pg == NULL && lockp != NULL) goto retry; } return (l2pg); } static vm_page_t pmap_alloc_l3(pmap_t pmap, vm_offset_t va, struct rwlock **lockp) { vm_pindex_t ptepindex; pd_entry_t *pde, tpde; #ifdef INVARIANTS pt_entry_t *pte; #endif vm_page_t m; int lvl; /* * Calculate pagetable page index */ ptepindex = pmap_l2_pindex(va); retry: /* * Get the page directory entry */ pde = pmap_pde(pmap, va, &lvl); /* * If the page table page is mapped, we just increment the hold count, * and activate it. If we get a level 2 pde it will point to a level 3 * table. */ switch (lvl) { case -1: break; case 0: #ifdef INVARIANTS pte = pmap_l0_to_l1(pde, va); KASSERT(pmap_load(pte) == 0, ("pmap_alloc_l3: TODO: l0 superpages")); #endif break; case 1: #ifdef INVARIANTS pte = pmap_l1_to_l2(pde, va); KASSERT(pmap_load(pte) == 0, ("pmap_alloc_l3: TODO: l1 superpages")); #endif break; case 2: tpde = pmap_load(pde); if (tpde != 0) { m = PHYS_TO_VM_PAGE(tpde & ~ATTR_MASK); m->wire_count++; return (m); } break; default: panic("pmap_alloc_l3: Invalid level %d", lvl); } /* * Here if the pte page isn't mapped, or if it has been deallocated. */ m = _pmap_alloc_l3(pmap, ptepindex, lockp); if (m == NULL && lockp != NULL) goto retry; return (m); } /*************************************************** * Pmap allocation/deallocation routines. ***************************************************/ /* * Release any resources held by the given physical map. * Called when a pmap initialized by pmap_pinit is being released. * Should only be called if the map contains no valid mappings. */ void pmap_release(pmap_t pmap) { vm_page_t m; KASSERT(pmap->pm_stats.resident_count == 0, ("pmap_release: pmap resident count %ld != 0", pmap->pm_stats.resident_count)); KASSERT(vm_radix_is_empty(&pmap->pm_root), ("pmap_release: pmap has reserved page table page(s)")); m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pmap->pm_l0)); vm_page_unwire_noq(m); vm_page_free_zero(m); } static int kvm_size(SYSCTL_HANDLER_ARGS) { unsigned long ksize = VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS; return sysctl_handle_long(oidp, &ksize, 0, req); } SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG|CTLFLAG_RD, 0, 0, kvm_size, "LU", "Size of KVM"); static int kvm_free(SYSCTL_HANDLER_ARGS) { unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end; return sysctl_handle_long(oidp, &kfree, 0, req); } SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG|CTLFLAG_RD, 0, 0, kvm_free, "LU", "Amount of KVM free"); /* * grow the number of kernel page table entries, if needed */ void pmap_growkernel(vm_offset_t addr) { vm_paddr_t paddr; vm_page_t nkpg; pd_entry_t *l0, *l1, *l2; mtx_assert(&kernel_map->system_mtx, MA_OWNED); addr = roundup2(addr, L2_SIZE); if (addr - 1 >= vm_map_max(kernel_map)) addr = vm_map_max(kernel_map); while (kernel_vm_end < addr) { l0 = pmap_l0(kernel_pmap, kernel_vm_end); KASSERT(pmap_load(l0) != 0, ("pmap_growkernel: No level 0 kernel entry")); l1 = pmap_l0_to_l1(l0, kernel_vm_end); if (pmap_load(l1) == 0) { /* We need a new PDP entry */ nkpg = vm_page_alloc(NULL, kernel_vm_end >> L1_SHIFT, VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO); if (nkpg == NULL) panic("pmap_growkernel: no memory to grow kernel"); if ((nkpg->flags & PG_ZERO) == 0) pmap_zero_page(nkpg); paddr = VM_PAGE_TO_PHYS(nkpg); pmap_store(l1, paddr | L1_TABLE); continue; /* try again */ } l2 = pmap_l1_to_l2(l1, kernel_vm_end); if ((pmap_load(l2) & ATTR_AF) != 0) { kernel_vm_end = (kernel_vm_end + L2_SIZE) & ~L2_OFFSET; if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) { kernel_vm_end = vm_map_max(kernel_map); break; } continue; } nkpg = vm_page_alloc(NULL, kernel_vm_end >> L2_SHIFT, VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO); if (nkpg == NULL) panic("pmap_growkernel: no memory to grow kernel"); if ((nkpg->flags & PG_ZERO) == 0) pmap_zero_page(nkpg); paddr = VM_PAGE_TO_PHYS(nkpg); pmap_load_store(l2, paddr | L2_TABLE); pmap_invalidate_page(kernel_pmap, kernel_vm_end); kernel_vm_end = (kernel_vm_end + L2_SIZE) & ~L2_OFFSET; if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) { kernel_vm_end = vm_map_max(kernel_map); break; } } } /*************************************************** * page management routines. ***************************************************/ CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE); CTASSERT(_NPCM == 3); CTASSERT(_NPCPV == 168); static __inline struct pv_chunk * pv_to_chunk(pv_entry_t pv) { return ((struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK)); } #define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap) #define PC_FREE0 0xfffffffffffffffful #define PC_FREE1 0xfffffffffffffffful #define PC_FREE2 0x000000fffffffffful static const uint64_t pc_freemask[_NPCM] = { PC_FREE0, PC_FREE1, PC_FREE2 }; #if 0 #ifdef PV_STATS static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail; SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0, "Current number of pv entry chunks"); SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0, "Current number of pv entry chunks allocated"); SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0, "Current number of pv entry chunks frees"); SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0, "Number of times tried to get a chunk page but failed."); static long pv_entry_frees, pv_entry_allocs, pv_entry_count; static int pv_entry_spare; SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0, "Current number of pv entry frees"); SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0, "Current number of pv entry allocs"); SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0, "Current number of pv entries"); SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0, "Current number of spare pv entries"); #endif #endif /* 0 */ /* * We are in a serious low memory condition. Resort to * drastic measures to free some pages so we can allocate * another pv entry chunk. * * Returns NULL if PV entries were reclaimed from the specified pmap. * * We do not, however, unmap 2mpages because subsequent accesses will * allocate per-page pv entries until repromotion occurs, thereby * exacerbating the shortage of free pv entries. */ static vm_page_t reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp) { struct pv_chunk *pc, *pc_marker, *pc_marker_end; struct pv_chunk_header pc_marker_b, pc_marker_end_b; struct md_page *pvh; pd_entry_t *pde; pmap_t next_pmap, pmap; pt_entry_t *pte, tpte; pv_entry_t pv; vm_offset_t va; vm_page_t m, m_pc; struct spglist free; uint64_t inuse; int bit, field, freed, lvl; static int active_reclaims = 0; PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED); KASSERT(lockp != NULL, ("reclaim_pv_chunk: lockp is NULL")); pmap = NULL; m_pc = NULL; SLIST_INIT(&free); bzero(&pc_marker_b, sizeof(pc_marker_b)); bzero(&pc_marker_end_b, sizeof(pc_marker_end_b)); pc_marker = (struct pv_chunk *)&pc_marker_b; pc_marker_end = (struct pv_chunk *)&pc_marker_end_b; mtx_lock(&pv_chunks_mutex); active_reclaims++; TAILQ_INSERT_HEAD(&pv_chunks, pc_marker, pc_lru); TAILQ_INSERT_TAIL(&pv_chunks, pc_marker_end, pc_lru); while ((pc = TAILQ_NEXT(pc_marker, pc_lru)) != pc_marker_end && SLIST_EMPTY(&free)) { next_pmap = pc->pc_pmap; if (next_pmap == NULL) { /* * The next chunk is a marker. However, it is * not our marker, so active_reclaims must be * > 1. Consequently, the next_chunk code * will not rotate the pv_chunks list. */ goto next_chunk; } mtx_unlock(&pv_chunks_mutex); /* * A pv_chunk can only be removed from the pc_lru list * when both pv_chunks_mutex is owned and the * corresponding pmap is locked. */ if (pmap != next_pmap) { if (pmap != NULL && pmap != locked_pmap) PMAP_UNLOCK(pmap); pmap = next_pmap; /* Avoid deadlock and lock recursion. */ if (pmap > locked_pmap) { RELEASE_PV_LIST_LOCK(lockp); PMAP_LOCK(pmap); mtx_lock(&pv_chunks_mutex); continue; } else if (pmap != locked_pmap) { if (PMAP_TRYLOCK(pmap)) { mtx_lock(&pv_chunks_mutex); continue; } else { pmap = NULL; /* pmap is not locked */ mtx_lock(&pv_chunks_mutex); pc = TAILQ_NEXT(pc_marker, pc_lru); if (pc == NULL || pc->pc_pmap != next_pmap) continue; goto next_chunk; } } } /* * Destroy every non-wired, 4 KB page mapping in the chunk. */ freed = 0; for (field = 0; field < _NPCM; field++) { for (inuse = ~pc->pc_map[field] & pc_freemask[field]; inuse != 0; inuse &= ~(1UL << bit)) { bit = ffsl(inuse) - 1; pv = &pc->pc_pventry[field * 64 + bit]; va = pv->pv_va; pde = pmap_pde(pmap, va, &lvl); if (lvl != 2) continue; pte = pmap_l2_to_l3(pde, va); tpte = pmap_load(pte); if ((tpte & ATTR_SW_WIRED) != 0) continue; tpte = pmap_load_clear(pte); pmap_invalidate_page(pmap, va); m = PHYS_TO_VM_PAGE(tpte & ~ATTR_MASK); if (pmap_pte_dirty(tpte)) vm_page_dirty(m); if ((tpte & ATTR_AF) != 0) vm_page_aflag_set(m, PGA_REFERENCED); CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); m->md.pv_gen++; if (TAILQ_EMPTY(&m->md.pv_list) && (m->flags & PG_FICTITIOUS) == 0) { pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); if (TAILQ_EMPTY(&pvh->pv_list)) { vm_page_aflag_clear(m, PGA_WRITEABLE); } } pc->pc_map[field] |= 1UL << bit; pmap_unuse_pt(pmap, va, pmap_load(pde), &free); freed++; } } if (freed == 0) { mtx_lock(&pv_chunks_mutex); goto next_chunk; } /* Every freed mapping is for a 4 KB page. */ pmap_resident_count_dec(pmap, freed); PV_STAT(atomic_add_long(&pv_entry_frees, freed)); PV_STAT(atomic_add_int(&pv_entry_spare, freed)); PV_STAT(atomic_subtract_long(&pv_entry_count, freed)); TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); if (pc->pc_map[0] == PC_FREE0 && pc->pc_map[1] == PC_FREE1 && pc->pc_map[2] == PC_FREE2) { PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV)); PV_STAT(atomic_subtract_int(&pc_chunk_count, 1)); PV_STAT(atomic_add_int(&pc_chunk_frees, 1)); /* Entire chunk is free; return it. */ m_pc = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc)); dump_drop_page(m_pc->phys_addr); mtx_lock(&pv_chunks_mutex); TAILQ_REMOVE(&pv_chunks, pc, pc_lru); break; } TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); mtx_lock(&pv_chunks_mutex); /* One freed pv entry in locked_pmap is sufficient. */ if (pmap == locked_pmap) break; next_chunk: TAILQ_REMOVE(&pv_chunks, pc_marker, pc_lru); TAILQ_INSERT_AFTER(&pv_chunks, pc, pc_marker, pc_lru); if (active_reclaims == 1 && pmap != NULL) { /* * Rotate the pv chunks list so that we do not * scan the same pv chunks that could not be * freed (because they contained a wired * and/or superpage mapping) on every * invocation of reclaim_pv_chunk(). */ while ((pc = TAILQ_FIRST(&pv_chunks)) != pc_marker) { MPASS(pc->pc_pmap != NULL); TAILQ_REMOVE(&pv_chunks, pc, pc_lru); TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru); } } } TAILQ_REMOVE(&pv_chunks, pc_marker, pc_lru); TAILQ_REMOVE(&pv_chunks, pc_marker_end, pc_lru); active_reclaims--; mtx_unlock(&pv_chunks_mutex); if (pmap != NULL && pmap != locked_pmap) PMAP_UNLOCK(pmap); if (m_pc == NULL && !SLIST_EMPTY(&free)) { m_pc = SLIST_FIRST(&free); SLIST_REMOVE_HEAD(&free, plinks.s.ss); /* Recycle a freed page table page. */ m_pc->wire_count = 1; } vm_page_free_pages_toq(&free, true); return (m_pc); } /* * free the pv_entry back to the free list */ static void free_pv_entry(pmap_t pmap, pv_entry_t pv) { struct pv_chunk *pc; int idx, field, bit; PMAP_LOCK_ASSERT(pmap, MA_OWNED); PV_STAT(atomic_add_long(&pv_entry_frees, 1)); PV_STAT(atomic_add_int(&pv_entry_spare, 1)); PV_STAT(atomic_subtract_long(&pv_entry_count, 1)); pc = pv_to_chunk(pv); idx = pv - &pc->pc_pventry[0]; field = idx / 64; bit = idx % 64; pc->pc_map[field] |= 1ul << bit; if (pc->pc_map[0] != PC_FREE0 || pc->pc_map[1] != PC_FREE1 || pc->pc_map[2] != PC_FREE2) { /* 98% of the time, pc is already at the head of the list. */ if (__predict_false(pc != TAILQ_FIRST(&pmap->pm_pvchunk))) { TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); } return; } TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); free_pv_chunk(pc); } static void free_pv_chunk(struct pv_chunk *pc) { vm_page_t m; mtx_lock(&pv_chunks_mutex); TAILQ_REMOVE(&pv_chunks, pc, pc_lru); mtx_unlock(&pv_chunks_mutex); PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV)); PV_STAT(atomic_subtract_int(&pc_chunk_count, 1)); PV_STAT(atomic_add_int(&pc_chunk_frees, 1)); /* entire chunk is free, return it */ m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc)); dump_drop_page(m->phys_addr); vm_page_unwire_noq(m); vm_page_free(m); } /* * Returns a new PV entry, allocating a new PV chunk from the system when * needed. If this PV chunk allocation fails and a PV list lock pointer was * given, a PV chunk is reclaimed from an arbitrary pmap. Otherwise, NULL is * returned. * * The given PV list lock may be released. */ static pv_entry_t get_pv_entry(pmap_t pmap, struct rwlock **lockp) { int bit, field; pv_entry_t pv; struct pv_chunk *pc; vm_page_t m; PMAP_LOCK_ASSERT(pmap, MA_OWNED); PV_STAT(atomic_add_long(&pv_entry_allocs, 1)); retry: pc = TAILQ_FIRST(&pmap->pm_pvchunk); if (pc != NULL) { for (field = 0; field < _NPCM; field++) { if (pc->pc_map[field]) { bit = ffsl(pc->pc_map[field]) - 1; break; } } if (field < _NPCM) { pv = &pc->pc_pventry[field * 64 + bit]; pc->pc_map[field] &= ~(1ul << bit); /* If this was the last item, move it to tail */ if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 && pc->pc_map[2] == 0) { TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); } PV_STAT(atomic_add_long(&pv_entry_count, 1)); PV_STAT(atomic_subtract_int(&pv_entry_spare, 1)); return (pv); } } /* No free items, allocate another chunk */ m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED); if (m == NULL) { if (lockp == NULL) { PV_STAT(pc_chunk_tryfail++); return (NULL); } m = reclaim_pv_chunk(pmap, lockp); if (m == NULL) goto retry; } PV_STAT(atomic_add_int(&pc_chunk_count, 1)); PV_STAT(atomic_add_int(&pc_chunk_allocs, 1)); dump_add_page(m->phys_addr); pc = (void *)PHYS_TO_DMAP(m->phys_addr); pc->pc_pmap = pmap; pc->pc_map[0] = PC_FREE0 & ~1ul; /* preallocated bit 0 */ pc->pc_map[1] = PC_FREE1; pc->pc_map[2] = PC_FREE2; mtx_lock(&pv_chunks_mutex); TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru); mtx_unlock(&pv_chunks_mutex); pv = &pc->pc_pventry[0]; TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); PV_STAT(atomic_add_long(&pv_entry_count, 1)); PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV - 1)); return (pv); } /* * Ensure that the number of spare PV entries in the specified pmap meets or * exceeds the given count, "needed". * * The given PV list lock may be released. */ static void reserve_pv_entries(pmap_t pmap, int needed, struct rwlock **lockp) { struct pch new_tail; struct pv_chunk *pc; vm_page_t m; int avail, free; bool reclaimed; PMAP_LOCK_ASSERT(pmap, MA_OWNED); KASSERT(lockp != NULL, ("reserve_pv_entries: lockp is NULL")); /* * Newly allocated PV chunks must be stored in a private list until * the required number of PV chunks have been allocated. Otherwise, * reclaim_pv_chunk() could recycle one of these chunks. In * contrast, these chunks must be added to the pmap upon allocation. */ TAILQ_INIT(&new_tail); retry: avail = 0; TAILQ_FOREACH(pc, &pmap->pm_pvchunk, pc_list) { bit_count((bitstr_t *)pc->pc_map, 0, sizeof(pc->pc_map) * NBBY, &free); if (free == 0) break; avail += free; if (avail >= needed) break; } for (reclaimed = false; avail < needed; avail += _NPCPV) { m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED); if (m == NULL) { m = reclaim_pv_chunk(pmap, lockp); if (m == NULL) goto retry; reclaimed = true; } PV_STAT(atomic_add_int(&pc_chunk_count, 1)); PV_STAT(atomic_add_int(&pc_chunk_allocs, 1)); dump_add_page(m->phys_addr); pc = (void *)PHYS_TO_DMAP(m->phys_addr); pc->pc_pmap = pmap; pc->pc_map[0] = PC_FREE0; pc->pc_map[1] = PC_FREE1; pc->pc_map[2] = PC_FREE2; TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru); PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV)); /* * The reclaim might have freed a chunk from the current pmap. * If that chunk contained available entries, we need to * re-count the number of available entries. */ if (reclaimed) goto retry; } if (!TAILQ_EMPTY(&new_tail)) { mtx_lock(&pv_chunks_mutex); TAILQ_CONCAT(&pv_chunks, &new_tail, pc_lru); mtx_unlock(&pv_chunks_mutex); } } /* * First find and then remove the pv entry for the specified pmap and virtual * address from the specified pv list. Returns the pv entry if found and NULL * otherwise. This operation can be performed on pv lists for either 4KB or * 2MB page mappings. */ static __inline pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va) { pv_entry_t pv; TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { if (pmap == PV_PMAP(pv) && va == pv->pv_va) { TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); pvh->pv_gen++; break; } } return (pv); } /* * After demotion from a 2MB page mapping to 512 4KB page mappings, * destroy the pv entry for the 2MB page mapping and reinstantiate the pv * entries for each of the 4KB page mappings. */ static void pmap_pv_demote_l2(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, struct rwlock **lockp) { struct md_page *pvh; struct pv_chunk *pc; pv_entry_t pv; vm_offset_t va_last; vm_page_t m; int bit, field; PMAP_LOCK_ASSERT(pmap, MA_OWNED); KASSERT((va & L2_OFFSET) == 0, ("pmap_pv_demote_l2: va is not 2mpage aligned")); KASSERT((pa & L2_OFFSET) == 0, ("pmap_pv_demote_l2: pa is not 2mpage aligned")); CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); /* * Transfer the 2mpage's pv entry for this mapping to the first * page's pv list. Once this transfer begins, the pv list lock * must not be released until the last pv entry is reinstantiated. */ pvh = pa_to_pvh(pa); pv = pmap_pvh_remove(pvh, pmap, va); KASSERT(pv != NULL, ("pmap_pv_demote_l2: pv not found")); m = PHYS_TO_VM_PAGE(pa); TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); m->md.pv_gen++; /* Instantiate the remaining Ln_ENTRIES - 1 pv entries. */ PV_STAT(atomic_add_long(&pv_entry_allocs, Ln_ENTRIES - 1)); va_last = va + L2_SIZE - PAGE_SIZE; for (;;) { pc = TAILQ_FIRST(&pmap->pm_pvchunk); KASSERT(pc->pc_map[0] != 0 || pc->pc_map[1] != 0 || pc->pc_map[2] != 0, ("pmap_pv_demote_l2: missing spare")); for (field = 0; field < _NPCM; field++) { while (pc->pc_map[field]) { bit = ffsl(pc->pc_map[field]) - 1; pc->pc_map[field] &= ~(1ul << bit); pv = &pc->pc_pventry[field * 64 + bit]; va += PAGE_SIZE; pv->pv_va = va; m++; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_pv_demote_l2: page %p is not managed", m)); TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); m->md.pv_gen++; if (va == va_last) goto out; } } TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); } out: if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 && pc->pc_map[2] == 0) { TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); } PV_STAT(atomic_add_long(&pv_entry_count, Ln_ENTRIES - 1)); PV_STAT(atomic_subtract_int(&pv_entry_spare, Ln_ENTRIES - 1)); } /* * First find and then destroy the pv entry for the specified pmap and virtual * address. This operation can be performed on pv lists for either 4KB or 2MB * page mappings. */ static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va) { pv_entry_t pv; pv = pmap_pvh_remove(pvh, pmap, va); KASSERT(pv != NULL, ("pmap_pvh_free: pv not found")); free_pv_entry(pmap, pv); } /* * Conditionally create the PV entry for a 4KB page mapping if the required * memory can be allocated without resorting to reclamation. */ static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m, struct rwlock **lockp) { pv_entry_t pv; PMAP_LOCK_ASSERT(pmap, MA_OWNED); /* Pass NULL instead of the lock pointer to disable reclamation. */ if ((pv = get_pv_entry(pmap, NULL)) != NULL) { pv->pv_va = va; CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); m->md.pv_gen++; return (TRUE); } else return (FALSE); } /* * Create the PV entry for a 2MB page mapping. Always returns true unless the * flag PMAP_ENTER_NORECLAIM is specified. If that flag is specified, returns * false if the PV entry cannot be allocated without resorting to reclamation. */ static bool pmap_pv_insert_l2(pmap_t pmap, vm_offset_t va, pd_entry_t l2e, u_int flags, struct rwlock **lockp) { struct md_page *pvh; pv_entry_t pv; vm_paddr_t pa; PMAP_LOCK_ASSERT(pmap, MA_OWNED); /* Pass NULL instead of the lock pointer to disable reclamation. */ if ((pv = get_pv_entry(pmap, (flags & PMAP_ENTER_NORECLAIM) != 0 ? NULL : lockp)) == NULL) return (false); pv->pv_va = va; pa = l2e & ~ATTR_MASK; CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); pvh = pa_to_pvh(pa); TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); pvh->pv_gen++; return (true); } static void pmap_remove_kernel_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va) { pt_entry_t newl2, oldl2; vm_page_t ml3; vm_paddr_t ml3pa; KASSERT(!VIRT_IN_DMAP(va), ("removing direct mapping of %#lx", va)); KASSERT(pmap == kernel_pmap, ("pmap %p is not kernel_pmap", pmap)); PMAP_LOCK_ASSERT(pmap, MA_OWNED); ml3 = pmap_remove_pt_page(pmap, va); if (ml3 == NULL) panic("pmap_remove_kernel_l2: Missing pt page"); ml3pa = VM_PAGE_TO_PHYS(ml3); newl2 = ml3pa | L2_TABLE; /* * If this page table page was unmapped by a promotion, then it * contains valid mappings. Zero it to invalidate those mappings. */ if (ml3->valid != 0) pagezero((void *)PHYS_TO_DMAP(ml3pa)); /* * Demote the mapping. The caller must have already invalidated the * mapping (i.e., the "break" in break-before-make). */ oldl2 = pmap_load_store(l2, newl2); KASSERT(oldl2 == 0, ("%s: found existing mapping at %p: %#lx", __func__, l2, oldl2)); } /* * pmap_remove_l2: Do the things to unmap a level 2 superpage. */ static int pmap_remove_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t sva, pd_entry_t l1e, struct spglist *free, struct rwlock **lockp) { struct md_page *pvh; pt_entry_t old_l2; vm_offset_t eva, va; vm_page_t m, ml3; PMAP_LOCK_ASSERT(pmap, MA_OWNED); KASSERT((sva & L2_OFFSET) == 0, ("pmap_remove_l2: sva is not aligned")); old_l2 = pmap_load_clear(l2); KASSERT((old_l2 & ATTR_DESCR_MASK) == L2_BLOCK, ("pmap_remove_l2: L2e %lx is not a block mapping", old_l2)); /* * Since a promotion must break the 4KB page mappings before making * the 2MB page mapping, a pmap_invalidate_page() suffices. */ pmap_invalidate_page(pmap, sva); if (old_l2 & ATTR_SW_WIRED) pmap->pm_stats.wired_count -= L2_SIZE / PAGE_SIZE; pmap_resident_count_dec(pmap, L2_SIZE / PAGE_SIZE); if (old_l2 & ATTR_SW_MANAGED) { CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, old_l2 & ~ATTR_MASK); pvh = pa_to_pvh(old_l2 & ~ATTR_MASK); pmap_pvh_free(pvh, pmap, sva); eva = sva + L2_SIZE; for (va = sva, m = PHYS_TO_VM_PAGE(old_l2 & ~ATTR_MASK); va < eva; va += PAGE_SIZE, m++) { if (pmap_pte_dirty(old_l2)) vm_page_dirty(m); if (old_l2 & ATTR_AF) vm_page_aflag_set(m, PGA_REFERENCED); if (TAILQ_EMPTY(&m->md.pv_list) && TAILQ_EMPTY(&pvh->pv_list)) vm_page_aflag_clear(m, PGA_WRITEABLE); } } if (pmap == kernel_pmap) { pmap_remove_kernel_l2(pmap, l2, sva); } else { ml3 = pmap_remove_pt_page(pmap, sva); if (ml3 != NULL) { KASSERT(ml3->valid == VM_PAGE_BITS_ALL, ("pmap_remove_l2: l3 page not promoted")); pmap_resident_count_dec(pmap, 1); KASSERT(ml3->wire_count == NL3PG, ("pmap_remove_l2: l3 page wire count error")); ml3->wire_count = 0; pmap_add_delayed_free_list(ml3, free, FALSE); } } return (pmap_unuse_pt(pmap, sva, l1e, free)); } /* * pmap_remove_l3: do the things to unmap a page in a process */ -static int __unused +static int pmap_remove_l3(pmap_t pmap, pt_entry_t *l3, vm_offset_t va, pd_entry_t l2e, struct spglist *free, struct rwlock **lockp) { struct md_page *pvh; pt_entry_t old_l3; vm_page_t m; PMAP_LOCK_ASSERT(pmap, MA_OWNED); old_l3 = pmap_load_clear(l3); pmap_invalidate_page(pmap, va); if (old_l3 & ATTR_SW_WIRED) pmap->pm_stats.wired_count -= 1; pmap_resident_count_dec(pmap, 1); if (old_l3 & ATTR_SW_MANAGED) { m = PHYS_TO_VM_PAGE(old_l3 & ~ATTR_MASK); if (pmap_pte_dirty(old_l3)) vm_page_dirty(m); if (old_l3 & ATTR_AF) vm_page_aflag_set(m, PGA_REFERENCED); CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); pmap_pvh_free(&m->md, pmap, va); if (TAILQ_EMPTY(&m->md.pv_list) && (m->flags & PG_FICTITIOUS) == 0) { pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); if (TAILQ_EMPTY(&pvh->pv_list)) vm_page_aflag_clear(m, PGA_WRITEABLE); } } return (pmap_unuse_pt(pmap, va, l2e, free)); } /* * Remove the specified range of addresses from the L3 page table that is * identified by the given L2 entry. */ static void pmap_remove_l3_range(pmap_t pmap, pd_entry_t l2e, vm_offset_t sva, vm_offset_t eva, struct spglist *free, struct rwlock **lockp) { struct md_page *pvh; struct rwlock *new_lock; pt_entry_t *l3, old_l3; vm_offset_t va; vm_page_t m; PMAP_LOCK_ASSERT(pmap, MA_OWNED); KASSERT(rounddown2(sva, L2_SIZE) + L2_SIZE == roundup2(eva, L2_SIZE), ("pmap_remove_l3_range: range crosses an L3 page table boundary")); va = eva; for (l3 = pmap_l2_to_l3(&l2e, sva); sva != eva; l3++, sva += L3_SIZE) { if (!pmap_l3_valid(pmap_load(l3))) { if (va != eva) { pmap_invalidate_range(pmap, va, sva); va = eva; } continue; } old_l3 = pmap_load_clear(l3); if ((old_l3 & ATTR_SW_WIRED) != 0) pmap->pm_stats.wired_count--; pmap_resident_count_dec(pmap, 1); if ((old_l3 & ATTR_SW_MANAGED) != 0) { m = PHYS_TO_VM_PAGE(old_l3 & ~ATTR_MASK); if (pmap_pte_dirty(old_l3)) vm_page_dirty(m); if ((old_l3 & ATTR_AF) != 0) vm_page_aflag_set(m, PGA_REFERENCED); new_lock = PHYS_TO_PV_LIST_LOCK(VM_PAGE_TO_PHYS(m)); if (new_lock != *lockp) { if (*lockp != NULL) { /* * Pending TLB invalidations must be * performed before the PV list lock is * released. Otherwise, a concurrent * pmap_remove_all() on a physical page * could return while a stale TLB entry * still provides access to that page. */ if (va != eva) { pmap_invalidate_range(pmap, va, sva); va = eva; } rw_wunlock(*lockp); } *lockp = new_lock; rw_wlock(*lockp); } pmap_pvh_free(&m->md, pmap, sva); if (TAILQ_EMPTY(&m->md.pv_list) && (m->flags & PG_FICTITIOUS) == 0) { pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); if (TAILQ_EMPTY(&pvh->pv_list)) vm_page_aflag_clear(m, PGA_WRITEABLE); } } if (va == eva) va = sva; if (pmap_unuse_pt(pmap, sva, l2e, free)) { sva += L3_SIZE; break; } } if (va != eva) pmap_invalidate_range(pmap, va, sva); } /* * Remove the given range of addresses from the specified map. * * It is assumed that the start and end are properly * rounded to the page size. */ void pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { struct rwlock *lock; vm_offset_t va_next; pd_entry_t *l0, *l1, *l2; pt_entry_t l3_paddr; struct spglist free; /* * Perform an unsynchronized read. This is, however, safe. */ if (pmap->pm_stats.resident_count == 0) return; SLIST_INIT(&free); PMAP_LOCK(pmap); lock = NULL; for (; sva < eva; sva = va_next) { if (pmap->pm_stats.resident_count == 0) break; l0 = pmap_l0(pmap, sva); if (pmap_load(l0) == 0) { va_next = (sva + L0_SIZE) & ~L0_OFFSET; if (va_next < sva) va_next = eva; continue; } l1 = pmap_l0_to_l1(l0, sva); if (pmap_load(l1) == 0) { va_next = (sva + L1_SIZE) & ~L1_OFFSET; if (va_next < sva) va_next = eva; continue; } /* * Calculate index for next page table. */ va_next = (sva + L2_SIZE) & ~L2_OFFSET; if (va_next < sva) va_next = eva; l2 = pmap_l1_to_l2(l1, sva); if (l2 == NULL) continue; l3_paddr = pmap_load(l2); if ((l3_paddr & ATTR_DESCR_MASK) == L2_BLOCK) { if (sva + L2_SIZE == va_next && eva >= va_next) { pmap_remove_l2(pmap, l2, sva, pmap_load(l1), &free, &lock); continue; } else if (pmap_demote_l2_locked(pmap, l2, sva, &lock) == NULL) continue; l3_paddr = pmap_load(l2); } /* * Weed out invalid mappings. */ if ((l3_paddr & ATTR_DESCR_MASK) != L2_TABLE) continue; /* * Limit our scan to either the end of the va represented * by the current page table page, or to the end of the * range being removed. */ if (va_next > eva) va_next = eva; pmap_remove_l3_range(pmap, l3_paddr, sva, va_next, &free, &lock); } if (lock != NULL) rw_wunlock(lock); PMAP_UNLOCK(pmap); vm_page_free_pages_toq(&free, true); } /* * Routine: pmap_remove_all * Function: * Removes this physical page from * all physical maps in which it resides. * Reflects back modify bits to the pager. * * Notes: * Original versions of this routine were very * inefficient because they iteratively called * pmap_remove (slow...) */ void pmap_remove_all(vm_page_t m) { struct md_page *pvh; pv_entry_t pv; pmap_t pmap; struct rwlock *lock; pd_entry_t *pde, tpde; pt_entry_t *pte, tpte; vm_offset_t va; struct spglist free; int lvl, pvh_gen, md_gen; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_remove_all: page %p is not managed", m)); SLIST_INIT(&free); lock = VM_PAGE_TO_PV_LIST_LOCK(m); pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : pa_to_pvh(VM_PAGE_TO_PHYS(m)); retry: rw_wlock(lock); while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { pvh_gen = pvh->pv_gen; rw_wunlock(lock); PMAP_LOCK(pmap); rw_wlock(lock); if (pvh_gen != pvh->pv_gen) { rw_wunlock(lock); PMAP_UNLOCK(pmap); goto retry; } } va = pv->pv_va; pte = pmap_pte(pmap, va, &lvl); KASSERT(pte != NULL, ("pmap_remove_all: no page table entry found")); KASSERT(lvl == 2, ("pmap_remove_all: invalid pte level %d", lvl)); pmap_demote_l2_locked(pmap, pte, va, &lock); PMAP_UNLOCK(pmap); } while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { pvh_gen = pvh->pv_gen; md_gen = m->md.pv_gen; rw_wunlock(lock); PMAP_LOCK(pmap); rw_wlock(lock); if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { rw_wunlock(lock); PMAP_UNLOCK(pmap); goto retry; } } pmap_resident_count_dec(pmap, 1); pde = pmap_pde(pmap, pv->pv_va, &lvl); KASSERT(pde != NULL, ("pmap_remove_all: no page directory entry found")); KASSERT(lvl == 2, ("pmap_remove_all: invalid pde level %d", lvl)); tpde = pmap_load(pde); pte = pmap_l2_to_l3(pde, pv->pv_va); tpte = pmap_load_clear(pte); pmap_invalidate_page(pmap, pv->pv_va); if (tpte & ATTR_SW_WIRED) pmap->pm_stats.wired_count--; if ((tpte & ATTR_AF) != 0) vm_page_aflag_set(m, PGA_REFERENCED); /* * Update the vm_page_t clean and reference bits. */ if (pmap_pte_dirty(tpte)) vm_page_dirty(m); pmap_unuse_pt(pmap, pv->pv_va, tpde, &free); TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); m->md.pv_gen++; free_pv_entry(pmap, pv); PMAP_UNLOCK(pmap); } vm_page_aflag_clear(m, PGA_WRITEABLE); rw_wunlock(lock); vm_page_free_pages_toq(&free, true); } /* * pmap_protect_l2: do the things to protect a 2MB page in a pmap */ static void pmap_protect_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t sva, pt_entry_t mask, pt_entry_t nbits) { pd_entry_t old_l2; vm_page_t m, mt; PMAP_LOCK_ASSERT(pmap, MA_OWNED); KASSERT((sva & L2_OFFSET) == 0, ("pmap_protect_l2: sva is not 2mpage aligned")); old_l2 = pmap_load(l2); KASSERT((old_l2 & ATTR_DESCR_MASK) == L2_BLOCK, ("pmap_protect_l2: L2e %lx is not a block mapping", old_l2)); /* * Return if the L2 entry already has the desired access restrictions * in place. */ retry: if ((old_l2 & mask) == nbits) return; /* * When a dirty read/write superpage mapping is write protected, * update the dirty field of each of the superpage's constituent 4KB * pages. */ if ((old_l2 & ATTR_SW_MANAGED) != 0 && (nbits & ATTR_AP(ATTR_AP_RO)) != 0 && pmap_pte_dirty(old_l2)) { m = PHYS_TO_VM_PAGE(old_l2 & ~ATTR_MASK); for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++) vm_page_dirty(mt); } if (!atomic_fcmpset_64(l2, &old_l2, (old_l2 & ~mask) | nbits)) goto retry; /* * Since a promotion must break the 4KB page mappings before making * the 2MB page mapping, a pmap_invalidate_page() suffices. */ pmap_invalidate_page(pmap, sva); } /* * Set the physical protection on the * specified range of this map as requested. */ void pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) { vm_offset_t va, va_next; pd_entry_t *l0, *l1, *l2; pt_entry_t *l3p, l3, mask, nbits; KASSERT((prot & ~VM_PROT_ALL) == 0, ("invalid prot %x", prot)); if (prot == VM_PROT_NONE) { pmap_remove(pmap, sva, eva); return; } mask = nbits = 0; if ((prot & VM_PROT_WRITE) == 0) { mask |= ATTR_AP_RW_BIT | ATTR_SW_DBM; nbits |= ATTR_AP(ATTR_AP_RO); } if ((prot & VM_PROT_EXECUTE) == 0) { mask |= ATTR_XN; nbits |= ATTR_XN; } if (mask == 0) return; PMAP_LOCK(pmap); for (; sva < eva; sva = va_next) { l0 = pmap_l0(pmap, sva); if (pmap_load(l0) == 0) { va_next = (sva + L0_SIZE) & ~L0_OFFSET; if (va_next < sva) va_next = eva; continue; } l1 = pmap_l0_to_l1(l0, sva); if (pmap_load(l1) == 0) { va_next = (sva + L1_SIZE) & ~L1_OFFSET; if (va_next < sva) va_next = eva; continue; } va_next = (sva + L2_SIZE) & ~L2_OFFSET; if (va_next < sva) va_next = eva; l2 = pmap_l1_to_l2(l1, sva); if (pmap_load(l2) == 0) continue; if ((pmap_load(l2) & ATTR_DESCR_MASK) == L2_BLOCK) { if (sva + L2_SIZE == va_next && eva >= va_next) { pmap_protect_l2(pmap, l2, sva, mask, nbits); continue; } else if (pmap_demote_l2(pmap, l2, sva) == NULL) continue; } KASSERT((pmap_load(l2) & ATTR_DESCR_MASK) == L2_TABLE, ("pmap_protect: Invalid L2 entry after demotion")); if (va_next > eva) va_next = eva; va = va_next; for (l3p = pmap_l2_to_l3(l2, sva); sva != va_next; l3p++, sva += L3_SIZE) { l3 = pmap_load(l3p); retry: /* * Go to the next L3 entry if the current one is * invalid or already has the desired access * restrictions in place. (The latter case occurs * frequently. For example, in a "buildworld" * workload, almost 1 out of 4 L3 entries already * have the desired restrictions.) */ if (!pmap_l3_valid(l3) || (l3 & mask) == nbits) { if (va != va_next) { pmap_invalidate_range(pmap, va, sva); va = va_next; } continue; } /* * When a dirty read/write mapping is write protected, * update the page's dirty field. */ if ((l3 & ATTR_SW_MANAGED) != 0 && (nbits & ATTR_AP(ATTR_AP_RO)) != 0 && pmap_pte_dirty(l3)) vm_page_dirty(PHYS_TO_VM_PAGE(l3 & ~ATTR_MASK)); if (!atomic_fcmpset_64(l3p, &l3, (l3 & ~mask) | nbits)) goto retry; if (va == va_next) va = sva; } if (va != va_next) pmap_invalidate_range(pmap, va, sva); } PMAP_UNLOCK(pmap); } /* * Inserts the specified page table page into the specified pmap's collection * of idle page table pages. Each of a pmap's page table pages is responsible * for mapping a distinct range of virtual addresses. The pmap's collection is * ordered by this virtual address range. * * If "promoted" is false, then the page table page "mpte" must be zero filled. */ static __inline int pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte, bool promoted) { PMAP_LOCK_ASSERT(pmap, MA_OWNED); mpte->valid = promoted ? VM_PAGE_BITS_ALL : 0; return (vm_radix_insert(&pmap->pm_root, mpte)); } /* * Removes the page table page mapping the specified virtual address from the * specified pmap's collection of idle page table pages, and returns it. * Otherwise, returns NULL if there is no page table page corresponding to the * specified virtual address. */ static __inline vm_page_t pmap_remove_pt_page(pmap_t pmap, vm_offset_t va) { PMAP_LOCK_ASSERT(pmap, MA_OWNED); return (vm_radix_remove(&pmap->pm_root, pmap_l2_pindex(va))); } /* * Performs a break-before-make update of a pmap entry. This is needed when * either promoting or demoting pages to ensure the TLB doesn't get into an * inconsistent state. */ static void pmap_update_entry(pmap_t pmap, pd_entry_t *pte, pd_entry_t newpte, vm_offset_t va, vm_size_t size) { register_t intr; PMAP_LOCK_ASSERT(pmap, MA_OWNED); /* * Ensure we don't get switched out with the page table in an * inconsistent state. We also need to ensure no interrupts fire * as they may make use of an address we are about to invalidate. */ intr = intr_disable(); critical_enter(); /* Clear the old mapping */ pmap_clear(pte); pmap_invalidate_range_nopin(pmap, va, va + size); /* Create the new mapping */ pmap_store(pte, newpte); dsb(ishst); critical_exit(); intr_restore(intr); } #if VM_NRESERVLEVEL > 0 /* * After promotion from 512 4KB page mappings to a single 2MB page mapping, * replace the many pv entries for the 4KB page mappings by a single pv entry * for the 2MB page mapping. */ static void pmap_pv_promote_l2(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, struct rwlock **lockp) { struct md_page *pvh; pv_entry_t pv; vm_offset_t va_last; vm_page_t m; KASSERT((pa & L2_OFFSET) == 0, ("pmap_pv_promote_l2: pa is not 2mpage aligned")); CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); /* * Transfer the first page's pv entry for this mapping to the 2mpage's * pv list. Aside from avoiding the cost of a call to get_pv_entry(), * a transfer avoids the possibility that get_pv_entry() calls * reclaim_pv_chunk() and that reclaim_pv_chunk() removes one of the * mappings that is being promoted. */ m = PHYS_TO_VM_PAGE(pa); va = va & ~L2_OFFSET; pv = pmap_pvh_remove(&m->md, pmap, va); KASSERT(pv != NULL, ("pmap_pv_promote_l2: pv not found")); pvh = pa_to_pvh(pa); TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); pvh->pv_gen++; /* Free the remaining NPTEPG - 1 pv entries. */ va_last = va + L2_SIZE - PAGE_SIZE; do { m++; va += PAGE_SIZE; pmap_pvh_free(&m->md, pmap, va); } while (va < va_last); } /* * Tries to promote the 512, contiguous 4KB page mappings that are within a * single level 2 table entry to a single 2MB page mapping. For promotion * to occur, two conditions must be met: (1) the 4KB page mappings must map * aligned, contiguous physical memory and (2) the 4KB page mappings must have * identical characteristics. */ static void pmap_promote_l2(pmap_t pmap, pd_entry_t *l2, vm_offset_t va, struct rwlock **lockp) { pt_entry_t *firstl3, *l3, newl2, oldl3, pa; vm_page_t mpte; vm_offset_t sva; PMAP_LOCK_ASSERT(pmap, MA_OWNED); sva = va & ~L2_OFFSET; firstl3 = pmap_l2_to_l3(l2, sva); newl2 = pmap_load(firstl3); setl2: if (((newl2 & (~ATTR_MASK | ATTR_AF)) & L2_OFFSET) != ATTR_AF) { atomic_add_long(&pmap_l2_p_failures, 1); CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx" " in pmap %p", va, pmap); return; } if ((newl2 & (ATTR_AP_RW_BIT | ATTR_SW_DBM)) == (ATTR_AP(ATTR_AP_RO) | ATTR_SW_DBM)) { if (!atomic_fcmpset_64(l2, &newl2, newl2 & ~ATTR_SW_DBM)) goto setl2; newl2 &= ~ATTR_SW_DBM; } pa = newl2 + L2_SIZE - PAGE_SIZE; for (l3 = firstl3 + NL3PG - 1; l3 > firstl3; l3--) { oldl3 = pmap_load(l3); setl3: if ((oldl3 & (ATTR_AP_RW_BIT | ATTR_SW_DBM)) == (ATTR_AP(ATTR_AP_RO) | ATTR_SW_DBM)) { if (!atomic_fcmpset_64(l3, &oldl3, oldl3 & ~ATTR_SW_DBM)) goto setl3; oldl3 &= ~ATTR_SW_DBM; } if (oldl3 != pa) { atomic_add_long(&pmap_l2_p_failures, 1); CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx" " in pmap %p", va, pmap); return; } pa -= PAGE_SIZE; } /* * Save the page table page in its current state until the L2 * mapping the superpage is demoted by pmap_demote_l2() or * destroyed by pmap_remove_l3(). */ mpte = PHYS_TO_VM_PAGE(pmap_load(l2) & ~ATTR_MASK); KASSERT(mpte >= vm_page_array && mpte < &vm_page_array[vm_page_array_size], ("pmap_promote_l2: page table page is out of range")); KASSERT(mpte->pindex == pmap_l2_pindex(va), ("pmap_promote_l2: page table page's pindex is wrong")); if (pmap_insert_pt_page(pmap, mpte, true)) { atomic_add_long(&pmap_l2_p_failures, 1); CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx in pmap %p", va, pmap); return; } if ((newl2 & ATTR_SW_MANAGED) != 0) pmap_pv_promote_l2(pmap, va, newl2 & ~ATTR_MASK, lockp); newl2 &= ~ATTR_DESCR_MASK; newl2 |= L2_BLOCK; pmap_update_entry(pmap, l2, newl2, sva, L2_SIZE); atomic_add_long(&pmap_l2_promotions, 1); CTR2(KTR_PMAP, "pmap_promote_l2: success for va %#lx in pmap %p", va, pmap); } #endif /* VM_NRESERVLEVEL > 0 */ /* * Insert the given physical page (p) at * the specified virtual address (v) in the * target physical map with the protection requested. * * If specified, the page will be wired down, meaning * that the related pte can not be reclaimed. * * NB: This is the only routine which MAY NOT lazy-evaluate * or lose information. That is, this routine must actually * insert this page into the given map NOW. */ int pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, u_int flags, int8_t psind) { struct rwlock *lock; pd_entry_t *pde; pt_entry_t new_l3, orig_l3; pt_entry_t *l2, *l3; pv_entry_t pv; vm_paddr_t opa, pa; vm_page_t mpte, om; boolean_t nosleep; int lvl, rv; va = trunc_page(va); if ((m->oflags & VPO_UNMANAGED) == 0 && !vm_page_xbusied(m)) VM_OBJECT_ASSERT_LOCKED(m->object); pa = VM_PAGE_TO_PHYS(m); new_l3 = (pt_entry_t)(pa | ATTR_DEFAULT | ATTR_IDX(m->md.pv_memattr) | L3_PAGE); if ((prot & VM_PROT_WRITE) == 0) new_l3 |= ATTR_AP(ATTR_AP_RO); if ((prot & VM_PROT_EXECUTE) == 0 || m->md.pv_memattr == DEVICE_MEMORY) new_l3 |= ATTR_XN; if ((flags & PMAP_ENTER_WIRED) != 0) new_l3 |= ATTR_SW_WIRED; if (va < VM_MAXUSER_ADDRESS) new_l3 |= ATTR_AP(ATTR_AP_USER) | ATTR_PXN; if ((m->oflags & VPO_UNMANAGED) == 0) { new_l3 |= ATTR_SW_MANAGED; if ((prot & VM_PROT_WRITE) != 0) { new_l3 |= ATTR_SW_DBM; if ((flags & VM_PROT_WRITE) == 0) new_l3 |= ATTR_AP(ATTR_AP_RO); } } CTR2(KTR_PMAP, "pmap_enter: %.16lx -> %.16lx", va, pa); lock = NULL; PMAP_LOCK(pmap); if (psind == 1) { /* Assert the required virtual and physical alignment. */ KASSERT((va & L2_OFFSET) == 0, ("pmap_enter: va unaligned")); KASSERT(m->psind > 0, ("pmap_enter: m->psind < psind")); rv = pmap_enter_l2(pmap, va, (new_l3 & ~L3_PAGE) | L2_BLOCK, flags, m, &lock); goto out; } mpte = NULL; /* * In the case that a page table page is not * resident, we are creating it here. */ retry: pde = pmap_pde(pmap, va, &lvl); if (pde != NULL && lvl == 2) { l3 = pmap_l2_to_l3(pde, va); if (va < VM_MAXUSER_ADDRESS && mpte == NULL) { mpte = PHYS_TO_VM_PAGE(pmap_load(pde) & ~ATTR_MASK); mpte->wire_count++; } goto havel3; } else if (pde != NULL && lvl == 1) { l2 = pmap_l1_to_l2(pde, va); if ((pmap_load(l2) & ATTR_DESCR_MASK) == L2_BLOCK && (l3 = pmap_demote_l2_locked(pmap, l2, va, &lock)) != NULL) { l3 = &l3[pmap_l3_index(va)]; if (va < VM_MAXUSER_ADDRESS) { mpte = PHYS_TO_VM_PAGE( pmap_load(l2) & ~ATTR_MASK); mpte->wire_count++; } goto havel3; } /* We need to allocate an L3 table. */ } if (va < VM_MAXUSER_ADDRESS) { nosleep = (flags & PMAP_ENTER_NOSLEEP) != 0; /* * We use _pmap_alloc_l3() instead of pmap_alloc_l3() in order * to handle the possibility that a superpage mapping for "va" * was created while we slept. */ mpte = _pmap_alloc_l3(pmap, pmap_l2_pindex(va), nosleep ? NULL : &lock); if (mpte == NULL && nosleep) { CTR0(KTR_PMAP, "pmap_enter: mpte == NULL"); rv = KERN_RESOURCE_SHORTAGE; goto out; } goto retry; } else panic("pmap_enter: missing L3 table for kernel va %#lx", va); havel3: orig_l3 = pmap_load(l3); opa = orig_l3 & ~ATTR_MASK; pv = NULL; /* * Is the specified virtual address already mapped? */ if (pmap_l3_valid(orig_l3)) { /* * Wiring change, just update stats. We don't worry about * wiring PT pages as they remain resident as long as there * are valid mappings in them. Hence, if a user page is wired, * the PT page will be also. */ if ((flags & PMAP_ENTER_WIRED) != 0 && (orig_l3 & ATTR_SW_WIRED) == 0) pmap->pm_stats.wired_count++; else if ((flags & PMAP_ENTER_WIRED) == 0 && (orig_l3 & ATTR_SW_WIRED) != 0) pmap->pm_stats.wired_count--; /* * Remove the extra PT page reference. */ if (mpte != NULL) { mpte->wire_count--; KASSERT(mpte->wire_count > 0, ("pmap_enter: missing reference to page table page," " va: 0x%lx", va)); } /* * Has the physical page changed? */ if (opa == pa) { /* * No, might be a protection or wiring change. */ if ((orig_l3 & ATTR_SW_MANAGED) != 0 && (new_l3 & ATTR_SW_DBM) != 0) vm_page_aflag_set(m, PGA_WRITEABLE); goto validate; } /* * The physical page has changed. Temporarily invalidate * the mapping. */ orig_l3 = pmap_load_clear(l3); KASSERT((orig_l3 & ~ATTR_MASK) == opa, ("pmap_enter: unexpected pa update for %#lx", va)); if ((orig_l3 & ATTR_SW_MANAGED) != 0) { om = PHYS_TO_VM_PAGE(opa); /* * The pmap lock is sufficient to synchronize with * concurrent calls to pmap_page_test_mappings() and * pmap_ts_referenced(). */ if (pmap_pte_dirty(orig_l3)) vm_page_dirty(om); if ((orig_l3 & ATTR_AF) != 0) vm_page_aflag_set(om, PGA_REFERENCED); CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, opa); pv = pmap_pvh_remove(&om->md, pmap, va); if ((m->oflags & VPO_UNMANAGED) != 0) free_pv_entry(pmap, pv); if ((om->aflags & PGA_WRITEABLE) != 0 && TAILQ_EMPTY(&om->md.pv_list) && ((om->flags & PG_FICTITIOUS) != 0 || TAILQ_EMPTY(&pa_to_pvh(opa)->pv_list))) vm_page_aflag_clear(om, PGA_WRITEABLE); } pmap_invalidate_page(pmap, va); orig_l3 = 0; } else { /* * Increment the counters. */ if ((new_l3 & ATTR_SW_WIRED) != 0) pmap->pm_stats.wired_count++; pmap_resident_count_inc(pmap, 1); } /* * Enter on the PV list if part of our managed memory. */ if ((m->oflags & VPO_UNMANAGED) == 0) { if (pv == NULL) { pv = get_pv_entry(pmap, &lock); pv->pv_va = va; } CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, pa); TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); m->md.pv_gen++; if ((new_l3 & ATTR_SW_DBM) != 0) vm_page_aflag_set(m, PGA_WRITEABLE); } validate: /* * Sync icache if exec permission and attribute VM_MEMATTR_WRITE_BACK * is set. Do it now, before the mapping is stored and made * valid for hardware table walk. If done later, then other can * access this page before caches are properly synced. * Don't do it for kernel memory which is mapped with exec * permission even if the memory isn't going to hold executable * code. The only time when icache sync is needed is after * kernel module is loaded and the relocation info is processed. * And it's done in elf_cpu_load_file(). */ if ((prot & VM_PROT_EXECUTE) && pmap != kernel_pmap && m->md.pv_memattr == VM_MEMATTR_WRITE_BACK && (opa != pa || (orig_l3 & ATTR_XN))) cpu_icache_sync_range(PHYS_TO_DMAP(pa), PAGE_SIZE); /* * Update the L3 entry */ if (pmap_l3_valid(orig_l3)) { KASSERT(opa == pa, ("pmap_enter: invalid update")); if ((orig_l3 & ~ATTR_AF) != (new_l3 & ~ATTR_AF)) { /* same PA, different attributes */ /* XXXMJ need to reload orig_l3 for hardware DBM. */ pmap_load_store(l3, new_l3); pmap_invalidate_page(pmap, va); if ((orig_l3 & ATTR_SW_MANAGED) != 0 && pmap_pte_dirty(orig_l3)) vm_page_dirty(m); } else { /* * orig_l3 == new_l3 * This can happens if multiple threads simultaneously * access not yet mapped page. This bad for performance * since this can cause full demotion-NOP-promotion * cycle. * Another possible reasons are: * - VM and pmap memory layout are diverged * - tlb flush is missing somewhere and CPU doesn't see * actual mapping. */ CTR4(KTR_PMAP, "%s: already mapped page - " "pmap %p va 0x%#lx pte 0x%lx", __func__, pmap, va, new_l3); } } else { /* New mapping */ pmap_store(l3, new_l3); dsb(ishst); } #if VM_NRESERVLEVEL > 0 if (pmap != pmap_kernel() && (mpte == NULL || mpte->wire_count == NL3PG) && pmap_ps_enabled(pmap) && (m->flags & PG_FICTITIOUS) == 0 && vm_reserv_level_iffullpop(m) == 0) { pmap_promote_l2(pmap, pde, va, &lock); } #endif rv = KERN_SUCCESS; out: if (lock != NULL) rw_wunlock(lock); PMAP_UNLOCK(pmap); return (rv); } /* * Tries to create a read- and/or execute-only 2MB page mapping. Returns true * if successful. Returns false if (1) a page table page cannot be allocated * without sleeping, (2) a mapping already exists at the specified virtual * address, or (3) a PV entry cannot be allocated without reclaiming another * PV entry. */ static bool pmap_enter_2mpage(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, struct rwlock **lockp) { pd_entry_t new_l2; PMAP_LOCK_ASSERT(pmap, MA_OWNED); new_l2 = (pd_entry_t)(VM_PAGE_TO_PHYS(m) | ATTR_DEFAULT | ATTR_IDX(m->md.pv_memattr) | ATTR_AP(ATTR_AP_RO) | L2_BLOCK); if ((m->oflags & VPO_UNMANAGED) == 0) { new_l2 |= ATTR_SW_MANAGED; new_l2 &= ~ATTR_AF; } if ((prot & VM_PROT_EXECUTE) == 0 || m->md.pv_memattr == DEVICE_MEMORY) new_l2 |= ATTR_XN; if (va < VM_MAXUSER_ADDRESS) new_l2 |= ATTR_AP(ATTR_AP_USER) | ATTR_PXN; return (pmap_enter_l2(pmap, va, new_l2, PMAP_ENTER_NOSLEEP | PMAP_ENTER_NOREPLACE | PMAP_ENTER_NORECLAIM, NULL, lockp) == KERN_SUCCESS); } /* * Tries to create the specified 2MB page mapping. Returns KERN_SUCCESS if * the mapping was created, and either KERN_FAILURE or KERN_RESOURCE_SHORTAGE * otherwise. Returns KERN_FAILURE if PMAP_ENTER_NOREPLACE was specified and * a mapping already exists at the specified virtual address. Returns * KERN_RESOURCE_SHORTAGE if PMAP_ENTER_NOSLEEP was specified and a page table * page allocation failed. Returns KERN_RESOURCE_SHORTAGE if * PMAP_ENTER_NORECLAIM was specified and a PV entry allocation failed. * * The parameter "m" is only used when creating a managed, writeable mapping. */ static int pmap_enter_l2(pmap_t pmap, vm_offset_t va, pd_entry_t new_l2, u_int flags, vm_page_t m, struct rwlock **lockp) { struct spglist free; pd_entry_t *l2, old_l2; vm_page_t l2pg, mt; PMAP_LOCK_ASSERT(pmap, MA_OWNED); if ((l2pg = pmap_alloc_l2(pmap, va, (flags & PMAP_ENTER_NOSLEEP) != 0 ? NULL : lockp)) == NULL) { CTR2(KTR_PMAP, "pmap_enter_l2: failure for va %#lx in pmap %p", va, pmap); return (KERN_RESOURCE_SHORTAGE); } l2 = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(l2pg)); l2 = &l2[pmap_l2_index(va)]; if ((old_l2 = pmap_load(l2)) != 0) { KASSERT(l2pg->wire_count > 1, ("pmap_enter_l2: l2pg's wire count is too low")); if ((flags & PMAP_ENTER_NOREPLACE) != 0) { l2pg->wire_count--; CTR2(KTR_PMAP, "pmap_enter_l2: failure for va %#lx in pmap %p", va, pmap); return (KERN_FAILURE); } SLIST_INIT(&free); if ((old_l2 & ATTR_DESCR_MASK) == L2_BLOCK) (void)pmap_remove_l2(pmap, l2, va, pmap_load(pmap_l1(pmap, va)), &free, lockp); else pmap_remove_l3_range(pmap, old_l2, va, va + L2_SIZE, &free, lockp); vm_page_free_pages_toq(&free, true); if (va >= VM_MAXUSER_ADDRESS) { /* * Both pmap_remove_l2() and pmap_remove_l3_range() * will leave the kernel page table page zero filled. * Nonetheless, the TLB could have an intermediate * entry for the kernel page table page. */ mt = PHYS_TO_VM_PAGE(pmap_load(l2) & ~ATTR_MASK); if (pmap_insert_pt_page(pmap, mt, false)) panic("pmap_enter_l2: trie insert failed"); pmap_clear(l2); pmap_invalidate_page(pmap, va); } else KASSERT(pmap_load(l2) == 0, ("pmap_enter_l2: non-zero L2 entry %p", l2)); } if ((new_l2 & ATTR_SW_MANAGED) != 0) { /* * Abort this mapping if its PV entry could not be created. */ if (!pmap_pv_insert_l2(pmap, va, new_l2, flags, lockp)) { SLIST_INIT(&free); if (pmap_unwire_l3(pmap, va, l2pg, &free)) { /* * Although "va" is not mapped, the TLB could * nonetheless have intermediate entries that * refer to the freed page table pages. * Invalidate those entries. * * XXX redundant invalidation (See * _pmap_unwire_l3().) */ pmap_invalidate_page(pmap, va); vm_page_free_pages_toq(&free, true); } CTR2(KTR_PMAP, "pmap_enter_l2: failure for va %#lx in pmap %p", va, pmap); return (KERN_RESOURCE_SHORTAGE); } if ((new_l2 & ATTR_SW_DBM) != 0) for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++) vm_page_aflag_set(mt, PGA_WRITEABLE); } /* * Increment counters. */ if ((new_l2 & ATTR_SW_WIRED) != 0) pmap->pm_stats.wired_count += L2_SIZE / PAGE_SIZE; pmap->pm_stats.resident_count += L2_SIZE / PAGE_SIZE; /* * Map the superpage. */ pmap_store(l2, new_l2); dsb(ishst); atomic_add_long(&pmap_l2_mappings, 1); CTR2(KTR_PMAP, "pmap_enter_l2: success for va %#lx in pmap %p", va, pmap); return (KERN_SUCCESS); } /* * Maps a sequence of resident pages belonging to the same object. * The sequence begins with the given page m_start. This page is * mapped at the given virtual address start. Each subsequent page is * mapped at a virtual address that is offset from start by the same * amount as the page is offset from m_start within the object. The * last page in the sequence is the page with the largest offset from * m_start that can be mapped at a virtual address less than the given * virtual address end. Not every virtual page between start and end * is mapped; only those for which a resident page exists with the * corresponding offset from m_start are mapped. */ void pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end, vm_page_t m_start, vm_prot_t prot) { struct rwlock *lock; vm_offset_t va; vm_page_t m, mpte; vm_pindex_t diff, psize; VM_OBJECT_ASSERT_LOCKED(m_start->object); psize = atop(end - start); mpte = NULL; m = m_start; lock = NULL; PMAP_LOCK(pmap); while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) { va = start + ptoa(diff); if ((va & L2_OFFSET) == 0 && va + L2_SIZE <= end && m->psind == 1 && pmap_ps_enabled(pmap) && pmap_enter_2mpage(pmap, va, m, prot, &lock)) m = &m[L2_SIZE / PAGE_SIZE - 1]; else mpte = pmap_enter_quick_locked(pmap, va, m, prot, mpte, &lock); m = TAILQ_NEXT(m, listq); } if (lock != NULL) rw_wunlock(lock); PMAP_UNLOCK(pmap); } /* * this code makes some *MAJOR* assumptions: * 1. Current pmap & pmap exists. * 2. Not wired. * 3. Read access. * 4. No page table pages. * but is *MUCH* faster than pmap_enter... */ void pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot) { struct rwlock *lock; lock = NULL; PMAP_LOCK(pmap); (void)pmap_enter_quick_locked(pmap, va, m, prot, NULL, &lock); if (lock != NULL) rw_wunlock(lock); PMAP_UNLOCK(pmap); } static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp) { struct spglist free; pd_entry_t *pde; pt_entry_t *l2, *l3, l3_val; vm_paddr_t pa; int lvl; KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva || (m->oflags & VPO_UNMANAGED) != 0, ("pmap_enter_quick_locked: managed mapping within the clean submap")); PMAP_LOCK_ASSERT(pmap, MA_OWNED); CTR2(KTR_PMAP, "pmap_enter_quick_locked: %p %lx", pmap, va); /* * In the case that a page table page is not * resident, we are creating it here. */ if (va < VM_MAXUSER_ADDRESS) { vm_pindex_t l2pindex; /* * Calculate pagetable page index */ l2pindex = pmap_l2_pindex(va); if (mpte && (mpte->pindex == l2pindex)) { mpte->wire_count++; } else { /* * Get the l2 entry */ pde = pmap_pde(pmap, va, &lvl); /* * If the page table page is mapped, we just increment * the hold count, and activate it. Otherwise, we * attempt to allocate a page table page. If this * attempt fails, we don't retry. Instead, we give up. */ if (lvl == 1) { l2 = pmap_l1_to_l2(pde, va); if ((pmap_load(l2) & ATTR_DESCR_MASK) == L2_BLOCK) return (NULL); } if (lvl == 2 && pmap_load(pde) != 0) { mpte = PHYS_TO_VM_PAGE(pmap_load(pde) & ~ATTR_MASK); mpte->wire_count++; } else { /* * Pass NULL instead of the PV list lock * pointer, because we don't intend to sleep. */ mpte = _pmap_alloc_l3(pmap, l2pindex, NULL); if (mpte == NULL) return (mpte); } } l3 = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mpte)); l3 = &l3[pmap_l3_index(va)]; } else { mpte = NULL; pde = pmap_pde(kernel_pmap, va, &lvl); KASSERT(pde != NULL, ("pmap_enter_quick_locked: Invalid page entry, va: 0x%lx", va)); KASSERT(lvl == 2, ("pmap_enter_quick_locked: Invalid level %d", lvl)); l3 = pmap_l2_to_l3(pde, va); } /* * Abort if a mapping already exists. */ if (pmap_load(l3) != 0) { if (mpte != NULL) { mpte->wire_count--; mpte = NULL; } return (mpte); } /* * Enter on the PV list if part of our managed memory. */ if ((m->oflags & VPO_UNMANAGED) == 0 && !pmap_try_insert_pv_entry(pmap, va, m, lockp)) { if (mpte != NULL) { SLIST_INIT(&free); if (pmap_unwire_l3(pmap, va, mpte, &free)) { pmap_invalidate_page(pmap, va); vm_page_free_pages_toq(&free, true); } mpte = NULL; } return (mpte); } /* * Increment counters */ pmap_resident_count_inc(pmap, 1); pa = VM_PAGE_TO_PHYS(m); l3_val = pa | ATTR_DEFAULT | ATTR_IDX(m->md.pv_memattr) | ATTR_AP(ATTR_AP_RO) | L3_PAGE; if ((prot & VM_PROT_EXECUTE) == 0 || m->md.pv_memattr == DEVICE_MEMORY) l3_val |= ATTR_XN; else if (va < VM_MAXUSER_ADDRESS) l3_val |= ATTR_PXN; /* * Now validate mapping with RO protection */ if ((m->oflags & VPO_UNMANAGED) == 0) { l3_val |= ATTR_SW_MANAGED; l3_val &= ~ATTR_AF; } /* Sync icache before the mapping is stored to PTE */ if ((prot & VM_PROT_EXECUTE) && pmap != kernel_pmap && m->md.pv_memattr == VM_MEMATTR_WRITE_BACK) cpu_icache_sync_range(PHYS_TO_DMAP(pa), PAGE_SIZE); pmap_store(l3, l3_val); dsb(ishst); return (mpte); } /* * This code maps large physical mmap regions into the * processor address space. Note that some shortcuts * are taken, but the code works. */ void pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object, vm_pindex_t pindex, vm_size_t size) { VM_OBJECT_ASSERT_WLOCKED(object); KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG, ("pmap_object_init_pt: non-device object")); } /* * Clear the wired attribute from the mappings for the specified range of * addresses in the given pmap. Every valid mapping within that range * must have the wired attribute set. In contrast, invalid mappings * cannot have the wired attribute set, so they are ignored. * * The wired attribute of the page table entry is not a hardware feature, * so there is no need to invalidate any TLB entries. */ void pmap_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { vm_offset_t va_next; pd_entry_t *l0, *l1, *l2; pt_entry_t *l3; PMAP_LOCK(pmap); for (; sva < eva; sva = va_next) { l0 = pmap_l0(pmap, sva); if (pmap_load(l0) == 0) { va_next = (sva + L0_SIZE) & ~L0_OFFSET; if (va_next < sva) va_next = eva; continue; } l1 = pmap_l0_to_l1(l0, sva); if (pmap_load(l1) == 0) { va_next = (sva + L1_SIZE) & ~L1_OFFSET; if (va_next < sva) va_next = eva; continue; } va_next = (sva + L2_SIZE) & ~L2_OFFSET; if (va_next < sva) va_next = eva; l2 = pmap_l1_to_l2(l1, sva); if (pmap_load(l2) == 0) continue; if ((pmap_load(l2) & ATTR_DESCR_MASK) == L2_BLOCK) { if ((pmap_load(l2) & ATTR_SW_WIRED) == 0) panic("pmap_unwire: l2 %#jx is missing " "ATTR_SW_WIRED", (uintmax_t)pmap_load(l2)); /* * Are we unwiring the entire large page? If not, * demote the mapping and fall through. */ if (sva + L2_SIZE == va_next && eva >= va_next) { pmap_clear_bits(l2, ATTR_SW_WIRED); pmap->pm_stats.wired_count -= L2_SIZE / PAGE_SIZE; continue; } else if (pmap_demote_l2(pmap, l2, sva) == NULL) panic("pmap_unwire: demotion failed"); } KASSERT((pmap_load(l2) & ATTR_DESCR_MASK) == L2_TABLE, ("pmap_unwire: Invalid l2 entry after demotion")); if (va_next > eva) va_next = eva; for (l3 = pmap_l2_to_l3(l2, sva); sva != va_next; l3++, sva += L3_SIZE) { if (pmap_load(l3) == 0) continue; if ((pmap_load(l3) & ATTR_SW_WIRED) == 0) panic("pmap_unwire: l3 %#jx is missing " "ATTR_SW_WIRED", (uintmax_t)pmap_load(l3)); /* * ATTR_SW_WIRED must be cleared atomically. Although * the pmap lock synchronizes access to ATTR_SW_WIRED, * the System MMU may write to the entry concurrently. */ pmap_clear_bits(l3, ATTR_SW_WIRED); pmap->pm_stats.wired_count--; } } PMAP_UNLOCK(pmap); } /* * Copy the range specified by src_addr/len * from the source map to the range dst_addr/len * in the destination map. * * This routine is only advisory and need not do anything. * * Because the executable mappings created by this routine are copied, * it should not have to flush the instruction cache. */ void pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len, vm_offset_t src_addr) { struct rwlock *lock; struct spglist free; pd_entry_t *l0, *l1, *l2, srcptepaddr; pt_entry_t *dst_pte, mask, nbits, ptetemp, *src_pte; vm_offset_t addr, end_addr, va_next; vm_page_t dst_l2pg, dstmpte, srcmpte; if (dst_addr != src_addr) return; end_addr = src_addr + len; lock = NULL; if (dst_pmap < src_pmap) { PMAP_LOCK(dst_pmap); PMAP_LOCK(src_pmap); } else { PMAP_LOCK(src_pmap); PMAP_LOCK(dst_pmap); } for (addr = src_addr; addr < end_addr; addr = va_next) { l0 = pmap_l0(src_pmap, addr); if (pmap_load(l0) == 0) { va_next = (addr + L0_SIZE) & ~L0_OFFSET; if (va_next < addr) va_next = end_addr; continue; } l1 = pmap_l0_to_l1(l0, addr); if (pmap_load(l1) == 0) { va_next = (addr + L1_SIZE) & ~L1_OFFSET; if (va_next < addr) va_next = end_addr; continue; } va_next = (addr + L2_SIZE) & ~L2_OFFSET; if (va_next < addr) va_next = end_addr; l2 = pmap_l1_to_l2(l1, addr); srcptepaddr = pmap_load(l2); if (srcptepaddr == 0) continue; if ((srcptepaddr & ATTR_DESCR_MASK) == L2_BLOCK) { if ((addr & L2_OFFSET) != 0 || addr + L2_SIZE > end_addr) continue; dst_l2pg = pmap_alloc_l2(dst_pmap, addr, NULL); if (dst_l2pg == NULL) break; l2 = (pd_entry_t *) PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dst_l2pg)); l2 = &l2[pmap_l2_index(addr)]; if (pmap_load(l2) == 0 && ((srcptepaddr & ATTR_SW_MANAGED) == 0 || pmap_pv_insert_l2(dst_pmap, addr, srcptepaddr, PMAP_ENTER_NORECLAIM, &lock))) { mask = ATTR_AF | ATTR_SW_WIRED; nbits = 0; if ((srcptepaddr & ATTR_SW_DBM) != 0) nbits |= ATTR_AP_RW_BIT; pmap_store(l2, (srcptepaddr & ~mask) | nbits); pmap_resident_count_inc(dst_pmap, L2_SIZE / PAGE_SIZE); atomic_add_long(&pmap_l2_mappings, 1); } else dst_l2pg->wire_count--; continue; } KASSERT((srcptepaddr & ATTR_DESCR_MASK) == L2_TABLE, ("pmap_copy: invalid L2 entry")); srcptepaddr &= ~ATTR_MASK; srcmpte = PHYS_TO_VM_PAGE(srcptepaddr); KASSERT(srcmpte->wire_count > 0, ("pmap_copy: source page table page is unused")); if (va_next > end_addr) va_next = end_addr; src_pte = (pt_entry_t *)PHYS_TO_DMAP(srcptepaddr); src_pte = &src_pte[pmap_l3_index(addr)]; dstmpte = NULL; for (; addr < va_next; addr += PAGE_SIZE, src_pte++) { ptetemp = pmap_load(src_pte); /* * We only virtual copy managed pages. */ if ((ptetemp & ATTR_SW_MANAGED) == 0) continue; if (dstmpte != NULL) { KASSERT(dstmpte->pindex == pmap_l2_pindex(addr), ("dstmpte pindex/addr mismatch")); dstmpte->wire_count++; } else if ((dstmpte = pmap_alloc_l3(dst_pmap, addr, NULL)) == NULL) goto out; dst_pte = (pt_entry_t *) PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dstmpte)); dst_pte = &dst_pte[pmap_l3_index(addr)]; if (pmap_load(dst_pte) == 0 && pmap_try_insert_pv_entry(dst_pmap, addr, PHYS_TO_VM_PAGE(ptetemp & ~ATTR_MASK), &lock)) { /* * Clear the wired, modified, and accessed * (referenced) bits during the copy. */ mask = ATTR_AF | ATTR_SW_WIRED; nbits = 0; if ((ptetemp & ATTR_SW_DBM) != 0) nbits |= ATTR_AP_RW_BIT; pmap_store(dst_pte, (ptetemp & ~mask) | nbits); pmap_resident_count_inc(dst_pmap, 1); } else { SLIST_INIT(&free); if (pmap_unwire_l3(dst_pmap, addr, dstmpte, &free)) { /* * Although "addr" is not mapped, * the TLB could nonetheless have * intermediate entries that refer * to the freed page table pages. * Invalidate those entries. * * XXX redundant invalidation */ pmap_invalidate_page(dst_pmap, addr); vm_page_free_pages_toq(&free, true); } goto out; } /* Have we copied all of the valid mappings? */ if (dstmpte->wire_count >= srcmpte->wire_count) break; } } out: /* * XXX This barrier may not be needed because the destination pmap is * not active. */ dsb(ishst); if (lock != NULL) rw_wunlock(lock); PMAP_UNLOCK(src_pmap); PMAP_UNLOCK(dst_pmap); } /* * pmap_zero_page zeros the specified hardware page by mapping * the page into KVM and using bzero to clear its contents. */ void pmap_zero_page(vm_page_t m) { vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); pagezero((void *)va); } /* * pmap_zero_page_area zeros the specified hardware page by mapping * the page into KVM and using bzero to clear its contents. * * off and size may not cover an area beyond a single hardware page. */ void pmap_zero_page_area(vm_page_t m, int off, int size) { vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); if (off == 0 && size == PAGE_SIZE) pagezero((void *)va); else bzero((char *)va + off, size); } /* * pmap_copy_page copies the specified (machine independent) * page by mapping the page into virtual memory and using * bcopy to copy the page, one machine dependent page at a * time. */ void pmap_copy_page(vm_page_t msrc, vm_page_t mdst) { vm_offset_t src = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(msrc)); vm_offset_t dst = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mdst)); pagecopy((void *)src, (void *)dst); } int unmapped_buf_allowed = 1; void pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[], vm_offset_t b_offset, int xfersize) { void *a_cp, *b_cp; vm_page_t m_a, m_b; vm_paddr_t p_a, p_b; vm_offset_t a_pg_offset, b_pg_offset; int cnt; while (xfersize > 0) { a_pg_offset = a_offset & PAGE_MASK; m_a = ma[a_offset >> PAGE_SHIFT]; p_a = m_a->phys_addr; b_pg_offset = b_offset & PAGE_MASK; m_b = mb[b_offset >> PAGE_SHIFT]; p_b = m_b->phys_addr; cnt = min(xfersize, PAGE_SIZE - a_pg_offset); cnt = min(cnt, PAGE_SIZE - b_pg_offset); if (__predict_false(!PHYS_IN_DMAP(p_a))) { panic("!DMAP a %lx", p_a); } else { a_cp = (char *)PHYS_TO_DMAP(p_a) + a_pg_offset; } if (__predict_false(!PHYS_IN_DMAP(p_b))) { panic("!DMAP b %lx", p_b); } else { b_cp = (char *)PHYS_TO_DMAP(p_b) + b_pg_offset; } bcopy(a_cp, b_cp, cnt); a_offset += cnt; b_offset += cnt; xfersize -= cnt; } } vm_offset_t pmap_quick_enter_page(vm_page_t m) { return (PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m))); } void pmap_quick_remove_page(vm_offset_t addr) { } /* * Returns true if the pmap's pv is one of the first * 16 pvs linked to from this page. This count may * be changed upwards or downwards in the future; it * is only necessary that true be returned for a small * subset of pmaps for proper page aging. */ boolean_t pmap_page_exists_quick(pmap_t pmap, vm_page_t m) { struct md_page *pvh; struct rwlock *lock; pv_entry_t pv; int loops = 0; boolean_t rv; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_page_exists_quick: page %p is not managed", m)); rv = FALSE; lock = VM_PAGE_TO_PV_LIST_LOCK(m); rw_rlock(lock); TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { if (PV_PMAP(pv) == pmap) { rv = TRUE; break; } loops++; if (loops >= 16) break; } if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) { pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { if (PV_PMAP(pv) == pmap) { rv = TRUE; break; } loops++; if (loops >= 16) break; } } rw_runlock(lock); return (rv); } /* * pmap_page_wired_mappings: * * Return the number of managed mappings to the given physical page * that are wired. */ int pmap_page_wired_mappings(vm_page_t m) { struct rwlock *lock; struct md_page *pvh; pmap_t pmap; pt_entry_t *pte; pv_entry_t pv; int count, lvl, md_gen, pvh_gen; if ((m->oflags & VPO_UNMANAGED) != 0) return (0); lock = VM_PAGE_TO_PV_LIST_LOCK(m); rw_rlock(lock); restart: count = 0; TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { md_gen = m->md.pv_gen; rw_runlock(lock); PMAP_LOCK(pmap); rw_rlock(lock); if (md_gen != m->md.pv_gen) { PMAP_UNLOCK(pmap); goto restart; } } pte = pmap_pte(pmap, pv->pv_va, &lvl); if (pte != NULL && (pmap_load(pte) & ATTR_SW_WIRED) != 0) count++; PMAP_UNLOCK(pmap); } if ((m->flags & PG_FICTITIOUS) == 0) { pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { md_gen = m->md.pv_gen; pvh_gen = pvh->pv_gen; rw_runlock(lock); PMAP_LOCK(pmap); rw_rlock(lock); if (md_gen != m->md.pv_gen || pvh_gen != pvh->pv_gen) { PMAP_UNLOCK(pmap); goto restart; } } pte = pmap_pte(pmap, pv->pv_va, &lvl); if (pte != NULL && (pmap_load(pte) & ATTR_SW_WIRED) != 0) count++; PMAP_UNLOCK(pmap); } } rw_runlock(lock); return (count); } /* * Destroy all managed, non-wired mappings in the given user-space * pmap. This pmap cannot be active on any processor besides the * caller. * * This function cannot be applied to the kernel pmap. Moreover, it * is not intended for general use. It is only to be used during * process termination. Consequently, it can be implemented in ways * that make it faster than pmap_remove(). First, it can more quickly * destroy mappings by iterating over the pmap's collection of PV * entries, rather than searching the page table. Second, it doesn't * have to test and clear the page table entries atomically, because * no processor is currently accessing the user address space. In * particular, a page table entry's dirty bit won't change state once * this function starts. */ void pmap_remove_pages(pmap_t pmap) { pd_entry_t *pde; pt_entry_t *pte, tpte; struct spglist free; vm_page_t m, ml3, mt; pv_entry_t pv; struct md_page *pvh; struct pv_chunk *pc, *npc; struct rwlock *lock; int64_t bit; uint64_t inuse, bitmask; int allfree, field, freed, idx, lvl; vm_paddr_t pa; lock = NULL; SLIST_INIT(&free); PMAP_LOCK(pmap); TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) { allfree = 1; freed = 0; for (field = 0; field < _NPCM; field++) { inuse = ~pc->pc_map[field] & pc_freemask[field]; while (inuse != 0) { bit = ffsl(inuse) - 1; bitmask = 1UL << bit; idx = field * 64 + bit; pv = &pc->pc_pventry[idx]; inuse &= ~bitmask; pde = pmap_pde(pmap, pv->pv_va, &lvl); KASSERT(pde != NULL, ("Attempting to remove an unmapped page")); switch(lvl) { case 1: pte = pmap_l1_to_l2(pde, pv->pv_va); tpte = pmap_load(pte); KASSERT((tpte & ATTR_DESCR_MASK) == L2_BLOCK, ("Attempting to remove an invalid " "block: %lx", tpte)); tpte = pmap_load(pte); break; case 2: pte = pmap_l2_to_l3(pde, pv->pv_va); tpte = pmap_load(pte); KASSERT((tpte & ATTR_DESCR_MASK) == L3_PAGE, ("Attempting to remove an invalid " "page: %lx", tpte)); break; default: panic( "Invalid page directory level: %d", lvl); } /* * We cannot remove wired pages from a process' mapping at this time */ if (tpte & ATTR_SW_WIRED) { allfree = 0; continue; } pa = tpte & ~ATTR_MASK; m = PHYS_TO_VM_PAGE(pa); KASSERT(m->phys_addr == pa, ("vm_page_t %p phys_addr mismatch %016jx %016jx", m, (uintmax_t)m->phys_addr, (uintmax_t)tpte)); KASSERT((m->flags & PG_FICTITIOUS) != 0 || m < &vm_page_array[vm_page_array_size], ("pmap_remove_pages: bad pte %#jx", (uintmax_t)tpte)); /* * Because this pmap is not active on other * processors, the dirty bit cannot have * changed state since we last loaded pte. */ pmap_clear(pte); /* * Update the vm_page_t clean/reference bits. */ if (pmap_pte_dirty(tpte)) { switch (lvl) { case 1: for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++) vm_page_dirty(mt); break; case 2: vm_page_dirty(m); break; } } CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, m); /* Mark free */ pc->pc_map[field] |= bitmask; switch (lvl) { case 1: pmap_resident_count_dec(pmap, L2_SIZE / PAGE_SIZE); pvh = pa_to_pvh(tpte & ~ATTR_MASK); TAILQ_REMOVE(&pvh->pv_list, pv,pv_next); pvh->pv_gen++; if (TAILQ_EMPTY(&pvh->pv_list)) { for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++) if ((mt->aflags & PGA_WRITEABLE) != 0 && TAILQ_EMPTY(&mt->md.pv_list)) vm_page_aflag_clear(mt, PGA_WRITEABLE); } ml3 = pmap_remove_pt_page(pmap, pv->pv_va); if (ml3 != NULL) { KASSERT(ml3->valid == VM_PAGE_BITS_ALL, ("pmap_remove_pages: l3 page not promoted")); pmap_resident_count_dec(pmap,1); KASSERT(ml3->wire_count == NL3PG, ("pmap_remove_pages: l3 page wire count error")); ml3->wire_count = 0; pmap_add_delayed_free_list(ml3, &free, FALSE); } break; case 2: pmap_resident_count_dec(pmap, 1); TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); m->md.pv_gen++; if ((m->aflags & PGA_WRITEABLE) != 0 && TAILQ_EMPTY(&m->md.pv_list) && (m->flags & PG_FICTITIOUS) == 0) { pvh = pa_to_pvh( VM_PAGE_TO_PHYS(m)); if (TAILQ_EMPTY(&pvh->pv_list)) vm_page_aflag_clear(m, PGA_WRITEABLE); } break; } pmap_unuse_pt(pmap, pv->pv_va, pmap_load(pde), &free); freed++; } } PV_STAT(atomic_add_long(&pv_entry_frees, freed)); PV_STAT(atomic_add_int(&pv_entry_spare, freed)); PV_STAT(atomic_subtract_long(&pv_entry_count, freed)); if (allfree) { TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); free_pv_chunk(pc); } } pmap_invalidate_all(pmap); if (lock != NULL) rw_wunlock(lock); PMAP_UNLOCK(pmap); vm_page_free_pages_toq(&free, true); } /* * This is used to check if a page has been accessed or modified. As we * don't have a bit to see if it has been modified we have to assume it * has been if the page is read/write. */ static boolean_t pmap_page_test_mappings(vm_page_t m, boolean_t accessed, boolean_t modified) { struct rwlock *lock; pv_entry_t pv; struct md_page *pvh; pt_entry_t *pte, mask, value; pmap_t pmap; int lvl, md_gen, pvh_gen; boolean_t rv; rv = FALSE; lock = VM_PAGE_TO_PV_LIST_LOCK(m); rw_rlock(lock); restart: TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { md_gen = m->md.pv_gen; rw_runlock(lock); PMAP_LOCK(pmap); rw_rlock(lock); if (md_gen != m->md.pv_gen) { PMAP_UNLOCK(pmap); goto restart; } } pte = pmap_pte(pmap, pv->pv_va, &lvl); KASSERT(lvl == 3, ("pmap_page_test_mappings: Invalid level %d", lvl)); mask = 0; value = 0; if (modified) { mask |= ATTR_AP_RW_BIT; value |= ATTR_AP(ATTR_AP_RW); } if (accessed) { mask |= ATTR_AF | ATTR_DESCR_MASK; value |= ATTR_AF | L3_PAGE; } rv = (pmap_load(pte) & mask) == value; PMAP_UNLOCK(pmap); if (rv) goto out; } if ((m->flags & PG_FICTITIOUS) == 0) { pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { md_gen = m->md.pv_gen; pvh_gen = pvh->pv_gen; rw_runlock(lock); PMAP_LOCK(pmap); rw_rlock(lock); if (md_gen != m->md.pv_gen || pvh_gen != pvh->pv_gen) { PMAP_UNLOCK(pmap); goto restart; } } pte = pmap_pte(pmap, pv->pv_va, &lvl); KASSERT(lvl == 2, ("pmap_page_test_mappings: Invalid level %d", lvl)); mask = 0; value = 0; if (modified) { mask |= ATTR_AP_RW_BIT; value |= ATTR_AP(ATTR_AP_RW); } if (accessed) { mask |= ATTR_AF | ATTR_DESCR_MASK; value |= ATTR_AF | L2_BLOCK; } rv = (pmap_load(pte) & mask) == value; PMAP_UNLOCK(pmap); if (rv) goto out; } } out: rw_runlock(lock); return (rv); } /* * pmap_is_modified: * * Return whether or not the specified physical page was modified * in any physical maps. */ boolean_t pmap_is_modified(vm_page_t m) { KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_is_modified: page %p is not managed", m)); /* * If the page is not exclusive busied, then PGA_WRITEABLE cannot be * concurrently set while the object is locked. Thus, if PGA_WRITEABLE * is clear, no PTEs can have PG_M set. */ VM_OBJECT_ASSERT_WLOCKED(m->object); if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0) return (FALSE); return (pmap_page_test_mappings(m, FALSE, TRUE)); } /* * pmap_is_prefaultable: * * Return whether or not the specified virtual address is eligible * for prefault. */ boolean_t pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr) { pt_entry_t *pte; boolean_t rv; int lvl; rv = FALSE; PMAP_LOCK(pmap); pte = pmap_pte(pmap, addr, &lvl); if (pte != NULL && pmap_load(pte) != 0) { rv = TRUE; } PMAP_UNLOCK(pmap); return (rv); } /* * pmap_is_referenced: * * Return whether or not the specified physical page was referenced * in any physical maps. */ boolean_t pmap_is_referenced(vm_page_t m) { KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_is_referenced: page %p is not managed", m)); return (pmap_page_test_mappings(m, TRUE, FALSE)); } /* * Clear the write and modified bits in each of the given page's mappings. */ void pmap_remove_write(vm_page_t m) { struct md_page *pvh; pmap_t pmap; struct rwlock *lock; pv_entry_t next_pv, pv; pt_entry_t oldpte, *pte; vm_offset_t va; int lvl, md_gen, pvh_gen; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_remove_write: page %p is not managed", m)); /* * If the page is not exclusive busied, then PGA_WRITEABLE cannot be * set by another thread while the object is locked. Thus, * if PGA_WRITEABLE is clear, no page table entries need updating. */ VM_OBJECT_ASSERT_WLOCKED(m->object); if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0) return; lock = VM_PAGE_TO_PV_LIST_LOCK(m); pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : pa_to_pvh(VM_PAGE_TO_PHYS(m)); retry_pv_loop: rw_wlock(lock); TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { pvh_gen = pvh->pv_gen; rw_wunlock(lock); PMAP_LOCK(pmap); rw_wlock(lock); if (pvh_gen != pvh->pv_gen) { PMAP_UNLOCK(pmap); rw_wunlock(lock); goto retry_pv_loop; } } va = pv->pv_va; pte = pmap_pte(pmap, pv->pv_va, &lvl); if ((pmap_load(pte) & ATTR_SW_DBM) != 0) (void)pmap_demote_l2_locked(pmap, pte, va, &lock); KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m), ("inconsistent pv lock %p %p for page %p", lock, VM_PAGE_TO_PV_LIST_LOCK(m), m)); PMAP_UNLOCK(pmap); } TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { pvh_gen = pvh->pv_gen; md_gen = m->md.pv_gen; rw_wunlock(lock); PMAP_LOCK(pmap); rw_wlock(lock); if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { PMAP_UNLOCK(pmap); rw_wunlock(lock); goto retry_pv_loop; } } pte = pmap_pte(pmap, pv->pv_va, &lvl); oldpte = pmap_load(pte); retry: if ((oldpte & ATTR_SW_DBM) != 0) { if (!atomic_fcmpset_long(pte, &oldpte, (oldpte | ATTR_AP_RW_BIT) & ~ATTR_SW_DBM)) goto retry; if ((oldpte & ATTR_AP_RW_BIT) == ATTR_AP(ATTR_AP_RW)) vm_page_dirty(m); pmap_invalidate_page(pmap, pv->pv_va); } PMAP_UNLOCK(pmap); } rw_wunlock(lock); vm_page_aflag_clear(m, PGA_WRITEABLE); } /* * pmap_ts_referenced: * * Return a count of reference bits for a page, clearing those bits. * It is not necessary for every reference bit to be cleared, but it * is necessary that 0 only be returned when there are truly no * reference bits set. * * As an optimization, update the page's dirty field if a modified bit is * found while counting reference bits. This opportunistic update can be * performed at low cost and can eliminate the need for some future calls * to pmap_is_modified(). However, since this function stops after * finding PMAP_TS_REFERENCED_MAX reference bits, it may not detect some * dirty pages. Those dirty pages will only be detected by a future call * to pmap_is_modified(). */ int pmap_ts_referenced(vm_page_t m) { struct md_page *pvh; pv_entry_t pv, pvf; pmap_t pmap; struct rwlock *lock; pd_entry_t *pde, tpde; pt_entry_t *pte, tpte; vm_offset_t va; vm_paddr_t pa; int cleared, lvl, md_gen, not_cleared, pvh_gen; struct spglist free; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_ts_referenced: page %p is not managed", m)); SLIST_INIT(&free); cleared = 0; pa = VM_PAGE_TO_PHYS(m); lock = PHYS_TO_PV_LIST_LOCK(pa); pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : pa_to_pvh(pa); rw_wlock(lock); retry: not_cleared = 0; if ((pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL) goto small_mappings; pv = pvf; do { if (pvf == NULL) pvf = pv; pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { pvh_gen = pvh->pv_gen; rw_wunlock(lock); PMAP_LOCK(pmap); rw_wlock(lock); if (pvh_gen != pvh->pv_gen) { PMAP_UNLOCK(pmap); goto retry; } } va = pv->pv_va; pde = pmap_pde(pmap, pv->pv_va, &lvl); KASSERT(pde != NULL, ("pmap_ts_referenced: no l1 table found")); KASSERT(lvl == 1, ("pmap_ts_referenced: invalid pde level %d", lvl)); tpde = pmap_load(pde); KASSERT((tpde & ATTR_DESCR_MASK) == L1_TABLE, ("pmap_ts_referenced: found an invalid l1 table")); pte = pmap_l1_to_l2(pde, pv->pv_va); tpte = pmap_load(pte); if (pmap_pte_dirty(tpte)) { /* * Although "tpte" is mapping a 2MB page, because * this function is called at a 4KB page granularity, * we only update the 4KB page under test. */ vm_page_dirty(m); } if ((tpte & ATTR_AF) != 0) { /* * Since this reference bit is shared by 512 4KB pages, * it should not be cleared every time it is tested. * Apply a simple "hash" function on the physical page * number, the virtual superpage number, and the pmap * address to select one 4KB page out of the 512 on * which testing the reference bit will result in * clearing that reference bit. This function is * designed to avoid the selection of the same 4KB page * for every 2MB page mapping. * * On demotion, a mapping that hasn't been referenced * is simply destroyed. To avoid the possibility of a * subsequent page fault on a demoted wired mapping, * always leave its reference bit set. Moreover, * since the superpage is wired, the current state of * its reference bit won't affect page replacement. */ if ((((pa >> PAGE_SHIFT) ^ (pv->pv_va >> L2_SHIFT) ^ (uintptr_t)pmap) & (Ln_ENTRIES - 1)) == 0 && (tpte & ATTR_SW_WIRED) == 0) { pmap_clear_bits(pte, ATTR_AF); pmap_invalidate_page(pmap, pv->pv_va); cleared++; } else not_cleared++; } PMAP_UNLOCK(pmap); /* Rotate the PV list if it has more than one entry. */ if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) { TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); pvh->pv_gen++; } if (cleared + not_cleared >= PMAP_TS_REFERENCED_MAX) goto out; } while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf); small_mappings: if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL) goto out; pv = pvf; do { if (pvf == NULL) pvf = pv; pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { pvh_gen = pvh->pv_gen; md_gen = m->md.pv_gen; rw_wunlock(lock); PMAP_LOCK(pmap); rw_wlock(lock); if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { PMAP_UNLOCK(pmap); goto retry; } } pde = pmap_pde(pmap, pv->pv_va, &lvl); KASSERT(pde != NULL, ("pmap_ts_referenced: no l2 table found")); KASSERT(lvl == 2, ("pmap_ts_referenced: invalid pde level %d", lvl)); tpde = pmap_load(pde); KASSERT((tpde & ATTR_DESCR_MASK) == L2_TABLE, ("pmap_ts_referenced: found an invalid l2 table")); pte = pmap_l2_to_l3(pde, pv->pv_va); tpte = pmap_load(pte); if (pmap_pte_dirty(tpte)) vm_page_dirty(m); if ((tpte & ATTR_AF) != 0) { if ((tpte & ATTR_SW_WIRED) == 0) { pmap_clear_bits(pte, ATTR_AF); pmap_invalidate_page(pmap, pv->pv_va); cleared++; } else not_cleared++; } PMAP_UNLOCK(pmap); /* Rotate the PV list if it has more than one entry. */ if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) { TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); m->md.pv_gen++; } } while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && cleared + not_cleared < PMAP_TS_REFERENCED_MAX); out: rw_wunlock(lock); vm_page_free_pages_toq(&free, true); return (cleared + not_cleared); } /* * Apply the given advice to the specified range of addresses within the * given pmap. Depending on the advice, clear the referenced and/or * modified flags in each mapping and set the mapped page's dirty field. */ void pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice) { + struct rwlock *lock; + vm_offset_t va, va_next; + vm_page_t m; + pd_entry_t *l0, *l1, *l2, oldl2; + pt_entry_t *l3, oldl3; + + if (advice != MADV_DONTNEED && advice != MADV_FREE) + return; + + PMAP_LOCK(pmap); + for (; sva < eva; sva = va_next) { + l0 = pmap_l0(pmap, sva); + if (pmap_load(l0) == 0) { + va_next = (sva + L0_SIZE) & ~L0_OFFSET; + if (va_next < sva) + va_next = eva; + continue; + } + l1 = pmap_l0_to_l1(l0, sva); + if (pmap_load(l1) == 0) { + va_next = (sva + L1_SIZE) & ~L1_OFFSET; + if (va_next < sva) + va_next = eva; + continue; + } + va_next = (sva + L2_SIZE) & ~L2_OFFSET; + if (va_next < sva) + va_next = eva; + l2 = pmap_l1_to_l2(l1, sva); + oldl2 = pmap_load(l2); + if (oldl2 == 0) + continue; + if ((oldl2 & ATTR_DESCR_MASK) == L2_BLOCK) { + if ((oldl2 & ATTR_SW_MANAGED) == 0) + continue; + lock = NULL; + if (!pmap_demote_l2_locked(pmap, l2, sva, &lock)) { + if (lock != NULL) + rw_wunlock(lock); + + /* + * The 2MB page mapping was destroyed. + */ + continue; + } + + /* + * Unless the page mappings are wired, remove the + * mapping to a single page so that a subsequent + * access may repromote. Since the underlying page + * table page is fully populated, this removal never + * frees a page table page. + */ + if ((oldl2 & ATTR_SW_WIRED) == 0) { + l3 = pmap_l2_to_l3(l2, sva); + KASSERT(pmap_load(l3) != 0, + ("pmap_advise: invalid PTE")); + pmap_remove_l3(pmap, l3, sva, pmap_load(l2), + NULL, &lock); + } + if (lock != NULL) + rw_wunlock(lock); + } + KASSERT((pmap_load(l2) & ATTR_DESCR_MASK) == L2_TABLE, + ("pmap_advise: invalid L2 entry after demotion")); + if (va_next > eva) + va_next = eva; + va = va_next; + for (l3 = pmap_l2_to_l3(l2, sva); sva != va_next; l3++, + sva += L3_SIZE) { + oldl3 = pmap_load(l3); + if ((oldl3 & (ATTR_SW_MANAGED | ATTR_DESCR_MASK)) != + (ATTR_SW_MANAGED | L3_PAGE)) + goto maybe_invlrng; + else if (pmap_pte_dirty(oldl3)) { + if (advice == MADV_DONTNEED) { + /* + * Future calls to pmap_is_modified() + * can be avoided by making the page + * dirty now. + */ + m = PHYS_TO_VM_PAGE(oldl3 & ~ATTR_MASK); + vm_page_dirty(m); + } + while (!atomic_fcmpset_long(l3, &oldl3, + (oldl3 & ~ATTR_AF) | ATTR_AP(ATTR_AP_RO))) + cpu_spinwait(); + } else if ((oldl3 & ATTR_AF) != 0) + pmap_clear_bits(l3, ATTR_AF); + else + goto maybe_invlrng; + if (va == va_next) + va = sva; + continue; +maybe_invlrng: + if (va != va_next) { + pmap_invalidate_range(pmap, va, sva); + va = va_next; + } + } + if (va != va_next) + pmap_invalidate_range(pmap, va, sva); + } + PMAP_UNLOCK(pmap); } /* * Clear the modify bits on the specified physical page. */ void pmap_clear_modify(vm_page_t m) { struct md_page *pvh; struct rwlock *lock; pmap_t pmap; pv_entry_t next_pv, pv; pd_entry_t *l2, oldl2; pt_entry_t *l3, oldl3; vm_offset_t va; int md_gen, pvh_gen; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_clear_modify: page %p is not managed", m)); VM_OBJECT_ASSERT_WLOCKED(m->object); KASSERT(!vm_page_xbusied(m), ("pmap_clear_modify: page %p is exclusive busied", m)); /* * If the page is not PGA_WRITEABLE, then no PTEs can have ATTR_SW_DBM * set. If the object containing the page is locked and the page is not * exclusive busied, then PGA_WRITEABLE cannot be concurrently set. */ if ((m->aflags & PGA_WRITEABLE) == 0) return; pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : pa_to_pvh(VM_PAGE_TO_PHYS(m)); lock = VM_PAGE_TO_PV_LIST_LOCK(m); rw_wlock(lock); restart: TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { pvh_gen = pvh->pv_gen; rw_wunlock(lock); PMAP_LOCK(pmap); rw_wlock(lock); if (pvh_gen != pvh->pv_gen) { PMAP_UNLOCK(pmap); goto restart; } } va = pv->pv_va; l2 = pmap_l2(pmap, va); oldl2 = pmap_load(l2); - if ((oldl2 & ATTR_SW_DBM) != 0) { - if (pmap_demote_l2_locked(pmap, l2, va, &lock)) { - if ((oldl2 & ATTR_SW_WIRED) == 0) { - /* - * Write protect the mapping to a - * single page so that a subsequent - * write access may repromote. - */ - va += VM_PAGE_TO_PHYS(m) - - (oldl2 & ~ATTR_MASK); - l3 = pmap_l2_to_l3(l2, va); - oldl3 = pmap_load(l3); - if (pmap_l3_valid(oldl3)) { - while (!atomic_fcmpset_long(l3, - &oldl3, (oldl3 & ~ATTR_SW_DBM) | - ATTR_AP(ATTR_AP_RO))) - cpu_spinwait(); - vm_page_dirty(m); - pmap_invalidate_page(pmap, va); - } - } - } + /* If oldl2 has ATTR_SW_DBM set, then it is also dirty. */ + if ((oldl2 & ATTR_SW_DBM) != 0 && + pmap_demote_l2_locked(pmap, l2, va, &lock) && + (oldl2 & ATTR_SW_WIRED) == 0) { + /* + * Write protect the mapping to a single page so that + * a subsequent write access may repromote. + */ + va += VM_PAGE_TO_PHYS(m) - (oldl2 & ~ATTR_MASK); + l3 = pmap_l2_to_l3(l2, va); + oldl3 = pmap_load(l3); + while (!atomic_fcmpset_long(l3, &oldl3, + (oldl3 & ~ATTR_SW_DBM) | ATTR_AP(ATTR_AP_RO))) + cpu_spinwait(); + vm_page_dirty(m); + pmap_invalidate_page(pmap, va); } PMAP_UNLOCK(pmap); } TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { md_gen = m->md.pv_gen; pvh_gen = pvh->pv_gen; rw_wunlock(lock); PMAP_LOCK(pmap); rw_wlock(lock); if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { PMAP_UNLOCK(pmap); goto restart; } } l2 = pmap_l2(pmap, pv->pv_va); l3 = pmap_l2_to_l3(l2, pv->pv_va); oldl3 = pmap_load(l3); if (pmap_l3_valid(oldl3) && (oldl3 & (ATTR_AP_RW_BIT | ATTR_SW_DBM)) == ATTR_SW_DBM) { pmap_set_bits(l3, ATTR_AP(ATTR_AP_RO)); pmap_invalidate_page(pmap, pv->pv_va); } PMAP_UNLOCK(pmap); } rw_wunlock(lock); } void * pmap_mapbios(vm_paddr_t pa, vm_size_t size) { struct pmap_preinit_mapping *ppim; vm_offset_t va, offset; pd_entry_t *pde; pt_entry_t *l2; int i, lvl, l2_blocks, free_l2_count, start_idx; if (!vm_initialized) { /* * No L3 ptables so map entire L2 blocks where start VA is: * preinit_map_va + start_idx * L2_SIZE * There may be duplicate mappings (multiple VA -> same PA) but * ARM64 dcache is always PIPT so that's acceptable. */ if (size == 0) return (NULL); /* Calculate how many L2 blocks are needed for the mapping */ l2_blocks = (roundup2(pa + size, L2_SIZE) - rounddown2(pa, L2_SIZE)) >> L2_SHIFT; offset = pa & L2_OFFSET; if (preinit_map_va == 0) return (NULL); /* Map 2MiB L2 blocks from reserved VA space */ free_l2_count = 0; start_idx = -1; /* Find enough free contiguous VA space */ for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { ppim = pmap_preinit_mapping + i; if (free_l2_count > 0 && ppim->pa != 0) { /* Not enough space here */ free_l2_count = 0; start_idx = -1; continue; } if (ppim->pa == 0) { /* Free L2 block */ if (start_idx == -1) start_idx = i; free_l2_count++; if (free_l2_count == l2_blocks) break; } } if (free_l2_count != l2_blocks) panic("%s: too many preinit mappings", __func__); va = preinit_map_va + (start_idx * L2_SIZE); for (i = start_idx; i < start_idx + l2_blocks; i++) { /* Mark entries as allocated */ ppim = pmap_preinit_mapping + i; ppim->pa = pa; ppim->va = va + offset; ppim->size = size; } /* Map L2 blocks */ pa = rounddown2(pa, L2_SIZE); for (i = 0; i < l2_blocks; i++) { pde = pmap_pde(kernel_pmap, va, &lvl); KASSERT(pde != NULL, ("pmap_mapbios: Invalid page entry, va: 0x%lx", va)); KASSERT(lvl == 1, ("pmap_mapbios: Invalid level %d", lvl)); /* Insert L2_BLOCK */ l2 = pmap_l1_to_l2(pde, va); pmap_load_store(l2, pa | ATTR_DEFAULT | ATTR_XN | ATTR_IDX(CACHED_MEMORY) | L2_BLOCK); va += L2_SIZE; pa += L2_SIZE; } pmap_invalidate_all(kernel_pmap); va = preinit_map_va + (start_idx * L2_SIZE); } else { /* kva_alloc may be used to map the pages */ offset = pa & PAGE_MASK; size = round_page(offset + size); va = kva_alloc(size); if (va == 0) panic("%s: Couldn't allocate KVA", __func__); pde = pmap_pde(kernel_pmap, va, &lvl); KASSERT(lvl == 2, ("pmap_mapbios: Invalid level %d", lvl)); /* L3 table is linked */ va = trunc_page(va); pa = trunc_page(pa); pmap_kenter(va, size, pa, CACHED_MEMORY); } return ((void *)(va + offset)); } void pmap_unmapbios(vm_offset_t va, vm_size_t size) { struct pmap_preinit_mapping *ppim; vm_offset_t offset, tmpsize, va_trunc; pd_entry_t *pde; pt_entry_t *l2; int i, lvl, l2_blocks, block; bool preinit_map; l2_blocks = (roundup2(va + size, L2_SIZE) - rounddown2(va, L2_SIZE)) >> L2_SHIFT; KASSERT(l2_blocks > 0, ("pmap_unmapbios: invalid size %lx", size)); /* Remove preinit mapping */ preinit_map = false; block = 0; for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { ppim = pmap_preinit_mapping + i; if (ppim->va == va) { KASSERT(ppim->size == size, ("pmap_unmapbios: size mismatch")); ppim->va = 0; ppim->pa = 0; ppim->size = 0; preinit_map = true; offset = block * L2_SIZE; va_trunc = rounddown2(va, L2_SIZE) + offset; /* Remove L2_BLOCK */ pde = pmap_pde(kernel_pmap, va_trunc, &lvl); KASSERT(pde != NULL, ("pmap_unmapbios: Invalid page entry, va: 0x%lx", va_trunc)); l2 = pmap_l1_to_l2(pde, va_trunc); pmap_clear(l2); if (block == (l2_blocks - 1)) break; block++; } } if (preinit_map) { pmap_invalidate_all(kernel_pmap); return; } /* Unmap the pages reserved with kva_alloc. */ if (vm_initialized) { offset = va & PAGE_MASK; size = round_page(offset + size); va = trunc_page(va); pde = pmap_pde(kernel_pmap, va, &lvl); KASSERT(pde != NULL, ("pmap_unmapbios: Invalid page entry, va: 0x%lx", va)); KASSERT(lvl == 2, ("pmap_unmapbios: Invalid level %d", lvl)); /* Unmap and invalidate the pages */ for (tmpsize = 0; tmpsize < size; tmpsize += PAGE_SIZE) pmap_kremove(va + tmpsize); kva_free(va, size); } } /* * Sets the memory attribute for the specified page. */ void pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma) { m->md.pv_memattr = ma; /* * If "m" is a normal page, update its direct mapping. This update * can be relied upon to perform any cache operations that are * required for data coherence. */ if ((m->flags & PG_FICTITIOUS) == 0 && pmap_change_attr(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)), PAGE_SIZE, m->md.pv_memattr) != 0) panic("memory attribute change on the direct map failed"); } /* * Changes the specified virtual address range's memory type to that given by * the parameter "mode". The specified virtual address range must be * completely contained within either the direct map or the kernel map. If * the virtual address range is contained within the kernel map, then the * memory type for each of the corresponding ranges of the direct map is also * changed. (The corresponding ranges of the direct map are those ranges that * map the same physical pages as the specified virtual address range.) These * changes to the direct map are necessary because Intel describes the * behavior of their processors as "undefined" if two or more mappings to the * same physical page have different memory types. * * Returns zero if the change completed successfully, and either EINVAL or * ENOMEM if the change failed. Specifically, EINVAL is returned if some part * of the virtual address range was not mapped, and ENOMEM is returned if * there was insufficient memory available to complete the change. In the * latter case, the memory type may have been changed on some part of the * virtual address range or the direct map. */ static int pmap_change_attr(vm_offset_t va, vm_size_t size, int mode) { int error; PMAP_LOCK(kernel_pmap); error = pmap_change_attr_locked(va, size, mode); PMAP_UNLOCK(kernel_pmap); return (error); } static int pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode) { vm_offset_t base, offset, tmpva; pt_entry_t l3, *pte, *newpte; int lvl; PMAP_LOCK_ASSERT(kernel_pmap, MA_OWNED); base = trunc_page(va); offset = va & PAGE_MASK; size = round_page(offset + size); if (!VIRT_IN_DMAP(base)) return (EINVAL); for (tmpva = base; tmpva < base + size; ) { pte = pmap_pte(kernel_pmap, tmpva, &lvl); if (pte == NULL) return (EINVAL); if ((pmap_load(pte) & ATTR_IDX_MASK) == ATTR_IDX(mode)) { /* * We already have the correct attribute, * ignore this entry. */ switch (lvl) { default: panic("Invalid DMAP table level: %d\n", lvl); case 1: tmpva = (tmpva & ~L1_OFFSET) + L1_SIZE; break; case 2: tmpva = (tmpva & ~L2_OFFSET) + L2_SIZE; break; case 3: tmpva += PAGE_SIZE; break; } } else { /* * Split the entry to an level 3 table, then * set the new attribute. */ switch (lvl) { default: panic("Invalid DMAP table level: %d\n", lvl); case 1: newpte = pmap_demote_l1(kernel_pmap, pte, tmpva & ~L1_OFFSET); if (newpte == NULL) return (EINVAL); pte = pmap_l1_to_l2(pte, tmpva); case 2: newpte = pmap_demote_l2(kernel_pmap, pte, tmpva); if (newpte == NULL) return (EINVAL); pte = pmap_l2_to_l3(pte, tmpva); case 3: /* Update the entry */ l3 = pmap_load(pte); l3 &= ~ATTR_IDX_MASK; l3 |= ATTR_IDX(mode); if (mode == DEVICE_MEMORY) l3 |= ATTR_XN; pmap_update_entry(kernel_pmap, pte, l3, tmpva, PAGE_SIZE); /* * If moving to a non-cacheable entry flush * the cache. */ if (mode == VM_MEMATTR_UNCACHEABLE) cpu_dcache_wbinv_range(tmpva, L3_SIZE); break; } tmpva += PAGE_SIZE; } } return (0); } /* * Create an L2 table to map all addresses within an L1 mapping. */ static pt_entry_t * pmap_demote_l1(pmap_t pmap, pt_entry_t *l1, vm_offset_t va) { pt_entry_t *l2, newl2, oldl1; vm_offset_t tmpl1; vm_paddr_t l2phys, phys; vm_page_t ml2; int i; PMAP_LOCK_ASSERT(pmap, MA_OWNED); oldl1 = pmap_load(l1); KASSERT((oldl1 & ATTR_DESCR_MASK) == L1_BLOCK, ("pmap_demote_l1: Demoting a non-block entry")); KASSERT((va & L1_OFFSET) == 0, ("pmap_demote_l1: Invalid virtual address %#lx", va)); KASSERT((oldl1 & ATTR_SW_MANAGED) == 0, ("pmap_demote_l1: Level 1 table shouldn't be managed")); tmpl1 = 0; if (va <= (vm_offset_t)l1 && va + L1_SIZE > (vm_offset_t)l1) { tmpl1 = kva_alloc(PAGE_SIZE); if (tmpl1 == 0) return (NULL); } if ((ml2 = vm_page_alloc(NULL, 0, VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) { CTR2(KTR_PMAP, "pmap_demote_l1: failure for va %#lx" " in pmap %p", va, pmap); return (NULL); } l2phys = VM_PAGE_TO_PHYS(ml2); l2 = (pt_entry_t *)PHYS_TO_DMAP(l2phys); /* Address the range points at */ phys = oldl1 & ~ATTR_MASK; /* The attributed from the old l1 table to be copied */ newl2 = oldl1 & ATTR_MASK; /* Create the new entries */ for (i = 0; i < Ln_ENTRIES; i++) { l2[i] = newl2 | phys; phys += L2_SIZE; } KASSERT(l2[0] == ((oldl1 & ~ATTR_DESCR_MASK) | L2_BLOCK), ("Invalid l2 page (%lx != %lx)", l2[0], (oldl1 & ~ATTR_DESCR_MASK) | L2_BLOCK)); if (tmpl1 != 0) { pmap_kenter(tmpl1, PAGE_SIZE, DMAP_TO_PHYS((vm_offset_t)l1) & ~L3_OFFSET, CACHED_MEMORY); l1 = (pt_entry_t *)(tmpl1 + ((vm_offset_t)l1 & PAGE_MASK)); } pmap_update_entry(pmap, l1, l2phys | L1_TABLE, va, PAGE_SIZE); if (tmpl1 != 0) { pmap_kremove(tmpl1); kva_free(tmpl1, PAGE_SIZE); } return (l2); } static void pmap_fill_l3(pt_entry_t *firstl3, pt_entry_t newl3) { pt_entry_t *l3; for (l3 = firstl3; l3 - firstl3 < Ln_ENTRIES; l3++) { *l3 = newl3; newl3 += L3_SIZE; } } static void pmap_demote_l2_abort(pmap_t pmap, vm_offset_t va, pt_entry_t *l2, struct rwlock **lockp) { struct spglist free; SLIST_INIT(&free); (void)pmap_remove_l2(pmap, l2, va, pmap_load(pmap_l1(pmap, va)), &free, lockp); vm_page_free_pages_toq(&free, true); } /* * Create an L3 table to map all addresses within an L2 mapping. */ static pt_entry_t * pmap_demote_l2_locked(pmap_t pmap, pt_entry_t *l2, vm_offset_t va, struct rwlock **lockp) { pt_entry_t *l3, newl3, oldl2; vm_offset_t tmpl2; vm_paddr_t l3phys; vm_page_t ml3; PMAP_LOCK_ASSERT(pmap, MA_OWNED); l3 = NULL; oldl2 = pmap_load(l2); KASSERT((oldl2 & ATTR_DESCR_MASK) == L2_BLOCK, ("pmap_demote_l2: Demoting a non-block entry")); va &= ~L2_OFFSET; tmpl2 = 0; if (va <= (vm_offset_t)l2 && va + L2_SIZE > (vm_offset_t)l2) { tmpl2 = kva_alloc(PAGE_SIZE); if (tmpl2 == 0) return (NULL); } /* * Invalidate the 2MB page mapping and return "failure" if the * mapping was never accessed. */ if ((oldl2 & ATTR_AF) == 0) { KASSERT((oldl2 & ATTR_SW_WIRED) == 0, ("pmap_demote_l2: a wired mapping is missing ATTR_AF")); pmap_demote_l2_abort(pmap, va, l2, lockp); CTR2(KTR_PMAP, "pmap_demote_l2: failure for va %#lx in pmap %p", va, pmap); goto fail; } if ((ml3 = pmap_remove_pt_page(pmap, va)) == NULL) { KASSERT((oldl2 & ATTR_SW_WIRED) == 0, ("pmap_demote_l2: page table page for a wired mapping" " is missing")); /* * If the page table page is missing and the mapping * is for a kernel address, the mapping must belong to * the direct map. Page table pages are preallocated * for every other part of the kernel address space, * so the direct map region is the only part of the * kernel address space that must be handled here. */ KASSERT(va < VM_MAXUSER_ADDRESS || VIRT_IN_DMAP(va), ("pmap_demote_l2: No saved mpte for va %#lx", va)); /* * If the 2MB page mapping belongs to the direct map * region of the kernel's address space, then the page * allocation request specifies the highest possible * priority (VM_ALLOC_INTERRUPT). Otherwise, the * priority is normal. */ ml3 = vm_page_alloc(NULL, pmap_l2_pindex(va), (VIRT_IN_DMAP(va) ? VM_ALLOC_INTERRUPT : VM_ALLOC_NORMAL) | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED); /* * If the allocation of the new page table page fails, * invalidate the 2MB page mapping and return "failure". */ if (ml3 == NULL) { pmap_demote_l2_abort(pmap, va, l2, lockp); CTR2(KTR_PMAP, "pmap_demote_l2: failure for va %#lx" " in pmap %p", va, pmap); goto fail; } if (va < VM_MAXUSER_ADDRESS) { ml3->wire_count = NL3PG; pmap_resident_count_inc(pmap, 1); } } l3phys = VM_PAGE_TO_PHYS(ml3); l3 = (pt_entry_t *)PHYS_TO_DMAP(l3phys); newl3 = (oldl2 & ~ATTR_DESCR_MASK) | L3_PAGE; KASSERT((oldl2 & (ATTR_AP_RW_BIT | ATTR_SW_DBM)) != (ATTR_AP(ATTR_AP_RO) | ATTR_SW_DBM), ("pmap_demote_l2: L2 entry is writeable but not dirty")); /* * If the page table page is not leftover from an earlier promotion, * or the mapping attributes have changed, (re)initialize the L3 table. */ if (ml3->valid == 0 || (l3[0] & ATTR_MASK) != (newl3 & ATTR_MASK)) pmap_fill_l3(l3, newl3); /* * Map the temporary page so we don't lose access to the l2 table. */ if (tmpl2 != 0) { pmap_kenter(tmpl2, PAGE_SIZE, DMAP_TO_PHYS((vm_offset_t)l2) & ~L3_OFFSET, CACHED_MEMORY); l2 = (pt_entry_t *)(tmpl2 + ((vm_offset_t)l2 & PAGE_MASK)); } /* * The spare PV entries must be reserved prior to demoting the * mapping, that is, prior to changing the PDE. Otherwise, the state * of the L2 and the PV lists will be inconsistent, which can result * in reclaim_pv_chunk() attempting to remove a PV entry from the * wrong PV list and pmap_pv_demote_l2() failing to find the expected * PV entry for the 2MB page mapping that is being demoted. */ if ((oldl2 & ATTR_SW_MANAGED) != 0) reserve_pv_entries(pmap, Ln_ENTRIES - 1, lockp); /* * Pass PAGE_SIZE so that a single TLB invalidation is performed on * the 2MB page mapping. */ pmap_update_entry(pmap, l2, l3phys | L2_TABLE, va, PAGE_SIZE); /* * Demote the PV entry. */ if ((oldl2 & ATTR_SW_MANAGED) != 0) pmap_pv_demote_l2(pmap, va, oldl2 & ~ATTR_MASK, lockp); atomic_add_long(&pmap_l2_demotions, 1); CTR3(KTR_PMAP, "pmap_demote_l2: success for va %#lx" " in pmap %p %lx", va, pmap, l3[0]); fail: if (tmpl2 != 0) { pmap_kremove(tmpl2); kva_free(tmpl2, PAGE_SIZE); } return (l3); } static pt_entry_t * pmap_demote_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va) { struct rwlock *lock; pt_entry_t *l3; lock = NULL; l3 = pmap_demote_l2_locked(pmap, l2, va, &lock); if (lock != NULL) rw_wunlock(lock); return (l3); } /* * perform the pmap work for mincore */ int pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *locked_pa) { pt_entry_t *pte, tpte; vm_paddr_t mask, pa; int lvl, val; bool managed; PMAP_LOCK(pmap); retry: val = 0; pte = pmap_pte(pmap, addr, &lvl); if (pte != NULL) { tpte = pmap_load(pte); switch (lvl) { case 3: mask = L3_OFFSET; break; case 2: mask = L2_OFFSET; break; case 1: mask = L1_OFFSET; break; default: panic("pmap_mincore: invalid level %d", lvl); } managed = (tpte & ATTR_SW_MANAGED) != 0; val = MINCORE_INCORE; if (lvl != 3) val |= MINCORE_SUPER; if ((managed && pmap_pte_dirty(tpte)) || (!managed && (tpte & ATTR_AP_RW_BIT) == ATTR_AP(ATTR_AP_RW))) val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER; if ((tpte & ATTR_AF) == ATTR_AF) val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER; pa = (tpte & ~ATTR_MASK) | (addr & mask); } else managed = false; if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) != (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) && managed) { /* Ensure that "PHYS_TO_VM_PAGE(pa)->object" doesn't change. */ if (vm_page_pa_tryrelock(pmap, pa, locked_pa)) goto retry; } else PA_UNLOCK_COND(*locked_pa); PMAP_UNLOCK(pmap); return (val); } void pmap_activate(struct thread *td) { pmap_t pmap; critical_enter(); pmap = vmspace_pmap(td->td_proc->p_vmspace); td->td_proc->p_md.md_l0addr = vtophys(pmap->pm_l0); __asm __volatile( "msr ttbr0_el1, %0 \n" "isb \n" : : "r"(td->td_proc->p_md.md_l0addr)); pmap_invalidate_all(pmap); critical_exit(); } struct pcb * pmap_switch(struct thread *old, struct thread *new) { pcpu_bp_harden bp_harden; struct pcb *pcb; /* Store the new curthread */ PCPU_SET(curthread, new); /* And the new pcb */ pcb = new->td_pcb; PCPU_SET(curpcb, pcb); /* * TODO: We may need to flush the cache here if switching * to a user process. */ if (old == NULL || old->td_proc->p_md.md_l0addr != new->td_proc->p_md.md_l0addr) { __asm __volatile( /* Switch to the new pmap */ "msr ttbr0_el1, %0 \n" "isb \n" /* Invalidate the TLB */ "dsb ishst \n" "tlbi vmalle1is \n" "dsb ish \n" "isb \n" : : "r"(new->td_proc->p_md.md_l0addr)); /* * Stop userspace from training the branch predictor against * other processes. This will call into a CPU specific * function that clears the branch predictor state. */ bp_harden = PCPU_GET(bp_harden); if (bp_harden != NULL) bp_harden(); } return (pcb); } void pmap_sync_icache(pmap_t pmap, vm_offset_t va, vm_size_t sz) { if (va >= VM_MIN_KERNEL_ADDRESS) { cpu_icache_sync_range(va, sz); } else { u_int len, offset; vm_paddr_t pa; /* Find the length of data in this page to flush */ offset = va & PAGE_MASK; len = imin(PAGE_SIZE - offset, sz); while (sz != 0) { /* Extract the physical address & find it in the DMAP */ pa = pmap_extract(pmap, va); if (pa != 0) cpu_icache_sync_range(PHYS_TO_DMAP(pa), len); /* Move to the next page */ sz -= len; va += len; /* Set the length for the next iteration */ len = imin(PAGE_SIZE, sz); } } } int pmap_fault(pmap_t pmap, uint64_t esr, uint64_t far) { pt_entry_t *pte; register_t intr; uint64_t ec, par; int lvl, rv; rv = KERN_FAILURE; ec = ESR_ELx_EXCEPTION(esr); switch (ec) { case EXCP_INSN_ABORT_L: case EXCP_INSN_ABORT: case EXCP_DATA_ABORT_L: case EXCP_DATA_ABORT: break; default: return (rv); } /* Data and insn aborts use same encoding for FSC field. */ switch (esr & ISS_DATA_DFSC_MASK) { case ISS_DATA_DFSC_AFF_L1: case ISS_DATA_DFSC_AFF_L2: case ISS_DATA_DFSC_AFF_L3: PMAP_LOCK(pmap); pte = pmap_pte(pmap, far, &lvl); if (pte != NULL) { pmap_set_bits(pte, ATTR_AF); rv = KERN_SUCCESS; /* * XXXMJ as an optimization we could mark the entry * dirty if this is a write fault. */ } PMAP_UNLOCK(pmap); break; case ISS_DATA_DFSC_PF_L1: case ISS_DATA_DFSC_PF_L2: case ISS_DATA_DFSC_PF_L3: if ((ec != EXCP_DATA_ABORT_L && ec != EXCP_DATA_ABORT) || (esr & ISS_DATA_WnR) == 0) return (rv); PMAP_LOCK(pmap); pte = pmap_pte(pmap, far, &lvl); if (pte != NULL && (pmap_load(pte) & (ATTR_AP_RW_BIT | ATTR_SW_DBM)) == (ATTR_AP(ATTR_AP_RO) | ATTR_SW_DBM)) { pmap_clear_bits(pte, ATTR_AP_RW_BIT); pmap_invalidate_page(pmap, trunc_page(far)); rv = KERN_SUCCESS; } PMAP_UNLOCK(pmap); break; case ISS_DATA_DFSC_TF_L0: case ISS_DATA_DFSC_TF_L1: case ISS_DATA_DFSC_TF_L2: case ISS_DATA_DFSC_TF_L3: PMAP_LOCK(pmap); /* Ask the MMU to check the address */ intr = intr_disable(); if (pmap == kernel_pmap) par = arm64_address_translate_s1e1r(far); else par = arm64_address_translate_s1e0r(far); intr_restore(intr); PMAP_UNLOCK(pmap); /* * If the translation was successful the address was invalid * due to a break-before-make sequence. We can unlock and * return success to the trap handler. */ if (PAR_SUCCESS(par)) rv = KERN_SUCCESS; break; } return (rv); } /* * Increase the starting virtual address of the given mapping if a * different alignment might result in more superpage mappings. */ void pmap_align_superpage(vm_object_t object, vm_ooffset_t offset, vm_offset_t *addr, vm_size_t size) { vm_offset_t superpage_offset; if (size < L2_SIZE) return; if (object != NULL && (object->flags & OBJ_COLORED) != 0) offset += ptoa(object->pg_color); superpage_offset = offset & L2_OFFSET; if (size - ((L2_SIZE - superpage_offset) & L2_OFFSET) < L2_SIZE || (*addr & L2_OFFSET) == superpage_offset) return; if ((*addr & L2_OFFSET) < superpage_offset) *addr = (*addr & ~L2_OFFSET) + superpage_offset; else *addr = ((*addr + L2_OFFSET) & ~L2_OFFSET) + superpage_offset; } /** * Get the kernel virtual address of a set of physical pages. If there are * physical addresses not covered by the DMAP perform a transient mapping * that will be removed when calling pmap_unmap_io_transient. * * \param page The pages the caller wishes to obtain the virtual * address on the kernel memory map. * \param vaddr On return contains the kernel virtual memory address * of the pages passed in the page parameter. * \param count Number of pages passed in. * \param can_fault TRUE if the thread using the mapped pages can take * page faults, FALSE otherwise. * * \returns TRUE if the caller must call pmap_unmap_io_transient when * finished or FALSE otherwise. * */ boolean_t pmap_map_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count, boolean_t can_fault) { vm_paddr_t paddr; boolean_t needs_mapping; int error, i; /* * Allocate any KVA space that we need, this is done in a separate * loop to prevent calling vmem_alloc while pinned. */ needs_mapping = FALSE; for (i = 0; i < count; i++) { paddr = VM_PAGE_TO_PHYS(page[i]); if (__predict_false(!PHYS_IN_DMAP(paddr))) { error = vmem_alloc(kernel_arena, PAGE_SIZE, M_BESTFIT | M_WAITOK, &vaddr[i]); KASSERT(error == 0, ("vmem_alloc failed: %d", error)); needs_mapping = TRUE; } else { vaddr[i] = PHYS_TO_DMAP(paddr); } } /* Exit early if everything is covered by the DMAP */ if (!needs_mapping) return (FALSE); if (!can_fault) sched_pin(); for (i = 0; i < count; i++) { paddr = VM_PAGE_TO_PHYS(page[i]); if (!PHYS_IN_DMAP(paddr)) { panic( "pmap_map_io_transient: TODO: Map out of DMAP data"); } } return (needs_mapping); } void pmap_unmap_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count, boolean_t can_fault) { vm_paddr_t paddr; int i; if (!can_fault) sched_unpin(); for (i = 0; i < count; i++) { paddr = VM_PAGE_TO_PHYS(page[i]); if (!PHYS_IN_DMAP(paddr)) { panic("ARM64TODO: pmap_unmap_io_transient: Unmap data"); } } } boolean_t pmap_is_valid_memattr(pmap_t pmap __unused, vm_memattr_t mode) { return (mode >= VM_MEMATTR_DEVICE && mode <= VM_MEMATTR_WRITE_THROUGH); } Index: projects/nfsv42/sys/cam/ata/ata_all.c =================================================================== --- projects/nfsv42/sys/cam/ata/ata_all.c (revision 350367) +++ projects/nfsv42/sys/cam/ata/ata_all.c (revision 350368) @@ -1,1271 +1,1280 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2009 Alexander Motin * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer, * without modification, immediately at the beginning of the file. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #ifdef _KERNEL #include "opt_scsi.h" #include #include #include #include #else #include #include #include #include #ifndef min #define min(a,b) (((a)<(b))?(a):(b)) #endif #endif #include #include #include #include #include #include #include #include int ata_version(int ver) { int bit; if (ver == 0xffff) return 0; for (bit = 15; bit >= 0; bit--) if (ver & (1<control & 0x04) return ("SOFT_RESET"); switch (cmd->command) { case 0x00: switch (cmd->features) { case 0x00: return ("NOP FLUSHQUEUE"); case 0x01: return ("NOP AUTOPOLL"); } return ("NOP"); case 0x03: return ("CFA_REQUEST_EXTENDED_ERROR"); case 0x06: switch (cmd->features) { case 0x01: return ("DSM TRIM"); } return "DSM"; case 0x07: switch (cmd->features) { case 0x01: return ("DSM_XL TRIM"); } return "DSM_XL"; case 0x08: return ("DEVICE_RESET"); case 0x0b: return ("REQUEST_SENSE_DATA_EXT"); case 0x12: return ("GET_PHYSICAL_ELEMENT_STATUS"); case 0x20: return ("READ"); case 0x24: return ("READ48"); case 0x25: return ("READ_DMA48"); case 0x26: return ("READ_DMA_QUEUED48"); case 0x27: return ("READ_NATIVE_MAX_ADDRESS48"); case 0x29: return ("READ_MUL48"); case 0x2a: return ("READ_STREAM_DMA48"); case 0x2b: return ("READ_STREAM48"); case 0x2f: return ("READ_LOG_EXT"); case 0x30: return ("WRITE"); case 0x34: return ("WRITE48"); case 0x35: return ("WRITE_DMA48"); case 0x36: return ("WRITE_DMA_QUEUED48"); case 0x37: return ("SET_MAX_ADDRESS48"); case 0x39: return ("WRITE_MUL48"); case 0x3a: return ("WRITE_STREAM_DMA48"); case 0x3b: return ("WRITE_STREAM48"); case 0x3d: return ("WRITE_DMA_FUA48"); case 0x3e: return ("WRITE_DMA_QUEUED_FUA48"); case 0x3f: return ("WRITE_LOG_EXT"); case 0x40: return ("READ_VERIFY"); case 0x42: return ("READ_VERIFY48"); case 0x44: switch (cmd->features) { case 0x01: return ("ZERO_EXT TRIM"); } return "ZERO_EXT"; case 0x45: switch (cmd->features) { case 0x55: return ("WRITE_UNCORRECTABLE48 PSEUDO"); case 0xaa: return ("WRITE_UNCORRECTABLE48 FLAGGED"); } return "WRITE_UNCORRECTABLE48"; case 0x47: return ("READ_LOG_DMA_EXT"); case 0x4a: return ("ZAC_MANAGEMENT_IN"); case 0x51: return ("CONFIGURE_STREAM"); case 0x57: return ("WRITE_LOG_DMA_EXT"); case 0x5b: return ("TRUSTED_NON_DATA"); case 0x5c: return ("TRUSTED_RECEIVE"); case 0x5d: return ("TRUSTED_RECEIVE_DMA"); case 0x5e: return ("TRUSTED_SEND"); case 0x5f: return ("TRUSTED_SEND_DMA"); case 0x60: return ("READ_FPDMA_QUEUED"); case 0x61: return ("WRITE_FPDMA_QUEUED"); case 0x63: switch (cmd->features & 0xf) { case 0x00: return ("NCQ_NON_DATA ABORT NCQ QUEUE"); case 0x01: return ("NCQ_NON_DATA DEADLINE HANDLING"); case 0x02: return ("NCQ_NON_DATA HYBRID DEMOTE BY SIZE"); case 0x03: return ("NCQ_NON_DATA HYBRID CHANGE BY LBA RANGE"); case 0x04: return ("NCQ_NON_DATA HYBRID CONTROL"); case 0x05: return ("NCQ_NON_DATA SET FEATURES"); /* * XXX KDM need common decoding between NCQ and non-NCQ * versions of SET FEATURES. */ case 0x06: return ("NCQ_NON_DATA ZERO EXT"); case 0x07: return ("NCQ_NON_DATA ZAC MANAGEMENT OUT"); } return ("NCQ_NON_DATA"); case 0x64: switch (cmd->sector_count_exp & 0xf) { case 0x00: return ("SEND_FPDMA_QUEUED DATA SET MANAGEMENT"); case 0x01: return ("SEND_FPDMA_QUEUED HYBRID EVICT"); case 0x02: return ("SEND_FPDMA_QUEUED WRITE LOG DMA EXT"); case 0x03: return ("SEND_FPDMA_QUEUED ZAC MANAGEMENT OUT"); case 0x04: return ("SEND_FPDMA_QUEUED DATA SET MANAGEMENT XL"); } return ("SEND_FPDMA_QUEUED"); case 0x65: switch (cmd->sector_count_exp & 0xf) { case 0x01: return ("RECEIVE_FPDMA_QUEUED READ LOG DMA EXT"); case 0x02: return ("RECEIVE_FPDMA_QUEUED ZAC MANAGEMENT IN"); } return ("RECEIVE_FPDMA_QUEUED"); case 0x67: if (cmd->features == 0xec) return ("SEP_ATTN IDENTIFY"); switch (cmd->lba_low) { case 0x00: return ("SEP_ATTN READ BUFFER"); case 0x02: return ("SEP_ATTN RECEIVE DIAGNOSTIC RESULTS"); case 0x80: return ("SEP_ATTN WRITE BUFFER"); case 0x82: return ("SEP_ATTN SEND DIAGNOSTIC"); } return ("SEP_ATTN"); case 0x70: return ("SEEK"); case 0x77: return ("SET_DATE_TIME_EXT"); case 0x78: switch (cmd->features) { case 0x00: return ("GET_NATIVE_MAX_ADDRESS_EXT"); case 0x01: return ("SET_ACCESSIBLE_MAX_ADDRESS_EXT"); case 0x02: return ("FREEZE_ACCESSIBLE_MAX_ADDRESS_EXT"); } return ("ACCESSIBLE_MAX_ADDRESS_CONFIGURATION"); case 0x7C: return ("REMOVE_ELEMENT_AND_TRUNCATE"); case 0x87: return ("CFA_TRANSLATE_SECTOR"); case 0x90: return ("EXECUTE_DEVICE_DIAGNOSTIC"); case 0x92: return ("DOWNLOAD_MICROCODE"); case 0x93: return ("DOWNLOAD_MICROCODE_DMA"); case 0x9a: return ("ZAC_MANAGEMENT_OUT"); case 0xa0: return ("PACKET"); case 0xa1: return ("ATAPI_IDENTIFY"); case 0xa2: return ("SERVICE"); case 0xb0: switch(cmd->features) { case 0xd0: return ("SMART READ ATTR VALUES"); case 0xd1: return ("SMART READ ATTR THRESHOLDS"); case 0xd3: return ("SMART SAVE ATTR VALUES"); case 0xd4: return ("SMART EXECUTE OFFLINE IMMEDIATE"); case 0xd5: return ("SMART READ LOG"); case 0xd6: return ("SMART WRITE LOG"); case 0xd8: return ("SMART ENABLE OPERATION"); case 0xd9: return ("SMART DISABLE OPERATION"); case 0xda: return ("SMART RETURN STATUS"); } return ("SMART"); case 0xb1: return ("DEVICE CONFIGURATION"); case 0xb2: return ("SET_SECTOR_CONFIGURATION_EXT"); - case 0xb4: return ("SANITIZE_DEVICE"); + case 0xb4: + switch(cmd->features) { + case 0x00: return ("SANITIZE_STATUS_EXT"); + case 0x11: return ("CRYPTO_SCRAMBLE_EXT"); + case 0x12: return ("BLOCK_ERASE_EXT"); + case 0x14: return ("OVERWRITE_EXT"); + case 0x20: return ("SANITIZE_FREEZE_LOCK_EXT"); + case 0x40: return ("SANITIZE_ANTIFREEZE_LOCK_EXT"); + } + return ("SANITIZE_DEVICE"); case 0xc0: return ("CFA_ERASE"); case 0xc4: return ("READ_MUL"); case 0xc5: return ("WRITE_MUL"); case 0xc6: return ("SET_MULTI"); case 0xc7: return ("READ_DMA_QUEUED"); case 0xc8: return ("READ_DMA"); case 0xca: return ("WRITE_DMA"); case 0xcc: return ("WRITE_DMA_QUEUED"); case 0xcd: return ("CFA_WRITE_MULTIPLE_WITHOUT_ERASE"); case 0xce: return ("WRITE_MUL_FUA48"); case 0xd1: return ("CHECK_MEDIA_CARD_TYPE"); case 0xda: return ("GET_MEDIA_STATUS"); case 0xde: return ("MEDIA_LOCK"); case 0xdf: return ("MEDIA_UNLOCK"); case 0xe0: return ("STANDBY_IMMEDIATE"); case 0xe1: return ("IDLE_IMMEDIATE"); case 0xe2: return ("STANDBY"); case 0xe3: return ("IDLE"); case 0xe4: return ("READ_BUFFER/PM"); case 0xe5: return ("CHECK_POWER_MODE"); case 0xe6: return ("SLEEP"); case 0xe7: return ("FLUSHCACHE"); case 0xe8: return ("WRITE_BUFFER/PM"); case 0xe9: return ("READ_BUFFER_DMA"); case 0xea: return ("FLUSHCACHE48"); case 0xeb: return ("WRITE_BUFFER_DMA"); case 0xec: return ("ATA_IDENTIFY"); case 0xed: return ("MEDIA_EJECT"); case 0xef: /* * XXX KDM need common decoding between NCQ and non-NCQ * versions of SET FEATURES. */ switch (cmd->features) { case 0x02: return ("SETFEATURES ENABLE WCACHE"); case 0x03: return ("SETFEATURES SET TRANSFER MODE"); case 0x05: return ("SETFEATURES ENABLE APM"); case 0x06: return ("SETFEATURES ENABLE PUIS"); case 0x07: return ("SETFEATURES SPIN-UP"); case 0x0b: return ("SETFEATURES ENABLE WRITE READ VERIFY"); case 0x0c: return ("SETFEATURES ENABLE DEVICE LIFE CONTROL"); case 0x10: return ("SETFEATURES ENABLE SATA FEATURE"); case 0x41: return ("SETFEATURES ENABLE FREEFALL CONTROL"); case 0x43: return ("SETFEATURES SET MAX HOST INT SECT TIMES"); case 0x45: return ("SETFEATURES SET RATE BASIS"); case 0x4a: return ("SETFEATURES EXTENDED POWER CONDITIONS"); case 0x50: return ("SETFEATURES ADVANCED BACKGROUD OPERATION"); case 0x55: return ("SETFEATURES DISABLE RCACHE"); case 0x5d: return ("SETFEATURES ENABLE RELIRQ"); case 0x5e: return ("SETFEATURES ENABLE SRVIRQ"); case 0x62: return ("SETFEATURES LONG PHYS SECT ALIGN ERC"); case 0x63: return ("SETFEATURES DSN"); case 0x66: return ("SETFEATURES DISABLE DEFAULTS"); case 0x82: return ("SETFEATURES DISABLE WCACHE"); case 0x85: return ("SETFEATURES DISABLE APM"); case 0x86: return ("SETFEATURES DISABLE PUIS"); case 0x8b: return ("SETFEATURES DISABLE WRITE READ VERIFY"); case 0x8c: return ("SETFEATURES DISABLE DEVICE LIFE CONTROL"); case 0x90: return ("SETFEATURES DISABLE SATA FEATURE"); case 0xaa: return ("SETFEATURES ENABLE RCACHE"); case 0xC1: return ("SETFEATURES DISABLE FREEFALL CONTROL"); case 0xC3: return ("SETFEATURES SENSE DATA REPORTING"); case 0xC4: return ("SETFEATURES NCQ SENSE DATA RETURN"); case 0xCC: return ("SETFEATURES ENABLE DEFAULTS"); case 0xdd: return ("SETFEATURES DISABLE RELIRQ"); case 0xde: return ("SETFEATURES DISABLE SRVIRQ"); } return "SETFEATURES"; case 0xf1: return ("SECURITY_SET_PASSWORD"); case 0xf2: return ("SECURITY_UNLOCK"); case 0xf3: return ("SECURITY_ERASE_PREPARE"); case 0xf4: return ("SECURITY_ERASE_UNIT"); case 0xf5: return ("SECURITY_FREEZE_LOCK"); case 0xf6: return ("SECURITY_DISABLE_PASSWORD"); case 0xf8: return ("READ_NATIVE_MAX_ADDRESS"); case 0xf9: return ("SET_MAX_ADDRESS"); } return "UNKNOWN"; } char * ata_cmd_string(struct ata_cmd *cmd, char *cmd_string, size_t len) { struct sbuf sb; int error; if (len == 0) return (""); sbuf_new(&sb, cmd_string, len, SBUF_FIXEDLEN); ata_cmd_sbuf(cmd, &sb); error = sbuf_finish(&sb); if (error != 0 && error != ENOMEM) return (""); return(sbuf_data(&sb)); } void ata_cmd_sbuf(struct ata_cmd *cmd, struct sbuf *sb) { sbuf_printf(sb, "%02x %02x %02x %02x " "%02x %02x %02x %02x %02x %02x %02x %02x", cmd->command, cmd->features, cmd->lba_low, cmd->lba_mid, cmd->lba_high, cmd->device, cmd->lba_low_exp, cmd->lba_mid_exp, cmd->lba_high_exp, cmd->features_exp, cmd->sector_count, cmd->sector_count_exp); } char * ata_res_string(struct ata_res *res, char *res_string, size_t len) { struct sbuf sb; int error; if (len == 0) return (""); sbuf_new(&sb, res_string, len, SBUF_FIXEDLEN); ata_res_sbuf(res, &sb); error = sbuf_finish(&sb); if (error != 0 && error != ENOMEM) return (""); return(sbuf_data(&sb)); } int ata_res_sbuf(struct ata_res *res, struct sbuf *sb) { sbuf_printf(sb, "%02x %02x %02x %02x " "%02x %02x %02x %02x %02x %02x %02x", res->status, res->error, res->lba_low, res->lba_mid, res->lba_high, res->device, res->lba_low_exp, res->lba_mid_exp, res->lba_high_exp, res->sector_count, res->sector_count_exp); return (0); } /* * ata_command_sbuf() returns 0 for success and -1 for failure. */ int ata_command_sbuf(struct ccb_ataio *ataio, struct sbuf *sb) { sbuf_printf(sb, "%s. ACB: ", ata_op_string(&ataio->cmd)); ata_cmd_sbuf(&ataio->cmd, sb); return(0); } /* * ata_status_abuf() returns 0 for success and -1 for failure. */ int ata_status_sbuf(struct ccb_ataio *ataio, struct sbuf *sb) { sbuf_printf(sb, "ATA status: %02x (%s%s%s%s%s%s%s%s)", ataio->res.status, (ataio->res.status & 0x80) ? "BSY " : "", (ataio->res.status & 0x40) ? "DRDY " : "", (ataio->res.status & 0x20) ? "DF " : "", (ataio->res.status & 0x10) ? "SERV " : "", (ataio->res.status & 0x08) ? "DRQ " : "", (ataio->res.status & 0x04) ? "CORR " : "", (ataio->res.status & 0x02) ? "IDX " : "", (ataio->res.status & 0x01) ? "ERR" : ""); if (ataio->res.status & 1) { sbuf_printf(sb, ", error: %02x (%s%s%s%s%s%s%s%s)", ataio->res.error, (ataio->res.error & 0x80) ? "ICRC " : "", (ataio->res.error & 0x40) ? "UNC " : "", (ataio->res.error & 0x20) ? "MC " : "", (ataio->res.error & 0x10) ? "IDNF " : "", (ataio->res.error & 0x08) ? "MCR " : "", (ataio->res.error & 0x04) ? "ABRT " : "", (ataio->res.error & 0x02) ? "NM " : "", (ataio->res.error & 0x01) ? "ILI" : ""); } return(0); } void ata_print_ident(struct ata_params *ident_data) { const char *proto; char ata[12], sata[12]; ata_print_ident_short(ident_data); proto = (ident_data->config == ATA_PROTO_CFA) ? "CFA" : (ident_data->config & ATA_PROTO_ATAPI) ? "ATAPI" : "ATA"; if (ata_version(ident_data->version_major) == 0) { snprintf(ata, sizeof(ata), "%s", proto); } else if (ata_version(ident_data->version_major) <= 7) { snprintf(ata, sizeof(ata), "%s-%d", proto, ata_version(ident_data->version_major)); } else if (ata_version(ident_data->version_major) == 8) { snprintf(ata, sizeof(ata), "%s8-ACS", proto); } else { snprintf(ata, sizeof(ata), "ACS-%d %s", ata_version(ident_data->version_major) - 7, proto); } if (ident_data->satacapabilities && ident_data->satacapabilities != 0xffff) { if (ident_data->satacapabilities & ATA_SATA_GEN3) snprintf(sata, sizeof(sata), " SATA 3.x"); else if (ident_data->satacapabilities & ATA_SATA_GEN2) snprintf(sata, sizeof(sata), " SATA 2.x"); else if (ident_data->satacapabilities & ATA_SATA_GEN1) snprintf(sata, sizeof(sata), " SATA 1.x"); else snprintf(sata, sizeof(sata), " SATA"); } else sata[0] = 0; printf(" %s%s device\n", ata, sata); } void ata_print_ident_sbuf(struct ata_params *ident_data, struct sbuf *sb) { const char *proto, *sata; int version; ata_print_ident_short_sbuf(ident_data, sb); sbuf_printf(sb, " "); proto = (ident_data->config == ATA_PROTO_CFA) ? "CFA" : (ident_data->config & ATA_PROTO_ATAPI) ? "ATAPI" : "ATA"; version = ata_version(ident_data->version_major); switch (version) { case 0: sbuf_printf(sb, "%s", proto); break; case 1: case 2: case 3: case 4: case 5: case 6: case 7: sbuf_printf(sb, "%s-%d", proto, version); break; case 8: sbuf_printf(sb, "%s8-ACS", proto); break; default: sbuf_printf(sb, "ACS-%d %s", version - 7, proto); break; } if (ident_data->satacapabilities && ident_data->satacapabilities != 0xffff) { if (ident_data->satacapabilities & ATA_SATA_GEN3) sata = " SATA 3.x"; else if (ident_data->satacapabilities & ATA_SATA_GEN2) sata = " SATA 2.x"; else if (ident_data->satacapabilities & ATA_SATA_GEN1) sata = " SATA 1.x"; else sata = " SATA"; } else sata = ""; sbuf_printf(sb, "%s device\n", sata); } void ata_print_ident_short(struct ata_params *ident_data) { char product[48], revision[16]; cam_strvis(product, ident_data->model, sizeof(ident_data->model), sizeof(product)); cam_strvis(revision, ident_data->revision, sizeof(ident_data->revision), sizeof(revision)); printf("<%s %s>", product, revision); } void ata_print_ident_short_sbuf(struct ata_params *ident_data, struct sbuf *sb) { sbuf_printf(sb, "<"); cam_strvis_sbuf(sb, ident_data->model, sizeof(ident_data->model), 0); sbuf_printf(sb, " "); cam_strvis_sbuf(sb, ident_data->revision, sizeof(ident_data->revision), 0); sbuf_printf(sb, ">"); } void semb_print_ident(struct sep_identify_data *ident_data) { char in[7], ins[5]; semb_print_ident_short(ident_data); cam_strvis(in, ident_data->interface_id, 6, sizeof(in)); cam_strvis(ins, ident_data->interface_rev, 4, sizeof(ins)); printf(" SEMB %s %s device\n", in, ins); } void semb_print_ident_sbuf(struct sep_identify_data *ident_data, struct sbuf *sb) { semb_print_ident_short_sbuf(ident_data, sb); sbuf_printf(sb, " SEMB "); cam_strvis_sbuf(sb, ident_data->interface_id, 6, 0); sbuf_printf(sb, " "); cam_strvis_sbuf(sb, ident_data->interface_rev, 4, 0); sbuf_printf(sb, " device\n"); } void semb_print_ident_short(struct sep_identify_data *ident_data) { char vendor[9], product[17], revision[5], fw[5]; cam_strvis(vendor, ident_data->vendor_id, 8, sizeof(vendor)); cam_strvis(product, ident_data->product_id, 16, sizeof(product)); cam_strvis(revision, ident_data->product_rev, 4, sizeof(revision)); cam_strvis(fw, ident_data->firmware_rev, 4, sizeof(fw)); printf("<%s %s %s %s>", vendor, product, revision, fw); } void semb_print_ident_short_sbuf(struct sep_identify_data *ident_data, struct sbuf *sb) { sbuf_printf(sb, "<"); cam_strvis_sbuf(sb, ident_data->vendor_id, 8, 0); sbuf_printf(sb, " "); cam_strvis_sbuf(sb, ident_data->product_id, 16, 0); sbuf_printf(sb, " "); cam_strvis_sbuf(sb, ident_data->product_rev, 4, 0); sbuf_printf(sb, " "); cam_strvis_sbuf(sb, ident_data->firmware_rev, 4, 0); sbuf_printf(sb, ">"); } uint32_t ata_logical_sector_size(struct ata_params *ident_data) { if ((ident_data->pss & ATA_PSS_VALID_MASK) == ATA_PSS_VALID_VALUE && (ident_data->pss & ATA_PSS_LSSABOVE512)) { return (((u_int32_t)ident_data->lss_1 | ((u_int32_t)ident_data->lss_2 << 16)) * 2); } return (512); } uint64_t ata_physical_sector_size(struct ata_params *ident_data) { if ((ident_data->pss & ATA_PSS_VALID_MASK) == ATA_PSS_VALID_VALUE) { if (ident_data->pss & ATA_PSS_MULTLS) { return ((uint64_t)ata_logical_sector_size(ident_data) * (1 << (ident_data->pss & ATA_PSS_LSPPS))); } else { return (uint64_t)ata_logical_sector_size(ident_data); } } return (512); } uint64_t ata_logical_sector_offset(struct ata_params *ident_data) { if ((ident_data->lsalign & 0xc000) == 0x4000) { return ((uint64_t)ata_logical_sector_size(ident_data) * (ident_data->lsalign & 0x3fff)); } return (0); } void ata_28bit_cmd(struct ccb_ataio *ataio, uint8_t cmd, uint8_t features, uint32_t lba, uint8_t sector_count) { bzero(&ataio->cmd, sizeof(ataio->cmd)); ataio->cmd.flags = 0; if (cmd == ATA_READ_DMA || cmd == ATA_READ_DMA_QUEUED || cmd == ATA_WRITE_DMA || cmd == ATA_WRITE_DMA_QUEUED || cmd == ATA_TRUSTED_RECEIVE_DMA || cmd == ATA_TRUSTED_SEND_DMA || cmd == ATA_DOWNLOAD_MICROCODE_DMA || cmd == ATA_READ_BUFFER_DMA || cmd == ATA_WRITE_BUFFER_DMA) ataio->cmd.flags |= CAM_ATAIO_DMA; ataio->cmd.command = cmd; ataio->cmd.features = features; ataio->cmd.lba_low = lba; ataio->cmd.lba_mid = lba >> 8; ataio->cmd.lba_high = lba >> 16; ataio->cmd.device = ATA_DEV_LBA | ((lba >> 24) & 0x0f); ataio->cmd.sector_count = sector_count; } void ata_48bit_cmd(struct ccb_ataio *ataio, uint8_t cmd, uint16_t features, uint64_t lba, uint16_t sector_count) { ataio->cmd.flags = CAM_ATAIO_48BIT; if (cmd == ATA_READ_DMA48 || cmd == ATA_READ_DMA_QUEUED48 || cmd == ATA_READ_STREAM_DMA48 || cmd == ATA_WRITE_DMA48 || cmd == ATA_WRITE_DMA_FUA48 || cmd == ATA_WRITE_DMA_QUEUED48 || cmd == ATA_WRITE_DMA_QUEUED_FUA48 || cmd == ATA_WRITE_STREAM_DMA48 || cmd == ATA_DATA_SET_MANAGEMENT || cmd == ATA_READ_LOG_DMA_EXT || cmd == ATA_WRITE_LOG_DMA_EXT) ataio->cmd.flags |= CAM_ATAIO_DMA; ataio->cmd.command = cmd; ataio->cmd.features = features; ataio->cmd.lba_low = lba; ataio->cmd.lba_mid = lba >> 8; ataio->cmd.lba_high = lba >> 16; ataio->cmd.device = ATA_DEV_LBA; ataio->cmd.lba_low_exp = lba >> 24; ataio->cmd.lba_mid_exp = lba >> 32; ataio->cmd.lba_high_exp = lba >> 40; ataio->cmd.features_exp = features >> 8; ataio->cmd.sector_count = sector_count; ataio->cmd.sector_count_exp = sector_count >> 8; ataio->cmd.control = 0; } void ata_ncq_cmd(struct ccb_ataio *ataio, uint8_t cmd, uint64_t lba, uint16_t sector_count) { ataio->cmd.flags = CAM_ATAIO_48BIT | CAM_ATAIO_FPDMA; ataio->cmd.command = cmd; ataio->cmd.features = sector_count; ataio->cmd.lba_low = lba; ataio->cmd.lba_mid = lba >> 8; ataio->cmd.lba_high = lba >> 16; ataio->cmd.device = ATA_DEV_LBA; ataio->cmd.lba_low_exp = lba >> 24; ataio->cmd.lba_mid_exp = lba >> 32; ataio->cmd.lba_high_exp = lba >> 40; ataio->cmd.features_exp = sector_count >> 8; ataio->cmd.sector_count = 0; ataio->cmd.sector_count_exp = 0; ataio->cmd.control = 0; } void ata_reset_cmd(struct ccb_ataio *ataio) { bzero(&ataio->cmd, sizeof(ataio->cmd)); ataio->cmd.flags = CAM_ATAIO_CONTROL | CAM_ATAIO_NEEDRESULT; ataio->cmd.control = 0x04; } void ata_pm_read_cmd(struct ccb_ataio *ataio, int reg, int port) { bzero(&ataio->cmd, sizeof(ataio->cmd)); ataio->cmd.flags = CAM_ATAIO_NEEDRESULT; ataio->cmd.command = ATA_READ_PM; ataio->cmd.features = reg; ataio->cmd.device = port & 0x0f; } void ata_pm_write_cmd(struct ccb_ataio *ataio, int reg, int port, uint32_t val) { bzero(&ataio->cmd, sizeof(ataio->cmd)); ataio->cmd.flags = 0; ataio->cmd.command = ATA_WRITE_PM; ataio->cmd.features = reg; ataio->cmd.sector_count = val; ataio->cmd.lba_low = val >> 8; ataio->cmd.lba_mid = val >> 16; ataio->cmd.lba_high = val >> 24; ataio->cmd.device = port & 0x0f; } void ata_read_log(struct ccb_ataio *ataio, uint32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb *), uint32_t log_address, uint32_t page_number, uint16_t block_count, uint32_t protocol, uint8_t *data_ptr, uint32_t dxfer_len, uint32_t timeout) { uint64_t lba; cam_fill_ataio(ataio, /*retries*/ 1, /*cbfcnp*/ cbfcnp, /*flags*/ CAM_DIR_IN, /*tag_action*/ 0, /*data_ptr*/ data_ptr, /*dxfer_len*/ dxfer_len, /*timeout*/ timeout); lba = (((uint64_t)page_number & 0xff00) << 32) | ((page_number & 0x00ff) << 8) | (log_address & 0xff); ata_48bit_cmd(ataio, /*cmd*/ (protocol & CAM_ATAIO_DMA) ? ATA_READ_LOG_DMA_EXT : ATA_READ_LOG_EXT, /*features*/ 0, /*lba*/ lba, /*sector_count*/ block_count); } void ata_bswap(int8_t *buf, int len) { u_int16_t *ptr = (u_int16_t*)(buf + len); while (--ptr >= (u_int16_t*)buf) *ptr = be16toh(*ptr); } void ata_btrim(int8_t *buf, int len) { int8_t *ptr; for (ptr = buf; ptr < buf+len; ++ptr) if (!*ptr || *ptr == '_') *ptr = ' '; for (ptr = buf + len - 1; ptr >= buf && *ptr == ' '; --ptr) *ptr = 0; } void ata_bpack(int8_t *src, int8_t *dst, int len) { int i, j, blank; for (i = j = blank = 0 ; i < len; i++) { if (blank && src[i] == ' ') continue; if (blank && src[i] != ' ') { dst[j++] = src[i]; blank = 0; continue; } if (src[i] == ' ') { blank = 1; if (i == 0) continue; } dst[j++] = src[i]; } while (j < len) dst[j++] = 0x00; } int ata_max_pmode(struct ata_params *ap) { if (ap->atavalid & ATA_FLAG_64_70) { if (ap->apiomodes & 0x02) return ATA_PIO4; if (ap->apiomodes & 0x01) return ATA_PIO3; } if (ap->mwdmamodes & 0x04) return ATA_PIO4; if (ap->mwdmamodes & 0x02) return ATA_PIO3; if (ap->mwdmamodes & 0x01) return ATA_PIO2; if ((ap->retired_piomode & ATA_RETIRED_PIO_MASK) == 0x200) return ATA_PIO2; if ((ap->retired_piomode & ATA_RETIRED_PIO_MASK) == 0x100) return ATA_PIO1; if ((ap->retired_piomode & ATA_RETIRED_PIO_MASK) == 0x000) return ATA_PIO0; return ATA_PIO0; } int ata_max_wmode(struct ata_params *ap) { if (ap->mwdmamodes & 0x04) return ATA_WDMA2; if (ap->mwdmamodes & 0x02) return ATA_WDMA1; if (ap->mwdmamodes & 0x01) return ATA_WDMA0; return -1; } int ata_max_umode(struct ata_params *ap) { if (ap->atavalid & ATA_FLAG_88) { if (ap->udmamodes & 0x40) return ATA_UDMA6; if (ap->udmamodes & 0x20) return ATA_UDMA5; if (ap->udmamodes & 0x10) return ATA_UDMA4; if (ap->udmamodes & 0x08) return ATA_UDMA3; if (ap->udmamodes & 0x04) return ATA_UDMA2; if (ap->udmamodes & 0x02) return ATA_UDMA1; if (ap->udmamodes & 0x01) return ATA_UDMA0; } return -1; } int ata_max_mode(struct ata_params *ap, int maxmode) { if (maxmode == 0) maxmode = ATA_DMA_MAX; if (maxmode >= ATA_UDMA0 && ata_max_umode(ap) > 0) return (min(maxmode, ata_max_umode(ap))); if (maxmode >= ATA_WDMA0 && ata_max_wmode(ap) > 0) return (min(maxmode, ata_max_wmode(ap))); return (min(maxmode, ata_max_pmode(ap))); } char * ata_mode2string(int mode) { switch (mode) { case -1: return "UNSUPPORTED"; case 0: return "NONE"; case ATA_PIO0: return "PIO0"; case ATA_PIO1: return "PIO1"; case ATA_PIO2: return "PIO2"; case ATA_PIO3: return "PIO3"; case ATA_PIO4: return "PIO4"; case ATA_WDMA0: return "WDMA0"; case ATA_WDMA1: return "WDMA1"; case ATA_WDMA2: return "WDMA2"; case ATA_UDMA0: return "UDMA0"; case ATA_UDMA1: return "UDMA1"; case ATA_UDMA2: return "UDMA2"; case ATA_UDMA3: return "UDMA3"; case ATA_UDMA4: return "UDMA4"; case ATA_UDMA5: return "UDMA5"; case ATA_UDMA6: return "UDMA6"; default: if (mode & ATA_DMA_MASK) return "BIOSDMA"; else return "BIOSPIO"; } } int ata_string2mode(char *str) { if (!strcasecmp(str, "PIO0")) return (ATA_PIO0); if (!strcasecmp(str, "PIO1")) return (ATA_PIO1); if (!strcasecmp(str, "PIO2")) return (ATA_PIO2); if (!strcasecmp(str, "PIO3")) return (ATA_PIO3); if (!strcasecmp(str, "PIO4")) return (ATA_PIO4); if (!strcasecmp(str, "WDMA0")) return (ATA_WDMA0); if (!strcasecmp(str, "WDMA1")) return (ATA_WDMA1); if (!strcasecmp(str, "WDMA2")) return (ATA_WDMA2); if (!strcasecmp(str, "UDMA0")) return (ATA_UDMA0); if (!strcasecmp(str, "UDMA16")) return (ATA_UDMA0); if (!strcasecmp(str, "UDMA1")) return (ATA_UDMA1); if (!strcasecmp(str, "UDMA25")) return (ATA_UDMA1); if (!strcasecmp(str, "UDMA2")) return (ATA_UDMA2); if (!strcasecmp(str, "UDMA33")) return (ATA_UDMA2); if (!strcasecmp(str, "UDMA3")) return (ATA_UDMA3); if (!strcasecmp(str, "UDMA44")) return (ATA_UDMA3); if (!strcasecmp(str, "UDMA4")) return (ATA_UDMA4); if (!strcasecmp(str, "UDMA66")) return (ATA_UDMA4); if (!strcasecmp(str, "UDMA5")) return (ATA_UDMA5); if (!strcasecmp(str, "UDMA100")) return (ATA_UDMA5); if (!strcasecmp(str, "UDMA6")) return (ATA_UDMA6); if (!strcasecmp(str, "UDMA133")) return (ATA_UDMA6); return (-1); } u_int ata_mode2speed(int mode) { switch (mode) { case ATA_PIO0: default: return (3300); case ATA_PIO1: return (5200); case ATA_PIO2: return (8300); case ATA_PIO3: return (11100); case ATA_PIO4: return (16700); case ATA_WDMA0: return (4200); case ATA_WDMA1: return (13300); case ATA_WDMA2: return (16700); case ATA_UDMA0: return (16700); case ATA_UDMA1: return (25000); case ATA_UDMA2: return (33300); case ATA_UDMA3: return (44400); case ATA_UDMA4: return (66700); case ATA_UDMA5: return (100000); case ATA_UDMA6: return (133000); } } u_int ata_revision2speed(int revision) { switch (revision) { case 1: default: return (150000); case 2: return (300000); case 3: return (600000); } } int ata_speed2revision(u_int speed) { switch (speed) { case 0: return (0); case 150000: return (1); case 300000: return (2); case 600000: return (3); default: return (-1); } } int ata_identify_match(caddr_t identbuffer, caddr_t table_entry) { struct scsi_inquiry_pattern *entry; struct ata_params *ident; entry = (struct scsi_inquiry_pattern *)table_entry; ident = (struct ata_params *)identbuffer; if ((cam_strmatch(ident->model, entry->product, sizeof(ident->model)) == 0) && (cam_strmatch(ident->revision, entry->revision, sizeof(ident->revision)) == 0)) { return (0); } return (-1); } int ata_static_identify_match(caddr_t identbuffer, caddr_t table_entry) { struct scsi_static_inquiry_pattern *entry; struct ata_params *ident; entry = (struct scsi_static_inquiry_pattern *)table_entry; ident = (struct ata_params *)identbuffer; if ((cam_strmatch(ident->model, entry->product, sizeof(ident->model)) == 0) && (cam_strmatch(ident->revision, entry->revision, sizeof(ident->revision)) == 0)) { return (0); } return (-1); } void semb_receive_diagnostic_results(struct ccb_ataio *ataio, u_int32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb*), uint8_t tag_action, int pcv, uint8_t page_code, uint8_t *data_ptr, uint16_t length, uint32_t timeout) { length = min(length, 1020); length = (length + 3) & ~3; cam_fill_ataio(ataio, retries, cbfcnp, /*flags*/CAM_DIR_IN, tag_action, data_ptr, length, timeout); ata_28bit_cmd(ataio, ATA_SEP_ATTN, pcv ? page_code : 0, 0x02, length / 4); } void semb_send_diagnostic(struct ccb_ataio *ataio, u_int32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb *), uint8_t tag_action, uint8_t *data_ptr, uint16_t length, uint32_t timeout) { length = min(length, 1020); length = (length + 3) & ~3; cam_fill_ataio(ataio, retries, cbfcnp, /*flags*/length ? CAM_DIR_OUT : CAM_DIR_NONE, tag_action, data_ptr, length, timeout); ata_28bit_cmd(ataio, ATA_SEP_ATTN, length > 0 ? data_ptr[0] : 0, 0x82, length / 4); } void semb_read_buffer(struct ccb_ataio *ataio, u_int32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb*), uint8_t tag_action, uint8_t page_code, uint8_t *data_ptr, uint16_t length, uint32_t timeout) { length = min(length, 1020); length = (length + 3) & ~3; cam_fill_ataio(ataio, retries, cbfcnp, /*flags*/CAM_DIR_IN, tag_action, data_ptr, length, timeout); ata_28bit_cmd(ataio, ATA_SEP_ATTN, page_code, 0x00, length / 4); } void semb_write_buffer(struct ccb_ataio *ataio, u_int32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb *), uint8_t tag_action, uint8_t *data_ptr, uint16_t length, uint32_t timeout) { length = min(length, 1020); length = (length + 3) & ~3; cam_fill_ataio(ataio, retries, cbfcnp, /*flags*/length ? CAM_DIR_OUT : CAM_DIR_NONE, tag_action, data_ptr, length, timeout); ata_28bit_cmd(ataio, ATA_SEP_ATTN, length > 0 ? data_ptr[0] : 0, 0x80, length / 4); } void ata_zac_mgmt_out(struct ccb_ataio *ataio, uint32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb *), int use_ncq, uint8_t zm_action, uint64_t zone_id, uint8_t zone_flags, uint16_t sector_count, uint8_t *data_ptr, uint32_t dxfer_len, uint32_t timeout) { uint8_t command_out, ata_flags; uint16_t features_out, sectors_out; uint32_t auxiliary; if (use_ncq == 0) { command_out = ATA_ZAC_MANAGEMENT_OUT; features_out = (zm_action & 0xf) | (zone_flags << 8); if (dxfer_len == 0) { ata_flags = 0; sectors_out = 0; } else { ata_flags = CAM_ATAIO_DMA; /* XXX KDM use sector count? */ sectors_out = ((dxfer_len >> 9) & 0xffff); } auxiliary = 0; } else { if (dxfer_len == 0) { command_out = ATA_NCQ_NON_DATA; features_out = ATA_NCQ_ZAC_MGMT_OUT; sectors_out = 0; } else { command_out = ATA_SEND_FPDMA_QUEUED; /* Note that we're defaulting to normal priority */ sectors_out = ATA_SFPDMA_ZAC_MGMT_OUT << 8; /* * For SEND FPDMA QUEUED, the transfer length is * encoded in the FEATURE register, and 0 means * that 65536 512 byte blocks are to be tranferred. * In practice, it seems unlikely that we'll see * a transfer that large. */ if (dxfer_len == (65536 * 512)) { features_out = 0; } else { /* * Yes, the caller can theoretically send a * transfer larger than we can handle. * Anyone using this function needs enough * knowledge to avoid doing that. */ features_out = ((dxfer_len >> 9) & 0xffff); } } auxiliary = (zm_action & 0xf) | (zone_flags << 8); ata_flags = CAM_ATAIO_FPDMA; } cam_fill_ataio(ataio, /*retries*/ retries, /*cbfcnp*/ cbfcnp, /*flags*/ (dxfer_len > 0) ? CAM_DIR_OUT : CAM_DIR_NONE, /*tag_action*/ 0, /*data_ptr*/ data_ptr, /*dxfer_len*/ dxfer_len, /*timeout*/ timeout); ata_48bit_cmd(ataio, /*cmd*/ command_out, /*features*/ features_out, /*lba*/ zone_id, /*sector_count*/ sectors_out); ataio->cmd.flags |= ata_flags; if (auxiliary != 0) { ataio->ata_flags |= ATA_FLAG_AUX; ataio->aux = auxiliary; } } void ata_zac_mgmt_in(struct ccb_ataio *ataio, uint32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb *), int use_ncq, uint8_t zm_action, uint64_t zone_id, uint8_t zone_flags, uint8_t *data_ptr, uint32_t dxfer_len, uint32_t timeout) { uint8_t command_out, ata_flags; uint16_t features_out, sectors_out; uint32_t auxiliary; if (use_ncq == 0) { command_out = ATA_ZAC_MANAGEMENT_IN; /* XXX KDM put a macro here */ features_out = (zm_action & 0xf) | (zone_flags << 8); ata_flags = CAM_ATAIO_DMA; sectors_out = ((dxfer_len >> 9) & 0xffff); auxiliary = 0; } else { command_out = ATA_RECV_FPDMA_QUEUED; sectors_out = ATA_RFPDMA_ZAC_MGMT_IN << 8; auxiliary = (zm_action & 0xf) | (zone_flags << 8); ata_flags = CAM_ATAIO_FPDMA; /* * For RECEIVE FPDMA QUEUED, the transfer length is * encoded in the FEATURE register, and 0 means * that 65536 512 byte blocks are to be tranferred. * In practice, it is unlikely we will see a transfer that * large. */ if (dxfer_len == (65536 * 512)) { features_out = 0; } else { /* * Yes, the caller can theoretically request a * transfer larger than we can handle. * Anyone using this function needs enough * knowledge to avoid doing that. */ features_out = ((dxfer_len >> 9) & 0xffff); } } cam_fill_ataio(ataio, /*retries*/ retries, /*cbfcnp*/ cbfcnp, /*flags*/ CAM_DIR_IN, /*tag_action*/ 0, /*data_ptr*/ data_ptr, /*dxfer_len*/ dxfer_len, /*timeout*/ timeout); ata_48bit_cmd(ataio, /*cmd*/ command_out, /*features*/ features_out, /*lba*/ zone_id, /*sector_count*/ sectors_out); ataio->cmd.flags |= ata_flags; if (auxiliary != 0) { ataio->ata_flags |= ATA_FLAG_AUX; ataio->aux = auxiliary; } } void ata_param_fixup(struct ata_params *ident_buf) { int16_t *ptr; for (ptr = (int16_t *)ident_buf; ptr < (int16_t *)ident_buf + sizeof(struct ata_params)/2; ptr++) { *ptr = le16toh(*ptr); } if (strncmp(ident_buf->model, "FX", 2) && strncmp(ident_buf->model, "NEC", 3) && strncmp(ident_buf->model, "Pioneer", 7) && strncmp(ident_buf->model, "SHARP", 5)) { ata_bswap(ident_buf->model, sizeof(ident_buf->model)); ata_bswap(ident_buf->revision, sizeof(ident_buf->revision)); ata_bswap(ident_buf->serial, sizeof(ident_buf->serial)); } ata_btrim(ident_buf->model, sizeof(ident_buf->model)); ata_bpack(ident_buf->model, ident_buf->model, sizeof(ident_buf->model)); ata_btrim(ident_buf->revision, sizeof(ident_buf->revision)); ata_bpack(ident_buf->revision, ident_buf->revision, sizeof(ident_buf->revision)); ata_btrim(ident_buf->serial, sizeof(ident_buf->serial)); ata_bpack(ident_buf->serial, ident_buf->serial, sizeof(ident_buf->serial)); } Index: projects/nfsv42/sys/cam/ctl/ctl.c =================================================================== --- projects/nfsv42/sys/cam/ctl/ctl.c (revision 350367) +++ projects/nfsv42/sys/cam/ctl/ctl.c (revision 350368) @@ -1,13448 +1,13564 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2003-2009 Silicon Graphics International Corp. * Copyright (c) 2012 The FreeBSD Foundation * Copyright (c) 2014-2017 Alexander Motin * Copyright (c) 2017 Jakub Wojciech Klama * Copyright (c) 2018 Marcelo Araujo * All rights reserved. * * Portions of this software were developed by Edward Tomasz Napierala * under sponsorship from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions, and the following disclaimer, * without modification. * 2. Redistributions in binary form must reproduce at minimum a disclaimer * substantially similar to the "NO WARRANTY" disclaimer below * ("Disclaimer") and any redistribution must be conditioned upon * including a substantially similar Disclaimer requirement for further * binary redistribution. * * NO WARRANTY * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGES. * * $Id$ */ /* * CAM Target Layer, a SCSI device emulation subsystem. * * Author: Ken Merry */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include struct ctl_softc *control_softc = NULL; /* * Template mode pages. */ /* * Note that these are default values only. The actual values will be * filled in when the user does a mode sense. */ const static struct scsi_da_rw_recovery_page rw_er_page_default = { /*page_code*/SMS_RW_ERROR_RECOVERY_PAGE, /*page_length*/sizeof(struct scsi_da_rw_recovery_page) - 2, /*byte3*/SMS_RWER_AWRE|SMS_RWER_ARRE, /*read_retry_count*/0, /*correction_span*/0, /*head_offset_count*/0, /*data_strobe_offset_cnt*/0, /*byte8*/SMS_RWER_LBPERE, /*write_retry_count*/0, /*reserved2*/0, /*recovery_time_limit*/{0, 0}, }; const static struct scsi_da_rw_recovery_page rw_er_page_changeable = { /*page_code*/SMS_RW_ERROR_RECOVERY_PAGE, /*page_length*/sizeof(struct scsi_da_rw_recovery_page) - 2, /*byte3*/SMS_RWER_PER, /*read_retry_count*/0, /*correction_span*/0, /*head_offset_count*/0, /*data_strobe_offset_cnt*/0, /*byte8*/SMS_RWER_LBPERE, /*write_retry_count*/0, /*reserved2*/0, /*recovery_time_limit*/{0, 0}, }; const static struct scsi_format_page format_page_default = { /*page_code*/SMS_FORMAT_DEVICE_PAGE, /*page_length*/sizeof(struct scsi_format_page) - 2, /*tracks_per_zone*/ {0, 0}, /*alt_sectors_per_zone*/ {0, 0}, /*alt_tracks_per_zone*/ {0, 0}, /*alt_tracks_per_lun*/ {0, 0}, /*sectors_per_track*/ {(CTL_DEFAULT_SECTORS_PER_TRACK >> 8) & 0xff, CTL_DEFAULT_SECTORS_PER_TRACK & 0xff}, /*bytes_per_sector*/ {0, 0}, /*interleave*/ {0, 0}, /*track_skew*/ {0, 0}, /*cylinder_skew*/ {0, 0}, /*flags*/ SFP_HSEC, /*reserved*/ {0, 0, 0} }; const static struct scsi_format_page format_page_changeable = { /*page_code*/SMS_FORMAT_DEVICE_PAGE, /*page_length*/sizeof(struct scsi_format_page) - 2, /*tracks_per_zone*/ {0, 0}, /*alt_sectors_per_zone*/ {0, 0}, /*alt_tracks_per_zone*/ {0, 0}, /*alt_tracks_per_lun*/ {0, 0}, /*sectors_per_track*/ {0, 0}, /*bytes_per_sector*/ {0, 0}, /*interleave*/ {0, 0}, /*track_skew*/ {0, 0}, /*cylinder_skew*/ {0, 0}, /*flags*/ 0, /*reserved*/ {0, 0, 0} }; const static struct scsi_rigid_disk_page rigid_disk_page_default = { /*page_code*/SMS_RIGID_DISK_PAGE, /*page_length*/sizeof(struct scsi_rigid_disk_page) - 2, /*cylinders*/ {0, 0, 0}, /*heads*/ CTL_DEFAULT_HEADS, /*start_write_precomp*/ {0, 0, 0}, /*start_reduced_current*/ {0, 0, 0}, /*step_rate*/ {0, 0}, /*landing_zone_cylinder*/ {0, 0, 0}, /*rpl*/ SRDP_RPL_DISABLED, /*rotational_offset*/ 0, /*reserved1*/ 0, /*rotation_rate*/ {(CTL_DEFAULT_ROTATION_RATE >> 8) & 0xff, CTL_DEFAULT_ROTATION_RATE & 0xff}, /*reserved2*/ {0, 0} }; const static struct scsi_rigid_disk_page rigid_disk_page_changeable = { /*page_code*/SMS_RIGID_DISK_PAGE, /*page_length*/sizeof(struct scsi_rigid_disk_page) - 2, /*cylinders*/ {0, 0, 0}, /*heads*/ 0, /*start_write_precomp*/ {0, 0, 0}, /*start_reduced_current*/ {0, 0, 0}, /*step_rate*/ {0, 0}, /*landing_zone_cylinder*/ {0, 0, 0}, /*rpl*/ 0, /*rotational_offset*/ 0, /*reserved1*/ 0, /*rotation_rate*/ {0, 0}, /*reserved2*/ {0, 0} }; const static struct scsi_da_verify_recovery_page verify_er_page_default = { /*page_code*/SMS_VERIFY_ERROR_RECOVERY_PAGE, /*page_length*/sizeof(struct scsi_da_verify_recovery_page) - 2, /*byte3*/0, /*read_retry_count*/0, /*reserved*/{ 0, 0, 0, 0, 0, 0 }, /*recovery_time_limit*/{0, 0}, }; const static struct scsi_da_verify_recovery_page verify_er_page_changeable = { /*page_code*/SMS_VERIFY_ERROR_RECOVERY_PAGE, /*page_length*/sizeof(struct scsi_da_verify_recovery_page) - 2, /*byte3*/SMS_VER_PER, /*read_retry_count*/0, /*reserved*/{ 0, 0, 0, 0, 0, 0 }, /*recovery_time_limit*/{0, 0}, }; const static struct scsi_caching_page caching_page_default = { /*page_code*/SMS_CACHING_PAGE, /*page_length*/sizeof(struct scsi_caching_page) - 2, /*flags1*/ SCP_DISC | SCP_WCE, /*ret_priority*/ 0, /*disable_pf_transfer_len*/ {0xff, 0xff}, /*min_prefetch*/ {0, 0}, /*max_prefetch*/ {0xff, 0xff}, /*max_pf_ceiling*/ {0xff, 0xff}, /*flags2*/ 0, /*cache_segments*/ 0, /*cache_seg_size*/ {0, 0}, /*reserved*/ 0, /*non_cache_seg_size*/ {0, 0, 0} }; const static struct scsi_caching_page caching_page_changeable = { /*page_code*/SMS_CACHING_PAGE, /*page_length*/sizeof(struct scsi_caching_page) - 2, /*flags1*/ SCP_WCE | SCP_RCD, /*ret_priority*/ 0, /*disable_pf_transfer_len*/ {0, 0}, /*min_prefetch*/ {0, 0}, /*max_prefetch*/ {0, 0}, /*max_pf_ceiling*/ {0, 0}, /*flags2*/ 0, /*cache_segments*/ 0, /*cache_seg_size*/ {0, 0}, /*reserved*/ 0, /*non_cache_seg_size*/ {0, 0, 0} }; const static struct scsi_control_page control_page_default = { /*page_code*/SMS_CONTROL_MODE_PAGE, /*page_length*/sizeof(struct scsi_control_page) - 2, /*rlec*/0, /*queue_flags*/SCP_QUEUE_ALG_RESTRICTED, /*eca_and_aen*/0, /*flags4*/SCP_TAS, /*aen_holdoff_period*/{0, 0}, /*busy_timeout_period*/{0, 0}, /*extended_selftest_completion_time*/{0, 0} }; const static struct scsi_control_page control_page_changeable = { /*page_code*/SMS_CONTROL_MODE_PAGE, /*page_length*/sizeof(struct scsi_control_page) - 2, /*rlec*/SCP_DSENSE, /*queue_flags*/SCP_QUEUE_ALG_MASK | SCP_NUAR, /*eca_and_aen*/SCP_SWP, /*flags4*/0, /*aen_holdoff_period*/{0, 0}, /*busy_timeout_period*/{0, 0}, /*extended_selftest_completion_time*/{0, 0} }; #define CTL_CEM_LEN (sizeof(struct scsi_control_ext_page) - 4) const static struct scsi_control_ext_page control_ext_page_default = { /*page_code*/SMS_CONTROL_MODE_PAGE | SMPH_SPF, /*subpage_code*/0x01, /*page_length*/{CTL_CEM_LEN >> 8, CTL_CEM_LEN}, /*flags*/0, /*prio*/0, /*max_sense*/0 }; const static struct scsi_control_ext_page control_ext_page_changeable = { /*page_code*/SMS_CONTROL_MODE_PAGE | SMPH_SPF, /*subpage_code*/0x01, /*page_length*/{CTL_CEM_LEN >> 8, CTL_CEM_LEN}, /*flags*/0, /*prio*/0, /*max_sense*/0xff }; const static struct scsi_info_exceptions_page ie_page_default = { /*page_code*/SMS_INFO_EXCEPTIONS_PAGE, /*page_length*/sizeof(struct scsi_info_exceptions_page) - 2, /*info_flags*/SIEP_FLAGS_EWASC, /*mrie*/SIEP_MRIE_NO, /*interval_timer*/{0, 0, 0, 0}, /*report_count*/{0, 0, 0, 1} }; const static struct scsi_info_exceptions_page ie_page_changeable = { /*page_code*/SMS_INFO_EXCEPTIONS_PAGE, /*page_length*/sizeof(struct scsi_info_exceptions_page) - 2, /*info_flags*/SIEP_FLAGS_EWASC | SIEP_FLAGS_DEXCPT | SIEP_FLAGS_TEST | SIEP_FLAGS_LOGERR, /*mrie*/0x0f, /*interval_timer*/{0xff, 0xff, 0xff, 0xff}, /*report_count*/{0xff, 0xff, 0xff, 0xff} }; #define CTL_LBPM_LEN (sizeof(struct ctl_logical_block_provisioning_page) - 4) const static struct ctl_logical_block_provisioning_page lbp_page_default = {{ /*page_code*/SMS_INFO_EXCEPTIONS_PAGE | SMPH_SPF, /*subpage_code*/0x02, /*page_length*/{CTL_LBPM_LEN >> 8, CTL_LBPM_LEN}, /*flags*/0, /*reserved*/{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, /*descr*/{}}, {{/*flags*/0, /*resource*/0x01, /*reserved*/{0, 0}, /*count*/{0, 0, 0, 0}}, {/*flags*/0, /*resource*/0x02, /*reserved*/{0, 0}, /*count*/{0, 0, 0, 0}}, {/*flags*/0, /*resource*/0xf1, /*reserved*/{0, 0}, /*count*/{0, 0, 0, 0}}, {/*flags*/0, /*resource*/0xf2, /*reserved*/{0, 0}, /*count*/{0, 0, 0, 0}} } }; const static struct ctl_logical_block_provisioning_page lbp_page_changeable = {{ /*page_code*/SMS_INFO_EXCEPTIONS_PAGE | SMPH_SPF, /*subpage_code*/0x02, /*page_length*/{CTL_LBPM_LEN >> 8, CTL_LBPM_LEN}, /*flags*/SLBPP_SITUA, /*reserved*/{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, /*descr*/{}}, {{/*flags*/0, /*resource*/0, /*reserved*/{0, 0}, /*count*/{0, 0, 0, 0}}, {/*flags*/0, /*resource*/0, /*reserved*/{0, 0}, /*count*/{0, 0, 0, 0}}, {/*flags*/0, /*resource*/0, /*reserved*/{0, 0}, /*count*/{0, 0, 0, 0}}, {/*flags*/0, /*resource*/0, /*reserved*/{0, 0}, /*count*/{0, 0, 0, 0}} } }; const static struct scsi_cddvd_capabilities_page cddvd_page_default = { /*page_code*/SMS_CDDVD_CAPS_PAGE, /*page_length*/sizeof(struct scsi_cddvd_capabilities_page) - 2, /*caps1*/0x3f, /*caps2*/0x00, /*caps3*/0xf0, /*caps4*/0x00, /*caps5*/0x29, /*caps6*/0x00, /*obsolete*/{0, 0}, /*nvol_levels*/{0, 0}, /*buffer_size*/{8, 0}, /*obsolete2*/{0, 0}, /*reserved*/0, /*digital*/0, /*obsolete3*/0, /*copy_management*/0, /*reserved2*/0, /*rotation_control*/0, /*cur_write_speed*/0, /*num_speed_descr*/0, }; const static struct scsi_cddvd_capabilities_page cddvd_page_changeable = { /*page_code*/SMS_CDDVD_CAPS_PAGE, /*page_length*/sizeof(struct scsi_cddvd_capabilities_page) - 2, /*caps1*/0, /*caps2*/0, /*caps3*/0, /*caps4*/0, /*caps5*/0, /*caps6*/0, /*obsolete*/{0, 0}, /*nvol_levels*/{0, 0}, /*buffer_size*/{0, 0}, /*obsolete2*/{0, 0}, /*reserved*/0, /*digital*/0, /*obsolete3*/0, /*copy_management*/0, /*reserved2*/0, /*rotation_control*/0, /*cur_write_speed*/0, /*num_speed_descr*/0, }; SYSCTL_NODE(_kern_cam, OID_AUTO, ctl, CTLFLAG_RD, 0, "CAM Target Layer"); static int worker_threads = -1; SYSCTL_INT(_kern_cam_ctl, OID_AUTO, worker_threads, CTLFLAG_RDTUN, &worker_threads, 1, "Number of worker threads"); static int ctl_debug = CTL_DEBUG_NONE; SYSCTL_INT(_kern_cam_ctl, OID_AUTO, debug, CTLFLAG_RWTUN, &ctl_debug, 0, "Enabled debug flags"); static int ctl_lun_map_size = 1024; SYSCTL_INT(_kern_cam_ctl, OID_AUTO, lun_map_size, CTLFLAG_RWTUN, &ctl_lun_map_size, 0, "Size of per-port LUN map (max LUN + 1)"); #ifdef CTL_TIME_IO static int ctl_time_io_secs = CTL_TIME_IO_DEFAULT_SECS; SYSCTL_INT(_kern_cam_ctl, OID_AUTO, time_io_secs, CTLFLAG_RWTUN, &ctl_time_io_secs, 0, "Log requests taking more seconds"); #endif /* * Maximum number of LUNs we support. MUST be a power of 2. */ #define CTL_DEFAULT_MAX_LUNS 1024 static int ctl_max_luns = CTL_DEFAULT_MAX_LUNS; TUNABLE_INT("kern.cam.ctl.max_luns", &ctl_max_luns); SYSCTL_INT(_kern_cam_ctl, OID_AUTO, max_luns, CTLFLAG_RDTUN, &ctl_max_luns, CTL_DEFAULT_MAX_LUNS, "Maximum number of LUNs"); /* * Maximum number of ports registered at one time. */ #define CTL_DEFAULT_MAX_PORTS 256 static int ctl_max_ports = CTL_DEFAULT_MAX_PORTS; TUNABLE_INT("kern.cam.ctl.max_ports", &ctl_max_ports); SYSCTL_INT(_kern_cam_ctl, OID_AUTO, max_ports, CTLFLAG_RDTUN, &ctl_max_ports, CTL_DEFAULT_MAX_LUNS, "Maximum number of ports"); /* * Maximum number of initiators we support. */ #define CTL_MAX_INITIATORS (CTL_MAX_INIT_PER_PORT * ctl_max_ports) /* * Supported pages (0x00), Serial number (0x80), Device ID (0x83), * Extended INQUIRY Data (0x86), Mode Page Policy (0x87), - * SCSI Ports (0x88), Third-party Copy (0x8F), Block limits (0xB0), - * Block Device Characteristics (0xB1) and Logical Block Provisioning (0xB2) + * SCSI Ports (0x88), Third-party Copy (0x8F), SCSI Feature Sets (0x92), + * Block limits (0xB0), Block Device Characteristics (0xB1) and + * Logical Block Provisioning (0xB2) */ -#define SCSI_EVPD_NUM_SUPPORTED_PAGES 10 +#define SCSI_EVPD_NUM_SUPPORTED_PAGES 11 static void ctl_isc_event_handler(ctl_ha_channel chanel, ctl_ha_event event, int param); static void ctl_copy_sense_data(union ctl_ha_msg *src, union ctl_io *dest); static void ctl_copy_sense_data_back(union ctl_io *src, union ctl_ha_msg *dest); static int ctl_init(void); static int ctl_shutdown(void); static int ctl_open(struct cdev *dev, int flags, int fmt, struct thread *td); static int ctl_close(struct cdev *dev, int flags, int fmt, struct thread *td); static void ctl_serialize_other_sc_cmd(struct ctl_scsiio *ctsio); static void ctl_ioctl_fill_ooa(struct ctl_lun *lun, uint32_t *cur_fill_num, struct ctl_ooa *ooa_hdr, struct ctl_ooa_entry *kern_entries); static int ctl_ioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flag, struct thread *td); static int ctl_alloc_lun(struct ctl_softc *ctl_softc, struct ctl_lun *lun, struct ctl_be_lun *be_lun); static int ctl_free_lun(struct ctl_lun *lun); static void ctl_create_lun(struct ctl_be_lun *be_lun); static int ctl_do_mode_select(union ctl_io *io); static int ctl_pro_preempt(struct ctl_softc *softc, struct ctl_lun *lun, uint64_t res_key, uint64_t sa_res_key, uint8_t type, uint32_t residx, struct ctl_scsiio *ctsio, struct scsi_per_res_out *cdb, struct scsi_per_res_out_parms* param); static void ctl_pro_preempt_other(struct ctl_lun *lun, union ctl_ha_msg *msg); static void ctl_hndl_per_res_out_on_other_sc(union ctl_io *io); static int ctl_inquiry_evpd_supported(struct ctl_scsiio *ctsio, int alloc_len); static int ctl_inquiry_evpd_serial(struct ctl_scsiio *ctsio, int alloc_len); static int ctl_inquiry_evpd_devid(struct ctl_scsiio *ctsio, int alloc_len); static int ctl_inquiry_evpd_eid(struct ctl_scsiio *ctsio, int alloc_len); static int ctl_inquiry_evpd_mpp(struct ctl_scsiio *ctsio, int alloc_len); static int ctl_inquiry_evpd_scsi_ports(struct ctl_scsiio *ctsio, int alloc_len); +static int ctl_inquiry_evpd_sfs(struct ctl_scsiio *ctsio, int alloc_len); static int ctl_inquiry_evpd_block_limits(struct ctl_scsiio *ctsio, int alloc_len); static int ctl_inquiry_evpd_bdc(struct ctl_scsiio *ctsio, int alloc_len); static int ctl_inquiry_evpd_lbp(struct ctl_scsiio *ctsio, int alloc_len); static int ctl_inquiry_evpd(struct ctl_scsiio *ctsio); static int ctl_inquiry_std(struct ctl_scsiio *ctsio); static int ctl_get_lba_len(union ctl_io *io, uint64_t *lba, uint64_t *len); static ctl_action ctl_extent_check(union ctl_io *io1, union ctl_io *io2, bool seq); static ctl_action ctl_extent_check_seq(union ctl_io *io1, union ctl_io *io2); static ctl_action ctl_check_for_blockage(struct ctl_lun *lun, union ctl_io *pending_io, union ctl_io *ooa_io); static ctl_action ctl_check_ooa(struct ctl_lun *lun, union ctl_io *pending_io, union ctl_io **starting_io); static void ctl_try_unblock_io(struct ctl_lun *lun, union ctl_io *io, bool skip); static void ctl_try_unblock_others(struct ctl_lun *lun, union ctl_io *io, bool skip); static int ctl_scsiio_lun_check(struct ctl_lun *lun, const struct ctl_cmd_entry *entry, struct ctl_scsiio *ctsio); static void ctl_failover_lun(union ctl_io *io); static int ctl_scsiio_precheck(struct ctl_softc *ctl_softc, struct ctl_scsiio *ctsio); static int ctl_scsiio(struct ctl_scsiio *ctsio); static int ctl_target_reset(union ctl_io *io); static void ctl_do_lun_reset(struct ctl_lun *lun, uint32_t initidx, ctl_ua_type ua_type); static int ctl_lun_reset(union ctl_io *io); static int ctl_abort_task(union ctl_io *io); static int ctl_abort_task_set(union ctl_io *io); static int ctl_query_task(union ctl_io *io, int task_set); static void ctl_i_t_nexus_loss(struct ctl_softc *softc, uint32_t initidx, ctl_ua_type ua_type); static int ctl_i_t_nexus_reset(union ctl_io *io); static int ctl_query_async_event(union ctl_io *io); static void ctl_run_task(union ctl_io *io); #ifdef CTL_IO_DELAY static void ctl_datamove_timer_wakeup(void *arg); static void ctl_done_timer_wakeup(void *arg); #endif /* CTL_IO_DELAY */ static void ctl_send_datamove_done(union ctl_io *io, int have_lock); static void ctl_datamove_remote_write_cb(struct ctl_ha_dt_req *rq); static int ctl_datamove_remote_dm_write_cb(union ctl_io *io); static void ctl_datamove_remote_write(union ctl_io *io); static int ctl_datamove_remote_dm_read_cb(union ctl_io *io); static void ctl_datamove_remote_read_cb(struct ctl_ha_dt_req *rq); static int ctl_datamove_remote_sgl_setup(union ctl_io *io); static int ctl_datamove_remote_xfer(union ctl_io *io, unsigned command, ctl_ha_dt_cb callback); static void ctl_datamove_remote_read(union ctl_io *io); static void ctl_datamove_remote(union ctl_io *io); static void ctl_process_done(union ctl_io *io); static void ctl_lun_thread(void *arg); static void ctl_thresh_thread(void *arg); static void ctl_work_thread(void *arg); static void ctl_enqueue_incoming(union ctl_io *io); static void ctl_enqueue_rtr(union ctl_io *io); static void ctl_enqueue_done(union ctl_io *io); static void ctl_enqueue_isc(union ctl_io *io); static const struct ctl_cmd_entry * ctl_get_cmd_entry(struct ctl_scsiio *ctsio, int *sa); static const struct ctl_cmd_entry * ctl_validate_command(struct ctl_scsiio *ctsio); static int ctl_cmd_applicable(uint8_t lun_type, const struct ctl_cmd_entry *entry); static int ctl_ha_init(void); static int ctl_ha_shutdown(void); static uint64_t ctl_get_prkey(struct ctl_lun *lun, uint32_t residx); static void ctl_clr_prkey(struct ctl_lun *lun, uint32_t residx); static void ctl_alloc_prkey(struct ctl_lun *lun, uint32_t residx); static void ctl_set_prkey(struct ctl_lun *lun, uint32_t residx, uint64_t key); /* * Load the serialization table. This isn't very pretty, but is probably * the easiest way to do it. */ #include "ctl_ser_table.c" /* * We only need to define open, close and ioctl routines for this driver. */ static struct cdevsw ctl_cdevsw = { .d_version = D_VERSION, .d_flags = 0, .d_open = ctl_open, .d_close = ctl_close, .d_ioctl = ctl_ioctl, .d_name = "ctl", }; MALLOC_DEFINE(M_CTL, "ctlmem", "Memory used for CTL"); static int ctl_module_event_handler(module_t, int /*modeventtype_t*/, void *); static moduledata_t ctl_moduledata = { "ctl", ctl_module_event_handler, NULL }; DECLARE_MODULE(ctl, ctl_moduledata, SI_SUB_CONFIGURE, SI_ORDER_THIRD); MODULE_VERSION(ctl, 1); static struct ctl_frontend ha_frontend = { .name = "ha", .init = ctl_ha_init, .shutdown = ctl_ha_shutdown, }; static int ctl_ha_init(void) { struct ctl_softc *softc = control_softc; if (ctl_pool_create(softc, "othersc", CTL_POOL_ENTRIES_OTHER_SC, &softc->othersc_pool) != 0) return (ENOMEM); if (ctl_ha_msg_init(softc) != CTL_HA_STATUS_SUCCESS) { ctl_pool_free(softc->othersc_pool); return (EIO); } if (ctl_ha_msg_register(CTL_HA_CHAN_CTL, ctl_isc_event_handler) != CTL_HA_STATUS_SUCCESS) { ctl_ha_msg_destroy(softc); ctl_pool_free(softc->othersc_pool); return (EIO); } return (0); }; static int ctl_ha_shutdown(void) { struct ctl_softc *softc = control_softc; struct ctl_port *port; ctl_ha_msg_shutdown(softc); if (ctl_ha_msg_deregister(CTL_HA_CHAN_CTL) != CTL_HA_STATUS_SUCCESS) return (EIO); if (ctl_ha_msg_destroy(softc) != CTL_HA_STATUS_SUCCESS) return (EIO); ctl_pool_free(softc->othersc_pool); while ((port = STAILQ_FIRST(&ha_frontend.port_list)) != NULL) { ctl_port_deregister(port); free(port->port_name, M_CTL); free(port, M_CTL); } return (0); }; static void ctl_ha_datamove(union ctl_io *io) { struct ctl_lun *lun = CTL_LUN(io); struct ctl_sg_entry *sgl; union ctl_ha_msg msg; uint32_t sg_entries_sent; int do_sg_copy, i, j; memset(&msg.dt, 0, sizeof(msg.dt)); msg.hdr.msg_type = CTL_MSG_DATAMOVE; msg.hdr.original_sc = io->io_hdr.remote_io; msg.hdr.serializing_sc = io; msg.hdr.nexus = io->io_hdr.nexus; msg.hdr.status = io->io_hdr.status; msg.dt.flags = io->io_hdr.flags; /* * We convert everything into a S/G list here. We can't * pass by reference, only by value between controllers. * So we can't pass a pointer to the S/G list, only as many * S/G entries as we can fit in here. If it's possible for * us to get more than CTL_HA_MAX_SG_ENTRIES S/G entries, * then we need to break this up into multiple transfers. */ if (io->scsiio.kern_sg_entries == 0) { msg.dt.kern_sg_entries = 1; #if 0 if (io->io_hdr.flags & CTL_FLAG_BUS_ADDR) { msg.dt.sg_list[0].addr = io->scsiio.kern_data_ptr; } else { /* XXX KDM use busdma here! */ msg.dt.sg_list[0].addr = (void *)vtophys(io->scsiio.kern_data_ptr); } #else KASSERT((io->io_hdr.flags & CTL_FLAG_BUS_ADDR) == 0, ("HA does not support BUS_ADDR")); msg.dt.sg_list[0].addr = io->scsiio.kern_data_ptr; #endif msg.dt.sg_list[0].len = io->scsiio.kern_data_len; do_sg_copy = 0; } else { msg.dt.kern_sg_entries = io->scsiio.kern_sg_entries; do_sg_copy = 1; } msg.dt.kern_data_len = io->scsiio.kern_data_len; msg.dt.kern_total_len = io->scsiio.kern_total_len; msg.dt.kern_data_resid = io->scsiio.kern_data_resid; msg.dt.kern_rel_offset = io->scsiio.kern_rel_offset; msg.dt.sg_sequence = 0; /* * Loop until we've sent all of the S/G entries. On the * other end, we'll recompose these S/G entries into one * contiguous list before processing. */ for (sg_entries_sent = 0; sg_entries_sent < msg.dt.kern_sg_entries; msg.dt.sg_sequence++) { msg.dt.cur_sg_entries = MIN((sizeof(msg.dt.sg_list) / sizeof(msg.dt.sg_list[0])), msg.dt.kern_sg_entries - sg_entries_sent); if (do_sg_copy != 0) { sgl = (struct ctl_sg_entry *)io->scsiio.kern_data_ptr; for (i = sg_entries_sent, j = 0; i < msg.dt.cur_sg_entries; i++, j++) { #if 0 if (io->io_hdr.flags & CTL_FLAG_BUS_ADDR) { msg.dt.sg_list[j].addr = sgl[i].addr; } else { /* XXX KDM use busdma here! */ msg.dt.sg_list[j].addr = (void *)vtophys(sgl[i].addr); } #else KASSERT((io->io_hdr.flags & CTL_FLAG_BUS_ADDR) == 0, ("HA does not support BUS_ADDR")); msg.dt.sg_list[j].addr = sgl[i].addr; #endif msg.dt.sg_list[j].len = sgl[i].len; } } sg_entries_sent += msg.dt.cur_sg_entries; msg.dt.sg_last = (sg_entries_sent >= msg.dt.kern_sg_entries); if (ctl_ha_msg_send(CTL_HA_CHAN_CTL, &msg, sizeof(msg.dt) - sizeof(msg.dt.sg_list) + sizeof(struct ctl_sg_entry) * msg.dt.cur_sg_entries, M_WAITOK) > CTL_HA_STATUS_SUCCESS) { io->io_hdr.port_status = 31341; io->scsiio.be_move_done(io); return; } msg.dt.sent_sg_entries = sg_entries_sent; } /* * Officially handover the request from us to peer. * If failover has just happened, then we must return error. * If failover happen just after, then it is not our problem. */ if (lun) mtx_lock(&lun->lun_lock); if (io->io_hdr.flags & CTL_FLAG_FAILOVER) { if (lun) mtx_unlock(&lun->lun_lock); io->io_hdr.port_status = 31342; io->scsiio.be_move_done(io); return; } io->io_hdr.flags &= ~CTL_FLAG_IO_ACTIVE; io->io_hdr.flags |= CTL_FLAG_DMA_INPROG; if (lun) mtx_unlock(&lun->lun_lock); } static void ctl_ha_done(union ctl_io *io) { union ctl_ha_msg msg; if (io->io_hdr.io_type == CTL_IO_SCSI) { memset(&msg, 0, sizeof(msg)); msg.hdr.msg_type = CTL_MSG_FINISH_IO; msg.hdr.original_sc = io->io_hdr.remote_io; msg.hdr.nexus = io->io_hdr.nexus; msg.hdr.status = io->io_hdr.status; msg.scsi.scsi_status = io->scsiio.scsi_status; msg.scsi.tag_num = io->scsiio.tag_num; msg.scsi.tag_type = io->scsiio.tag_type; msg.scsi.sense_len = io->scsiio.sense_len; memcpy(&msg.scsi.sense_data, &io->scsiio.sense_data, io->scsiio.sense_len); ctl_ha_msg_send(CTL_HA_CHAN_CTL, &msg, sizeof(msg.scsi) - sizeof(msg.scsi.sense_data) + msg.scsi.sense_len, M_WAITOK); } ctl_free_io(io); } static void ctl_isc_handler_finish_xfer(struct ctl_softc *ctl_softc, union ctl_ha_msg *msg_info) { struct ctl_scsiio *ctsio; if (msg_info->hdr.original_sc == NULL) { printf("%s: original_sc == NULL!\n", __func__); /* XXX KDM now what? */ return; } ctsio = &msg_info->hdr.original_sc->scsiio; ctsio->io_hdr.flags |= CTL_FLAG_IO_ACTIVE; ctsio->io_hdr.msg_type = CTL_MSG_FINISH_IO; ctsio->io_hdr.status = msg_info->hdr.status; ctsio->scsi_status = msg_info->scsi.scsi_status; ctsio->sense_len = msg_info->scsi.sense_len; memcpy(&ctsio->sense_data, &msg_info->scsi.sense_data, msg_info->scsi.sense_len); ctl_enqueue_isc((union ctl_io *)ctsio); } static void ctl_isc_handler_finish_ser_only(struct ctl_softc *ctl_softc, union ctl_ha_msg *msg_info) { struct ctl_scsiio *ctsio; if (msg_info->hdr.serializing_sc == NULL) { printf("%s: serializing_sc == NULL!\n", __func__); /* XXX KDM now what? */ return; } ctsio = &msg_info->hdr.serializing_sc->scsiio; ctsio->io_hdr.msg_type = CTL_MSG_FINISH_IO; ctl_enqueue_isc((union ctl_io *)ctsio); } void ctl_isc_announce_lun(struct ctl_lun *lun) { struct ctl_softc *softc = lun->ctl_softc; union ctl_ha_msg *msg; struct ctl_ha_msg_lun_pr_key pr_key; int i, k; if (softc->ha_link != CTL_HA_LINK_ONLINE) return; mtx_lock(&lun->lun_lock); i = sizeof(msg->lun); if (lun->lun_devid) i += lun->lun_devid->len; i += sizeof(pr_key) * lun->pr_key_count; alloc: mtx_unlock(&lun->lun_lock); msg = malloc(i, M_CTL, M_WAITOK); mtx_lock(&lun->lun_lock); k = sizeof(msg->lun); if (lun->lun_devid) k += lun->lun_devid->len; k += sizeof(pr_key) * lun->pr_key_count; if (i < k) { free(msg, M_CTL); i = k; goto alloc; } bzero(&msg->lun, sizeof(msg->lun)); msg->hdr.msg_type = CTL_MSG_LUN_SYNC; msg->hdr.nexus.targ_lun = lun->lun; msg->hdr.nexus.targ_mapped_lun = lun->lun; msg->lun.flags = lun->flags; msg->lun.pr_generation = lun->pr_generation; msg->lun.pr_res_idx = lun->pr_res_idx; msg->lun.pr_res_type = lun->pr_res_type; msg->lun.pr_key_count = lun->pr_key_count; i = 0; if (lun->lun_devid) { msg->lun.lun_devid_len = lun->lun_devid->len; memcpy(&msg->lun.data[i], lun->lun_devid->data, msg->lun.lun_devid_len); i += msg->lun.lun_devid_len; } for (k = 0; k < CTL_MAX_INITIATORS; k++) { if ((pr_key.pr_key = ctl_get_prkey(lun, k)) == 0) continue; pr_key.pr_iid = k; memcpy(&msg->lun.data[i], &pr_key, sizeof(pr_key)); i += sizeof(pr_key); } mtx_unlock(&lun->lun_lock); ctl_ha_msg_send(CTL_HA_CHAN_CTL, &msg->port, sizeof(msg->port) + i, M_WAITOK); free(msg, M_CTL); if (lun->flags & CTL_LUN_PRIMARY_SC) { for (i = 0; i < CTL_NUM_MODE_PAGES; i++) { ctl_isc_announce_mode(lun, -1, lun->mode_pages.index[i].page_code & SMPH_PC_MASK, lun->mode_pages.index[i].subpage); } } } void ctl_isc_announce_port(struct ctl_port *port) { struct ctl_softc *softc = port->ctl_softc; union ctl_ha_msg *msg; int i; if (port->targ_port < softc->port_min || port->targ_port >= softc->port_max || softc->ha_link != CTL_HA_LINK_ONLINE) return; i = sizeof(msg->port) + strlen(port->port_name) + 1; if (port->lun_map) i += port->lun_map_size * sizeof(uint32_t); if (port->port_devid) i += port->port_devid->len; if (port->target_devid) i += port->target_devid->len; if (port->init_devid) i += port->init_devid->len; msg = malloc(i, M_CTL, M_WAITOK); bzero(&msg->port, sizeof(msg->port)); msg->hdr.msg_type = CTL_MSG_PORT_SYNC; msg->hdr.nexus.targ_port = port->targ_port; msg->port.port_type = port->port_type; msg->port.physical_port = port->physical_port; msg->port.virtual_port = port->virtual_port; msg->port.status = port->status; i = 0; msg->port.name_len = sprintf(&msg->port.data[i], "%d:%s", softc->ha_id, port->port_name) + 1; i += msg->port.name_len; if (port->lun_map) { msg->port.lun_map_len = port->lun_map_size * sizeof(uint32_t); memcpy(&msg->port.data[i], port->lun_map, msg->port.lun_map_len); i += msg->port.lun_map_len; } if (port->port_devid) { msg->port.port_devid_len = port->port_devid->len; memcpy(&msg->port.data[i], port->port_devid->data, msg->port.port_devid_len); i += msg->port.port_devid_len; } if (port->target_devid) { msg->port.target_devid_len = port->target_devid->len; memcpy(&msg->port.data[i], port->target_devid->data, msg->port.target_devid_len); i += msg->port.target_devid_len; } if (port->init_devid) { msg->port.init_devid_len = port->init_devid->len; memcpy(&msg->port.data[i], port->init_devid->data, msg->port.init_devid_len); i += msg->port.init_devid_len; } ctl_ha_msg_send(CTL_HA_CHAN_CTL, &msg->port, sizeof(msg->port) + i, M_WAITOK); free(msg, M_CTL); } void ctl_isc_announce_iid(struct ctl_port *port, int iid) { struct ctl_softc *softc = port->ctl_softc; union ctl_ha_msg *msg; int i, l; if (port->targ_port < softc->port_min || port->targ_port >= softc->port_max || softc->ha_link != CTL_HA_LINK_ONLINE) return; mtx_lock(&softc->ctl_lock); i = sizeof(msg->iid); l = 0; if (port->wwpn_iid[iid].name) l = strlen(port->wwpn_iid[iid].name) + 1; i += l; msg = malloc(i, M_CTL, M_NOWAIT); if (msg == NULL) { mtx_unlock(&softc->ctl_lock); return; } bzero(&msg->iid, sizeof(msg->iid)); msg->hdr.msg_type = CTL_MSG_IID_SYNC; msg->hdr.nexus.targ_port = port->targ_port; msg->hdr.nexus.initid = iid; msg->iid.in_use = port->wwpn_iid[iid].in_use; msg->iid.name_len = l; msg->iid.wwpn = port->wwpn_iid[iid].wwpn; if (port->wwpn_iid[iid].name) strlcpy(msg->iid.data, port->wwpn_iid[iid].name, l); mtx_unlock(&softc->ctl_lock); ctl_ha_msg_send(CTL_HA_CHAN_CTL, &msg->iid, i, M_NOWAIT); free(msg, M_CTL); } void ctl_isc_announce_mode(struct ctl_lun *lun, uint32_t initidx, uint8_t page, uint8_t subpage) { struct ctl_softc *softc = lun->ctl_softc; union ctl_ha_msg msg; u_int i; if (softc->ha_link != CTL_HA_LINK_ONLINE) return; for (i = 0; i < CTL_NUM_MODE_PAGES; i++) { if ((lun->mode_pages.index[i].page_code & SMPH_PC_MASK) == page && lun->mode_pages.index[i].subpage == subpage) break; } if (i == CTL_NUM_MODE_PAGES) return; /* Don't try to replicate pages not present on this device. */ if (lun->mode_pages.index[i].page_data == NULL) return; bzero(&msg.mode, sizeof(msg.mode)); msg.hdr.msg_type = CTL_MSG_MODE_SYNC; msg.hdr.nexus.targ_port = initidx / CTL_MAX_INIT_PER_PORT; msg.hdr.nexus.initid = initidx % CTL_MAX_INIT_PER_PORT; msg.hdr.nexus.targ_lun = lun->lun; msg.hdr.nexus.targ_mapped_lun = lun->lun; msg.mode.page_code = page; msg.mode.subpage = subpage; msg.mode.page_len = lun->mode_pages.index[i].page_len; memcpy(msg.mode.data, lun->mode_pages.index[i].page_data, msg.mode.page_len); ctl_ha_msg_send(CTL_HA_CHAN_CTL, &msg.mode, sizeof(msg.mode), M_WAITOK); } static void ctl_isc_ha_link_up(struct ctl_softc *softc) { struct ctl_port *port; struct ctl_lun *lun; union ctl_ha_msg msg; int i; /* Announce this node parameters to peer for validation. */ msg.login.msg_type = CTL_MSG_LOGIN; msg.login.version = CTL_HA_VERSION; msg.login.ha_mode = softc->ha_mode; msg.login.ha_id = softc->ha_id; msg.login.max_luns = ctl_max_luns; msg.login.max_ports = ctl_max_ports; msg.login.max_init_per_port = CTL_MAX_INIT_PER_PORT; ctl_ha_msg_send(CTL_HA_CHAN_CTL, &msg.login, sizeof(msg.login), M_WAITOK); STAILQ_FOREACH(port, &softc->port_list, links) { ctl_isc_announce_port(port); for (i = 0; i < CTL_MAX_INIT_PER_PORT; i++) { if (port->wwpn_iid[i].in_use) ctl_isc_announce_iid(port, i); } } STAILQ_FOREACH(lun, &softc->lun_list, links) ctl_isc_announce_lun(lun); } static void ctl_isc_ha_link_down(struct ctl_softc *softc) { struct ctl_port *port; struct ctl_lun *lun; union ctl_io *io; int i; mtx_lock(&softc->ctl_lock); STAILQ_FOREACH(lun, &softc->lun_list, links) { mtx_lock(&lun->lun_lock); if (lun->flags & CTL_LUN_PEER_SC_PRIMARY) { lun->flags &= ~CTL_LUN_PEER_SC_PRIMARY; ctl_est_ua_all(lun, -1, CTL_UA_ASYM_ACC_CHANGE); } mtx_unlock(&lun->lun_lock); mtx_unlock(&softc->ctl_lock); io = ctl_alloc_io(softc->othersc_pool); mtx_lock(&softc->ctl_lock); ctl_zero_io(io); io->io_hdr.msg_type = CTL_MSG_FAILOVER; io->io_hdr.nexus.targ_mapped_lun = lun->lun; ctl_enqueue_isc(io); } STAILQ_FOREACH(port, &softc->port_list, links) { if (port->targ_port >= softc->port_min && port->targ_port < softc->port_max) continue; port->status &= ~CTL_PORT_STATUS_ONLINE; for (i = 0; i < CTL_MAX_INIT_PER_PORT; i++) { port->wwpn_iid[i].in_use = 0; free(port->wwpn_iid[i].name, M_CTL); port->wwpn_iid[i].name = NULL; } } mtx_unlock(&softc->ctl_lock); } static void ctl_isc_ua(struct ctl_softc *softc, union ctl_ha_msg *msg, int len) { struct ctl_lun *lun; uint32_t iid = ctl_get_initindex(&msg->hdr.nexus); mtx_lock(&softc->ctl_lock); if (msg->hdr.nexus.targ_mapped_lun >= ctl_max_luns || (lun = softc->ctl_luns[msg->hdr.nexus.targ_mapped_lun]) == NULL) { mtx_unlock(&softc->ctl_lock); return; } mtx_lock(&lun->lun_lock); mtx_unlock(&softc->ctl_lock); if (msg->ua.ua_type == CTL_UA_THIN_PROV_THRES && msg->ua.ua_set) memcpy(lun->ua_tpt_info, msg->ua.ua_info, 8); if (msg->ua.ua_all) { if (msg->ua.ua_set) ctl_est_ua_all(lun, iid, msg->ua.ua_type); else ctl_clr_ua_all(lun, iid, msg->ua.ua_type); } else { if (msg->ua.ua_set) ctl_est_ua(lun, iid, msg->ua.ua_type); else ctl_clr_ua(lun, iid, msg->ua.ua_type); } mtx_unlock(&lun->lun_lock); } static void ctl_isc_lun_sync(struct ctl_softc *softc, union ctl_ha_msg *msg, int len) { struct ctl_lun *lun; struct ctl_ha_msg_lun_pr_key pr_key; int i, k; ctl_lun_flags oflags; uint32_t targ_lun; targ_lun = msg->hdr.nexus.targ_mapped_lun; mtx_lock(&softc->ctl_lock); if (targ_lun >= ctl_max_luns || (lun = softc->ctl_luns[targ_lun]) == NULL) { mtx_unlock(&softc->ctl_lock); return; } mtx_lock(&lun->lun_lock); mtx_unlock(&softc->ctl_lock); if (lun->flags & CTL_LUN_DISABLED) { mtx_unlock(&lun->lun_lock); return; } i = (lun->lun_devid != NULL) ? lun->lun_devid->len : 0; if (msg->lun.lun_devid_len != i || (i > 0 && memcmp(&msg->lun.data[0], lun->lun_devid->data, i) != 0)) { mtx_unlock(&lun->lun_lock); printf("%s: Received conflicting HA LUN %d\n", __func__, targ_lun); return; } else { /* Record whether peer is primary. */ oflags = lun->flags; if ((msg->lun.flags & CTL_LUN_PRIMARY_SC) && (msg->lun.flags & CTL_LUN_DISABLED) == 0) lun->flags |= CTL_LUN_PEER_SC_PRIMARY; else lun->flags &= ~CTL_LUN_PEER_SC_PRIMARY; if (oflags != lun->flags) ctl_est_ua_all(lun, -1, CTL_UA_ASYM_ACC_CHANGE); /* If peer is primary and we are not -- use data */ if ((lun->flags & CTL_LUN_PRIMARY_SC) == 0 && (lun->flags & CTL_LUN_PEER_SC_PRIMARY)) { lun->pr_generation = msg->lun.pr_generation; lun->pr_res_idx = msg->lun.pr_res_idx; lun->pr_res_type = msg->lun.pr_res_type; lun->pr_key_count = msg->lun.pr_key_count; for (k = 0; k < CTL_MAX_INITIATORS; k++) ctl_clr_prkey(lun, k); for (k = 0; k < msg->lun.pr_key_count; k++) { memcpy(&pr_key, &msg->lun.data[i], sizeof(pr_key)); ctl_alloc_prkey(lun, pr_key.pr_iid); ctl_set_prkey(lun, pr_key.pr_iid, pr_key.pr_key); i += sizeof(pr_key); } } mtx_unlock(&lun->lun_lock); CTL_DEBUG_PRINT(("%s: Known LUN %d, peer is %s\n", __func__, targ_lun, (msg->lun.flags & CTL_LUN_PRIMARY_SC) ? "primary" : "secondary")); /* If we are primary but peer doesn't know -- notify */ if ((lun->flags & CTL_LUN_PRIMARY_SC) && (msg->lun.flags & CTL_LUN_PEER_SC_PRIMARY) == 0) ctl_isc_announce_lun(lun); } } static void ctl_isc_port_sync(struct ctl_softc *softc, union ctl_ha_msg *msg, int len) { struct ctl_port *port; struct ctl_lun *lun; int i, new; port = softc->ctl_ports[msg->hdr.nexus.targ_port]; if (port == NULL) { CTL_DEBUG_PRINT(("%s: New port %d\n", __func__, msg->hdr.nexus.targ_port)); new = 1; port = malloc(sizeof(*port), M_CTL, M_WAITOK | M_ZERO); port->frontend = &ha_frontend; port->targ_port = msg->hdr.nexus.targ_port; port->fe_datamove = ctl_ha_datamove; port->fe_done = ctl_ha_done; } else if (port->frontend == &ha_frontend) { CTL_DEBUG_PRINT(("%s: Updated port %d\n", __func__, msg->hdr.nexus.targ_port)); new = 0; } else { printf("%s: Received conflicting HA port %d\n", __func__, msg->hdr.nexus.targ_port); return; } port->port_type = msg->port.port_type; port->physical_port = msg->port.physical_port; port->virtual_port = msg->port.virtual_port; port->status = msg->port.status; i = 0; free(port->port_name, M_CTL); port->port_name = strndup(&msg->port.data[i], msg->port.name_len, M_CTL); i += msg->port.name_len; if (msg->port.lun_map_len != 0) { if (port->lun_map == NULL || port->lun_map_size * sizeof(uint32_t) < msg->port.lun_map_len) { port->lun_map_size = 0; free(port->lun_map, M_CTL); port->lun_map = malloc(msg->port.lun_map_len, M_CTL, M_WAITOK); } memcpy(port->lun_map, &msg->port.data[i], msg->port.lun_map_len); port->lun_map_size = msg->port.lun_map_len / sizeof(uint32_t); i += msg->port.lun_map_len; } else { port->lun_map_size = 0; free(port->lun_map, M_CTL); port->lun_map = NULL; } if (msg->port.port_devid_len != 0) { if (port->port_devid == NULL || port->port_devid->len < msg->port.port_devid_len) { free(port->port_devid, M_CTL); port->port_devid = malloc(sizeof(struct ctl_devid) + msg->port.port_devid_len, M_CTL, M_WAITOK); } memcpy(port->port_devid->data, &msg->port.data[i], msg->port.port_devid_len); port->port_devid->len = msg->port.port_devid_len; i += msg->port.port_devid_len; } else { free(port->port_devid, M_CTL); port->port_devid = NULL; } if (msg->port.target_devid_len != 0) { if (port->target_devid == NULL || port->target_devid->len < msg->port.target_devid_len) { free(port->target_devid, M_CTL); port->target_devid = malloc(sizeof(struct ctl_devid) + msg->port.target_devid_len, M_CTL, M_WAITOK); } memcpy(port->target_devid->data, &msg->port.data[i], msg->port.target_devid_len); port->target_devid->len = msg->port.target_devid_len; i += msg->port.target_devid_len; } else { free(port->target_devid, M_CTL); port->target_devid = NULL; } if (msg->port.init_devid_len != 0) { if (port->init_devid == NULL || port->init_devid->len < msg->port.init_devid_len) { free(port->init_devid, M_CTL); port->init_devid = malloc(sizeof(struct ctl_devid) + msg->port.init_devid_len, M_CTL, M_WAITOK); } memcpy(port->init_devid->data, &msg->port.data[i], msg->port.init_devid_len); port->init_devid->len = msg->port.init_devid_len; i += msg->port.init_devid_len; } else { free(port->init_devid, M_CTL); port->init_devid = NULL; } if (new) { if (ctl_port_register(port) != 0) { printf("%s: ctl_port_register() failed with error\n", __func__); } } mtx_lock(&softc->ctl_lock); STAILQ_FOREACH(lun, &softc->lun_list, links) { if (ctl_lun_map_to_port(port, lun->lun) == UINT32_MAX) continue; mtx_lock(&lun->lun_lock); ctl_est_ua_all(lun, -1, CTL_UA_INQ_CHANGE); mtx_unlock(&lun->lun_lock); } mtx_unlock(&softc->ctl_lock); } static void ctl_isc_iid_sync(struct ctl_softc *softc, union ctl_ha_msg *msg, int len) { struct ctl_port *port; int iid; port = softc->ctl_ports[msg->hdr.nexus.targ_port]; if (port == NULL) { printf("%s: Received IID for unknown port %d\n", __func__, msg->hdr.nexus.targ_port); return; } iid = msg->hdr.nexus.initid; if (port->wwpn_iid[iid].in_use != 0 && msg->iid.in_use == 0) ctl_i_t_nexus_loss(softc, iid, CTL_UA_POWERON); port->wwpn_iid[iid].in_use = msg->iid.in_use; port->wwpn_iid[iid].wwpn = msg->iid.wwpn; free(port->wwpn_iid[iid].name, M_CTL); if (msg->iid.name_len) { port->wwpn_iid[iid].name = strndup(&msg->iid.data[0], msg->iid.name_len, M_CTL); } else port->wwpn_iid[iid].name = NULL; } static void ctl_isc_login(struct ctl_softc *softc, union ctl_ha_msg *msg, int len) { if (msg->login.version != CTL_HA_VERSION) { printf("CTL HA peers have different versions %d != %d\n", msg->login.version, CTL_HA_VERSION); ctl_ha_msg_abort(CTL_HA_CHAN_CTL); return; } if (msg->login.ha_mode != softc->ha_mode) { printf("CTL HA peers have different ha_mode %d != %d\n", msg->login.ha_mode, softc->ha_mode); ctl_ha_msg_abort(CTL_HA_CHAN_CTL); return; } if (msg->login.ha_id == softc->ha_id) { printf("CTL HA peers have same ha_id %d\n", msg->login.ha_id); ctl_ha_msg_abort(CTL_HA_CHAN_CTL); return; } if (msg->login.max_luns != ctl_max_luns || msg->login.max_ports != ctl_max_ports || msg->login.max_init_per_port != CTL_MAX_INIT_PER_PORT) { printf("CTL HA peers have different limits\n"); ctl_ha_msg_abort(CTL_HA_CHAN_CTL); return; } } static void ctl_isc_mode_sync(struct ctl_softc *softc, union ctl_ha_msg *msg, int len) { struct ctl_lun *lun; u_int i; uint32_t initidx, targ_lun; targ_lun = msg->hdr.nexus.targ_mapped_lun; mtx_lock(&softc->ctl_lock); if (targ_lun >= ctl_max_luns || (lun = softc->ctl_luns[targ_lun]) == NULL) { mtx_unlock(&softc->ctl_lock); return; } mtx_lock(&lun->lun_lock); mtx_unlock(&softc->ctl_lock); if (lun->flags & CTL_LUN_DISABLED) { mtx_unlock(&lun->lun_lock); return; } for (i = 0; i < CTL_NUM_MODE_PAGES; i++) { if ((lun->mode_pages.index[i].page_code & SMPH_PC_MASK) == msg->mode.page_code && lun->mode_pages.index[i].subpage == msg->mode.subpage) break; } if (i == CTL_NUM_MODE_PAGES) { mtx_unlock(&lun->lun_lock); return; } memcpy(lun->mode_pages.index[i].page_data, msg->mode.data, lun->mode_pages.index[i].page_len); initidx = ctl_get_initindex(&msg->hdr.nexus); if (initidx != -1) ctl_est_ua_all(lun, initidx, CTL_UA_MODE_CHANGE); mtx_unlock(&lun->lun_lock); } /* * ISC (Inter Shelf Communication) event handler. Events from the HA * subsystem come in here. */ static void ctl_isc_event_handler(ctl_ha_channel channel, ctl_ha_event event, int param) { struct ctl_softc *softc = control_softc; union ctl_io *io; struct ctl_prio *presio; ctl_ha_status isc_status; CTL_DEBUG_PRINT(("CTL: Isc Msg event %d\n", event)); if (event == CTL_HA_EVT_MSG_RECV) { union ctl_ha_msg *msg, msgbuf; if (param > sizeof(msgbuf)) msg = malloc(param, M_CTL, M_WAITOK); else msg = &msgbuf; isc_status = ctl_ha_msg_recv(CTL_HA_CHAN_CTL, msg, param, M_WAITOK); if (isc_status != CTL_HA_STATUS_SUCCESS) { printf("%s: Error receiving message: %d\n", __func__, isc_status); if (msg != &msgbuf) free(msg, M_CTL); return; } CTL_DEBUG_PRINT(("CTL: msg_type %d\n", msg->msg_type)); switch (msg->hdr.msg_type) { case CTL_MSG_SERIALIZE: io = ctl_alloc_io(softc->othersc_pool); ctl_zero_io(io); // populate ctsio from msg io->io_hdr.io_type = CTL_IO_SCSI; io->io_hdr.msg_type = CTL_MSG_SERIALIZE; io->io_hdr.remote_io = msg->hdr.original_sc; io->io_hdr.flags |= CTL_FLAG_FROM_OTHER_SC | CTL_FLAG_IO_ACTIVE; /* * If we're in serialization-only mode, we don't * want to go through full done processing. Thus * the COPY flag. * * XXX KDM add another flag that is more specific. */ if (softc->ha_mode != CTL_HA_MODE_XFER) io->io_hdr.flags |= CTL_FLAG_INT_COPY; io->io_hdr.nexus = msg->hdr.nexus; io->scsiio.tag_num = msg->scsi.tag_num; io->scsiio.tag_type = msg->scsi.tag_type; #ifdef CTL_TIME_IO io->io_hdr.start_time = time_uptime; getbinuptime(&io->io_hdr.start_bt); #endif /* CTL_TIME_IO */ io->scsiio.cdb_len = msg->scsi.cdb_len; memcpy(io->scsiio.cdb, msg->scsi.cdb, CTL_MAX_CDBLEN); if (softc->ha_mode == CTL_HA_MODE_XFER) { const struct ctl_cmd_entry *entry; entry = ctl_get_cmd_entry(&io->scsiio, NULL); io->io_hdr.flags &= ~CTL_FLAG_DATA_MASK; io->io_hdr.flags |= entry->flags & CTL_FLAG_DATA_MASK; } ctl_enqueue_isc(io); break; /* Performed on the Originating SC, XFER mode only */ case CTL_MSG_DATAMOVE: { struct ctl_sg_entry *sgl; int i, j; io = msg->hdr.original_sc; if (io == NULL) { printf("%s: original_sc == NULL!\n", __func__); /* XXX KDM do something here */ break; } io->io_hdr.msg_type = CTL_MSG_DATAMOVE; io->io_hdr.flags |= CTL_FLAG_IO_ACTIVE; /* * Keep track of this, we need to send it back over * when the datamove is complete. */ io->io_hdr.remote_io = msg->hdr.serializing_sc; if (msg->hdr.status == CTL_SUCCESS) io->io_hdr.status = msg->hdr.status; if (msg->dt.sg_sequence == 0) { #ifdef CTL_TIME_IO getbinuptime(&io->io_hdr.dma_start_bt); #endif i = msg->dt.kern_sg_entries + msg->dt.kern_data_len / CTL_HA_DATAMOVE_SEGMENT + 1; sgl = malloc(sizeof(*sgl) * i, M_CTL, M_WAITOK | M_ZERO); CTL_RSGL(io) = sgl; CTL_LSGL(io) = &sgl[msg->dt.kern_sg_entries]; io->scsiio.kern_data_ptr = (uint8_t *)sgl; io->scsiio.kern_sg_entries = msg->dt.kern_sg_entries; io->scsiio.rem_sg_entries = msg->dt.kern_sg_entries; io->scsiio.kern_data_len = msg->dt.kern_data_len; io->scsiio.kern_total_len = msg->dt.kern_total_len; io->scsiio.kern_data_resid = msg->dt.kern_data_resid; io->scsiio.kern_rel_offset = msg->dt.kern_rel_offset; io->io_hdr.flags &= ~CTL_FLAG_BUS_ADDR; io->io_hdr.flags |= msg->dt.flags & CTL_FLAG_BUS_ADDR; } else sgl = (struct ctl_sg_entry *) io->scsiio.kern_data_ptr; for (i = msg->dt.sent_sg_entries, j = 0; i < (msg->dt.sent_sg_entries + msg->dt.cur_sg_entries); i++, j++) { sgl[i].addr = msg->dt.sg_list[j].addr; sgl[i].len = msg->dt.sg_list[j].len; } /* * If this is the last piece of the I/O, we've got * the full S/G list. Queue processing in the thread. * Otherwise wait for the next piece. */ if (msg->dt.sg_last != 0) ctl_enqueue_isc(io); break; } /* Performed on the Serializing (primary) SC, XFER mode only */ case CTL_MSG_DATAMOVE_DONE: { if (msg->hdr.serializing_sc == NULL) { printf("%s: serializing_sc == NULL!\n", __func__); /* XXX KDM now what? */ break; } /* * We grab the sense information here in case * there was a failure, so we can return status * back to the initiator. */ io = msg->hdr.serializing_sc; io->io_hdr.msg_type = CTL_MSG_DATAMOVE_DONE; io->io_hdr.flags &= ~CTL_FLAG_DMA_INPROG; io->io_hdr.flags |= CTL_FLAG_IO_ACTIVE; io->io_hdr.port_status = msg->scsi.port_status; io->scsiio.kern_data_resid = msg->scsi.kern_data_resid; if (msg->hdr.status != CTL_STATUS_NONE) { io->io_hdr.status = msg->hdr.status; io->scsiio.scsi_status = msg->scsi.scsi_status; io->scsiio.sense_len = msg->scsi.sense_len; memcpy(&io->scsiio.sense_data, &msg->scsi.sense_data, msg->scsi.sense_len); if (msg->hdr.status == CTL_SUCCESS) io->io_hdr.flags |= CTL_FLAG_STATUS_SENT; } ctl_enqueue_isc(io); break; } /* Preformed on Originating SC, SER_ONLY mode */ case CTL_MSG_R2R: io = msg->hdr.original_sc; if (io == NULL) { printf("%s: original_sc == NULL!\n", __func__); break; } io->io_hdr.flags |= CTL_FLAG_IO_ACTIVE; io->io_hdr.msg_type = CTL_MSG_R2R; io->io_hdr.remote_io = msg->hdr.serializing_sc; ctl_enqueue_isc(io); break; /* * Performed on Serializing(i.e. primary SC) SC in SER_ONLY * mode. * Performed on the Originating (i.e. secondary) SC in XFER * mode */ case CTL_MSG_FINISH_IO: if (softc->ha_mode == CTL_HA_MODE_XFER) ctl_isc_handler_finish_xfer(softc, msg); else ctl_isc_handler_finish_ser_only(softc, msg); break; /* Preformed on Originating SC */ case CTL_MSG_BAD_JUJU: io = msg->hdr.original_sc; if (io == NULL) { printf("%s: Bad JUJU!, original_sc is NULL!\n", __func__); break; } ctl_copy_sense_data(msg, io); /* * IO should have already been cleaned up on other * SC so clear this flag so we won't send a message * back to finish the IO there. */ io->io_hdr.flags &= ~CTL_FLAG_SENT_2OTHER_SC; io->io_hdr.flags |= CTL_FLAG_IO_ACTIVE; /* io = msg->hdr.serializing_sc; */ io->io_hdr.msg_type = CTL_MSG_BAD_JUJU; ctl_enqueue_isc(io); break; /* Handle resets sent from the other side */ case CTL_MSG_MANAGE_TASKS: { struct ctl_taskio *taskio; taskio = (struct ctl_taskio *)ctl_alloc_io( softc->othersc_pool); ctl_zero_io((union ctl_io *)taskio); taskio->io_hdr.io_type = CTL_IO_TASK; taskio->io_hdr.flags |= CTL_FLAG_FROM_OTHER_SC; taskio->io_hdr.nexus = msg->hdr.nexus; taskio->task_action = msg->task.task_action; taskio->tag_num = msg->task.tag_num; taskio->tag_type = msg->task.tag_type; #ifdef CTL_TIME_IO taskio->io_hdr.start_time = time_uptime; getbinuptime(&taskio->io_hdr.start_bt); #endif /* CTL_TIME_IO */ ctl_run_task((union ctl_io *)taskio); break; } /* Persistent Reserve action which needs attention */ case CTL_MSG_PERS_ACTION: presio = (struct ctl_prio *)ctl_alloc_io( softc->othersc_pool); ctl_zero_io((union ctl_io *)presio); presio->io_hdr.msg_type = CTL_MSG_PERS_ACTION; presio->io_hdr.flags |= CTL_FLAG_FROM_OTHER_SC; presio->io_hdr.nexus = msg->hdr.nexus; presio->pr_msg = msg->pr; ctl_enqueue_isc((union ctl_io *)presio); break; case CTL_MSG_UA: ctl_isc_ua(softc, msg, param); break; case CTL_MSG_PORT_SYNC: ctl_isc_port_sync(softc, msg, param); break; case CTL_MSG_LUN_SYNC: ctl_isc_lun_sync(softc, msg, param); break; case CTL_MSG_IID_SYNC: ctl_isc_iid_sync(softc, msg, param); break; case CTL_MSG_LOGIN: ctl_isc_login(softc, msg, param); break; case CTL_MSG_MODE_SYNC: ctl_isc_mode_sync(softc, msg, param); break; default: printf("Received HA message of unknown type %d\n", msg->hdr.msg_type); ctl_ha_msg_abort(CTL_HA_CHAN_CTL); break; } if (msg != &msgbuf) free(msg, M_CTL); } else if (event == CTL_HA_EVT_LINK_CHANGE) { printf("CTL: HA link status changed from %d to %d\n", softc->ha_link, param); if (param == softc->ha_link) return; if (softc->ha_link == CTL_HA_LINK_ONLINE) { softc->ha_link = param; ctl_isc_ha_link_down(softc); } else { softc->ha_link = param; if (softc->ha_link == CTL_HA_LINK_ONLINE) ctl_isc_ha_link_up(softc); } return; } else { printf("ctl_isc_event_handler: Unknown event %d\n", event); return; } } static void ctl_copy_sense_data(union ctl_ha_msg *src, union ctl_io *dest) { memcpy(&dest->scsiio.sense_data, &src->scsi.sense_data, src->scsi.sense_len); dest->scsiio.scsi_status = src->scsi.scsi_status; dest->scsiio.sense_len = src->scsi.sense_len; dest->io_hdr.status = src->hdr.status; } static void ctl_copy_sense_data_back(union ctl_io *src, union ctl_ha_msg *dest) { memcpy(&dest->scsi.sense_data, &src->scsiio.sense_data, src->scsiio.sense_len); dest->scsi.scsi_status = src->scsiio.scsi_status; dest->scsi.sense_len = src->scsiio.sense_len; dest->hdr.status = src->io_hdr.status; } void ctl_est_ua(struct ctl_lun *lun, uint32_t initidx, ctl_ua_type ua) { struct ctl_softc *softc = lun->ctl_softc; ctl_ua_type *pu; if (initidx < softc->init_min || initidx >= softc->init_max) return; mtx_assert(&lun->lun_lock, MA_OWNED); pu = lun->pending_ua[initidx / CTL_MAX_INIT_PER_PORT]; if (pu == NULL) return; pu[initidx % CTL_MAX_INIT_PER_PORT] |= ua; } void ctl_est_ua_port(struct ctl_lun *lun, int port, uint32_t except, ctl_ua_type ua) { int i; mtx_assert(&lun->lun_lock, MA_OWNED); if (lun->pending_ua[port] == NULL) return; for (i = 0; i < CTL_MAX_INIT_PER_PORT; i++) { if (port * CTL_MAX_INIT_PER_PORT + i == except) continue; lun->pending_ua[port][i] |= ua; } } void ctl_est_ua_all(struct ctl_lun *lun, uint32_t except, ctl_ua_type ua) { struct ctl_softc *softc = lun->ctl_softc; int i; mtx_assert(&lun->lun_lock, MA_OWNED); for (i = softc->port_min; i < softc->port_max; i++) ctl_est_ua_port(lun, i, except, ua); } void ctl_clr_ua(struct ctl_lun *lun, uint32_t initidx, ctl_ua_type ua) { struct ctl_softc *softc = lun->ctl_softc; ctl_ua_type *pu; if (initidx < softc->init_min || initidx >= softc->init_max) return; mtx_assert(&lun->lun_lock, MA_OWNED); pu = lun->pending_ua[initidx / CTL_MAX_INIT_PER_PORT]; if (pu == NULL) return; pu[initidx % CTL_MAX_INIT_PER_PORT] &= ~ua; } void ctl_clr_ua_all(struct ctl_lun *lun, uint32_t except, ctl_ua_type ua) { struct ctl_softc *softc = lun->ctl_softc; int i, j; mtx_assert(&lun->lun_lock, MA_OWNED); for (i = softc->port_min; i < softc->port_max; i++) { if (lun->pending_ua[i] == NULL) continue; for (j = 0; j < CTL_MAX_INIT_PER_PORT; j++) { if (i * CTL_MAX_INIT_PER_PORT + j == except) continue; lun->pending_ua[i][j] &= ~ua; } } } void ctl_clr_ua_allluns(struct ctl_softc *ctl_softc, uint32_t initidx, ctl_ua_type ua_type) { struct ctl_lun *lun; mtx_assert(&ctl_softc->ctl_lock, MA_OWNED); STAILQ_FOREACH(lun, &ctl_softc->lun_list, links) { mtx_lock(&lun->lun_lock); ctl_clr_ua(lun, initidx, ua_type); mtx_unlock(&lun->lun_lock); } } static int ctl_ha_role_sysctl(SYSCTL_HANDLER_ARGS) { struct ctl_softc *softc = (struct ctl_softc *)arg1; struct ctl_lun *lun; struct ctl_lun_req ireq; int error, value; value = (softc->flags & CTL_FLAG_ACTIVE_SHELF) ? 0 : 1; error = sysctl_handle_int(oidp, &value, 0, req); if ((error != 0) || (req->newptr == NULL)) return (error); mtx_lock(&softc->ctl_lock); if (value == 0) softc->flags |= CTL_FLAG_ACTIVE_SHELF; else softc->flags &= ~CTL_FLAG_ACTIVE_SHELF; STAILQ_FOREACH(lun, &softc->lun_list, links) { mtx_unlock(&softc->ctl_lock); bzero(&ireq, sizeof(ireq)); ireq.reqtype = CTL_LUNREQ_MODIFY; ireq.reqdata.modify.lun_id = lun->lun; lun->backend->ioctl(NULL, CTL_LUN_REQ, (caddr_t)&ireq, 0, curthread); if (ireq.status != CTL_LUN_OK) { printf("%s: CTL_LUNREQ_MODIFY returned %d '%s'\n", __func__, ireq.status, ireq.error_str); } mtx_lock(&softc->ctl_lock); } mtx_unlock(&softc->ctl_lock); return (0); } static int ctl_init(void) { struct make_dev_args args; struct ctl_softc *softc; int i, error; softc = control_softc = malloc(sizeof(*control_softc), M_DEVBUF, M_WAITOK | M_ZERO); make_dev_args_init(&args); args.mda_devsw = &ctl_cdevsw; args.mda_uid = UID_ROOT; args.mda_gid = GID_OPERATOR; args.mda_mode = 0600; args.mda_si_drv1 = softc; args.mda_si_drv2 = NULL; error = make_dev_s(&args, &softc->dev, "cam/ctl"); if (error != 0) { free(softc, M_DEVBUF); control_softc = NULL; return (error); } sysctl_ctx_init(&softc->sysctl_ctx); softc->sysctl_tree = SYSCTL_ADD_NODE(&softc->sysctl_ctx, SYSCTL_STATIC_CHILDREN(_kern_cam), OID_AUTO, "ctl", CTLFLAG_RD, 0, "CAM Target Layer"); if (softc->sysctl_tree == NULL) { printf("%s: unable to allocate sysctl tree\n", __func__); destroy_dev(softc->dev); free(softc, M_DEVBUF); control_softc = NULL; return (ENOMEM); } mtx_init(&softc->ctl_lock, "CTL mutex", NULL, MTX_DEF); softc->io_zone = uma_zcreate("CTL IO", sizeof(union ctl_io), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); softc->flags = 0; SYSCTL_ADD_INT(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree), OID_AUTO, "ha_mode", CTLFLAG_RDTUN, (int *)&softc->ha_mode, 0, "HA mode (0 - act/stby, 1 - serialize only, 2 - xfer)"); if (ctl_max_luns <= 0 || powerof2(ctl_max_luns) == 0) { printf("Bad value %d for kern.cam.ctl.max_luns, must be a power of two, using %d\n", ctl_max_luns, CTL_DEFAULT_MAX_LUNS); ctl_max_luns = CTL_DEFAULT_MAX_LUNS; } softc->ctl_luns = malloc(sizeof(struct ctl_lun *) * ctl_max_luns, M_DEVBUF, M_WAITOK | M_ZERO); softc->ctl_lun_mask = malloc(sizeof(uint32_t) * ((ctl_max_luns + 31) / 32), M_DEVBUF, M_WAITOK | M_ZERO); if (ctl_max_ports <= 0 || powerof2(ctl_max_ports) == 0) { printf("Bad value %d for kern.cam.ctl.max_ports, must be a power of two, using %d\n", ctl_max_ports, CTL_DEFAULT_MAX_PORTS); ctl_max_ports = CTL_DEFAULT_MAX_PORTS; } softc->ctl_port_mask = malloc(sizeof(uint32_t) * ((ctl_max_ports + 31) / 32), M_DEVBUF, M_WAITOK | M_ZERO); softc->ctl_ports = malloc(sizeof(struct ctl_port *) * ctl_max_ports, M_DEVBUF, M_WAITOK | M_ZERO); /* * In Copan's HA scheme, the "master" and "slave" roles are * figured out through the slot the controller is in. Although it * is an active/active system, someone has to be in charge. */ SYSCTL_ADD_INT(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree), OID_AUTO, "ha_id", CTLFLAG_RDTUN, &softc->ha_id, 0, "HA head ID (0 - no HA)"); if (softc->ha_id == 0 || softc->ha_id > NUM_HA_SHELVES) { softc->flags |= CTL_FLAG_ACTIVE_SHELF; softc->is_single = 1; softc->port_cnt = ctl_max_ports; softc->port_min = 0; } else { softc->port_cnt = ctl_max_ports / NUM_HA_SHELVES; softc->port_min = (softc->ha_id - 1) * softc->port_cnt; } softc->port_max = softc->port_min + softc->port_cnt; softc->init_min = softc->port_min * CTL_MAX_INIT_PER_PORT; softc->init_max = softc->port_max * CTL_MAX_INIT_PER_PORT; SYSCTL_ADD_INT(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree), OID_AUTO, "ha_link", CTLFLAG_RD, (int *)&softc->ha_link, 0, "HA link state (0 - offline, 1 - unknown, 2 - online)"); STAILQ_INIT(&softc->lun_list); STAILQ_INIT(&softc->pending_lun_queue); STAILQ_INIT(&softc->fe_list); STAILQ_INIT(&softc->port_list); STAILQ_INIT(&softc->be_list); ctl_tpc_init(softc); if (worker_threads <= 0) worker_threads = max(1, mp_ncpus / 4); if (worker_threads > CTL_MAX_THREADS) worker_threads = CTL_MAX_THREADS; for (i = 0; i < worker_threads; i++) { struct ctl_thread *thr = &softc->threads[i]; mtx_init(&thr->queue_lock, "CTL queue mutex", NULL, MTX_DEF); thr->ctl_softc = softc; STAILQ_INIT(&thr->incoming_queue); STAILQ_INIT(&thr->rtr_queue); STAILQ_INIT(&thr->done_queue); STAILQ_INIT(&thr->isc_queue); error = kproc_kthread_add(ctl_work_thread, thr, &softc->ctl_proc, &thr->thread, 0, 0, "ctl", "work%d", i); if (error != 0) { printf("error creating CTL work thread!\n"); return (error); } } error = kproc_kthread_add(ctl_lun_thread, softc, &softc->ctl_proc, &softc->lun_thread, 0, 0, "ctl", "lun"); if (error != 0) { printf("error creating CTL lun thread!\n"); return (error); } error = kproc_kthread_add(ctl_thresh_thread, softc, &softc->ctl_proc, &softc->thresh_thread, 0, 0, "ctl", "thresh"); if (error != 0) { printf("error creating CTL threshold thread!\n"); return (error); } SYSCTL_ADD_PROC(&softc->sysctl_ctx,SYSCTL_CHILDREN(softc->sysctl_tree), OID_AUTO, "ha_role", CTLTYPE_INT | CTLFLAG_RWTUN, softc, 0, ctl_ha_role_sysctl, "I", "HA role for this head"); if (softc->is_single == 0) { if (ctl_frontend_register(&ha_frontend) != 0) softc->is_single = 1; } return (0); } static int ctl_shutdown(void) { struct ctl_softc *softc = control_softc; int i; if (softc->is_single == 0) ctl_frontend_deregister(&ha_frontend); destroy_dev(softc->dev); /* Shutdown CTL threads. */ softc->shutdown = 1; for (i = 0; i < worker_threads; i++) { struct ctl_thread *thr = &softc->threads[i]; while (thr->thread != NULL) { wakeup(thr); if (thr->thread != NULL) pause("CTL thr shutdown", 1); } mtx_destroy(&thr->queue_lock); } while (softc->lun_thread != NULL) { wakeup(&softc->pending_lun_queue); if (softc->lun_thread != NULL) pause("CTL thr shutdown", 1); } while (softc->thresh_thread != NULL) { wakeup(softc->thresh_thread); if (softc->thresh_thread != NULL) pause("CTL thr shutdown", 1); } ctl_tpc_shutdown(softc); uma_zdestroy(softc->io_zone); mtx_destroy(&softc->ctl_lock); free(softc->ctl_luns, M_DEVBUF); free(softc->ctl_lun_mask, M_DEVBUF); free(softc->ctl_port_mask, M_DEVBUF); free(softc->ctl_ports, M_DEVBUF); sysctl_ctx_free(&softc->sysctl_ctx); free(softc, M_DEVBUF); control_softc = NULL; return (0); } static int ctl_module_event_handler(module_t mod, int what, void *arg) { switch (what) { case MOD_LOAD: return (ctl_init()); case MOD_UNLOAD: return (ctl_shutdown()); default: return (EOPNOTSUPP); } } /* * XXX KDM should we do some access checks here? Bump a reference count to * prevent a CTL module from being unloaded while someone has it open? */ static int ctl_open(struct cdev *dev, int flags, int fmt, struct thread *td) { return (0); } static int ctl_close(struct cdev *dev, int flags, int fmt, struct thread *td) { return (0); } /* * Remove an initiator by port number and initiator ID. * Returns 0 for success, -1 for failure. */ int ctl_remove_initiator(struct ctl_port *port, int iid) { struct ctl_softc *softc = port->ctl_softc; int last; mtx_assert(&softc->ctl_lock, MA_NOTOWNED); if (iid > CTL_MAX_INIT_PER_PORT) { printf("%s: initiator ID %u > maximun %u!\n", __func__, iid, CTL_MAX_INIT_PER_PORT); return (-1); } mtx_lock(&softc->ctl_lock); last = (--port->wwpn_iid[iid].in_use == 0); port->wwpn_iid[iid].last_use = time_uptime; mtx_unlock(&softc->ctl_lock); if (last) ctl_i_t_nexus_loss(softc, iid, CTL_UA_POWERON); ctl_isc_announce_iid(port, iid); return (0); } /* * Add an initiator to the initiator map. * Returns iid for success, < 0 for failure. */ int ctl_add_initiator(struct ctl_port *port, int iid, uint64_t wwpn, char *name) { struct ctl_softc *softc = port->ctl_softc; time_t best_time; int i, best; mtx_assert(&softc->ctl_lock, MA_NOTOWNED); if (iid >= CTL_MAX_INIT_PER_PORT) { printf("%s: WWPN %#jx initiator ID %u > maximum %u!\n", __func__, wwpn, iid, CTL_MAX_INIT_PER_PORT); free(name, M_CTL); return (-1); } mtx_lock(&softc->ctl_lock); if (iid < 0 && (wwpn != 0 || name != NULL)) { for (i = 0; i < CTL_MAX_INIT_PER_PORT; i++) { if (wwpn != 0 && wwpn == port->wwpn_iid[i].wwpn) { iid = i; break; } if (name != NULL && port->wwpn_iid[i].name != NULL && strcmp(name, port->wwpn_iid[i].name) == 0) { iid = i; break; } } } if (iid < 0) { for (i = 0; i < CTL_MAX_INIT_PER_PORT; i++) { if (port->wwpn_iid[i].in_use == 0 && port->wwpn_iid[i].wwpn == 0 && port->wwpn_iid[i].name == NULL) { iid = i; break; } } } if (iid < 0) { best = -1; best_time = INT32_MAX; for (i = 0; i < CTL_MAX_INIT_PER_PORT; i++) { if (port->wwpn_iid[i].in_use == 0) { if (port->wwpn_iid[i].last_use < best_time) { best = i; best_time = port->wwpn_iid[i].last_use; } } } iid = best; } if (iid < 0) { mtx_unlock(&softc->ctl_lock); free(name, M_CTL); return (-2); } if (port->wwpn_iid[iid].in_use > 0 && (wwpn != 0 || name != NULL)) { /* * This is not an error yet. */ if (wwpn != 0 && wwpn == port->wwpn_iid[iid].wwpn) { #if 0 printf("%s: port %d iid %u WWPN %#jx arrived" " again\n", __func__, port->targ_port, iid, (uintmax_t)wwpn); #endif goto take; } if (name != NULL && port->wwpn_iid[iid].name != NULL && strcmp(name, port->wwpn_iid[iid].name) == 0) { #if 0 printf("%s: port %d iid %u name '%s' arrived" " again\n", __func__, port->targ_port, iid, name); #endif goto take; } /* * This is an error, but what do we do about it? The * driver is telling us we have a new WWPN for this * initiator ID, so we pretty much need to use it. */ printf("%s: port %d iid %u WWPN %#jx '%s' arrived," " but WWPN %#jx '%s' is still at that address\n", __func__, port->targ_port, iid, wwpn, name, (uintmax_t)port->wwpn_iid[iid].wwpn, port->wwpn_iid[iid].name); } take: free(port->wwpn_iid[iid].name, M_CTL); port->wwpn_iid[iid].name = name; port->wwpn_iid[iid].wwpn = wwpn; port->wwpn_iid[iid].in_use++; mtx_unlock(&softc->ctl_lock); ctl_isc_announce_iid(port, iid); return (iid); } static int ctl_create_iid(struct ctl_port *port, int iid, uint8_t *buf) { int len; switch (port->port_type) { case CTL_PORT_FC: { struct scsi_transportid_fcp *id = (struct scsi_transportid_fcp *)buf; if (port->wwpn_iid[iid].wwpn == 0) return (0); memset(id, 0, sizeof(*id)); id->format_protocol = SCSI_PROTO_FC; scsi_u64to8b(port->wwpn_iid[iid].wwpn, id->n_port_name); return (sizeof(*id)); } case CTL_PORT_ISCSI: { struct scsi_transportid_iscsi_port *id = (struct scsi_transportid_iscsi_port *)buf; if (port->wwpn_iid[iid].name == NULL) return (0); memset(id, 0, 256); id->format_protocol = SCSI_TRN_ISCSI_FORMAT_PORT | SCSI_PROTO_ISCSI; len = strlcpy(id->iscsi_name, port->wwpn_iid[iid].name, 252) + 1; len = roundup2(min(len, 252), 4); scsi_ulto2b(len, id->additional_length); return (sizeof(*id) + len); } case CTL_PORT_SAS: { struct scsi_transportid_sas *id = (struct scsi_transportid_sas *)buf; if (port->wwpn_iid[iid].wwpn == 0) return (0); memset(id, 0, sizeof(*id)); id->format_protocol = SCSI_PROTO_SAS; scsi_u64to8b(port->wwpn_iid[iid].wwpn, id->sas_address); return (sizeof(*id)); } default: { struct scsi_transportid_spi *id = (struct scsi_transportid_spi *)buf; memset(id, 0, sizeof(*id)); id->format_protocol = SCSI_PROTO_SPI; scsi_ulto2b(iid, id->scsi_addr); scsi_ulto2b(port->targ_port, id->rel_trgt_port_id); return (sizeof(*id)); } } } /* * Serialize a command that went down the "wrong" side, and so was sent to * this controller for execution. The logic is a little different than the * standard case in ctl_scsiio_precheck(). Errors in this case need to get * sent back to the other side, but in the success case, we execute the * command on this side (XFER mode) or tell the other side to execute it * (SER_ONLY mode). */ static void ctl_serialize_other_sc_cmd(struct ctl_scsiio *ctsio) { struct ctl_softc *softc = CTL_SOFTC(ctsio); struct ctl_port *port = CTL_PORT(ctsio); union ctl_ha_msg msg_info; struct ctl_lun *lun; const struct ctl_cmd_entry *entry; union ctl_io *bio; uint32_t targ_lun; targ_lun = ctsio->io_hdr.nexus.targ_mapped_lun; /* Make sure that we know about this port. */ if (port == NULL || (port->status & CTL_PORT_STATUS_ONLINE) == 0) { ctl_set_internal_failure(ctsio, /*sks_valid*/ 0, /*retry_count*/ 1); goto badjuju; } /* Make sure that we know about this LUN. */ mtx_lock(&softc->ctl_lock); if (targ_lun >= ctl_max_luns || (lun = softc->ctl_luns[targ_lun]) == NULL) { mtx_unlock(&softc->ctl_lock); /* * The other node would not send this request to us unless * received announce that we are primary node for this LUN. * If this LUN does not exist now, it is probably result of * a race, so respond to initiator in the most opaque way. */ ctl_set_busy(ctsio); goto badjuju; } mtx_lock(&lun->lun_lock); mtx_unlock(&softc->ctl_lock); /* * If the LUN is invalid, pretend that it doesn't exist. * It will go away as soon as all pending I/Os completed. */ if (lun->flags & CTL_LUN_DISABLED) { mtx_unlock(&lun->lun_lock); ctl_set_busy(ctsio); goto badjuju; } entry = ctl_get_cmd_entry(ctsio, NULL); if (ctl_scsiio_lun_check(lun, entry, ctsio) != 0) { mtx_unlock(&lun->lun_lock); goto badjuju; } CTL_LUN(ctsio) = lun; CTL_BACKEND_LUN(ctsio) = lun->be_lun; /* * Every I/O goes into the OOA queue for a * particular LUN, and stays there until completion. */ #ifdef CTL_TIME_IO if (TAILQ_EMPTY(&lun->ooa_queue)) lun->idle_time += getsbinuptime() - lun->last_busy; #endif TAILQ_INSERT_TAIL(&lun->ooa_queue, &ctsio->io_hdr, ooa_links); bio = (union ctl_io *)TAILQ_PREV(&ctsio->io_hdr, ctl_ooaq, ooa_links); switch (ctl_check_ooa(lun, (union ctl_io *)ctsio, &bio)) { case CTL_ACTION_BLOCK: ctsio->io_hdr.blocker = bio; TAILQ_INSERT_TAIL(&bio->io_hdr.blocked_queue, &ctsio->io_hdr, blocked_links); mtx_unlock(&lun->lun_lock); break; case CTL_ACTION_PASS: case CTL_ACTION_SKIP: if (softc->ha_mode == CTL_HA_MODE_XFER) { ctsio->io_hdr.flags |= CTL_FLAG_IS_WAS_ON_RTR; ctl_enqueue_rtr((union ctl_io *)ctsio); mtx_unlock(&lun->lun_lock); } else { ctsio->io_hdr.flags &= ~CTL_FLAG_IO_ACTIVE; mtx_unlock(&lun->lun_lock); /* send msg back to other side */ msg_info.hdr.original_sc = ctsio->io_hdr.remote_io; msg_info.hdr.serializing_sc = (union ctl_io *)ctsio; msg_info.hdr.msg_type = CTL_MSG_R2R; ctl_ha_msg_send(CTL_HA_CHAN_CTL, &msg_info, sizeof(msg_info.hdr), M_WAITOK); } break; case CTL_ACTION_OVERLAP: TAILQ_REMOVE(&lun->ooa_queue, &ctsio->io_hdr, ooa_links); mtx_unlock(&lun->lun_lock); ctl_set_overlapped_cmd(ctsio); goto badjuju; case CTL_ACTION_OVERLAP_TAG: TAILQ_REMOVE(&lun->ooa_queue, &ctsio->io_hdr, ooa_links); mtx_unlock(&lun->lun_lock); ctl_set_overlapped_tag(ctsio, ctsio->tag_num); goto badjuju; case CTL_ACTION_ERROR: default: TAILQ_REMOVE(&lun->ooa_queue, &ctsio->io_hdr, ooa_links); mtx_unlock(&lun->lun_lock); ctl_set_internal_failure(ctsio, /*sks_valid*/ 0, /*retry_count*/ 0); badjuju: ctl_copy_sense_data_back((union ctl_io *)ctsio, &msg_info); msg_info.hdr.original_sc = ctsio->io_hdr.remote_io; msg_info.hdr.serializing_sc = NULL; msg_info.hdr.msg_type = CTL_MSG_BAD_JUJU; ctl_ha_msg_send(CTL_HA_CHAN_CTL, &msg_info, sizeof(msg_info.scsi), M_WAITOK); ctl_free_io((union ctl_io *)ctsio); break; } } /* * Returns 0 for success, errno for failure. */ static void ctl_ioctl_fill_ooa(struct ctl_lun *lun, uint32_t *cur_fill_num, struct ctl_ooa *ooa_hdr, struct ctl_ooa_entry *kern_entries) { union ctl_io *io; mtx_lock(&lun->lun_lock); for (io = (union ctl_io *)TAILQ_FIRST(&lun->ooa_queue); (io != NULL); (*cur_fill_num)++, io = (union ctl_io *)TAILQ_NEXT(&io->io_hdr, ooa_links)) { struct ctl_ooa_entry *entry; /* * If we've got more than we can fit, just count the * remaining entries. */ if (*cur_fill_num >= ooa_hdr->alloc_num) continue; entry = &kern_entries[*cur_fill_num]; entry->tag_num = io->scsiio.tag_num; entry->lun_num = lun->lun; #ifdef CTL_TIME_IO entry->start_bt = io->io_hdr.start_bt; #endif bcopy(io->scsiio.cdb, entry->cdb, io->scsiio.cdb_len); entry->cdb_len = io->scsiio.cdb_len; if (io->io_hdr.blocker != NULL) entry->cmd_flags |= CTL_OOACMD_FLAG_BLOCKED; if (io->io_hdr.flags & CTL_FLAG_DMA_INPROG) entry->cmd_flags |= CTL_OOACMD_FLAG_DMA; if (io->io_hdr.flags & CTL_FLAG_ABORT) entry->cmd_flags |= CTL_OOACMD_FLAG_ABORT; if (io->io_hdr.flags & CTL_FLAG_IS_WAS_ON_RTR) entry->cmd_flags |= CTL_OOACMD_FLAG_RTR; if (io->io_hdr.flags & CTL_FLAG_DMA_QUEUED) entry->cmd_flags |= CTL_OOACMD_FLAG_DMA_QUEUED; } mtx_unlock(&lun->lun_lock); } /* * Escape characters that are illegal or not recommended in XML. */ int ctl_sbuf_printf_esc(struct sbuf *sb, char *str, int size) { char *end = str + size; int retval; retval = 0; for (; *str && str < end; str++) { switch (*str) { case '&': retval = sbuf_printf(sb, "&"); break; case '>': retval = sbuf_printf(sb, ">"); break; case '<': retval = sbuf_printf(sb, "<"); break; default: retval = sbuf_putc(sb, *str); break; } if (retval != 0) break; } return (retval); } static void ctl_id_sbuf(struct ctl_devid *id, struct sbuf *sb) { struct scsi_vpd_id_descriptor *desc; int i; if (id == NULL || id->len < 4) return; desc = (struct scsi_vpd_id_descriptor *)id->data; switch (desc->id_type & SVPD_ID_TYPE_MASK) { case SVPD_ID_TYPE_T10: sbuf_printf(sb, "t10."); break; case SVPD_ID_TYPE_EUI64: sbuf_printf(sb, "eui."); break; case SVPD_ID_TYPE_NAA: sbuf_printf(sb, "naa."); break; case SVPD_ID_TYPE_SCSI_NAME: break; } switch (desc->proto_codeset & SVPD_ID_CODESET_MASK) { case SVPD_ID_CODESET_BINARY: for (i = 0; i < desc->length; i++) sbuf_printf(sb, "%02x", desc->identifier[i]); break; case SVPD_ID_CODESET_ASCII: sbuf_printf(sb, "%.*s", (int)desc->length, (char *)desc->identifier); break; case SVPD_ID_CODESET_UTF8: sbuf_printf(sb, "%s", (char *)desc->identifier); break; } } static int ctl_ioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flag, struct thread *td) { struct ctl_softc *softc = dev->si_drv1; struct ctl_port *port; struct ctl_lun *lun; int retval; retval = 0; switch (cmd) { case CTL_IO: retval = ctl_ioctl_io(dev, cmd, addr, flag, td); break; case CTL_ENABLE_PORT: case CTL_DISABLE_PORT: case CTL_SET_PORT_WWNS: { struct ctl_port *port; struct ctl_port_entry *entry; entry = (struct ctl_port_entry *)addr; mtx_lock(&softc->ctl_lock); STAILQ_FOREACH(port, &softc->port_list, links) { int action, done; if (port->targ_port < softc->port_min || port->targ_port >= softc->port_max) continue; action = 0; done = 0; if ((entry->port_type == CTL_PORT_NONE) && (entry->targ_port == port->targ_port)) { /* * If the user only wants to enable or * disable or set WWNs on a specific port, * do the operation and we're done. */ action = 1; done = 1; } else if (entry->port_type & port->port_type) { /* * Compare the user's type mask with the * particular frontend type to see if we * have a match. */ action = 1; done = 0; /* * Make sure the user isn't trying to set * WWNs on multiple ports at the same time. */ if (cmd == CTL_SET_PORT_WWNS) { printf("%s: Can't set WWNs on " "multiple ports\n", __func__); retval = EINVAL; break; } } if (action == 0) continue; /* * XXX KDM we have to drop the lock here, because * the online/offline operations can potentially * block. We need to reference count the frontends * so they can't go away, */ if (cmd == CTL_ENABLE_PORT) { mtx_unlock(&softc->ctl_lock); ctl_port_online(port); mtx_lock(&softc->ctl_lock); } else if (cmd == CTL_DISABLE_PORT) { mtx_unlock(&softc->ctl_lock); ctl_port_offline(port); mtx_lock(&softc->ctl_lock); } else if (cmd == CTL_SET_PORT_WWNS) { ctl_port_set_wwns(port, (entry->flags & CTL_PORT_WWNN_VALID) ? 1 : 0, entry->wwnn, (entry->flags & CTL_PORT_WWPN_VALID) ? 1 : 0, entry->wwpn); } if (done != 0) break; } mtx_unlock(&softc->ctl_lock); break; } case CTL_GET_OOA: { struct ctl_ooa *ooa_hdr; struct ctl_ooa_entry *entries; uint32_t cur_fill_num; ooa_hdr = (struct ctl_ooa *)addr; if ((ooa_hdr->alloc_len == 0) || (ooa_hdr->alloc_num == 0)) { printf("%s: CTL_GET_OOA: alloc len %u and alloc num %u " "must be non-zero\n", __func__, ooa_hdr->alloc_len, ooa_hdr->alloc_num); retval = EINVAL; break; } if (ooa_hdr->alloc_len != (ooa_hdr->alloc_num * sizeof(struct ctl_ooa_entry))) { printf("%s: CTL_GET_OOA: alloc len %u must be alloc " "num %d * sizeof(struct ctl_ooa_entry) %zd\n", __func__, ooa_hdr->alloc_len, ooa_hdr->alloc_num,sizeof(struct ctl_ooa_entry)); retval = EINVAL; break; } entries = malloc(ooa_hdr->alloc_len, M_CTL, M_WAITOK | M_ZERO); if (entries == NULL) { printf("%s: could not allocate %d bytes for OOA " "dump\n", __func__, ooa_hdr->alloc_len); retval = ENOMEM; break; } mtx_lock(&softc->ctl_lock); if ((ooa_hdr->flags & CTL_OOA_FLAG_ALL_LUNS) == 0 && (ooa_hdr->lun_num >= ctl_max_luns || softc->ctl_luns[ooa_hdr->lun_num] == NULL)) { mtx_unlock(&softc->ctl_lock); free(entries, M_CTL); printf("%s: CTL_GET_OOA: invalid LUN %ju\n", __func__, (uintmax_t)ooa_hdr->lun_num); retval = EINVAL; break; } cur_fill_num = 0; if (ooa_hdr->flags & CTL_OOA_FLAG_ALL_LUNS) { STAILQ_FOREACH(lun, &softc->lun_list, links) { ctl_ioctl_fill_ooa(lun, &cur_fill_num, ooa_hdr, entries); } } else { lun = softc->ctl_luns[ooa_hdr->lun_num]; ctl_ioctl_fill_ooa(lun, &cur_fill_num, ooa_hdr, entries); } mtx_unlock(&softc->ctl_lock); ooa_hdr->fill_num = min(cur_fill_num, ooa_hdr->alloc_num); ooa_hdr->fill_len = ooa_hdr->fill_num * sizeof(struct ctl_ooa_entry); retval = copyout(entries, ooa_hdr->entries, ooa_hdr->fill_len); if (retval != 0) { printf("%s: error copying out %d bytes for OOA dump\n", __func__, ooa_hdr->fill_len); } getbinuptime(&ooa_hdr->cur_bt); if (cur_fill_num > ooa_hdr->alloc_num) { ooa_hdr->dropped_num = cur_fill_num -ooa_hdr->alloc_num; ooa_hdr->status = CTL_OOA_NEED_MORE_SPACE; } else { ooa_hdr->dropped_num = 0; ooa_hdr->status = CTL_OOA_OK; } free(entries, M_CTL); break; } case CTL_DELAY_IO: { struct ctl_io_delay_info *delay_info; delay_info = (struct ctl_io_delay_info *)addr; #ifdef CTL_IO_DELAY mtx_lock(&softc->ctl_lock); if (delay_info->lun_id >= ctl_max_luns || (lun = softc->ctl_luns[delay_info->lun_id]) == NULL) { mtx_unlock(&softc->ctl_lock); delay_info->status = CTL_DELAY_STATUS_INVALID_LUN; break; } mtx_lock(&lun->lun_lock); mtx_unlock(&softc->ctl_lock); delay_info->status = CTL_DELAY_STATUS_OK; switch (delay_info->delay_type) { case CTL_DELAY_TYPE_CONT: case CTL_DELAY_TYPE_ONESHOT: break; default: delay_info->status = CTL_DELAY_STATUS_INVALID_TYPE; break; } switch (delay_info->delay_loc) { case CTL_DELAY_LOC_DATAMOVE: lun->delay_info.datamove_type = delay_info->delay_type; lun->delay_info.datamove_delay = delay_info->delay_secs; break; case CTL_DELAY_LOC_DONE: lun->delay_info.done_type = delay_info->delay_type; lun->delay_info.done_delay = delay_info->delay_secs; break; default: delay_info->status = CTL_DELAY_STATUS_INVALID_LOC; break; } mtx_unlock(&lun->lun_lock); #else delay_info->status = CTL_DELAY_STATUS_NOT_IMPLEMENTED; #endif /* CTL_IO_DELAY */ break; } case CTL_ERROR_INJECT: { struct ctl_error_desc *err_desc, *new_err_desc; err_desc = (struct ctl_error_desc *)addr; new_err_desc = malloc(sizeof(*new_err_desc), M_CTL, M_WAITOK | M_ZERO); bcopy(err_desc, new_err_desc, sizeof(*new_err_desc)); mtx_lock(&softc->ctl_lock); if (err_desc->lun_id >= ctl_max_luns || (lun = softc->ctl_luns[err_desc->lun_id]) == NULL) { mtx_unlock(&softc->ctl_lock); free(new_err_desc, M_CTL); printf("%s: CTL_ERROR_INJECT: invalid LUN %ju\n", __func__, (uintmax_t)err_desc->lun_id); retval = EINVAL; break; } mtx_lock(&lun->lun_lock); mtx_unlock(&softc->ctl_lock); /* * We could do some checking here to verify the validity * of the request, but given the complexity of error * injection requests, the checking logic would be fairly * complex. * * For now, if the request is invalid, it just won't get * executed and might get deleted. */ STAILQ_INSERT_TAIL(&lun->error_list, new_err_desc, links); /* * XXX KDM check to make sure the serial number is unique, * in case we somehow manage to wrap. That shouldn't * happen for a very long time, but it's the right thing to * do. */ new_err_desc->serial = lun->error_serial; err_desc->serial = lun->error_serial; lun->error_serial++; mtx_unlock(&lun->lun_lock); break; } case CTL_ERROR_INJECT_DELETE: { struct ctl_error_desc *delete_desc, *desc, *desc2; int delete_done; delete_desc = (struct ctl_error_desc *)addr; delete_done = 0; mtx_lock(&softc->ctl_lock); if (delete_desc->lun_id >= ctl_max_luns || (lun = softc->ctl_luns[delete_desc->lun_id]) == NULL) { mtx_unlock(&softc->ctl_lock); printf("%s: CTL_ERROR_INJECT_DELETE: invalid LUN %ju\n", __func__, (uintmax_t)delete_desc->lun_id); retval = EINVAL; break; } mtx_lock(&lun->lun_lock); mtx_unlock(&softc->ctl_lock); STAILQ_FOREACH_SAFE(desc, &lun->error_list, links, desc2) { if (desc->serial != delete_desc->serial) continue; STAILQ_REMOVE(&lun->error_list, desc, ctl_error_desc, links); free(desc, M_CTL); delete_done = 1; } mtx_unlock(&lun->lun_lock); if (delete_done == 0) { printf("%s: CTL_ERROR_INJECT_DELETE: can't find " "error serial %ju on LUN %u\n", __func__, delete_desc->serial, delete_desc->lun_id); retval = EINVAL; break; } break; } case CTL_DUMP_STRUCTS: { int j, k; struct ctl_port *port; struct ctl_frontend *fe; mtx_lock(&softc->ctl_lock); printf("CTL Persistent Reservation information start:\n"); STAILQ_FOREACH(lun, &softc->lun_list, links) { mtx_lock(&lun->lun_lock); if ((lun->flags & CTL_LUN_DISABLED) != 0) { mtx_unlock(&lun->lun_lock); continue; } for (j = 0; j < ctl_max_ports; j++) { if (lun->pr_keys[j] == NULL) continue; for (k = 0; k < CTL_MAX_INIT_PER_PORT; k++){ if (lun->pr_keys[j][k] == 0) continue; printf(" LUN %ju port %d iid %d key " "%#jx\n", lun->lun, j, k, (uintmax_t)lun->pr_keys[j][k]); } } mtx_unlock(&lun->lun_lock); } printf("CTL Persistent Reservation information end\n"); printf("CTL Ports:\n"); STAILQ_FOREACH(port, &softc->port_list, links) { printf(" Port %d '%s' Frontend '%s' Type %u pp %d vp %d WWNN " "%#jx WWPN %#jx\n", port->targ_port, port->port_name, port->frontend->name, port->port_type, port->physical_port, port->virtual_port, (uintmax_t)port->wwnn, (uintmax_t)port->wwpn); for (j = 0; j < CTL_MAX_INIT_PER_PORT; j++) { if (port->wwpn_iid[j].in_use == 0 && port->wwpn_iid[j].wwpn == 0 && port->wwpn_iid[j].name == NULL) continue; printf(" iid %u use %d WWPN %#jx '%s'\n", j, port->wwpn_iid[j].in_use, (uintmax_t)port->wwpn_iid[j].wwpn, port->wwpn_iid[j].name); } } printf("CTL Port information end\n"); mtx_unlock(&softc->ctl_lock); /* * XXX KDM calling this without a lock. We'd likely want * to drop the lock before calling the frontend's dump * routine anyway. */ printf("CTL Frontends:\n"); STAILQ_FOREACH(fe, &softc->fe_list, links) { printf(" Frontend '%s'\n", fe->name); if (fe->fe_dump != NULL) fe->fe_dump(); } printf("CTL Frontend information end\n"); break; } case CTL_LUN_REQ: { struct ctl_lun_req *lun_req; struct ctl_backend_driver *backend; void *packed; nvlist_t *tmp_args_nvl; size_t packed_len; lun_req = (struct ctl_lun_req *)addr; tmp_args_nvl = lun_req->args_nvl; backend = ctl_backend_find(lun_req->backend); if (backend == NULL) { lun_req->status = CTL_LUN_ERROR; snprintf(lun_req->error_str, sizeof(lun_req->error_str), "Backend \"%s\" not found.", lun_req->backend); break; } if (lun_req->args != NULL) { packed = malloc(lun_req->args_len, M_CTL, M_WAITOK); if (copyin(lun_req->args, packed, lun_req->args_len) != 0) { free(packed, M_CTL); lun_req->status = CTL_LUN_ERROR; snprintf(lun_req->error_str, sizeof(lun_req->error_str), "Cannot copyin args."); break; } lun_req->args_nvl = nvlist_unpack(packed, lun_req->args_len, 0); free(packed, M_CTL); if (lun_req->args_nvl == NULL) { lun_req->status = CTL_LUN_ERROR; snprintf(lun_req->error_str, sizeof(lun_req->error_str), "Cannot unpack args nvlist."); break; } } else lun_req->args_nvl = nvlist_create(0); retval = backend->ioctl(dev, cmd, addr, flag, td); nvlist_destroy(lun_req->args_nvl); lun_req->args_nvl = tmp_args_nvl; if (lun_req->result_nvl != NULL) { if (lun_req->result != NULL) { packed = nvlist_pack(lun_req->result_nvl, &packed_len); if (packed == NULL) { lun_req->status = CTL_LUN_ERROR; snprintf(lun_req->error_str, sizeof(lun_req->error_str), "Cannot pack result nvlist."); break; } if (packed_len > lun_req->result_len) { lun_req->status = CTL_LUN_ERROR; snprintf(lun_req->error_str, sizeof(lun_req->error_str), "Result nvlist too large."); free(packed, M_NVLIST); break; } if (copyout(packed, lun_req->result, packed_len)) { lun_req->status = CTL_LUN_ERROR; snprintf(lun_req->error_str, sizeof(lun_req->error_str), "Cannot copyout() the result."); free(packed, M_NVLIST); break; } lun_req->result_len = packed_len; free(packed, M_NVLIST); } nvlist_destroy(lun_req->result_nvl); } break; } case CTL_LUN_LIST: { struct sbuf *sb; struct ctl_lun_list *list; const char *name, *value; void *cookie; int type; list = (struct ctl_lun_list *)addr; /* * Allocate a fixed length sbuf here, based on the length * of the user's buffer. We could allocate an auto-extending * buffer, and then tell the user how much larger our * amount of data is than his buffer, but that presents * some problems: * * 1. The sbuf(9) routines use a blocking malloc, and so * we can't hold a lock while calling them with an * auto-extending buffer. * * 2. There is not currently a LUN reference counting * mechanism, outside of outstanding transactions on * the LUN's OOA queue. So a LUN could go away on us * while we're getting the LUN number, backend-specific * information, etc. Thus, given the way things * currently work, we need to hold the CTL lock while * grabbing LUN information. * * So, from the user's standpoint, the best thing to do is * allocate what he thinks is a reasonable buffer length, * and then if he gets a CTL_LUN_LIST_NEED_MORE_SPACE error, * double the buffer length and try again. (And repeat * that until he succeeds.) */ sb = sbuf_new(NULL, NULL, list->alloc_len, SBUF_FIXEDLEN); if (sb == NULL) { list->status = CTL_LUN_LIST_ERROR; snprintf(list->error_str, sizeof(list->error_str), "Unable to allocate %d bytes for LUN list", list->alloc_len); break; } sbuf_printf(sb, "\n"); mtx_lock(&softc->ctl_lock); STAILQ_FOREACH(lun, &softc->lun_list, links) { mtx_lock(&lun->lun_lock); retval = sbuf_printf(sb, "\n", (uintmax_t)lun->lun); /* * Bail out as soon as we see that we've overfilled * the buffer. */ if (retval != 0) break; retval = sbuf_printf(sb, "\t%s" "\n", (lun->backend == NULL) ? "none" : lun->backend->name); if (retval != 0) break; retval = sbuf_printf(sb, "\t%d\n", lun->be_lun->lun_type); if (retval != 0) break; if (lun->backend == NULL) { retval = sbuf_printf(sb, "\n"); if (retval != 0) break; continue; } retval = sbuf_printf(sb, "\t%ju\n", (lun->be_lun->maxlba > 0) ? lun->be_lun->maxlba + 1 : 0); if (retval != 0) break; retval = sbuf_printf(sb, "\t%u\n", lun->be_lun->blocksize); if (retval != 0) break; retval = sbuf_printf(sb, "\t"); if (retval != 0) break; retval = ctl_sbuf_printf_esc(sb, lun->be_lun->serial_num, sizeof(lun->be_lun->serial_num)); if (retval != 0) break; retval = sbuf_printf(sb, "\n"); if (retval != 0) break; retval = sbuf_printf(sb, "\t"); if (retval != 0) break; retval = ctl_sbuf_printf_esc(sb, lun->be_lun->device_id, sizeof(lun->be_lun->device_id)); if (retval != 0) break; retval = sbuf_printf(sb, "\n"); if (retval != 0) break; if (lun->backend->lun_info != NULL) { retval = lun->backend->lun_info(lun->be_lun->be_lun, sb); if (retval != 0) break; } cookie = NULL; while ((name = nvlist_next(lun->be_lun->options, &type, &cookie)) != NULL) { sbuf_printf(sb, "\t<%s>", name); if (type == NV_TYPE_STRING) { value = dnvlist_get_string( lun->be_lun->options, name, NULL); if (value != NULL) sbuf_printf(sb, "%s", value); } sbuf_printf(sb, "\n", name); } retval = sbuf_printf(sb, "\n"); if (retval != 0) break; mtx_unlock(&lun->lun_lock); } if (lun != NULL) mtx_unlock(&lun->lun_lock); mtx_unlock(&softc->ctl_lock); if ((retval != 0) || ((retval = sbuf_printf(sb, "\n")) != 0)) { retval = 0; sbuf_delete(sb); list->status = CTL_LUN_LIST_NEED_MORE_SPACE; snprintf(list->error_str, sizeof(list->error_str), "Out of space, %d bytes is too small", list->alloc_len); break; } sbuf_finish(sb); retval = copyout(sbuf_data(sb), list->lun_xml, sbuf_len(sb) + 1); list->fill_len = sbuf_len(sb) + 1; list->status = CTL_LUN_LIST_OK; sbuf_delete(sb); break; } case CTL_ISCSI: { struct ctl_iscsi *ci; struct ctl_frontend *fe; ci = (struct ctl_iscsi *)addr; fe = ctl_frontend_find("iscsi"); if (fe == NULL) { ci->status = CTL_ISCSI_ERROR; snprintf(ci->error_str, sizeof(ci->error_str), "Frontend \"iscsi\" not found."); break; } retval = fe->ioctl(dev, cmd, addr, flag, td); break; } case CTL_PORT_REQ: { struct ctl_req *req; struct ctl_frontend *fe; void *packed; nvlist_t *tmp_args_nvl; size_t packed_len; req = (struct ctl_req *)addr; tmp_args_nvl = req->args_nvl; fe = ctl_frontend_find(req->driver); if (fe == NULL) { req->status = CTL_LUN_ERROR; snprintf(req->error_str, sizeof(req->error_str), "Frontend \"%s\" not found.", req->driver); break; } if (req->args != NULL) { packed = malloc(req->args_len, M_CTL, M_WAITOK); if (copyin(req->args, packed, req->args_len) != 0) { free(packed, M_CTL); req->status = CTL_LUN_ERROR; snprintf(req->error_str, sizeof(req->error_str), "Cannot copyin args."); break; } req->args_nvl = nvlist_unpack(packed, req->args_len, 0); free(packed, M_CTL); if (req->args_nvl == NULL) { req->status = CTL_LUN_ERROR; snprintf(req->error_str, sizeof(req->error_str), "Cannot unpack args nvlist."); break; } } else req->args_nvl = nvlist_create(0); if (fe->ioctl) retval = fe->ioctl(dev, cmd, addr, flag, td); else retval = ENODEV; nvlist_destroy(req->args_nvl); req->args_nvl = tmp_args_nvl; if (req->result_nvl != NULL) { if (req->result != NULL) { packed = nvlist_pack(req->result_nvl, &packed_len); if (packed == NULL) { req->status = CTL_LUN_ERROR; snprintf(req->error_str, sizeof(req->error_str), "Cannot pack result nvlist."); break; } if (packed_len > req->result_len) { req->status = CTL_LUN_ERROR; snprintf(req->error_str, sizeof(req->error_str), "Result nvlist too large."); free(packed, M_NVLIST); break; } if (copyout(packed, req->result, packed_len)) { req->status = CTL_LUN_ERROR; snprintf(req->error_str, sizeof(req->error_str), "Cannot copyout() the result."); free(packed, M_NVLIST); break; } req->result_len = packed_len; free(packed, M_NVLIST); } nvlist_destroy(req->result_nvl); } break; } case CTL_PORT_LIST: { struct sbuf *sb; struct ctl_port *port; struct ctl_lun_list *list; const char *name, *value; void *cookie; int j, type; uint32_t plun; list = (struct ctl_lun_list *)addr; sb = sbuf_new(NULL, NULL, list->alloc_len, SBUF_FIXEDLEN); if (sb == NULL) { list->status = CTL_LUN_LIST_ERROR; snprintf(list->error_str, sizeof(list->error_str), "Unable to allocate %d bytes for LUN list", list->alloc_len); break; } sbuf_printf(sb, "\n"); mtx_lock(&softc->ctl_lock); STAILQ_FOREACH(port, &softc->port_list, links) { retval = sbuf_printf(sb, "\n", (uintmax_t)port->targ_port); /* * Bail out as soon as we see that we've overfilled * the buffer. */ if (retval != 0) break; retval = sbuf_printf(sb, "\t%s" "\n", port->frontend->name); if (retval != 0) break; retval = sbuf_printf(sb, "\t%d\n", port->port_type); if (retval != 0) break; retval = sbuf_printf(sb, "\t%s\n", (port->status & CTL_PORT_STATUS_ONLINE) ? "YES" : "NO"); if (retval != 0) break; retval = sbuf_printf(sb, "\t%s\n", port->port_name); if (retval != 0) break; retval = sbuf_printf(sb, "\t%d\n", port->physical_port); if (retval != 0) break; retval = sbuf_printf(sb, "\t%d\n", port->virtual_port); if (retval != 0) break; if (port->target_devid != NULL) { sbuf_printf(sb, "\t"); ctl_id_sbuf(port->target_devid, sb); sbuf_printf(sb, "\n"); } if (port->port_devid != NULL) { sbuf_printf(sb, "\t"); ctl_id_sbuf(port->port_devid, sb); sbuf_printf(sb, "\n"); } if (port->port_info != NULL) { retval = port->port_info(port->onoff_arg, sb); if (retval != 0) break; } cookie = NULL; while ((name = nvlist_next(port->options, &type, &cookie)) != NULL) { sbuf_printf(sb, "\t<%s>", name); if (type == NV_TYPE_STRING) { value = dnvlist_get_string(port->options, name, NULL); if (value != NULL) sbuf_printf(sb, "%s", value); } sbuf_printf(sb, "\n", name); } if (port->lun_map != NULL) { sbuf_printf(sb, "\ton\n"); for (j = 0; j < port->lun_map_size; j++) { plun = ctl_lun_map_from_port(port, j); if (plun == UINT32_MAX) continue; sbuf_printf(sb, "\t%u\n", j, plun); } } for (j = 0; j < CTL_MAX_INIT_PER_PORT; j++) { if (port->wwpn_iid[j].in_use == 0 || (port->wwpn_iid[j].wwpn == 0 && port->wwpn_iid[j].name == NULL)) continue; if (port->wwpn_iid[j].name != NULL) retval = sbuf_printf(sb, "\t%s\n", j, port->wwpn_iid[j].name); else retval = sbuf_printf(sb, "\tnaa.%08jx\n", j, port->wwpn_iid[j].wwpn); if (retval != 0) break; } if (retval != 0) break; retval = sbuf_printf(sb, "\n"); if (retval != 0) break; } mtx_unlock(&softc->ctl_lock); if ((retval != 0) || ((retval = sbuf_printf(sb, "\n")) != 0)) { retval = 0; sbuf_delete(sb); list->status = CTL_LUN_LIST_NEED_MORE_SPACE; snprintf(list->error_str, sizeof(list->error_str), "Out of space, %d bytes is too small", list->alloc_len); break; } sbuf_finish(sb); retval = copyout(sbuf_data(sb), list->lun_xml, sbuf_len(sb) + 1); list->fill_len = sbuf_len(sb) + 1; list->status = CTL_LUN_LIST_OK; sbuf_delete(sb); break; } case CTL_LUN_MAP: { struct ctl_lun_map *lm = (struct ctl_lun_map *)addr; struct ctl_port *port; mtx_lock(&softc->ctl_lock); if (lm->port < softc->port_min || lm->port >= softc->port_max || (port = softc->ctl_ports[lm->port]) == NULL) { mtx_unlock(&softc->ctl_lock); return (ENXIO); } if (port->status & CTL_PORT_STATUS_ONLINE) { STAILQ_FOREACH(lun, &softc->lun_list, links) { if (ctl_lun_map_to_port(port, lun->lun) == UINT32_MAX) continue; mtx_lock(&lun->lun_lock); ctl_est_ua_port(lun, lm->port, -1, CTL_UA_LUN_CHANGE); mtx_unlock(&lun->lun_lock); } } mtx_unlock(&softc->ctl_lock); // XXX: port_enable sleeps if (lm->plun != UINT32_MAX) { if (lm->lun == UINT32_MAX) retval = ctl_lun_map_unset(port, lm->plun); else if (lm->lun < ctl_max_luns && softc->ctl_luns[lm->lun] != NULL) retval = ctl_lun_map_set(port, lm->plun, lm->lun); else return (ENXIO); } else { if (lm->lun == UINT32_MAX) retval = ctl_lun_map_deinit(port); else retval = ctl_lun_map_init(port); } if (port->status & CTL_PORT_STATUS_ONLINE) ctl_isc_announce_port(port); break; } case CTL_GET_LUN_STATS: { struct ctl_get_io_stats *stats = (struct ctl_get_io_stats *)addr; int i; /* * XXX KDM no locking here. If the LUN list changes, * things can blow up. */ i = 0; stats->status = CTL_SS_OK; stats->fill_len = 0; STAILQ_FOREACH(lun, &softc->lun_list, links) { if (lun->lun < stats->first_item) continue; if (stats->fill_len + sizeof(lun->stats) > stats->alloc_len) { stats->status = CTL_SS_NEED_MORE_SPACE; break; } retval = copyout(&lun->stats, &stats->stats[i++], sizeof(lun->stats)); if (retval != 0) break; stats->fill_len += sizeof(lun->stats); } stats->num_items = softc->num_luns; stats->flags = CTL_STATS_FLAG_NONE; #ifdef CTL_TIME_IO stats->flags |= CTL_STATS_FLAG_TIME_VALID; #endif getnanouptime(&stats->timestamp); break; } case CTL_GET_PORT_STATS: { struct ctl_get_io_stats *stats = (struct ctl_get_io_stats *)addr; int i; /* * XXX KDM no locking here. If the LUN list changes, * things can blow up. */ i = 0; stats->status = CTL_SS_OK; stats->fill_len = 0; STAILQ_FOREACH(port, &softc->port_list, links) { if (port->targ_port < stats->first_item) continue; if (stats->fill_len + sizeof(port->stats) > stats->alloc_len) { stats->status = CTL_SS_NEED_MORE_SPACE; break; } retval = copyout(&port->stats, &stats->stats[i++], sizeof(port->stats)); if (retval != 0) break; stats->fill_len += sizeof(port->stats); } stats->num_items = softc->num_ports; stats->flags = CTL_STATS_FLAG_NONE; #ifdef CTL_TIME_IO stats->flags |= CTL_STATS_FLAG_TIME_VALID; #endif getnanouptime(&stats->timestamp); break; } default: { /* XXX KDM should we fix this? */ #if 0 struct ctl_backend_driver *backend; unsigned int type; int found; found = 0; /* * We encode the backend type as the ioctl type for backend * ioctls. So parse it out here, and then search for a * backend of this type. */ type = _IOC_TYPE(cmd); STAILQ_FOREACH(backend, &softc->be_list, links) { if (backend->type == type) { found = 1; break; } } if (found == 0) { printf("ctl: unknown ioctl command %#lx or backend " "%d\n", cmd, type); retval = EINVAL; break; } retval = backend->ioctl(dev, cmd, addr, flag, td); #endif retval = ENOTTY; break; } } return (retval); } uint32_t ctl_get_initindex(struct ctl_nexus *nexus) { return (nexus->initid + (nexus->targ_port * CTL_MAX_INIT_PER_PORT)); } int ctl_lun_map_init(struct ctl_port *port) { struct ctl_softc *softc = port->ctl_softc; struct ctl_lun *lun; int size = ctl_lun_map_size; uint32_t i; if (port->lun_map == NULL || port->lun_map_size < size) { port->lun_map_size = 0; free(port->lun_map, M_CTL); port->lun_map = malloc(size * sizeof(uint32_t), M_CTL, M_NOWAIT); } if (port->lun_map == NULL) return (ENOMEM); for (i = 0; i < size; i++) port->lun_map[i] = UINT32_MAX; port->lun_map_size = size; if (port->status & CTL_PORT_STATUS_ONLINE) { if (port->lun_disable != NULL) { STAILQ_FOREACH(lun, &softc->lun_list, links) port->lun_disable(port->targ_lun_arg, lun->lun); } ctl_isc_announce_port(port); } return (0); } int ctl_lun_map_deinit(struct ctl_port *port) { struct ctl_softc *softc = port->ctl_softc; struct ctl_lun *lun; if (port->lun_map == NULL) return (0); port->lun_map_size = 0; free(port->lun_map, M_CTL); port->lun_map = NULL; if (port->status & CTL_PORT_STATUS_ONLINE) { if (port->lun_enable != NULL) { STAILQ_FOREACH(lun, &softc->lun_list, links) port->lun_enable(port->targ_lun_arg, lun->lun); } ctl_isc_announce_port(port); } return (0); } int ctl_lun_map_set(struct ctl_port *port, uint32_t plun, uint32_t glun) { int status; uint32_t old; if (port->lun_map == NULL) { status = ctl_lun_map_init(port); if (status != 0) return (status); } if (plun >= port->lun_map_size) return (EINVAL); old = port->lun_map[plun]; port->lun_map[plun] = glun; if ((port->status & CTL_PORT_STATUS_ONLINE) && old == UINT32_MAX) { if (port->lun_enable != NULL) port->lun_enable(port->targ_lun_arg, plun); ctl_isc_announce_port(port); } return (0); } int ctl_lun_map_unset(struct ctl_port *port, uint32_t plun) { uint32_t old; if (port->lun_map == NULL || plun >= port->lun_map_size) return (0); old = port->lun_map[plun]; port->lun_map[plun] = UINT32_MAX; if ((port->status & CTL_PORT_STATUS_ONLINE) && old != UINT32_MAX) { if (port->lun_disable != NULL) port->lun_disable(port->targ_lun_arg, plun); ctl_isc_announce_port(port); } return (0); } uint32_t ctl_lun_map_from_port(struct ctl_port *port, uint32_t lun_id) { if (port == NULL) return (UINT32_MAX); if (port->lun_map == NULL) return (lun_id); if (lun_id > port->lun_map_size) return (UINT32_MAX); return (port->lun_map[lun_id]); } uint32_t ctl_lun_map_to_port(struct ctl_port *port, uint32_t lun_id) { uint32_t i; if (port == NULL) return (UINT32_MAX); if (port->lun_map == NULL) return (lun_id); for (i = 0; i < port->lun_map_size; i++) { if (port->lun_map[i] == lun_id) return (i); } return (UINT32_MAX); } uint32_t ctl_decode_lun(uint64_t encoded) { uint8_t lun[8]; uint32_t result = 0xffffffff; be64enc(lun, encoded); switch (lun[0] & RPL_LUNDATA_ATYP_MASK) { case RPL_LUNDATA_ATYP_PERIPH: if ((lun[0] & 0x3f) == 0 && lun[2] == 0 && lun[3] == 0 && lun[4] == 0 && lun[5] == 0 && lun[6] == 0 && lun[7] == 0) result = lun[1]; break; case RPL_LUNDATA_ATYP_FLAT: if (lun[2] == 0 && lun[3] == 0 && lun[4] == 0 && lun[5] == 0 && lun[6] == 0 && lun[7] == 0) result = ((lun[0] & 0x3f) << 8) + lun[1]; break; case RPL_LUNDATA_ATYP_EXTLUN: switch (lun[0] & RPL_LUNDATA_EXT_EAM_MASK) { case 0x02: switch (lun[0] & RPL_LUNDATA_EXT_LEN_MASK) { case 0x00: result = lun[1]; break; case 0x10: result = (lun[1] << 16) + (lun[2] << 8) + lun[3]; break; case 0x20: if (lun[1] == 0 && lun[6] == 0 && lun[7] == 0) result = (lun[2] << 24) + (lun[3] << 16) + (lun[4] << 8) + lun[5]; break; } break; case RPL_LUNDATA_EXT_EAM_NOT_SPEC: result = 0xffffffff; break; } break; } return (result); } uint64_t ctl_encode_lun(uint32_t decoded) { uint64_t l = decoded; if (l <= 0xff) return (((uint64_t)RPL_LUNDATA_ATYP_PERIPH << 56) | (l << 48)); if (l <= 0x3fff) return (((uint64_t)RPL_LUNDATA_ATYP_FLAT << 56) | (l << 48)); if (l <= 0xffffff) return (((uint64_t)(RPL_LUNDATA_ATYP_EXTLUN | 0x12) << 56) | (l << 32)); return ((((uint64_t)RPL_LUNDATA_ATYP_EXTLUN | 0x22) << 56) | (l << 16)); } int ctl_ffz(uint32_t *mask, uint32_t first, uint32_t last) { int i; for (i = first; i < last; i++) { if ((mask[i / 32] & (1 << (i % 32))) == 0) return (i); } return (-1); } int ctl_set_mask(uint32_t *mask, uint32_t bit) { uint32_t chunk, piece; chunk = bit >> 5; piece = bit % (sizeof(uint32_t) * 8); if ((mask[chunk] & (1 << piece)) != 0) return (-1); else mask[chunk] |= (1 << piece); return (0); } int ctl_clear_mask(uint32_t *mask, uint32_t bit) { uint32_t chunk, piece; chunk = bit >> 5; piece = bit % (sizeof(uint32_t) * 8); if ((mask[chunk] & (1 << piece)) == 0) return (-1); else mask[chunk] &= ~(1 << piece); return (0); } int ctl_is_set(uint32_t *mask, uint32_t bit) { uint32_t chunk, piece; chunk = bit >> 5; piece = bit % (sizeof(uint32_t) * 8); if ((mask[chunk] & (1 << piece)) == 0) return (0); else return (1); } static uint64_t ctl_get_prkey(struct ctl_lun *lun, uint32_t residx) { uint64_t *t; t = lun->pr_keys[residx/CTL_MAX_INIT_PER_PORT]; if (t == NULL) return (0); return (t[residx % CTL_MAX_INIT_PER_PORT]); } static void ctl_clr_prkey(struct ctl_lun *lun, uint32_t residx) { uint64_t *t; t = lun->pr_keys[residx/CTL_MAX_INIT_PER_PORT]; if (t == NULL) return; t[residx % CTL_MAX_INIT_PER_PORT] = 0; } static void ctl_alloc_prkey(struct ctl_lun *lun, uint32_t residx) { uint64_t *p; u_int i; i = residx/CTL_MAX_INIT_PER_PORT; if (lun->pr_keys[i] != NULL) return; mtx_unlock(&lun->lun_lock); p = malloc(sizeof(uint64_t) * CTL_MAX_INIT_PER_PORT, M_CTL, M_WAITOK | M_ZERO); mtx_lock(&lun->lun_lock); if (lun->pr_keys[i] == NULL) lun->pr_keys[i] = p; else free(p, M_CTL); } static void ctl_set_prkey(struct ctl_lun *lun, uint32_t residx, uint64_t key) { uint64_t *t; t = lun->pr_keys[residx/CTL_MAX_INIT_PER_PORT]; KASSERT(t != NULL, ("prkey %d is not allocated", residx)); t[residx % CTL_MAX_INIT_PER_PORT] = key; } /* * ctl_softc, pool_name, total_ctl_io are passed in. * npool is passed out. */ int ctl_pool_create(struct ctl_softc *ctl_softc, const char *pool_name, uint32_t total_ctl_io, void **npool) { struct ctl_io_pool *pool; pool = (struct ctl_io_pool *)malloc(sizeof(*pool), M_CTL, M_NOWAIT | M_ZERO); if (pool == NULL) return (ENOMEM); snprintf(pool->name, sizeof(pool->name), "CTL IO %s", pool_name); pool->ctl_softc = ctl_softc; #ifdef IO_POOLS pool->zone = uma_zsecond_create(pool->name, NULL, NULL, NULL, NULL, ctl_softc->io_zone); /* uma_prealloc(pool->zone, total_ctl_io); */ #else pool->zone = ctl_softc->io_zone; #endif *npool = pool; return (0); } void ctl_pool_free(struct ctl_io_pool *pool) { if (pool == NULL) return; #ifdef IO_POOLS uma_zdestroy(pool->zone); #endif free(pool, M_CTL); } union ctl_io * ctl_alloc_io(void *pool_ref) { struct ctl_io_pool *pool = (struct ctl_io_pool *)pool_ref; union ctl_io *io; io = uma_zalloc(pool->zone, M_WAITOK); if (io != NULL) { io->io_hdr.pool = pool_ref; CTL_SOFTC(io) = pool->ctl_softc; TAILQ_INIT(&io->io_hdr.blocked_queue); } return (io); } union ctl_io * ctl_alloc_io_nowait(void *pool_ref) { struct ctl_io_pool *pool = (struct ctl_io_pool *)pool_ref; union ctl_io *io; io = uma_zalloc(pool->zone, M_NOWAIT); if (io != NULL) { io->io_hdr.pool = pool_ref; CTL_SOFTC(io) = pool->ctl_softc; TAILQ_INIT(&io->io_hdr.blocked_queue); } return (io); } void ctl_free_io(union ctl_io *io) { struct ctl_io_pool *pool; if (io == NULL) return; pool = (struct ctl_io_pool *)io->io_hdr.pool; uma_zfree(pool->zone, io); } void ctl_zero_io(union ctl_io *io) { struct ctl_io_pool *pool; if (io == NULL) return; /* * May need to preserve linked list pointers at some point too. */ pool = io->io_hdr.pool; memset(io, 0, sizeof(*io)); io->io_hdr.pool = pool; CTL_SOFTC(io) = pool->ctl_softc; TAILQ_INIT(&io->io_hdr.blocked_queue); } int ctl_expand_number(const char *buf, uint64_t *num) { char *endptr; uint64_t number; unsigned shift; number = strtoq(buf, &endptr, 0); switch (tolower((unsigned char)*endptr)) { case 'e': shift = 60; break; case 'p': shift = 50; break; case 't': shift = 40; break; case 'g': shift = 30; break; case 'm': shift = 20; break; case 'k': shift = 10; break; case 'b': case '\0': /* No unit. */ *num = number; return (0); default: /* Unrecognized unit. */ return (-1); } if ((number << shift) >> shift != number) { /* Overflow */ return (-1); } *num = number << shift; return (0); } /* * This routine could be used in the future to load default and/or saved * mode page parameters for a particuar lun. */ static int ctl_init_page_index(struct ctl_lun *lun) { int i, page_code; struct ctl_page_index *page_index; const char *value; uint64_t ival; memcpy(&lun->mode_pages.index, page_index_template, sizeof(page_index_template)); for (i = 0; i < CTL_NUM_MODE_PAGES; i++) { page_index = &lun->mode_pages.index[i]; if (lun->be_lun->lun_type == T_DIRECT && (page_index->page_flags & CTL_PAGE_FLAG_DIRECT) == 0) continue; if (lun->be_lun->lun_type == T_PROCESSOR && (page_index->page_flags & CTL_PAGE_FLAG_PROC) == 0) continue; if (lun->be_lun->lun_type == T_CDROM && (page_index->page_flags & CTL_PAGE_FLAG_CDROM) == 0) continue; page_code = page_index->page_code & SMPH_PC_MASK; switch (page_code) { case SMS_RW_ERROR_RECOVERY_PAGE: { KASSERT(page_index->subpage == SMS_SUBPAGE_PAGE_0, ("subpage %#x for page %#x is incorrect!", page_index->subpage, page_code)); memcpy(&lun->mode_pages.rw_er_page[CTL_PAGE_CURRENT], &rw_er_page_default, sizeof(rw_er_page_default)); memcpy(&lun->mode_pages.rw_er_page[CTL_PAGE_CHANGEABLE], &rw_er_page_changeable, sizeof(rw_er_page_changeable)); memcpy(&lun->mode_pages.rw_er_page[CTL_PAGE_DEFAULT], &rw_er_page_default, sizeof(rw_er_page_default)); memcpy(&lun->mode_pages.rw_er_page[CTL_PAGE_SAVED], &rw_er_page_default, sizeof(rw_er_page_default)); page_index->page_data = (uint8_t *)lun->mode_pages.rw_er_page; break; } case SMS_FORMAT_DEVICE_PAGE: { struct scsi_format_page *format_page; KASSERT(page_index->subpage == SMS_SUBPAGE_PAGE_0, ("subpage %#x for page %#x is incorrect!", page_index->subpage, page_code)); /* * Sectors per track are set above. Bytes per * sector need to be set here on a per-LUN basis. */ memcpy(&lun->mode_pages.format_page[CTL_PAGE_CURRENT], &format_page_default, sizeof(format_page_default)); memcpy(&lun->mode_pages.format_page[ CTL_PAGE_CHANGEABLE], &format_page_changeable, sizeof(format_page_changeable)); memcpy(&lun->mode_pages.format_page[CTL_PAGE_DEFAULT], &format_page_default, sizeof(format_page_default)); memcpy(&lun->mode_pages.format_page[CTL_PAGE_SAVED], &format_page_default, sizeof(format_page_default)); format_page = &lun->mode_pages.format_page[ CTL_PAGE_CURRENT]; scsi_ulto2b(lun->be_lun->blocksize, format_page->bytes_per_sector); format_page = &lun->mode_pages.format_page[ CTL_PAGE_DEFAULT]; scsi_ulto2b(lun->be_lun->blocksize, format_page->bytes_per_sector); format_page = &lun->mode_pages.format_page[ CTL_PAGE_SAVED]; scsi_ulto2b(lun->be_lun->blocksize, format_page->bytes_per_sector); page_index->page_data = (uint8_t *)lun->mode_pages.format_page; break; } case SMS_RIGID_DISK_PAGE: { struct scsi_rigid_disk_page *rigid_disk_page; uint32_t sectors_per_cylinder; uint64_t cylinders; #ifndef __XSCALE__ int shift; #endif /* !__XSCALE__ */ KASSERT(page_index->subpage == SMS_SUBPAGE_PAGE_0, ("subpage %#x for page %#x is incorrect!", page_index->subpage, page_code)); /* * Rotation rate and sectors per track are set * above. We calculate the cylinders here based on * capacity. Due to the number of heads and * sectors per track we're using, smaller arrays * may turn out to have 0 cylinders. Linux and * FreeBSD don't pay attention to these mode pages * to figure out capacity, but Solaris does. It * seems to deal with 0 cylinders just fine, and * works out a fake geometry based on the capacity. */ memcpy(&lun->mode_pages.rigid_disk_page[ CTL_PAGE_DEFAULT], &rigid_disk_page_default, sizeof(rigid_disk_page_default)); memcpy(&lun->mode_pages.rigid_disk_page[ CTL_PAGE_CHANGEABLE],&rigid_disk_page_changeable, sizeof(rigid_disk_page_changeable)); sectors_per_cylinder = CTL_DEFAULT_SECTORS_PER_TRACK * CTL_DEFAULT_HEADS; /* * The divide method here will be more accurate, * probably, but results in floating point being * used in the kernel on i386 (__udivdi3()). On the * XScale, though, __udivdi3() is implemented in * software. * * The shift method for cylinder calculation is * accurate if sectors_per_cylinder is a power of * 2. Otherwise it might be slightly off -- you * might have a bit of a truncation problem. */ #ifdef __XSCALE__ cylinders = (lun->be_lun->maxlba + 1) / sectors_per_cylinder; #else for (shift = 31; shift > 0; shift--) { if (sectors_per_cylinder & (1 << shift)) break; } cylinders = (lun->be_lun->maxlba + 1) >> shift; #endif /* * We've basically got 3 bytes, or 24 bits for the * cylinder size in the mode page. If we're over, * just round down to 2^24. */ if (cylinders > 0xffffff) cylinders = 0xffffff; rigid_disk_page = &lun->mode_pages.rigid_disk_page[ CTL_PAGE_DEFAULT]; scsi_ulto3b(cylinders, rigid_disk_page->cylinders); if ((value = dnvlist_get_string(lun->be_lun->options, "rpm", NULL)) != NULL) { scsi_ulto2b(strtol(value, NULL, 0), rigid_disk_page->rotation_rate); } memcpy(&lun->mode_pages.rigid_disk_page[CTL_PAGE_CURRENT], &lun->mode_pages.rigid_disk_page[CTL_PAGE_DEFAULT], sizeof(rigid_disk_page_default)); memcpy(&lun->mode_pages.rigid_disk_page[CTL_PAGE_SAVED], &lun->mode_pages.rigid_disk_page[CTL_PAGE_DEFAULT], sizeof(rigid_disk_page_default)); page_index->page_data = (uint8_t *)lun->mode_pages.rigid_disk_page; break; } case SMS_VERIFY_ERROR_RECOVERY_PAGE: { KASSERT(page_index->subpage == SMS_SUBPAGE_PAGE_0, ("subpage %#x for page %#x is incorrect!", page_index->subpage, page_code)); memcpy(&lun->mode_pages.verify_er_page[CTL_PAGE_CURRENT], &verify_er_page_default, sizeof(verify_er_page_default)); memcpy(&lun->mode_pages.verify_er_page[CTL_PAGE_CHANGEABLE], &verify_er_page_changeable, sizeof(verify_er_page_changeable)); memcpy(&lun->mode_pages.verify_er_page[CTL_PAGE_DEFAULT], &verify_er_page_default, sizeof(verify_er_page_default)); memcpy(&lun->mode_pages.verify_er_page[CTL_PAGE_SAVED], &verify_er_page_default, sizeof(verify_er_page_default)); page_index->page_data = (uint8_t *)lun->mode_pages.verify_er_page; break; } case SMS_CACHING_PAGE: { struct scsi_caching_page *caching_page; KASSERT(page_index->subpage == SMS_SUBPAGE_PAGE_0, ("subpage %#x for page %#x is incorrect!", page_index->subpage, page_code)); memcpy(&lun->mode_pages.caching_page[CTL_PAGE_DEFAULT], &caching_page_default, sizeof(caching_page_default)); memcpy(&lun->mode_pages.caching_page[ CTL_PAGE_CHANGEABLE], &caching_page_changeable, sizeof(caching_page_changeable)); memcpy(&lun->mode_pages.caching_page[CTL_PAGE_SAVED], &caching_page_default, sizeof(caching_page_default)); caching_page = &lun->mode_pages.caching_page[ CTL_PAGE_SAVED]; value = dnvlist_get_string(lun->be_lun->options, "writecache", NULL); if (value != NULL && strcmp(value, "off") == 0) caching_page->flags1 &= ~SCP_WCE; value = dnvlist_get_string(lun->be_lun->options, "readcache", NULL); if (value != NULL && strcmp(value, "off") == 0) caching_page->flags1 |= SCP_RCD; memcpy(&lun->mode_pages.caching_page[CTL_PAGE_CURRENT], &lun->mode_pages.caching_page[CTL_PAGE_SAVED], sizeof(caching_page_default)); page_index->page_data = (uint8_t *)lun->mode_pages.caching_page; break; } case SMS_CONTROL_MODE_PAGE: { switch (page_index->subpage) { case SMS_SUBPAGE_PAGE_0: { struct scsi_control_page *control_page; memcpy(&lun->mode_pages.control_page[ CTL_PAGE_DEFAULT], &control_page_default, sizeof(control_page_default)); memcpy(&lun->mode_pages.control_page[ CTL_PAGE_CHANGEABLE], &control_page_changeable, sizeof(control_page_changeable)); memcpy(&lun->mode_pages.control_page[ CTL_PAGE_SAVED], &control_page_default, sizeof(control_page_default)); control_page = &lun->mode_pages.control_page[ CTL_PAGE_SAVED]; value = dnvlist_get_string(lun->be_lun->options, "reordering", NULL); if (value != NULL && strcmp(value, "unrestricted") == 0) { control_page->queue_flags &= ~SCP_QUEUE_ALG_MASK; control_page->queue_flags |= SCP_QUEUE_ALG_UNRESTRICTED; } memcpy(&lun->mode_pages.control_page[ CTL_PAGE_CURRENT], &lun->mode_pages.control_page[ CTL_PAGE_SAVED], sizeof(control_page_default)); page_index->page_data = (uint8_t *)lun->mode_pages.control_page; break; } case 0x01: memcpy(&lun->mode_pages.control_ext_page[ CTL_PAGE_DEFAULT], &control_ext_page_default, sizeof(control_ext_page_default)); memcpy(&lun->mode_pages.control_ext_page[ CTL_PAGE_CHANGEABLE], &control_ext_page_changeable, sizeof(control_ext_page_changeable)); memcpy(&lun->mode_pages.control_ext_page[ CTL_PAGE_SAVED], &control_ext_page_default, sizeof(control_ext_page_default)); memcpy(&lun->mode_pages.control_ext_page[ CTL_PAGE_CURRENT], &lun->mode_pages.control_ext_page[ CTL_PAGE_SAVED], sizeof(control_ext_page_default)); page_index->page_data = (uint8_t *)lun->mode_pages.control_ext_page; break; default: panic("subpage %#x for page %#x is incorrect!", page_index->subpage, page_code); } break; } case SMS_INFO_EXCEPTIONS_PAGE: { switch (page_index->subpage) { case SMS_SUBPAGE_PAGE_0: memcpy(&lun->mode_pages.ie_page[CTL_PAGE_CURRENT], &ie_page_default, sizeof(ie_page_default)); memcpy(&lun->mode_pages.ie_page[ CTL_PAGE_CHANGEABLE], &ie_page_changeable, sizeof(ie_page_changeable)); memcpy(&lun->mode_pages.ie_page[CTL_PAGE_DEFAULT], &ie_page_default, sizeof(ie_page_default)); memcpy(&lun->mode_pages.ie_page[CTL_PAGE_SAVED], &ie_page_default, sizeof(ie_page_default)); page_index->page_data = (uint8_t *)lun->mode_pages.ie_page; break; case 0x02: { struct ctl_logical_block_provisioning_page *page; memcpy(&lun->mode_pages.lbp_page[CTL_PAGE_DEFAULT], &lbp_page_default, sizeof(lbp_page_default)); memcpy(&lun->mode_pages.lbp_page[ CTL_PAGE_CHANGEABLE], &lbp_page_changeable, sizeof(lbp_page_changeable)); memcpy(&lun->mode_pages.lbp_page[CTL_PAGE_SAVED], &lbp_page_default, sizeof(lbp_page_default)); page = &lun->mode_pages.lbp_page[CTL_PAGE_SAVED]; value = dnvlist_get_string(lun->be_lun->options, "avail-threshold", NULL); if (value != NULL && ctl_expand_number(value, &ival) == 0) { page->descr[0].flags |= SLBPPD_ENABLED | SLBPPD_ARMING_DEC; if (lun->be_lun->blocksize) ival /= lun->be_lun->blocksize; else ival /= 512; scsi_ulto4b(ival >> CTL_LBP_EXPONENT, page->descr[0].count); } value = dnvlist_get_string(lun->be_lun->options, "used-threshold", NULL); if (value != NULL && ctl_expand_number(value, &ival) == 0) { page->descr[1].flags |= SLBPPD_ENABLED | SLBPPD_ARMING_INC; if (lun->be_lun->blocksize) ival /= lun->be_lun->blocksize; else ival /= 512; scsi_ulto4b(ival >> CTL_LBP_EXPONENT, page->descr[1].count); } value = dnvlist_get_string(lun->be_lun->options, "pool-avail-threshold", NULL); if (value != NULL && ctl_expand_number(value, &ival) == 0) { page->descr[2].flags |= SLBPPD_ENABLED | SLBPPD_ARMING_DEC; if (lun->be_lun->blocksize) ival /= lun->be_lun->blocksize; else ival /= 512; scsi_ulto4b(ival >> CTL_LBP_EXPONENT, page->descr[2].count); } value = dnvlist_get_string(lun->be_lun->options, "pool-used-threshold", NULL); if (value != NULL && ctl_expand_number(value, &ival) == 0) { page->descr[3].flags |= SLBPPD_ENABLED | SLBPPD_ARMING_INC; if (lun->be_lun->blocksize) ival /= lun->be_lun->blocksize; else ival /= 512; scsi_ulto4b(ival >> CTL_LBP_EXPONENT, page->descr[3].count); } memcpy(&lun->mode_pages.lbp_page[CTL_PAGE_CURRENT], &lun->mode_pages.lbp_page[CTL_PAGE_SAVED], sizeof(lbp_page_default)); page_index->page_data = (uint8_t *)lun->mode_pages.lbp_page; break; } default: panic("subpage %#x for page %#x is incorrect!", page_index->subpage, page_code); } break; } case SMS_CDDVD_CAPS_PAGE:{ KASSERT(page_index->subpage == SMS_SUBPAGE_PAGE_0, ("subpage %#x for page %#x is incorrect!", page_index->subpage, page_code)); memcpy(&lun->mode_pages.cddvd_page[CTL_PAGE_DEFAULT], &cddvd_page_default, sizeof(cddvd_page_default)); memcpy(&lun->mode_pages.cddvd_page[ CTL_PAGE_CHANGEABLE], &cddvd_page_changeable, sizeof(cddvd_page_changeable)); memcpy(&lun->mode_pages.cddvd_page[CTL_PAGE_SAVED], &cddvd_page_default, sizeof(cddvd_page_default)); memcpy(&lun->mode_pages.cddvd_page[CTL_PAGE_CURRENT], &lun->mode_pages.cddvd_page[CTL_PAGE_SAVED], sizeof(cddvd_page_default)); page_index->page_data = (uint8_t *)lun->mode_pages.cddvd_page; break; } default: panic("invalid page code value %#x", page_code); } } return (CTL_RETVAL_COMPLETE); } static int ctl_init_log_page_index(struct ctl_lun *lun) { struct ctl_page_index *page_index; int i, j, k, prev; memcpy(&lun->log_pages.index, log_page_index_template, sizeof(log_page_index_template)); prev = -1; for (i = 0, j = 0, k = 0; i < CTL_NUM_LOG_PAGES; i++) { page_index = &lun->log_pages.index[i]; if (lun->be_lun->lun_type == T_DIRECT && (page_index->page_flags & CTL_PAGE_FLAG_DIRECT) == 0) continue; if (lun->be_lun->lun_type == T_PROCESSOR && (page_index->page_flags & CTL_PAGE_FLAG_PROC) == 0) continue; if (lun->be_lun->lun_type == T_CDROM && (page_index->page_flags & CTL_PAGE_FLAG_CDROM) == 0) continue; if (page_index->page_code == SLS_LOGICAL_BLOCK_PROVISIONING && lun->backend->lun_attr == NULL) continue; if (page_index->page_code != prev) { lun->log_pages.pages_page[j] = page_index->page_code; prev = page_index->page_code; j++; } lun->log_pages.subpages_page[k*2] = page_index->page_code; lun->log_pages.subpages_page[k*2+1] = page_index->subpage; k++; } lun->log_pages.index[0].page_data = &lun->log_pages.pages_page[0]; lun->log_pages.index[0].page_len = j; lun->log_pages.index[1].page_data = &lun->log_pages.subpages_page[0]; lun->log_pages.index[1].page_len = k * 2; - lun->log_pages.index[2].page_data = &lun->log_pages.lbp_page[0]; - lun->log_pages.index[2].page_len = 12*CTL_NUM_LBP_PARAMS; - lun->log_pages.index[3].page_data = (uint8_t *)&lun->log_pages.stat_page; - lun->log_pages.index[3].page_len = sizeof(lun->log_pages.stat_page); - lun->log_pages.index[4].page_data = (uint8_t *)&lun->log_pages.ie_page; - lun->log_pages.index[4].page_len = sizeof(lun->log_pages.ie_page); + lun->log_pages.index[2].page_data = (uint8_t *)&lun->log_pages.temp_page; + lun->log_pages.index[2].page_len = sizeof(lun->log_pages.temp_page); + lun->log_pages.index[3].page_data = &lun->log_pages.lbp_page[0]; + lun->log_pages.index[3].page_len = 12*CTL_NUM_LBP_PARAMS; + lun->log_pages.index[4].page_data = (uint8_t *)&lun->log_pages.stat_page; + lun->log_pages.index[4].page_len = sizeof(lun->log_pages.stat_page); + lun->log_pages.index[5].page_data = (uint8_t *)&lun->log_pages.ie_page; + lun->log_pages.index[5].page_len = sizeof(lun->log_pages.ie_page); return (CTL_RETVAL_COMPLETE); } static int hex2bin(const char *str, uint8_t *buf, int buf_size) { int i; u_char c; memset(buf, 0, buf_size); while (isspace(str[0])) str++; if (str[0] == '0' && (str[1] == 'x' || str[1] == 'X')) str += 2; buf_size *= 2; for (i = 0; str[i] != 0 && i < buf_size; i++) { while (str[i] == '-') /* Skip dashes in UUIDs. */ str++; c = str[i]; if (isdigit(c)) c -= '0'; else if (isalpha(c)) c -= isupper(c) ? 'A' - 10 : 'a' - 10; else break; if (c >= 16) break; if ((i & 1) == 0) buf[i / 2] |= (c << 4); else buf[i / 2] |= c; } return ((i + 1) / 2); } /* * LUN allocation. * * Requirements: * - caller allocates and zeros LUN storage, or passes in a NULL LUN if he * wants us to allocate the LUN and he can block. * - ctl_softc is always set * - be_lun is set if the LUN has a backend (needed for disk LUNs) * * Returns 0 for success, non-zero (errno) for failure. */ static int ctl_alloc_lun(struct ctl_softc *ctl_softc, struct ctl_lun *ctl_lun, struct ctl_be_lun *const be_lun) { struct ctl_lun *nlun, *lun; struct scsi_vpd_id_descriptor *desc; struct scsi_vpd_id_t10 *t10id; const char *eui, *naa, *scsiname, *uuid, *vendor, *value; int lun_number, lun_malloced; int devidlen, idlen1, idlen2 = 0, len; if (be_lun == NULL) return (EINVAL); /* * We currently only support Direct Access or Processor LUN types. */ switch (be_lun->lun_type) { case T_DIRECT: case T_PROCESSOR: case T_CDROM: break; case T_SEQUENTIAL: case T_CHANGER: default: be_lun->lun_config_status(be_lun->be_lun, CTL_LUN_CONFIG_FAILURE); break; } if (ctl_lun == NULL) { lun = malloc(sizeof(*lun), M_CTL, M_WAITOK); lun_malloced = 1; } else { lun_malloced = 0; lun = ctl_lun; } memset(lun, 0, sizeof(*lun)); if (lun_malloced) lun->flags = CTL_LUN_MALLOCED; lun->pending_sense = malloc(sizeof(struct scsi_sense_data *) * ctl_max_ports, M_DEVBUF, M_WAITOK | M_ZERO); lun->pending_ua = malloc(sizeof(ctl_ua_type *) * ctl_max_ports, M_DEVBUF, M_WAITOK | M_ZERO); lun->pr_keys = malloc(sizeof(uint64_t *) * ctl_max_ports, M_DEVBUF, M_WAITOK | M_ZERO); /* Generate LUN ID. */ devidlen = max(CTL_DEVID_MIN_LEN, strnlen(be_lun->device_id, CTL_DEVID_LEN)); idlen1 = sizeof(*t10id) + devidlen; len = sizeof(struct scsi_vpd_id_descriptor) + idlen1; scsiname = dnvlist_get_string(be_lun->options, "scsiname", NULL); if (scsiname != NULL) { idlen2 = roundup2(strlen(scsiname) + 1, 4); len += sizeof(struct scsi_vpd_id_descriptor) + idlen2; } eui = dnvlist_get_string(be_lun->options, "eui", NULL); if (eui != NULL) { len += sizeof(struct scsi_vpd_id_descriptor) + 16; } naa = dnvlist_get_string(be_lun->options, "naa", NULL); if (naa != NULL) { len += sizeof(struct scsi_vpd_id_descriptor) + 16; } uuid = dnvlist_get_string(be_lun->options, "uuid", NULL); if (uuid != NULL) { len += sizeof(struct scsi_vpd_id_descriptor) + 18; } lun->lun_devid = malloc(sizeof(struct ctl_devid) + len, M_CTL, M_WAITOK | M_ZERO); desc = (struct scsi_vpd_id_descriptor *)lun->lun_devid->data; desc->proto_codeset = SVPD_ID_CODESET_ASCII; desc->id_type = SVPD_ID_PIV | SVPD_ID_ASSOC_LUN | SVPD_ID_TYPE_T10; desc->length = idlen1; t10id = (struct scsi_vpd_id_t10 *)&desc->identifier[0]; memset(t10id->vendor, ' ', sizeof(t10id->vendor)); if ((vendor = dnvlist_get_string(be_lun->options, "vendor", NULL)) == NULL) { strncpy((char *)t10id->vendor, CTL_VENDOR, sizeof(t10id->vendor)); } else { strncpy(t10id->vendor, vendor, min(sizeof(t10id->vendor), strlen(vendor))); } strncpy((char *)t10id->vendor_spec_id, (char *)be_lun->device_id, devidlen); if (scsiname != NULL) { desc = (struct scsi_vpd_id_descriptor *)(&desc->identifier[0] + desc->length); desc->proto_codeset = SVPD_ID_CODESET_UTF8; desc->id_type = SVPD_ID_PIV | SVPD_ID_ASSOC_LUN | SVPD_ID_TYPE_SCSI_NAME; desc->length = idlen2; strlcpy(desc->identifier, scsiname, idlen2); } if (eui != NULL) { desc = (struct scsi_vpd_id_descriptor *)(&desc->identifier[0] + desc->length); desc->proto_codeset = SVPD_ID_CODESET_BINARY; desc->id_type = SVPD_ID_PIV | SVPD_ID_ASSOC_LUN | SVPD_ID_TYPE_EUI64; desc->length = hex2bin(eui, desc->identifier, 16); desc->length = desc->length > 12 ? 16 : (desc->length > 8 ? 12 : 8); len -= 16 - desc->length; } if (naa != NULL) { desc = (struct scsi_vpd_id_descriptor *)(&desc->identifier[0] + desc->length); desc->proto_codeset = SVPD_ID_CODESET_BINARY; desc->id_type = SVPD_ID_PIV | SVPD_ID_ASSOC_LUN | SVPD_ID_TYPE_NAA; desc->length = hex2bin(naa, desc->identifier, 16); desc->length = desc->length > 8 ? 16 : 8; len -= 16 - desc->length; } if (uuid != NULL) { desc = (struct scsi_vpd_id_descriptor *)(&desc->identifier[0] + desc->length); desc->proto_codeset = SVPD_ID_CODESET_BINARY; desc->id_type = SVPD_ID_PIV | SVPD_ID_ASSOC_LUN | SVPD_ID_TYPE_UUID; desc->identifier[0] = 0x10; hex2bin(uuid, &desc->identifier[2], 16); desc->length = 18; } lun->lun_devid->len = len; mtx_lock(&ctl_softc->ctl_lock); /* * See if the caller requested a particular LUN number. If so, see * if it is available. Otherwise, allocate the first available LUN. */ if (be_lun->flags & CTL_LUN_FLAG_ID_REQ) { if ((be_lun->req_lun_id > (ctl_max_luns - 1)) || (ctl_is_set(ctl_softc->ctl_lun_mask, be_lun->req_lun_id))) { mtx_unlock(&ctl_softc->ctl_lock); if (be_lun->req_lun_id > (ctl_max_luns - 1)) { printf("ctl: requested LUN ID %d is higher " "than ctl_max_luns - 1 (%d)\n", be_lun->req_lun_id, ctl_max_luns - 1); } else { /* * XXX KDM return an error, or just assign * another LUN ID in this case?? */ printf("ctl: requested LUN ID %d is already " "in use\n", be_lun->req_lun_id); } fail: free(lun->lun_devid, M_CTL); if (lun->flags & CTL_LUN_MALLOCED) free(lun, M_CTL); be_lun->lun_config_status(be_lun->be_lun, CTL_LUN_CONFIG_FAILURE); return (ENOSPC); } lun_number = be_lun->req_lun_id; } else { lun_number = ctl_ffz(ctl_softc->ctl_lun_mask, 0, ctl_max_luns); if (lun_number == -1) { mtx_unlock(&ctl_softc->ctl_lock); printf("ctl: can't allocate LUN, out of LUNs\n"); goto fail; } } ctl_set_mask(ctl_softc->ctl_lun_mask, lun_number); mtx_unlock(&ctl_softc->ctl_lock); mtx_init(&lun->lun_lock, "CTL LUN", NULL, MTX_DEF); lun->lun = lun_number; lun->be_lun = be_lun; /* * The processor LUN is always enabled. Disk LUNs come on line * disabled, and must be enabled by the backend. */ lun->flags |= CTL_LUN_DISABLED; lun->backend = be_lun->be; be_lun->ctl_lun = lun; be_lun->lun_id = lun_number; atomic_add_int(&be_lun->be->num_luns, 1); if (be_lun->flags & CTL_LUN_FLAG_EJECTED) lun->flags |= CTL_LUN_EJECTED; if (be_lun->flags & CTL_LUN_FLAG_NO_MEDIA) lun->flags |= CTL_LUN_NO_MEDIA; if (be_lun->flags & CTL_LUN_FLAG_STOPPED) lun->flags |= CTL_LUN_STOPPED; if (be_lun->flags & CTL_LUN_FLAG_PRIMARY) lun->flags |= CTL_LUN_PRIMARY_SC; value = dnvlist_get_string(be_lun->options, "removable", NULL); if (value != NULL) { if (strcmp(value, "on") == 0) lun->flags |= CTL_LUN_REMOVABLE; } else if (be_lun->lun_type == T_CDROM) lun->flags |= CTL_LUN_REMOVABLE; lun->ctl_softc = ctl_softc; #ifdef CTL_TIME_IO lun->last_busy = getsbinuptime(); #endif TAILQ_INIT(&lun->ooa_queue); STAILQ_INIT(&lun->error_list); lun->ie_reported = 1; callout_init_mtx(&lun->ie_callout, &lun->lun_lock, 0); ctl_tpc_lun_init(lun); if (lun->flags & CTL_LUN_REMOVABLE) { lun->prevent = malloc((CTL_MAX_INITIATORS + 31) / 32 * 4, M_CTL, M_WAITOK); } /* * Initialize the mode and log page index. */ ctl_init_page_index(lun); ctl_init_log_page_index(lun); /* Setup statistics gathering */ lun->stats.item = lun_number; /* * Now, before we insert this lun on the lun list, set the lun * inventory changed UA for all other luns. */ mtx_lock(&ctl_softc->ctl_lock); STAILQ_FOREACH(nlun, &ctl_softc->lun_list, links) { mtx_lock(&nlun->lun_lock); ctl_est_ua_all(nlun, -1, CTL_UA_LUN_CHANGE); mtx_unlock(&nlun->lun_lock); } STAILQ_INSERT_TAIL(&ctl_softc->lun_list, lun, links); ctl_softc->ctl_luns[lun_number] = lun; ctl_softc->num_luns++; mtx_unlock(&ctl_softc->ctl_lock); lun->be_lun->lun_config_status(lun->be_lun->be_lun, CTL_LUN_CONFIG_OK); return (0); } /* * Delete a LUN. * Assumptions: * - LUN has already been marked invalid and any pending I/O has been taken * care of. */ static int ctl_free_lun(struct ctl_lun *lun) { struct ctl_softc *softc = lun->ctl_softc; struct ctl_lun *nlun; int i; KASSERT(TAILQ_EMPTY(&lun->ooa_queue), ("Freeing a LUN %p with outstanding I/O!\n", lun)); mtx_lock(&softc->ctl_lock); STAILQ_REMOVE(&softc->lun_list, lun, ctl_lun, links); ctl_clear_mask(softc->ctl_lun_mask, lun->lun); softc->ctl_luns[lun->lun] = NULL; softc->num_luns--; STAILQ_FOREACH(nlun, &softc->lun_list, links) { mtx_lock(&nlun->lun_lock); ctl_est_ua_all(nlun, -1, CTL_UA_LUN_CHANGE); mtx_unlock(&nlun->lun_lock); } mtx_unlock(&softc->ctl_lock); /* * Tell the backend to free resources, if this LUN has a backend. */ atomic_subtract_int(&lun->be_lun->be->num_luns, 1); lun->be_lun->lun_shutdown(lun->be_lun->be_lun); lun->ie_reportcnt = UINT32_MAX; callout_drain(&lun->ie_callout); ctl_tpc_lun_shutdown(lun); mtx_destroy(&lun->lun_lock); free(lun->lun_devid, M_CTL); for (i = 0; i < ctl_max_ports; i++) free(lun->pending_ua[i], M_CTL); free(lun->pending_ua, M_DEVBUF); for (i = 0; i < ctl_max_ports; i++) free(lun->pr_keys[i], M_CTL); free(lun->pr_keys, M_DEVBUF); free(lun->write_buffer, M_CTL); free(lun->prevent, M_CTL); if (lun->flags & CTL_LUN_MALLOCED) free(lun, M_CTL); return (0); } static void ctl_create_lun(struct ctl_be_lun *be_lun) { /* * ctl_alloc_lun() should handle all potential failure cases. */ ctl_alloc_lun(control_softc, NULL, be_lun); } int ctl_add_lun(struct ctl_be_lun *be_lun) { struct ctl_softc *softc = control_softc; mtx_lock(&softc->ctl_lock); STAILQ_INSERT_TAIL(&softc->pending_lun_queue, be_lun, links); mtx_unlock(&softc->ctl_lock); wakeup(&softc->pending_lun_queue); return (0); } int ctl_enable_lun(struct ctl_be_lun *be_lun) { struct ctl_softc *softc; struct ctl_port *port, *nport; struct ctl_lun *lun; int retval; lun = (struct ctl_lun *)be_lun->ctl_lun; softc = lun->ctl_softc; mtx_lock(&softc->ctl_lock); mtx_lock(&lun->lun_lock); if ((lun->flags & CTL_LUN_DISABLED) == 0) { /* * eh? Why did we get called if the LUN is already * enabled? */ mtx_unlock(&lun->lun_lock); mtx_unlock(&softc->ctl_lock); return (0); } lun->flags &= ~CTL_LUN_DISABLED; mtx_unlock(&lun->lun_lock); STAILQ_FOREACH_SAFE(port, &softc->port_list, links, nport) { if ((port->status & CTL_PORT_STATUS_ONLINE) == 0 || port->lun_map != NULL || port->lun_enable == NULL) continue; /* * Drop the lock while we call the FETD's enable routine. * This can lead to a callback into CTL (at least in the * case of the internal initiator frontend. */ mtx_unlock(&softc->ctl_lock); retval = port->lun_enable(port->targ_lun_arg, lun->lun); mtx_lock(&softc->ctl_lock); if (retval != 0) { printf("%s: FETD %s port %d returned error " "%d for lun_enable on lun %jd\n", __func__, port->port_name, port->targ_port, retval, (intmax_t)lun->lun); } } mtx_unlock(&softc->ctl_lock); ctl_isc_announce_lun(lun); return (0); } int ctl_disable_lun(struct ctl_be_lun *be_lun) { struct ctl_softc *softc; struct ctl_port *port; struct ctl_lun *lun; int retval; lun = (struct ctl_lun *)be_lun->ctl_lun; softc = lun->ctl_softc; mtx_lock(&softc->ctl_lock); mtx_lock(&lun->lun_lock); if (lun->flags & CTL_LUN_DISABLED) { mtx_unlock(&lun->lun_lock); mtx_unlock(&softc->ctl_lock); return (0); } lun->flags |= CTL_LUN_DISABLED; mtx_unlock(&lun->lun_lock); STAILQ_FOREACH(port, &softc->port_list, links) { if ((port->status & CTL_PORT_STATUS_ONLINE) == 0 || port->lun_map != NULL || port->lun_disable == NULL) continue; /* * Drop the lock before we call the frontend's disable * routine, to avoid lock order reversals. * * XXX KDM what happens if the frontend list changes while * we're traversing it? It's unlikely, but should be handled. */ mtx_unlock(&softc->ctl_lock); retval = port->lun_disable(port->targ_lun_arg, lun->lun); mtx_lock(&softc->ctl_lock); if (retval != 0) { printf("%s: FETD %s port %d returned error " "%d for lun_disable on lun %jd\n", __func__, port->port_name, port->targ_port, retval, (intmax_t)lun->lun); } } mtx_unlock(&softc->ctl_lock); ctl_isc_announce_lun(lun); return (0); } int ctl_start_lun(struct ctl_be_lun *be_lun) { struct ctl_lun *lun = (struct ctl_lun *)be_lun->ctl_lun; mtx_lock(&lun->lun_lock); lun->flags &= ~CTL_LUN_STOPPED; mtx_unlock(&lun->lun_lock); return (0); } int ctl_stop_lun(struct ctl_be_lun *be_lun) { struct ctl_lun *lun = (struct ctl_lun *)be_lun->ctl_lun; mtx_lock(&lun->lun_lock); lun->flags |= CTL_LUN_STOPPED; mtx_unlock(&lun->lun_lock); return (0); } int ctl_lun_no_media(struct ctl_be_lun *be_lun) { struct ctl_lun *lun = (struct ctl_lun *)be_lun->ctl_lun; mtx_lock(&lun->lun_lock); lun->flags |= CTL_LUN_NO_MEDIA; mtx_unlock(&lun->lun_lock); return (0); } int ctl_lun_has_media(struct ctl_be_lun *be_lun) { struct ctl_lun *lun = (struct ctl_lun *)be_lun->ctl_lun; union ctl_ha_msg msg; mtx_lock(&lun->lun_lock); lun->flags &= ~(CTL_LUN_NO_MEDIA | CTL_LUN_EJECTED); if (lun->flags & CTL_LUN_REMOVABLE) ctl_est_ua_all(lun, -1, CTL_UA_MEDIUM_CHANGE); mtx_unlock(&lun->lun_lock); if ((lun->flags & CTL_LUN_REMOVABLE) && lun->ctl_softc->ha_mode == CTL_HA_MODE_XFER) { bzero(&msg.ua, sizeof(msg.ua)); msg.hdr.msg_type = CTL_MSG_UA; msg.hdr.nexus.initid = -1; msg.hdr.nexus.targ_port = -1; msg.hdr.nexus.targ_lun = lun->lun; msg.hdr.nexus.targ_mapped_lun = lun->lun; msg.ua.ua_all = 1; msg.ua.ua_set = 1; msg.ua.ua_type = CTL_UA_MEDIUM_CHANGE; ctl_ha_msg_send(CTL_HA_CHAN_CTL, &msg, sizeof(msg.ua), M_WAITOK); } return (0); } int ctl_lun_ejected(struct ctl_be_lun *be_lun) { struct ctl_lun *lun = (struct ctl_lun *)be_lun->ctl_lun; mtx_lock(&lun->lun_lock); lun->flags |= CTL_LUN_EJECTED; mtx_unlock(&lun->lun_lock); return (0); } int ctl_lun_primary(struct ctl_be_lun *be_lun) { struct ctl_lun *lun = (struct ctl_lun *)be_lun->ctl_lun; mtx_lock(&lun->lun_lock); lun->flags |= CTL_LUN_PRIMARY_SC; ctl_est_ua_all(lun, -1, CTL_UA_ASYM_ACC_CHANGE); mtx_unlock(&lun->lun_lock); ctl_isc_announce_lun(lun); return (0); } int ctl_lun_secondary(struct ctl_be_lun *be_lun) { struct ctl_lun *lun = (struct ctl_lun *)be_lun->ctl_lun; mtx_lock(&lun->lun_lock); lun->flags &= ~CTL_LUN_PRIMARY_SC; ctl_est_ua_all(lun, -1, CTL_UA_ASYM_ACC_CHANGE); mtx_unlock(&lun->lun_lock); ctl_isc_announce_lun(lun); return (0); } int ctl_invalidate_lun(struct ctl_be_lun *be_lun) { struct ctl_lun *lun; lun = (struct ctl_lun *)be_lun->ctl_lun; mtx_lock(&lun->lun_lock); /* * The LUN needs to be disabled before it can be marked invalid. */ if ((lun->flags & CTL_LUN_DISABLED) == 0) { mtx_unlock(&lun->lun_lock); return (-1); } /* * Mark the LUN invalid. */ lun->flags |= CTL_LUN_INVALID; /* * If there is nothing in the OOA queue, go ahead and free the LUN. * If we have something in the OOA queue, we'll free it when the * last I/O completes. */ if (TAILQ_EMPTY(&lun->ooa_queue)) { mtx_unlock(&lun->lun_lock); ctl_free_lun(lun); } else mtx_unlock(&lun->lun_lock); return (0); } void ctl_lun_capacity_changed(struct ctl_be_lun *be_lun) { struct ctl_lun *lun = (struct ctl_lun *)be_lun->ctl_lun; union ctl_ha_msg msg; mtx_lock(&lun->lun_lock); ctl_est_ua_all(lun, -1, CTL_UA_CAPACITY_CHANGE); mtx_unlock(&lun->lun_lock); if (lun->ctl_softc->ha_mode == CTL_HA_MODE_XFER) { /* Send msg to other side. */ bzero(&msg.ua, sizeof(msg.ua)); msg.hdr.msg_type = CTL_MSG_UA; msg.hdr.nexus.initid = -1; msg.hdr.nexus.targ_port = -1; msg.hdr.nexus.targ_lun = lun->lun; msg.hdr.nexus.targ_mapped_lun = lun->lun; msg.ua.ua_all = 1; msg.ua.ua_set = 1; msg.ua.ua_type = CTL_UA_CAPACITY_CHANGE; ctl_ha_msg_send(CTL_HA_CHAN_CTL, &msg, sizeof(msg.ua), M_WAITOK); } } /* * Backend "memory move is complete" callback for requests that never * make it down to say RAIDCore's configuration code. */ int ctl_config_move_done(union ctl_io *io) { int retval; CTL_DEBUG_PRINT(("ctl_config_move_done\n")); KASSERT(io->io_hdr.io_type == CTL_IO_SCSI, ("Config I/O type isn't CTL_IO_SCSI (%d)!", io->io_hdr.io_type)); if ((io->io_hdr.port_status != 0) && ((io->io_hdr.status & CTL_STATUS_MASK) == CTL_STATUS_NONE || (io->io_hdr.status & CTL_STATUS_MASK) == CTL_SUCCESS)) { ctl_set_internal_failure(&io->scsiio, /*sks_valid*/ 1, /*retry_count*/ io->io_hdr.port_status); } else if (io->scsiio.kern_data_resid != 0 && (io->io_hdr.flags & CTL_FLAG_DATA_MASK) == CTL_FLAG_DATA_OUT && ((io->io_hdr.status & CTL_STATUS_MASK) == CTL_STATUS_NONE || (io->io_hdr.status & CTL_STATUS_MASK) == CTL_SUCCESS)) { ctl_set_invalid_field_ciu(&io->scsiio); } if (ctl_debug & CTL_DEBUG_CDB_DATA) ctl_data_print(io); if (((io->io_hdr.flags & CTL_FLAG_DATA_MASK) == CTL_FLAG_DATA_IN) || ((io->io_hdr.status & CTL_STATUS_MASK) != CTL_STATUS_NONE && (io->io_hdr.status & CTL_STATUS_MASK) != CTL_SUCCESS) || ((io->io_hdr.flags & CTL_FLAG_ABORT) != 0)) { /* * XXX KDM just assuming a single pointer here, and not a * S/G list. If we start using S/G lists for config data, * we'll need to know how to clean them up here as well. */ if (io->io_hdr.flags & CTL_FLAG_ALLOCATED) free(io->scsiio.kern_data_ptr, M_CTL); ctl_done(io); retval = CTL_RETVAL_COMPLETE; } else { /* * XXX KDM now we need to continue data movement. Some * options: * - call ctl_scsiio() again? We don't do this for data * writes, because for those at least we know ahead of * time where the write will go and how long it is. For * config writes, though, that information is largely * contained within the write itself, thus we need to * parse out the data again. * * - Call some other function once the data is in? */ /* * XXX KDM call ctl_scsiio() again for now, and check flag * bits to see whether we're allocated or not. */ retval = ctl_scsiio(&io->scsiio); } return (retval); } /* * This gets called by a backend driver when it is done with a * data_submit method. */ void ctl_data_submit_done(union ctl_io *io) { /* * If the IO_CONT flag is set, we need to call the supplied * function to continue processing the I/O, instead of completing * the I/O just yet. * * If there is an error, though, we don't want to keep processing. * Instead, just send status back to the initiator. */ if ((io->io_hdr.flags & CTL_FLAG_IO_CONT) && (io->io_hdr.flags & CTL_FLAG_ABORT) == 0 && ((io->io_hdr.status & CTL_STATUS_MASK) == CTL_STATUS_NONE || (io->io_hdr.status & CTL_STATUS_MASK) == CTL_SUCCESS)) { io->scsiio.io_cont(io); return; } ctl_done(io); } /* * This gets called by a backend driver when it is done with a * configuration write. */ void ctl_config_write_done(union ctl_io *io) { uint8_t *buf; /* * If the IO_CONT flag is set, we need to call the supplied * function to continue processing the I/O, instead of completing * the I/O just yet. * * If there is an error, though, we don't want to keep processing. * Instead, just send status back to the initiator. */ if ((io->io_hdr.flags & CTL_FLAG_IO_CONT) && (io->io_hdr.flags & CTL_FLAG_ABORT) == 0 && ((io->io_hdr.status & CTL_STATUS_MASK) == CTL_STATUS_NONE || (io->io_hdr.status & CTL_STATUS_MASK) == CTL_SUCCESS)) { io->scsiio.io_cont(io); return; } /* * Since a configuration write can be done for commands that actually * have data allocated, like write buffer, and commands that have * no data, like start/stop unit, we need to check here. */ if (io->io_hdr.flags & CTL_FLAG_ALLOCATED) buf = io->scsiio.kern_data_ptr; else buf = NULL; ctl_done(io); if (buf) free(buf, M_CTL); } void ctl_config_read_done(union ctl_io *io) { uint8_t *buf; /* * If there is some error -- we are done, skip data transfer. */ if ((io->io_hdr.flags & CTL_FLAG_ABORT) != 0 || ((io->io_hdr.status & CTL_STATUS_MASK) != CTL_STATUS_NONE && (io->io_hdr.status & CTL_STATUS_MASK) != CTL_SUCCESS)) { if (io->io_hdr.flags & CTL_FLAG_ALLOCATED) buf = io->scsiio.kern_data_ptr; else buf = NULL; ctl_done(io); if (buf) free(buf, M_CTL); return; } /* * If the IO_CONT flag is set, we need to call the supplied * function to continue processing the I/O, instead of completing * the I/O just yet. */ if (io->io_hdr.flags & CTL_FLAG_IO_CONT) { io->scsiio.io_cont(io); return; } ctl_datamove(io); } /* * SCSI release command. */ int ctl_scsi_release(struct ctl_scsiio *ctsio) { struct ctl_lun *lun = CTL_LUN(ctsio); uint32_t residx; CTL_DEBUG_PRINT(("ctl_scsi_release\n")); residx = ctl_get_initindex(&ctsio->io_hdr.nexus); /* * XXX KDM right now, we only support LUN reservation. We don't * support 3rd party reservations, or extent reservations, which * might actually need the parameter list. If we've gotten this * far, we've got a LUN reservation. Anything else got kicked out * above. So, according to SPC, ignore the length. */ mtx_lock(&lun->lun_lock); /* * According to SPC, it is not an error for an intiator to attempt * to release a reservation on a LUN that isn't reserved, or that * is reserved by another initiator. The reservation can only be * released, though, by the initiator who made it or by one of * several reset type events. */ if ((lun->flags & CTL_LUN_RESERVED) && (lun->res_idx == residx)) lun->flags &= ~CTL_LUN_RESERVED; mtx_unlock(&lun->lun_lock); ctl_set_success(ctsio); ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } int ctl_scsi_reserve(struct ctl_scsiio *ctsio) { struct ctl_lun *lun = CTL_LUN(ctsio); uint32_t residx; CTL_DEBUG_PRINT(("ctl_reserve\n")); residx = ctl_get_initindex(&ctsio->io_hdr.nexus); /* * XXX KDM right now, we only support LUN reservation. We don't * support 3rd party reservations, or extent reservations, which * might actually need the parameter list. If we've gotten this * far, we've got a LUN reservation. Anything else got kicked out * above. So, according to SPC, ignore the length. */ mtx_lock(&lun->lun_lock); if ((lun->flags & CTL_LUN_RESERVED) && (lun->res_idx != residx)) { ctl_set_reservation_conflict(ctsio); goto bailout; } /* SPC-3 exceptions to SPC-2 RESERVE and RELEASE behavior. */ if (lun->flags & CTL_LUN_PR_RESERVED) { ctl_set_success(ctsio); goto bailout; } lun->flags |= CTL_LUN_RESERVED; lun->res_idx = residx; ctl_set_success(ctsio); bailout: mtx_unlock(&lun->lun_lock); ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } int ctl_start_stop(struct ctl_scsiio *ctsio) { struct ctl_lun *lun = CTL_LUN(ctsio); struct scsi_start_stop_unit *cdb; int retval; CTL_DEBUG_PRINT(("ctl_start_stop\n")); cdb = (struct scsi_start_stop_unit *)ctsio->cdb; if ((cdb->how & SSS_PC_MASK) == 0) { if ((lun->flags & CTL_LUN_PR_RESERVED) && (cdb->how & SSS_START) == 0) { uint32_t residx; residx = ctl_get_initindex(&ctsio->io_hdr.nexus); if (ctl_get_prkey(lun, residx) == 0 || (lun->pr_res_idx != residx && lun->pr_res_type < 4)) { ctl_set_reservation_conflict(ctsio); ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } } if ((cdb->how & SSS_LOEJ) && (lun->flags & CTL_LUN_REMOVABLE) == 0) { ctl_set_invalid_field(ctsio, /*sks_valid*/ 1, /*command*/ 1, /*field*/ 4, /*bit_valid*/ 1, /*bit*/ 1); ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } if ((cdb->how & SSS_START) == 0 && (cdb->how & SSS_LOEJ) && lun->prevent_count > 0) { /* "Medium removal prevented" */ ctl_set_sense(ctsio, /*current_error*/ 1, /*sense_key*/(lun->flags & CTL_LUN_NO_MEDIA) ? SSD_KEY_NOT_READY : SSD_KEY_ILLEGAL_REQUEST, /*asc*/ 0x53, /*ascq*/ 0x02, SSD_ELEM_NONE); ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } } retval = lun->backend->config_write((union ctl_io *)ctsio); return (retval); } int ctl_prevent_allow(struct ctl_scsiio *ctsio) { struct ctl_lun *lun = CTL_LUN(ctsio); struct scsi_prevent *cdb; int retval; uint32_t initidx; CTL_DEBUG_PRINT(("ctl_prevent_allow\n")); cdb = (struct scsi_prevent *)ctsio->cdb; if ((lun->flags & CTL_LUN_REMOVABLE) == 0 || lun->prevent == NULL) { ctl_set_invalid_opcode(ctsio); ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } initidx = ctl_get_initindex(&ctsio->io_hdr.nexus); mtx_lock(&lun->lun_lock); if ((cdb->how & PR_PREVENT) && ctl_is_set(lun->prevent, initidx) == 0) { ctl_set_mask(lun->prevent, initidx); lun->prevent_count++; } else if ((cdb->how & PR_PREVENT) == 0 && ctl_is_set(lun->prevent, initidx)) { ctl_clear_mask(lun->prevent, initidx); lun->prevent_count--; } mtx_unlock(&lun->lun_lock); retval = lun->backend->config_write((union ctl_io *)ctsio); return (retval); } /* * We support the SYNCHRONIZE CACHE command (10 and 16 byte versions), but * we don't really do anything with the LBA and length fields if the user * passes them in. Instead we'll just flush out the cache for the entire * LUN. */ int ctl_sync_cache(struct ctl_scsiio *ctsio) { struct ctl_lun *lun = CTL_LUN(ctsio); struct ctl_lba_len_flags *lbalen; uint64_t starting_lba; uint32_t block_count; int retval; uint8_t byte2; CTL_DEBUG_PRINT(("ctl_sync_cache\n")); retval = 0; switch (ctsio->cdb[0]) { case SYNCHRONIZE_CACHE: { struct scsi_sync_cache *cdb; cdb = (struct scsi_sync_cache *)ctsio->cdb; starting_lba = scsi_4btoul(cdb->begin_lba); block_count = scsi_2btoul(cdb->lb_count); byte2 = cdb->byte2; break; } case SYNCHRONIZE_CACHE_16: { struct scsi_sync_cache_16 *cdb; cdb = (struct scsi_sync_cache_16 *)ctsio->cdb; starting_lba = scsi_8btou64(cdb->begin_lba); block_count = scsi_4btoul(cdb->lb_count); byte2 = cdb->byte2; break; } default: ctl_set_invalid_opcode(ctsio); ctl_done((union ctl_io *)ctsio); goto bailout; break; /* NOTREACHED */ } /* * We check the LBA and length, but don't do anything with them. * A SYNCHRONIZE CACHE will cause the entire cache for this lun to * get flushed. This check will just help satisfy anyone who wants * to see an error for an out of range LBA. */ if ((starting_lba + block_count) > (lun->be_lun->maxlba + 1)) { ctl_set_lba_out_of_range(ctsio, MAX(starting_lba, lun->be_lun->maxlba + 1)); ctl_done((union ctl_io *)ctsio); goto bailout; } lbalen = (struct ctl_lba_len_flags *)&ctsio->io_hdr.ctl_private[CTL_PRIV_LBA_LEN]; lbalen->lba = starting_lba; lbalen->len = block_count; lbalen->flags = byte2; retval = lun->backend->config_write((union ctl_io *)ctsio); bailout: return (retval); } int ctl_format(struct ctl_scsiio *ctsio) { struct scsi_format *cdb; int length, defect_list_len; CTL_DEBUG_PRINT(("ctl_format\n")); cdb = (struct scsi_format *)ctsio->cdb; length = 0; if (cdb->byte2 & SF_FMTDATA) { if (cdb->byte2 & SF_LONGLIST) length = sizeof(struct scsi_format_header_long); else length = sizeof(struct scsi_format_header_short); } if (((ctsio->io_hdr.flags & CTL_FLAG_ALLOCATED) == 0) && (length > 0)) { ctsio->kern_data_ptr = malloc(length, M_CTL, M_WAITOK); ctsio->kern_data_len = length; ctsio->kern_total_len = length; ctsio->kern_rel_offset = 0; ctsio->kern_sg_entries = 0; ctsio->io_hdr.flags |= CTL_FLAG_ALLOCATED; ctsio->be_move_done = ctl_config_move_done; ctl_datamove((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } defect_list_len = 0; if (cdb->byte2 & SF_FMTDATA) { if (cdb->byte2 & SF_LONGLIST) { struct scsi_format_header_long *header; header = (struct scsi_format_header_long *) ctsio->kern_data_ptr; defect_list_len = scsi_4btoul(header->defect_list_len); if (defect_list_len != 0) { ctl_set_invalid_field(ctsio, /*sks_valid*/ 1, /*command*/ 0, /*field*/ 2, /*bit_valid*/ 0, /*bit*/ 0); goto bailout; } } else { struct scsi_format_header_short *header; header = (struct scsi_format_header_short *) ctsio->kern_data_ptr; defect_list_len = scsi_2btoul(header->defect_list_len); if (defect_list_len != 0) { ctl_set_invalid_field(ctsio, /*sks_valid*/ 1, /*command*/ 0, /*field*/ 2, /*bit_valid*/ 0, /*bit*/ 0); goto bailout; } } } ctl_set_success(ctsio); bailout: if (ctsio->io_hdr.flags & CTL_FLAG_ALLOCATED) { free(ctsio->kern_data_ptr, M_CTL); ctsio->io_hdr.flags &= ~CTL_FLAG_ALLOCATED; } ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } int ctl_read_buffer(struct ctl_scsiio *ctsio) { struct ctl_lun *lun = CTL_LUN(ctsio); uint64_t buffer_offset; uint32_t len; uint8_t byte2; static uint8_t descr[4]; static uint8_t echo_descr[4] = { 0 }; CTL_DEBUG_PRINT(("ctl_read_buffer\n")); switch (ctsio->cdb[0]) { case READ_BUFFER: { struct scsi_read_buffer *cdb; cdb = (struct scsi_read_buffer *)ctsio->cdb; buffer_offset = scsi_3btoul(cdb->offset); len = scsi_3btoul(cdb->length); byte2 = cdb->byte2; break; } case READ_BUFFER_16: { struct scsi_read_buffer_16 *cdb; cdb = (struct scsi_read_buffer_16 *)ctsio->cdb; buffer_offset = scsi_8btou64(cdb->offset); len = scsi_4btoul(cdb->length); byte2 = cdb->byte2; break; } default: /* This shouldn't happen. */ ctl_set_invalid_opcode(ctsio); ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } if (buffer_offset > CTL_WRITE_BUFFER_SIZE || buffer_offset + len > CTL_WRITE_BUFFER_SIZE) { ctl_set_invalid_field(ctsio, /*sks_valid*/ 1, /*command*/ 1, /*field*/ 6, /*bit_valid*/ 0, /*bit*/ 0); ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } if ((byte2 & RWB_MODE) == RWB_MODE_DESCR) { descr[0] = 0; scsi_ulto3b(CTL_WRITE_BUFFER_SIZE, &descr[1]); ctsio->kern_data_ptr = descr; len = min(len, sizeof(descr)); } else if ((byte2 & RWB_MODE) == RWB_MODE_ECHO_DESCR) { ctsio->kern_data_ptr = echo_descr; len = min(len, sizeof(echo_descr)); } else { if (lun->write_buffer == NULL) { lun->write_buffer = malloc(CTL_WRITE_BUFFER_SIZE, M_CTL, M_WAITOK); } ctsio->kern_data_ptr = lun->write_buffer + buffer_offset; } ctsio->kern_data_len = len; ctsio->kern_total_len = len; ctsio->kern_rel_offset = 0; ctsio->kern_sg_entries = 0; ctl_set_success(ctsio); ctsio->be_move_done = ctl_config_move_done; ctl_datamove((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } int ctl_write_buffer(struct ctl_scsiio *ctsio) { struct ctl_lun *lun = CTL_LUN(ctsio); struct scsi_write_buffer *cdb; int buffer_offset, len; CTL_DEBUG_PRINT(("ctl_write_buffer\n")); cdb = (struct scsi_write_buffer *)ctsio->cdb; len = scsi_3btoul(cdb->length); buffer_offset = scsi_3btoul(cdb->offset); if (buffer_offset + len > CTL_WRITE_BUFFER_SIZE) { ctl_set_invalid_field(ctsio, /*sks_valid*/ 1, /*command*/ 1, /*field*/ 6, /*bit_valid*/ 0, /*bit*/ 0); ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } /* * If we've got a kernel request that hasn't been malloced yet, * malloc it and tell the caller the data buffer is here. */ if ((ctsio->io_hdr.flags & CTL_FLAG_ALLOCATED) == 0) { if (lun->write_buffer == NULL) { lun->write_buffer = malloc(CTL_WRITE_BUFFER_SIZE, M_CTL, M_WAITOK); } ctsio->kern_data_ptr = lun->write_buffer + buffer_offset; ctsio->kern_data_len = len; ctsio->kern_total_len = len; ctsio->kern_rel_offset = 0; ctsio->kern_sg_entries = 0; ctsio->io_hdr.flags |= CTL_FLAG_ALLOCATED; ctsio->be_move_done = ctl_config_move_done; ctl_datamove((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } ctl_set_success(ctsio); ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } int ctl_write_same(struct ctl_scsiio *ctsio) { struct ctl_lun *lun = CTL_LUN(ctsio); struct ctl_lba_len_flags *lbalen; uint64_t lba; uint32_t num_blocks; int len, retval; uint8_t byte2; CTL_DEBUG_PRINT(("ctl_write_same\n")); switch (ctsio->cdb[0]) { case WRITE_SAME_10: { struct scsi_write_same_10 *cdb; cdb = (struct scsi_write_same_10 *)ctsio->cdb; lba = scsi_4btoul(cdb->addr); num_blocks = scsi_2btoul(cdb->length); byte2 = cdb->byte2; break; } case WRITE_SAME_16: { struct scsi_write_same_16 *cdb; cdb = (struct scsi_write_same_16 *)ctsio->cdb; lba = scsi_8btou64(cdb->addr); num_blocks = scsi_4btoul(cdb->length); byte2 = cdb->byte2; break; } default: /* * We got a command we don't support. This shouldn't * happen, commands should be filtered out above us. */ ctl_set_invalid_opcode(ctsio); ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); break; /* NOTREACHED */ } /* ANCHOR flag can be used only together with UNMAP */ if ((byte2 & SWS_UNMAP) == 0 && (byte2 & SWS_ANCHOR) != 0) { ctl_set_invalid_field(ctsio, /*sks_valid*/ 1, /*command*/ 1, /*field*/ 1, /*bit_valid*/ 1, /*bit*/ 0); ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } /* * The first check is to make sure we're in bounds, the second * check is to catch wrap-around problems. If the lba + num blocks * is less than the lba, then we've wrapped around and the block * range is invalid anyway. */ if (((lba + num_blocks) > (lun->be_lun->maxlba + 1)) || ((lba + num_blocks) < lba)) { ctl_set_lba_out_of_range(ctsio, MAX(lba, lun->be_lun->maxlba + 1)); ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } /* Zero number of blocks means "to the last logical block" */ if (num_blocks == 0) { if ((lun->be_lun->maxlba + 1) - lba > UINT32_MAX) { ctl_set_invalid_field(ctsio, /*sks_valid*/ 0, /*command*/ 1, /*field*/ 0, /*bit_valid*/ 0, /*bit*/ 0); ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } num_blocks = (lun->be_lun->maxlba + 1) - lba; } len = lun->be_lun->blocksize; /* * If we've got a kernel request that hasn't been malloced yet, * malloc it and tell the caller the data buffer is here. */ if ((byte2 & SWS_NDOB) == 0 && (ctsio->io_hdr.flags & CTL_FLAG_ALLOCATED) == 0) { ctsio->kern_data_ptr = malloc(len, M_CTL, M_WAITOK); ctsio->kern_data_len = len; ctsio->kern_total_len = len; ctsio->kern_rel_offset = 0; ctsio->kern_sg_entries = 0; ctsio->io_hdr.flags |= CTL_FLAG_ALLOCATED; ctsio->be_move_done = ctl_config_move_done; ctl_datamove((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } lbalen = (struct ctl_lba_len_flags *)&ctsio->io_hdr.ctl_private[CTL_PRIV_LBA_LEN]; lbalen->lba = lba; lbalen->len = num_blocks; lbalen->flags = byte2; retval = lun->backend->config_write((union ctl_io *)ctsio); return (retval); } int ctl_unmap(struct ctl_scsiio *ctsio) { struct ctl_lun *lun = CTL_LUN(ctsio); struct scsi_unmap *cdb; struct ctl_ptr_len_flags *ptrlen; struct scsi_unmap_header *hdr; struct scsi_unmap_desc *buf, *end, *endnz, *range; uint64_t lba; uint32_t num_blocks; int len, retval; uint8_t byte2; CTL_DEBUG_PRINT(("ctl_unmap\n")); cdb = (struct scsi_unmap *)ctsio->cdb; len = scsi_2btoul(cdb->length); byte2 = cdb->byte2; /* * If we've got a kernel request that hasn't been malloced yet, * malloc it and tell the caller the data buffer is here. */ if ((ctsio->io_hdr.flags & CTL_FLAG_ALLOCATED) == 0) { ctsio->kern_data_ptr = malloc(len, M_CTL, M_WAITOK); ctsio->kern_data_len = len; ctsio->kern_total_len = len; ctsio->kern_rel_offset = 0; ctsio->kern_sg_entries = 0; ctsio->io_hdr.flags |= CTL_FLAG_ALLOCATED; ctsio->be_move_done = ctl_config_move_done; ctl_datamove((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } len = ctsio->kern_total_len - ctsio->kern_data_resid; hdr = (struct scsi_unmap_header *)ctsio->kern_data_ptr; if (len < sizeof (*hdr) || len < (scsi_2btoul(hdr->length) + sizeof(hdr->length)) || len < (scsi_2btoul(hdr->desc_length) + sizeof (*hdr)) || scsi_2btoul(hdr->desc_length) % sizeof(*buf) != 0) { ctl_set_invalid_field(ctsio, /*sks_valid*/ 0, /*command*/ 0, /*field*/ 0, /*bit_valid*/ 0, /*bit*/ 0); goto done; } len = scsi_2btoul(hdr->desc_length); buf = (struct scsi_unmap_desc *)(hdr + 1); end = buf + len / sizeof(*buf); endnz = buf; for (range = buf; range < end; range++) { lba = scsi_8btou64(range->lba); num_blocks = scsi_4btoul(range->length); if (((lba + num_blocks) > (lun->be_lun->maxlba + 1)) || ((lba + num_blocks) < lba)) { ctl_set_lba_out_of_range(ctsio, MAX(lba, lun->be_lun->maxlba + 1)); ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } if (num_blocks != 0) endnz = range + 1; } /* * Block backend can not handle zero last range. * Filter it out and return if there is nothing left. */ len = (uint8_t *)endnz - (uint8_t *)buf; if (len == 0) { ctl_set_success(ctsio); goto done; } mtx_lock(&lun->lun_lock); ptrlen = (struct ctl_ptr_len_flags *) &ctsio->io_hdr.ctl_private[CTL_PRIV_LBA_LEN]; ptrlen->ptr = (void *)buf; ptrlen->len = len; ptrlen->flags = byte2; ctl_try_unblock_others(lun, (union ctl_io *)ctsio, FALSE); mtx_unlock(&lun->lun_lock); retval = lun->backend->config_write((union ctl_io *)ctsio); return (retval); done: if (ctsio->io_hdr.flags & CTL_FLAG_ALLOCATED) { free(ctsio->kern_data_ptr, M_CTL); ctsio->io_hdr.flags &= ~CTL_FLAG_ALLOCATED; } ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } int ctl_default_page_handler(struct ctl_scsiio *ctsio, struct ctl_page_index *page_index, uint8_t *page_ptr) { struct ctl_lun *lun = CTL_LUN(ctsio); uint8_t *current_cp; int set_ua; uint32_t initidx; initidx = ctl_get_initindex(&ctsio->io_hdr.nexus); set_ua = 0; current_cp = (page_index->page_data + (page_index->page_len * CTL_PAGE_CURRENT)); mtx_lock(&lun->lun_lock); if (memcmp(current_cp, page_ptr, page_index->page_len)) { memcpy(current_cp, page_ptr, page_index->page_len); set_ua = 1; } if (set_ua != 0) ctl_est_ua_all(lun, initidx, CTL_UA_MODE_CHANGE); mtx_unlock(&lun->lun_lock); if (set_ua) { ctl_isc_announce_mode(lun, ctl_get_initindex(&ctsio->io_hdr.nexus), page_index->page_code, page_index->subpage); } return (CTL_RETVAL_COMPLETE); } static void ctl_ie_timer(void *arg) { struct ctl_lun *lun = arg; uint64_t t; if (lun->ie_asc == 0) return; if (lun->MODE_IE.mrie == SIEP_MRIE_UA) ctl_est_ua_all(lun, -1, CTL_UA_IE); else lun->ie_reported = 0; if (lun->ie_reportcnt < scsi_4btoul(lun->MODE_IE.report_count)) { lun->ie_reportcnt++; t = scsi_4btoul(lun->MODE_IE.interval_timer); if (t == 0 || t == UINT32_MAX) t = 3000; /* 5 min */ callout_schedule(&lun->ie_callout, t * hz / 10); } } int ctl_ie_page_handler(struct ctl_scsiio *ctsio, struct ctl_page_index *page_index, uint8_t *page_ptr) { struct ctl_lun *lun = CTL_LUN(ctsio); struct scsi_info_exceptions_page *pg; uint64_t t; (void)ctl_default_page_handler(ctsio, page_index, page_ptr); pg = (struct scsi_info_exceptions_page *)page_ptr; mtx_lock(&lun->lun_lock); if (pg->info_flags & SIEP_FLAGS_TEST) { lun->ie_asc = 0x5d; lun->ie_ascq = 0xff; if (pg->mrie == SIEP_MRIE_UA) { ctl_est_ua_all(lun, -1, CTL_UA_IE); lun->ie_reported = 1; } else { ctl_clr_ua_all(lun, -1, CTL_UA_IE); lun->ie_reported = -1; } lun->ie_reportcnt = 1; if (lun->ie_reportcnt < scsi_4btoul(pg->report_count)) { lun->ie_reportcnt++; t = scsi_4btoul(pg->interval_timer); if (t == 0 || t == UINT32_MAX) t = 3000; /* 5 min */ callout_reset(&lun->ie_callout, t * hz / 10, ctl_ie_timer, lun); } } else { lun->ie_asc = 0; lun->ie_ascq = 0; lun->ie_reported = 1; ctl_clr_ua_all(lun, -1, CTL_UA_IE); lun->ie_reportcnt = UINT32_MAX; callout_stop(&lun->ie_callout); } mtx_unlock(&lun->lun_lock); return (CTL_RETVAL_COMPLETE); } static int ctl_do_mode_select(union ctl_io *io) { struct ctl_lun *lun = CTL_LUN(io); struct scsi_mode_page_header *page_header; struct ctl_page_index *page_index; struct ctl_scsiio *ctsio; int page_len, page_len_offset, page_len_size; union ctl_modepage_info *modepage_info; uint16_t *len_left, *len_used; int retval, i; ctsio = &io->scsiio; page_index = NULL; page_len = 0; modepage_info = (union ctl_modepage_info *) ctsio->io_hdr.ctl_private[CTL_PRIV_MODEPAGE].bytes; len_left = &modepage_info->header.len_left; len_used = &modepage_info->header.len_used; do_next_page: page_header = (struct scsi_mode_page_header *) (ctsio->kern_data_ptr + *len_used); if (*len_left == 0) { free(ctsio->kern_data_ptr, M_CTL); ctl_set_success(ctsio); ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } else if (*len_left < sizeof(struct scsi_mode_page_header)) { free(ctsio->kern_data_ptr, M_CTL); ctl_set_param_len_error(ctsio); ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } else if ((page_header->page_code & SMPH_SPF) && (*len_left < sizeof(struct scsi_mode_page_header_sp))) { free(ctsio->kern_data_ptr, M_CTL); ctl_set_param_len_error(ctsio); ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } /* * XXX KDM should we do something with the block descriptor? */ for (i = 0; i < CTL_NUM_MODE_PAGES; i++) { page_index = &lun->mode_pages.index[i]; if (lun->be_lun->lun_type == T_DIRECT && (page_index->page_flags & CTL_PAGE_FLAG_DIRECT) == 0) continue; if (lun->be_lun->lun_type == T_PROCESSOR && (page_index->page_flags & CTL_PAGE_FLAG_PROC) == 0) continue; if (lun->be_lun->lun_type == T_CDROM && (page_index->page_flags & CTL_PAGE_FLAG_CDROM) == 0) continue; if ((page_index->page_code & SMPH_PC_MASK) != (page_header->page_code & SMPH_PC_MASK)) continue; /* * If neither page has a subpage code, then we've got a * match. */ if (((page_index->page_code & SMPH_SPF) == 0) && ((page_header->page_code & SMPH_SPF) == 0)) { page_len = page_header->page_length; break; } /* * If both pages have subpages, then the subpage numbers * have to match. */ if ((page_index->page_code & SMPH_SPF) && (page_header->page_code & SMPH_SPF)) { struct scsi_mode_page_header_sp *sph; sph = (struct scsi_mode_page_header_sp *)page_header; if (page_index->subpage == sph->subpage) { page_len = scsi_2btoul(sph->page_length); break; } } } /* * If we couldn't find the page, or if we don't have a mode select * handler for it, send back an error to the user. */ if ((i >= CTL_NUM_MODE_PAGES) || (page_index->select_handler == NULL)) { ctl_set_invalid_field(ctsio, /*sks_valid*/ 1, /*command*/ 0, /*field*/ *len_used, /*bit_valid*/ 0, /*bit*/ 0); free(ctsio->kern_data_ptr, M_CTL); ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } if (page_index->page_code & SMPH_SPF) { page_len_offset = 2; page_len_size = 2; } else { page_len_size = 1; page_len_offset = 1; } /* * If the length the initiator gives us isn't the one we specify in * the mode page header, or if they didn't specify enough data in * the CDB to avoid truncating this page, kick out the request. */ if (page_len != page_index->page_len - page_len_offset - page_len_size) { ctl_set_invalid_field(ctsio, /*sks_valid*/ 1, /*command*/ 0, /*field*/ *len_used + page_len_offset, /*bit_valid*/ 0, /*bit*/ 0); free(ctsio->kern_data_ptr, M_CTL); ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } if (*len_left < page_index->page_len) { free(ctsio->kern_data_ptr, M_CTL); ctl_set_param_len_error(ctsio); ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } /* * Run through the mode page, checking to make sure that the bits * the user changed are actually legal for him to change. */ for (i = 0; i < page_index->page_len; i++) { uint8_t *user_byte, *change_mask, *current_byte; int bad_bit; int j; user_byte = (uint8_t *)page_header + i; change_mask = page_index->page_data + (page_index->page_len * CTL_PAGE_CHANGEABLE) + i; current_byte = page_index->page_data + (page_index->page_len * CTL_PAGE_CURRENT) + i; /* * Check to see whether the user set any bits in this byte * that he is not allowed to set. */ if ((*user_byte & ~(*change_mask)) == (*current_byte & ~(*change_mask))) continue; /* * Go through bit by bit to determine which one is illegal. */ bad_bit = 0; for (j = 7; j >= 0; j--) { if ((((1 << i) & ~(*change_mask)) & *user_byte) != (((1 << i) & ~(*change_mask)) & *current_byte)) { bad_bit = i; break; } } ctl_set_invalid_field(ctsio, /*sks_valid*/ 1, /*command*/ 0, /*field*/ *len_used + i, /*bit_valid*/ 1, /*bit*/ bad_bit); free(ctsio->kern_data_ptr, M_CTL); ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } /* * Decrement these before we call the page handler, since we may * end up getting called back one way or another before the handler * returns to this context. */ *len_left -= page_index->page_len; *len_used += page_index->page_len; retval = page_index->select_handler(ctsio, page_index, (uint8_t *)page_header); /* * If the page handler returns CTL_RETVAL_QUEUED, then we need to * wait until this queued command completes to finish processing * the mode page. If it returns anything other than * CTL_RETVAL_COMPLETE (e.g. CTL_RETVAL_ERROR), then it should have * already set the sense information, freed the data pointer, and * completed the io for us. */ if (retval != CTL_RETVAL_COMPLETE) goto bailout_no_done; /* * If the initiator sent us more than one page, parse the next one. */ if (*len_left > 0) goto do_next_page; ctl_set_success(ctsio); free(ctsio->kern_data_ptr, M_CTL); ctl_done((union ctl_io *)ctsio); bailout_no_done: return (CTL_RETVAL_COMPLETE); } int ctl_mode_select(struct ctl_scsiio *ctsio) { struct ctl_lun *lun = CTL_LUN(ctsio); union ctl_modepage_info *modepage_info; int bd_len, i, header_size, param_len, rtd; uint32_t initidx; initidx = ctl_get_initindex(&ctsio->io_hdr.nexus); switch (ctsio->cdb[0]) { case MODE_SELECT_6: { struct scsi_mode_select_6 *cdb; cdb = (struct scsi_mode_select_6 *)ctsio->cdb; rtd = (cdb->byte2 & SMS_RTD) ? 1 : 0; param_len = cdb->length; header_size = sizeof(struct scsi_mode_header_6); break; } case MODE_SELECT_10: { struct scsi_mode_select_10 *cdb; cdb = (struct scsi_mode_select_10 *)ctsio->cdb; rtd = (cdb->byte2 & SMS_RTD) ? 1 : 0; param_len = scsi_2btoul(cdb->length); header_size = sizeof(struct scsi_mode_header_10); break; } default: ctl_set_invalid_opcode(ctsio); ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } if (rtd) { if (param_len != 0) { ctl_set_invalid_field(ctsio, /*sks_valid*/ 0, /*command*/ 1, /*field*/ 0, /*bit_valid*/ 0, /*bit*/ 0); ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } /* Revert to defaults. */ ctl_init_page_index(lun); mtx_lock(&lun->lun_lock); ctl_est_ua_all(lun, initidx, CTL_UA_MODE_CHANGE); mtx_unlock(&lun->lun_lock); for (i = 0; i < CTL_NUM_MODE_PAGES; i++) { ctl_isc_announce_mode(lun, -1, lun->mode_pages.index[i].page_code & SMPH_PC_MASK, lun->mode_pages.index[i].subpage); } ctl_set_success(ctsio); ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } /* * From SPC-3: * "A parameter list length of zero indicates that the Data-Out Buffer * shall be empty. This condition shall not be considered as an error." */ if (param_len == 0) { ctl_set_success(ctsio); ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } /* * Since we'll hit this the first time through, prior to * allocation, we don't need to free a data buffer here. */ if (param_len < header_size) { ctl_set_param_len_error(ctsio); ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } /* * Allocate the data buffer and grab the user's data. In theory, * we shouldn't have to sanity check the parameter list length here * because the maximum size is 64K. We should be able to malloc * that much without too many problems. */ if ((ctsio->io_hdr.flags & CTL_FLAG_ALLOCATED) == 0) { ctsio->kern_data_ptr = malloc(param_len, M_CTL, M_WAITOK); ctsio->kern_data_len = param_len; ctsio->kern_total_len = param_len; ctsio->kern_rel_offset = 0; ctsio->kern_sg_entries = 0; ctsio->io_hdr.flags |= CTL_FLAG_ALLOCATED; ctsio->be_move_done = ctl_config_move_done; ctl_datamove((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } switch (ctsio->cdb[0]) { case MODE_SELECT_6: { struct scsi_mode_header_6 *mh6; mh6 = (struct scsi_mode_header_6 *)ctsio->kern_data_ptr; bd_len = mh6->blk_desc_len; break; } case MODE_SELECT_10: { struct scsi_mode_header_10 *mh10; mh10 = (struct scsi_mode_header_10 *)ctsio->kern_data_ptr; bd_len = scsi_2btoul(mh10->blk_desc_len); break; } default: panic("%s: Invalid CDB type %#x", __func__, ctsio->cdb[0]); } if (param_len < (header_size + bd_len)) { free(ctsio->kern_data_ptr, M_CTL); ctl_set_param_len_error(ctsio); ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } /* * Set the IO_CONT flag, so that if this I/O gets passed to * ctl_config_write_done(), it'll get passed back to * ctl_do_mode_select() for further processing, or completion if * we're all done. */ ctsio->io_hdr.flags |= CTL_FLAG_IO_CONT; ctsio->io_cont = ctl_do_mode_select; modepage_info = (union ctl_modepage_info *) ctsio->io_hdr.ctl_private[CTL_PRIV_MODEPAGE].bytes; memset(modepage_info, 0, sizeof(*modepage_info)); modepage_info->header.len_left = param_len - header_size - bd_len; modepage_info->header.len_used = header_size + bd_len; return (ctl_do_mode_select((union ctl_io *)ctsio)); } int ctl_mode_sense(struct ctl_scsiio *ctsio) { struct ctl_lun *lun = CTL_LUN(ctsio); - int pc, page_code, dbd, subpage; - int alloc_len, page_len, header_len, total_len; - struct scsi_mode_block_descr *block_desc; + int pc, page_code, llba, subpage; + int alloc_len, page_len, header_len, bd_len, total_len; + void *block_desc; struct ctl_page_index *page_index; - dbd = 0; - block_desc = NULL; + llba = 0; CTL_DEBUG_PRINT(("ctl_mode_sense\n")); switch (ctsio->cdb[0]) { case MODE_SENSE_6: { struct scsi_mode_sense_6 *cdb; cdb = (struct scsi_mode_sense_6 *)ctsio->cdb; header_len = sizeof(struct scsi_mode_hdr_6); if (cdb->byte2 & SMS_DBD) - dbd = 1; + bd_len = 0; else - header_len += sizeof(struct scsi_mode_block_descr); + bd_len = sizeof(struct scsi_mode_block_descr); + header_len += bd_len; pc = (cdb->page & SMS_PAGE_CTRL_MASK) >> 6; page_code = cdb->page & SMS_PAGE_CODE; subpage = cdb->subpage; alloc_len = cdb->length; break; } case MODE_SENSE_10: { struct scsi_mode_sense_10 *cdb; cdb = (struct scsi_mode_sense_10 *)ctsio->cdb; header_len = sizeof(struct scsi_mode_hdr_10); + if (cdb->byte2 & SMS_DBD) { + bd_len = 0; + } else if (lun->be_lun->lun_type == T_DIRECT) { + if (cdb->byte2 & SMS10_LLBAA) { + llba = 1; + bd_len = sizeof(struct scsi_mode_block_descr_dlong); + } else + bd_len = sizeof(struct scsi_mode_block_descr_dshort); + } else + bd_len = sizeof(struct scsi_mode_block_descr); + header_len += bd_len; - if (cdb->byte2 & SMS_DBD) - dbd = 1; - else - header_len += sizeof(struct scsi_mode_block_descr); pc = (cdb->page & SMS_PAGE_CTRL_MASK) >> 6; page_code = cdb->page & SMS_PAGE_CODE; subpage = cdb->subpage; alloc_len = scsi_2btoul(cdb->length); break; } default: ctl_set_invalid_opcode(ctsio); ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); break; /* NOTREACHED */ } /* * We have to make a first pass through to calculate the size of * the pages that match the user's query. Then we allocate enough * memory to hold it, and actually copy the data into the buffer. */ switch (page_code) { case SMS_ALL_PAGES_PAGE: { u_int i; page_len = 0; /* * At the moment, values other than 0 and 0xff here are * reserved according to SPC-3. */ if ((subpage != SMS_SUBPAGE_PAGE_0) && (subpage != SMS_SUBPAGE_ALL)) { ctl_set_invalid_field(ctsio, /*sks_valid*/ 1, /*command*/ 1, /*field*/ 3, /*bit_valid*/ 0, /*bit*/ 0); ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } for (i = 0; i < CTL_NUM_MODE_PAGES; i++) { page_index = &lun->mode_pages.index[i]; /* Make sure the page is supported for this dev type */ if (lun->be_lun->lun_type == T_DIRECT && (page_index->page_flags & CTL_PAGE_FLAG_DIRECT) == 0) continue; if (lun->be_lun->lun_type == T_PROCESSOR && (page_index->page_flags & CTL_PAGE_FLAG_PROC) == 0) continue; if (lun->be_lun->lun_type == T_CDROM && (page_index->page_flags & CTL_PAGE_FLAG_CDROM) == 0) continue; /* * We don't use this subpage if the user didn't * request all subpages. */ if ((page_index->subpage != 0) && (subpage == SMS_SUBPAGE_PAGE_0)) continue; page_len += page_index->page_len; } break; } default: { u_int i; page_len = 0; for (i = 0; i < CTL_NUM_MODE_PAGES; i++) { page_index = &lun->mode_pages.index[i]; /* Make sure the page is supported for this dev type */ if (lun->be_lun->lun_type == T_DIRECT && (page_index->page_flags & CTL_PAGE_FLAG_DIRECT) == 0) continue; if (lun->be_lun->lun_type == T_PROCESSOR && (page_index->page_flags & CTL_PAGE_FLAG_PROC) == 0) continue; if (lun->be_lun->lun_type == T_CDROM && (page_index->page_flags & CTL_PAGE_FLAG_CDROM) == 0) continue; /* Look for the right page code */ if ((page_index->page_code & SMPH_PC_MASK) != page_code) continue; /* Look for the right subpage or the subpage wildcard*/ if ((page_index->subpage != subpage) && (subpage != SMS_SUBPAGE_ALL)) continue; page_len += page_index->page_len; } if (page_len == 0) { ctl_set_invalid_field(ctsio, /*sks_valid*/ 1, /*command*/ 1, /*field*/ 2, /*bit_valid*/ 1, /*bit*/ 5); ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } break; } } total_len = header_len + page_len; ctsio->kern_data_ptr = malloc(total_len, M_CTL, M_WAITOK | M_ZERO); ctsio->kern_sg_entries = 0; ctsio->kern_rel_offset = 0; ctsio->kern_data_len = min(total_len, alloc_len); ctsio->kern_total_len = ctsio->kern_data_len; switch (ctsio->cdb[0]) { case MODE_SENSE_6: { struct scsi_mode_hdr_6 *header; header = (struct scsi_mode_hdr_6 *)ctsio->kern_data_ptr; header->datalen = MIN(total_len - 1, 254); if (lun->be_lun->lun_type == T_DIRECT) { header->dev_specific = 0x10; /* DPOFUA */ if ((lun->be_lun->flags & CTL_LUN_FLAG_READONLY) || (lun->MODE_CTRL.eca_and_aen & SCP_SWP) != 0) header->dev_specific |= 0x80; /* WP */ } - if (dbd) - header->block_descr_len = 0; - else - header->block_descr_len = - sizeof(struct scsi_mode_block_descr); - block_desc = (struct scsi_mode_block_descr *)&header[1]; + header->block_descr_len = bd_len; + block_desc = &header[1]; break; } case MODE_SENSE_10: { struct scsi_mode_hdr_10 *header; int datalen; header = (struct scsi_mode_hdr_10 *)ctsio->kern_data_ptr; datalen = MIN(total_len - 2, 65533); scsi_ulto2b(datalen, header->datalen); if (lun->be_lun->lun_type == T_DIRECT) { header->dev_specific = 0x10; /* DPOFUA */ if ((lun->be_lun->flags & CTL_LUN_FLAG_READONLY) || (lun->MODE_CTRL.eca_and_aen & SCP_SWP) != 0) header->dev_specific |= 0x80; /* WP */ } - if (dbd) - scsi_ulto2b(0, header->block_descr_len); - else - scsi_ulto2b(sizeof(struct scsi_mode_block_descr), - header->block_descr_len); - block_desc = (struct scsi_mode_block_descr *)&header[1]; + if (llba) + header->flags |= SMH_LONGLBA; + scsi_ulto2b(bd_len, header->block_descr_len); + block_desc = &header[1]; break; } default: panic("%s: Invalid CDB type %#x", __func__, ctsio->cdb[0]); } /* * If we've got a disk, use its blocksize in the block * descriptor. Otherwise, just set it to 0. */ - if (dbd == 0) { - if (lun->be_lun->lun_type == T_DIRECT) - scsi_ulto3b(lun->be_lun->blocksize, - block_desc->block_len); - else - scsi_ulto3b(0, block_desc->block_len); + if (bd_len > 0) { + if (lun->be_lun->lun_type == T_DIRECT) { + if (llba) { + struct scsi_mode_block_descr_dlong *bd = block_desc; + if (lun->be_lun->maxlba != 0) + scsi_u64to8b(lun->be_lun->maxlba + 1, + bd->num_blocks); + scsi_ulto4b(lun->be_lun->blocksize, + bd->block_len); + } else { + struct scsi_mode_block_descr_dshort *bd = block_desc; + if (lun->be_lun->maxlba != 0) + scsi_ulto4b(MIN(lun->be_lun->maxlba+1, + UINT32_MAX), bd->num_blocks); + scsi_ulto3b(lun->be_lun->blocksize, + bd->block_len); + } + } else { + struct scsi_mode_block_descr *bd = block_desc; + scsi_ulto3b(0, bd->block_len); + } } switch (page_code) { case SMS_ALL_PAGES_PAGE: { int i, data_used; data_used = header_len; for (i = 0; i < CTL_NUM_MODE_PAGES; i++) { struct ctl_page_index *page_index; page_index = &lun->mode_pages.index[i]; if (lun->be_lun->lun_type == T_DIRECT && (page_index->page_flags & CTL_PAGE_FLAG_DIRECT) == 0) continue; if (lun->be_lun->lun_type == T_PROCESSOR && (page_index->page_flags & CTL_PAGE_FLAG_PROC) == 0) continue; if (lun->be_lun->lun_type == T_CDROM && (page_index->page_flags & CTL_PAGE_FLAG_CDROM) == 0) continue; /* * We don't use this subpage if the user didn't * request all subpages. We already checked (above) * to make sure the user only specified a subpage * of 0 or 0xff in the SMS_ALL_PAGES_PAGE case. */ if ((page_index->subpage != 0) && (subpage == SMS_SUBPAGE_PAGE_0)) continue; /* * Call the handler, if it exists, to update the * page to the latest values. */ if (page_index->sense_handler != NULL) page_index->sense_handler(ctsio, page_index,pc); memcpy(ctsio->kern_data_ptr + data_used, page_index->page_data + (page_index->page_len * pc), page_index->page_len); data_used += page_index->page_len; } break; } default: { int i, data_used; data_used = header_len; for (i = 0; i < CTL_NUM_MODE_PAGES; i++) { struct ctl_page_index *page_index; page_index = &lun->mode_pages.index[i]; /* Look for the right page code */ if ((page_index->page_code & SMPH_PC_MASK) != page_code) continue; /* Look for the right subpage or the subpage wildcard*/ if ((page_index->subpage != subpage) && (subpage != SMS_SUBPAGE_ALL)) continue; /* Make sure the page is supported for this dev type */ if (lun->be_lun->lun_type == T_DIRECT && (page_index->page_flags & CTL_PAGE_FLAG_DIRECT) == 0) continue; if (lun->be_lun->lun_type == T_PROCESSOR && (page_index->page_flags & CTL_PAGE_FLAG_PROC) == 0) continue; if (lun->be_lun->lun_type == T_CDROM && (page_index->page_flags & CTL_PAGE_FLAG_CDROM) == 0) continue; /* * Call the handler, if it exists, to update the * page to the latest values. */ if (page_index->sense_handler != NULL) page_index->sense_handler(ctsio, page_index,pc); memcpy(ctsio->kern_data_ptr + data_used, page_index->page_data + (page_index->page_len * pc), page_index->page_len); data_used += page_index->page_len; } break; } } ctl_set_success(ctsio); ctsio->io_hdr.flags |= CTL_FLAG_ALLOCATED; ctsio->be_move_done = ctl_config_move_done; ctl_datamove((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } int +ctl_temp_log_sense_handler(struct ctl_scsiio *ctsio, + struct ctl_page_index *page_index, + int pc) +{ + struct ctl_lun *lun = CTL_LUN(ctsio); + struct scsi_log_temperature *data; + const char *value; + + data = (struct scsi_log_temperature *)page_index->page_data; + + scsi_ulto2b(SLP_TEMPERATURE, data->hdr.param_code); + data->hdr.param_control = SLP_LBIN; + data->hdr.param_len = sizeof(struct scsi_log_temperature) - + sizeof(struct scsi_log_param_header); + if ((value = dnvlist_get_string(lun->be_lun->options, "temperature", + NULL)) != NULL) + data->temperature = strtol(value, NULL, 0); + else + data->temperature = 0xff; + data++; + + scsi_ulto2b(SLP_REFTEMPERATURE, data->hdr.param_code); + data->hdr.param_control = SLP_LBIN; + data->hdr.param_len = sizeof(struct scsi_log_temperature) - + sizeof(struct scsi_log_param_header); + if ((value = dnvlist_get_string(lun->be_lun->options, "reftemperature", + NULL)) != NULL) + data->temperature = strtol(value, NULL, 0); + else + data->temperature = 0xff; + return (0); +} + +int ctl_lbp_log_sense_handler(struct ctl_scsiio *ctsio, struct ctl_page_index *page_index, int pc) { struct ctl_lun *lun = CTL_LUN(ctsio); struct scsi_log_param_header *phdr; uint8_t *data; uint64_t val; data = page_index->page_data; if (lun->backend->lun_attr != NULL && (val = lun->backend->lun_attr(lun->be_lun->be_lun, "blocksavail")) != UINT64_MAX) { phdr = (struct scsi_log_param_header *)data; scsi_ulto2b(0x0001, phdr->param_code); phdr->param_control = SLP_LBIN | SLP_LP; phdr->param_len = 8; data = (uint8_t *)(phdr + 1); scsi_ulto4b(val >> CTL_LBP_EXPONENT, data); data[4] = 0x02; /* per-pool */ data += phdr->param_len; } if (lun->backend->lun_attr != NULL && (val = lun->backend->lun_attr(lun->be_lun->be_lun, "blocksused")) != UINT64_MAX) { phdr = (struct scsi_log_param_header *)data; scsi_ulto2b(0x0002, phdr->param_code); phdr->param_control = SLP_LBIN | SLP_LP; phdr->param_len = 8; data = (uint8_t *)(phdr + 1); scsi_ulto4b(val >> CTL_LBP_EXPONENT, data); data[4] = 0x01; /* per-LUN */ data += phdr->param_len; } if (lun->backend->lun_attr != NULL && (val = lun->backend->lun_attr(lun->be_lun->be_lun, "poolblocksavail")) != UINT64_MAX) { phdr = (struct scsi_log_param_header *)data; scsi_ulto2b(0x00f1, phdr->param_code); phdr->param_control = SLP_LBIN | SLP_LP; phdr->param_len = 8; data = (uint8_t *)(phdr + 1); scsi_ulto4b(val >> CTL_LBP_EXPONENT, data); data[4] = 0x02; /* per-pool */ data += phdr->param_len; } if (lun->backend->lun_attr != NULL && (val = lun->backend->lun_attr(lun->be_lun->be_lun, "poolblocksused")) != UINT64_MAX) { phdr = (struct scsi_log_param_header *)data; scsi_ulto2b(0x00f2, phdr->param_code); phdr->param_control = SLP_LBIN | SLP_LP; phdr->param_len = 8; data = (uint8_t *)(phdr + 1); scsi_ulto4b(val >> CTL_LBP_EXPONENT, data); data[4] = 0x02; /* per-pool */ data += phdr->param_len; } page_index->page_len = data - page_index->page_data; return (0); } int ctl_sap_log_sense_handler(struct ctl_scsiio *ctsio, struct ctl_page_index *page_index, int pc) { struct ctl_lun *lun = CTL_LUN(ctsio); struct stat_page *data; struct bintime *t; data = (struct stat_page *)page_index->page_data; scsi_ulto2b(SLP_SAP, data->sap.hdr.param_code); data->sap.hdr.param_control = SLP_LBIN; data->sap.hdr.param_len = sizeof(struct scsi_log_stat_and_perf) - sizeof(struct scsi_log_param_header); scsi_u64to8b(lun->stats.operations[CTL_STATS_READ], data->sap.read_num); scsi_u64to8b(lun->stats.operations[CTL_STATS_WRITE], data->sap.write_num); if (lun->be_lun->blocksize > 0) { scsi_u64to8b(lun->stats.bytes[CTL_STATS_WRITE] / lun->be_lun->blocksize, data->sap.recvieved_lba); scsi_u64to8b(lun->stats.bytes[CTL_STATS_READ] / lun->be_lun->blocksize, data->sap.transmitted_lba); } t = &lun->stats.time[CTL_STATS_READ]; scsi_u64to8b((uint64_t)t->sec * 1000 + t->frac / (UINT64_MAX / 1000), data->sap.read_int); t = &lun->stats.time[CTL_STATS_WRITE]; scsi_u64to8b((uint64_t)t->sec * 1000 + t->frac / (UINT64_MAX / 1000), data->sap.write_int); scsi_u64to8b(0, data->sap.weighted_num); scsi_u64to8b(0, data->sap.weighted_int); scsi_ulto2b(SLP_IT, data->it.hdr.param_code); data->it.hdr.param_control = SLP_LBIN; data->it.hdr.param_len = sizeof(struct scsi_log_idle_time) - sizeof(struct scsi_log_param_header); #ifdef CTL_TIME_IO scsi_u64to8b(lun->idle_time / SBT_1MS, data->it.idle_int); #endif scsi_ulto2b(SLP_TI, data->ti.hdr.param_code); data->it.hdr.param_control = SLP_LBIN; data->ti.hdr.param_len = sizeof(struct scsi_log_time_interval) - sizeof(struct scsi_log_param_header); scsi_ulto4b(3, data->ti.exponent); scsi_ulto4b(1, data->ti.integer); return (0); } int ctl_ie_log_sense_handler(struct ctl_scsiio *ctsio, struct ctl_page_index *page_index, int pc) { struct ctl_lun *lun = CTL_LUN(ctsio); struct scsi_log_informational_exceptions *data; + const char *value; data = (struct scsi_log_informational_exceptions *)page_index->page_data; scsi_ulto2b(SLP_IE_GEN, data->hdr.param_code); data->hdr.param_control = SLP_LBIN; data->hdr.param_len = sizeof(struct scsi_log_informational_exceptions) - sizeof(struct scsi_log_param_header); data->ie_asc = lun->ie_asc; data->ie_ascq = lun->ie_ascq; - data->temperature = 0xff; + if ((value = dnvlist_get_string(lun->be_lun->options, "temperature", + NULL)) != NULL) + data->temperature = strtol(value, NULL, 0); + else + data->temperature = 0xff; return (0); } int ctl_log_sense(struct ctl_scsiio *ctsio) { struct ctl_lun *lun = CTL_LUN(ctsio); int i, pc, page_code, subpage; int alloc_len, total_len; struct ctl_page_index *page_index; struct scsi_log_sense *cdb; struct scsi_log_header *header; CTL_DEBUG_PRINT(("ctl_log_sense\n")); cdb = (struct scsi_log_sense *)ctsio->cdb; pc = (cdb->page & SLS_PAGE_CTRL_MASK) >> 6; page_code = cdb->page & SLS_PAGE_CODE; subpage = cdb->subpage; alloc_len = scsi_2btoul(cdb->length); page_index = NULL; for (i = 0; i < CTL_NUM_LOG_PAGES; i++) { page_index = &lun->log_pages.index[i]; /* Look for the right page code */ if ((page_index->page_code & SL_PAGE_CODE) != page_code) continue; /* Look for the right subpage or the subpage wildcard*/ if (page_index->subpage != subpage) continue; break; } if (i >= CTL_NUM_LOG_PAGES) { ctl_set_invalid_field(ctsio, /*sks_valid*/ 1, /*command*/ 1, /*field*/ 2, /*bit_valid*/ 0, /*bit*/ 0); ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } total_len = sizeof(struct scsi_log_header) + page_index->page_len; ctsio->kern_data_ptr = malloc(total_len, M_CTL, M_WAITOK | M_ZERO); ctsio->kern_sg_entries = 0; ctsio->kern_rel_offset = 0; ctsio->kern_data_len = min(total_len, alloc_len); ctsio->kern_total_len = ctsio->kern_data_len; header = (struct scsi_log_header *)ctsio->kern_data_ptr; header->page = page_index->page_code; if (page_index->page_code == SLS_LOGICAL_BLOCK_PROVISIONING) header->page |= SL_DS; if (page_index->subpage) { header->page |= SL_SPF; header->subpage = page_index->subpage; } scsi_ulto2b(page_index->page_len, header->datalen); /* * Call the handler, if it exists, to update the * page to the latest values. */ if (page_index->sense_handler != NULL) page_index->sense_handler(ctsio, page_index, pc); memcpy(header + 1, page_index->page_data, page_index->page_len); ctl_set_success(ctsio); ctsio->io_hdr.flags |= CTL_FLAG_ALLOCATED; ctsio->be_move_done = ctl_config_move_done; ctl_datamove((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } int ctl_read_capacity(struct ctl_scsiio *ctsio) { struct ctl_lun *lun = CTL_LUN(ctsio); struct scsi_read_capacity *cdb; struct scsi_read_capacity_data *data; uint32_t lba; CTL_DEBUG_PRINT(("ctl_read_capacity\n")); cdb = (struct scsi_read_capacity *)ctsio->cdb; lba = scsi_4btoul(cdb->addr); if (((cdb->pmi & SRC_PMI) == 0) && (lba != 0)) { ctl_set_invalid_field(/*ctsio*/ ctsio, /*sks_valid*/ 1, /*command*/ 1, /*field*/ 2, /*bit_valid*/ 0, /*bit*/ 0); ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } ctsio->kern_data_ptr = malloc(sizeof(*data), M_CTL, M_WAITOK | M_ZERO); data = (struct scsi_read_capacity_data *)ctsio->kern_data_ptr; ctsio->kern_data_len = sizeof(*data); ctsio->kern_total_len = sizeof(*data); ctsio->kern_rel_offset = 0; ctsio->kern_sg_entries = 0; /* * If the maximum LBA is greater than 0xfffffffe, the user must * issue a SERVICE ACTION IN (16) command, with the read capacity * serivce action set. */ if (lun->be_lun->maxlba > 0xfffffffe) scsi_ulto4b(0xffffffff, data->addr); else scsi_ulto4b(lun->be_lun->maxlba, data->addr); /* * XXX KDM this may not be 512 bytes... */ scsi_ulto4b(lun->be_lun->blocksize, data->length); ctl_set_success(ctsio); ctsio->io_hdr.flags |= CTL_FLAG_ALLOCATED; ctsio->be_move_done = ctl_config_move_done; ctl_datamove((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } int ctl_read_capacity_16(struct ctl_scsiio *ctsio) { struct ctl_lun *lun = CTL_LUN(ctsio); struct scsi_read_capacity_16 *cdb; struct scsi_read_capacity_data_long *data; uint64_t lba; uint32_t alloc_len; CTL_DEBUG_PRINT(("ctl_read_capacity_16\n")); cdb = (struct scsi_read_capacity_16 *)ctsio->cdb; alloc_len = scsi_4btoul(cdb->alloc_len); lba = scsi_8btou64(cdb->addr); if ((cdb->reladr & SRC16_PMI) && (lba != 0)) { ctl_set_invalid_field(/*ctsio*/ ctsio, /*sks_valid*/ 1, /*command*/ 1, /*field*/ 2, /*bit_valid*/ 0, /*bit*/ 0); ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } ctsio->kern_data_ptr = malloc(sizeof(*data), M_CTL, M_WAITOK | M_ZERO); data = (struct scsi_read_capacity_data_long *)ctsio->kern_data_ptr; ctsio->kern_rel_offset = 0; ctsio->kern_sg_entries = 0; ctsio->kern_data_len = min(sizeof(*data), alloc_len); ctsio->kern_total_len = ctsio->kern_data_len; scsi_u64to8b(lun->be_lun->maxlba, data->addr); /* XXX KDM this may not be 512 bytes... */ scsi_ulto4b(lun->be_lun->blocksize, data->length); data->prot_lbppbe = lun->be_lun->pblockexp & SRC16_LBPPBE; scsi_ulto2b(lun->be_lun->pblockoff & SRC16_LALBA_A, data->lalba_lbp); if (lun->be_lun->flags & CTL_LUN_FLAG_UNMAP) data->lalba_lbp[0] |= SRC16_LBPME | SRC16_LBPRZ; ctl_set_success(ctsio); ctsio->io_hdr.flags |= CTL_FLAG_ALLOCATED; ctsio->be_move_done = ctl_config_move_done; ctl_datamove((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } int ctl_get_lba_status(struct ctl_scsiio *ctsio) { struct ctl_lun *lun = CTL_LUN(ctsio); struct scsi_get_lba_status *cdb; struct scsi_get_lba_status_data *data; struct ctl_lba_len_flags *lbalen; uint64_t lba; uint32_t alloc_len, total_len; int retval; CTL_DEBUG_PRINT(("ctl_get_lba_status\n")); cdb = (struct scsi_get_lba_status *)ctsio->cdb; lba = scsi_8btou64(cdb->addr); alloc_len = scsi_4btoul(cdb->alloc_len); if (lba > lun->be_lun->maxlba) { ctl_set_lba_out_of_range(ctsio, lba); ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } total_len = sizeof(*data) + sizeof(data->descr[0]); ctsio->kern_data_ptr = malloc(total_len, M_CTL, M_WAITOK | M_ZERO); data = (struct scsi_get_lba_status_data *)ctsio->kern_data_ptr; ctsio->kern_rel_offset = 0; ctsio->kern_sg_entries = 0; ctsio->kern_data_len = min(total_len, alloc_len); ctsio->kern_total_len = ctsio->kern_data_len; /* Fill dummy data in case backend can't tell anything. */ scsi_ulto4b(4 + sizeof(data->descr[0]), data->length); scsi_u64to8b(lba, data->descr[0].addr); scsi_ulto4b(MIN(UINT32_MAX, lun->be_lun->maxlba + 1 - lba), data->descr[0].length); data->descr[0].status = 0; /* Mapped or unknown. */ ctl_set_success(ctsio); ctsio->io_hdr.flags |= CTL_FLAG_ALLOCATED; ctsio->be_move_done = ctl_config_move_done; lbalen = (struct ctl_lba_len_flags *)&ctsio->io_hdr.ctl_private[CTL_PRIV_LBA_LEN]; lbalen->lba = lba; lbalen->len = total_len; lbalen->flags = 0; retval = lun->backend->config_read((union ctl_io *)ctsio); return (retval); } int ctl_read_defect(struct ctl_scsiio *ctsio) { struct scsi_read_defect_data_10 *ccb10; struct scsi_read_defect_data_12 *ccb12; struct scsi_read_defect_data_hdr_10 *data10; struct scsi_read_defect_data_hdr_12 *data12; uint32_t alloc_len, data_len; uint8_t format; CTL_DEBUG_PRINT(("ctl_read_defect\n")); if (ctsio->cdb[0] == READ_DEFECT_DATA_10) { ccb10 = (struct scsi_read_defect_data_10 *)&ctsio->cdb; format = ccb10->format; alloc_len = scsi_2btoul(ccb10->alloc_length); data_len = sizeof(*data10); } else { ccb12 = (struct scsi_read_defect_data_12 *)&ctsio->cdb; format = ccb12->format; alloc_len = scsi_4btoul(ccb12->alloc_length); data_len = sizeof(*data12); } if (alloc_len == 0) { ctl_set_success(ctsio); ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } ctsio->kern_data_ptr = malloc(data_len, M_CTL, M_WAITOK | M_ZERO); ctsio->kern_rel_offset = 0; ctsio->kern_sg_entries = 0; ctsio->kern_data_len = min(data_len, alloc_len); ctsio->kern_total_len = ctsio->kern_data_len; if (ctsio->cdb[0] == READ_DEFECT_DATA_10) { data10 = (struct scsi_read_defect_data_hdr_10 *) ctsio->kern_data_ptr; data10->format = format; scsi_ulto2b(0, data10->length); } else { data12 = (struct scsi_read_defect_data_hdr_12 *) ctsio->kern_data_ptr; data12->format = format; scsi_ulto2b(0, data12->generation); scsi_ulto4b(0, data12->length); } ctl_set_success(ctsio); ctsio->io_hdr.flags |= CTL_FLAG_ALLOCATED; ctsio->be_move_done = ctl_config_move_done; ctl_datamove((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } int ctl_report_tagret_port_groups(struct ctl_scsiio *ctsio) { struct ctl_softc *softc = CTL_SOFTC(ctsio); struct ctl_lun *lun = CTL_LUN(ctsio); struct scsi_maintenance_in *cdb; int retval; int alloc_len, ext, total_len = 0, g, pc, pg, ts, os; int num_ha_groups, num_target_ports, shared_group; struct ctl_port *port; struct scsi_target_group_data *rtg_ptr; struct scsi_target_group_data_extended *rtg_ext_ptr; struct scsi_target_port_group_descriptor *tpg_desc; CTL_DEBUG_PRINT(("ctl_report_tagret_port_groups\n")); cdb = (struct scsi_maintenance_in *)ctsio->cdb; retval = CTL_RETVAL_COMPLETE; switch (cdb->byte2 & STG_PDF_MASK) { case STG_PDF_LENGTH: ext = 0; break; case STG_PDF_EXTENDED: ext = 1; break; default: ctl_set_invalid_field(/*ctsio*/ ctsio, /*sks_valid*/ 1, /*command*/ 1, /*field*/ 2, /*bit_valid*/ 1, /*bit*/ 5); ctl_done((union ctl_io *)ctsio); return(retval); } num_target_ports = 0; shared_group = (softc->is_single != 0); mtx_lock(&softc->ctl_lock); STAILQ_FOREACH(port, &softc->port_list, links) { if ((port->status & CTL_PORT_STATUS_ONLINE) == 0) continue; if (ctl_lun_map_to_port(port, lun->lun) == UINT32_MAX) continue; num_target_ports++; if (port->status & CTL_PORT_STATUS_HA_SHARED) shared_group = 1; } mtx_unlock(&softc->ctl_lock); num_ha_groups = (softc->is_single) ? 0 : NUM_HA_SHELVES; if (ext) total_len = sizeof(struct scsi_target_group_data_extended); else total_len = sizeof(struct scsi_target_group_data); total_len += sizeof(struct scsi_target_port_group_descriptor) * (shared_group + num_ha_groups) + sizeof(struct scsi_target_port_descriptor) * num_target_ports; alloc_len = scsi_4btoul(cdb->length); ctsio->kern_data_ptr = malloc(total_len, M_CTL, M_WAITOK | M_ZERO); ctsio->kern_sg_entries = 0; ctsio->kern_rel_offset = 0; ctsio->kern_data_len = min(total_len, alloc_len); ctsio->kern_total_len = ctsio->kern_data_len; if (ext) { rtg_ext_ptr = (struct scsi_target_group_data_extended *) ctsio->kern_data_ptr; scsi_ulto4b(total_len - 4, rtg_ext_ptr->length); rtg_ext_ptr->format_type = 0x10; rtg_ext_ptr->implicit_transition_time = 0; tpg_desc = &rtg_ext_ptr->groups[0]; } else { rtg_ptr = (struct scsi_target_group_data *) ctsio->kern_data_ptr; scsi_ulto4b(total_len - 4, rtg_ptr->length); tpg_desc = &rtg_ptr->groups[0]; } mtx_lock(&softc->ctl_lock); pg = softc->port_min / softc->port_cnt; if (lun->flags & (CTL_LUN_PRIMARY_SC | CTL_LUN_PEER_SC_PRIMARY)) { /* Some shelf is known to be primary. */ if (softc->ha_link == CTL_HA_LINK_OFFLINE) os = TPG_ASYMMETRIC_ACCESS_UNAVAILABLE; else if (softc->ha_link == CTL_HA_LINK_UNKNOWN) os = TPG_ASYMMETRIC_ACCESS_TRANSITIONING; else if (softc->ha_mode == CTL_HA_MODE_ACT_STBY) os = TPG_ASYMMETRIC_ACCESS_STANDBY; else os = TPG_ASYMMETRIC_ACCESS_NONOPTIMIZED; if (lun->flags & CTL_LUN_PRIMARY_SC) { ts = TPG_ASYMMETRIC_ACCESS_OPTIMIZED; } else { ts = os; os = TPG_ASYMMETRIC_ACCESS_OPTIMIZED; } } else { /* No known primary shelf. */ if (softc->ha_link == CTL_HA_LINK_OFFLINE) { ts = TPG_ASYMMETRIC_ACCESS_UNAVAILABLE; os = TPG_ASYMMETRIC_ACCESS_OPTIMIZED; } else if (softc->ha_link == CTL_HA_LINK_UNKNOWN) { ts = TPG_ASYMMETRIC_ACCESS_TRANSITIONING; os = TPG_ASYMMETRIC_ACCESS_OPTIMIZED; } else { ts = os = TPG_ASYMMETRIC_ACCESS_TRANSITIONING; } } if (shared_group) { tpg_desc->pref_state = ts; tpg_desc->support = TPG_AO_SUP | TPG_AN_SUP | TPG_S_SUP | TPG_U_SUP | TPG_T_SUP; scsi_ulto2b(1, tpg_desc->target_port_group); tpg_desc->status = TPG_IMPLICIT; pc = 0; STAILQ_FOREACH(port, &softc->port_list, links) { if ((port->status & CTL_PORT_STATUS_ONLINE) == 0) continue; if (!softc->is_single && (port->status & CTL_PORT_STATUS_HA_SHARED) == 0) continue; if (ctl_lun_map_to_port(port, lun->lun) == UINT32_MAX) continue; scsi_ulto2b(port->targ_port, tpg_desc->descriptors[pc]. relative_target_port_identifier); pc++; } tpg_desc->target_port_count = pc; tpg_desc = (struct scsi_target_port_group_descriptor *) &tpg_desc->descriptors[pc]; } for (g = 0; g < num_ha_groups; g++) { tpg_desc->pref_state = (g == pg) ? ts : os; tpg_desc->support = TPG_AO_SUP | TPG_AN_SUP | TPG_S_SUP | TPG_U_SUP | TPG_T_SUP; scsi_ulto2b(2 + g, tpg_desc->target_port_group); tpg_desc->status = TPG_IMPLICIT; pc = 0; STAILQ_FOREACH(port, &softc->port_list, links) { if (port->targ_port < g * softc->port_cnt || port->targ_port >= (g + 1) * softc->port_cnt) continue; if ((port->status & CTL_PORT_STATUS_ONLINE) == 0) continue; if (port->status & CTL_PORT_STATUS_HA_SHARED) continue; if (ctl_lun_map_to_port(port, lun->lun) == UINT32_MAX) continue; scsi_ulto2b(port->targ_port, tpg_desc->descriptors[pc]. relative_target_port_identifier); pc++; } tpg_desc->target_port_count = pc; tpg_desc = (struct scsi_target_port_group_descriptor *) &tpg_desc->descriptors[pc]; } mtx_unlock(&softc->ctl_lock); ctl_set_success(ctsio); ctsio->io_hdr.flags |= CTL_FLAG_ALLOCATED; ctsio->be_move_done = ctl_config_move_done; ctl_datamove((union ctl_io *)ctsio); return(retval); } int ctl_report_supported_opcodes(struct ctl_scsiio *ctsio) { struct ctl_lun *lun = CTL_LUN(ctsio); struct scsi_report_supported_opcodes *cdb; const struct ctl_cmd_entry *entry, *sentry; struct scsi_report_supported_opcodes_all *all; struct scsi_report_supported_opcodes_descr *descr; struct scsi_report_supported_opcodes_one *one; int retval; int alloc_len, total_len; int opcode, service_action, i, j, num; CTL_DEBUG_PRINT(("ctl_report_supported_opcodes\n")); cdb = (struct scsi_report_supported_opcodes *)ctsio->cdb; retval = CTL_RETVAL_COMPLETE; opcode = cdb->requested_opcode; service_action = scsi_2btoul(cdb->requested_service_action); switch (cdb->options & RSO_OPTIONS_MASK) { case RSO_OPTIONS_ALL: num = 0; for (i = 0; i < 256; i++) { entry = &ctl_cmd_table[i]; if (entry->flags & CTL_CMD_FLAG_SA5) { for (j = 0; j < 32; j++) { sentry = &((const struct ctl_cmd_entry *) entry->execute)[j]; if (ctl_cmd_applicable( lun->be_lun->lun_type, sentry)) num++; } } else { if (ctl_cmd_applicable(lun->be_lun->lun_type, entry)) num++; } } total_len = sizeof(struct scsi_report_supported_opcodes_all) + num * sizeof(struct scsi_report_supported_opcodes_descr); break; case RSO_OPTIONS_OC: if (ctl_cmd_table[opcode].flags & CTL_CMD_FLAG_SA5) { ctl_set_invalid_field(/*ctsio*/ ctsio, /*sks_valid*/ 1, /*command*/ 1, /*field*/ 2, /*bit_valid*/ 1, /*bit*/ 2); ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } total_len = sizeof(struct scsi_report_supported_opcodes_one) + 32; break; case RSO_OPTIONS_OC_SA: if ((ctl_cmd_table[opcode].flags & CTL_CMD_FLAG_SA5) == 0 || service_action >= 32) { ctl_set_invalid_field(/*ctsio*/ ctsio, /*sks_valid*/ 1, /*command*/ 1, /*field*/ 2, /*bit_valid*/ 1, /*bit*/ 2); ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } /* FALLTHROUGH */ case RSO_OPTIONS_OC_ASA: total_len = sizeof(struct scsi_report_supported_opcodes_one) + 32; break; default: ctl_set_invalid_field(/*ctsio*/ ctsio, /*sks_valid*/ 1, /*command*/ 1, /*field*/ 2, /*bit_valid*/ 1, /*bit*/ 2); ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } alloc_len = scsi_4btoul(cdb->length); ctsio->kern_data_ptr = malloc(total_len, M_CTL, M_WAITOK | M_ZERO); ctsio->kern_sg_entries = 0; ctsio->kern_rel_offset = 0; ctsio->kern_data_len = min(total_len, alloc_len); ctsio->kern_total_len = ctsio->kern_data_len; switch (cdb->options & RSO_OPTIONS_MASK) { case RSO_OPTIONS_ALL: all = (struct scsi_report_supported_opcodes_all *) ctsio->kern_data_ptr; num = 0; for (i = 0; i < 256; i++) { entry = &ctl_cmd_table[i]; if (entry->flags & CTL_CMD_FLAG_SA5) { for (j = 0; j < 32; j++) { sentry = &((const struct ctl_cmd_entry *) entry->execute)[j]; if (!ctl_cmd_applicable( lun->be_lun->lun_type, sentry)) continue; descr = &all->descr[num++]; descr->opcode = i; scsi_ulto2b(j, descr->service_action); descr->flags = RSO_SERVACTV; scsi_ulto2b(sentry->length, descr->cdb_length); } } else { if (!ctl_cmd_applicable(lun->be_lun->lun_type, entry)) continue; descr = &all->descr[num++]; descr->opcode = i; scsi_ulto2b(0, descr->service_action); descr->flags = 0; scsi_ulto2b(entry->length, descr->cdb_length); } } scsi_ulto4b( num * sizeof(struct scsi_report_supported_opcodes_descr), all->length); break; case RSO_OPTIONS_OC: one = (struct scsi_report_supported_opcodes_one *) ctsio->kern_data_ptr; entry = &ctl_cmd_table[opcode]; goto fill_one; case RSO_OPTIONS_OC_SA: one = (struct scsi_report_supported_opcodes_one *) ctsio->kern_data_ptr; entry = &ctl_cmd_table[opcode]; entry = &((const struct ctl_cmd_entry *) entry->execute)[service_action]; fill_one: if (ctl_cmd_applicable(lun->be_lun->lun_type, entry)) { one->support = 3; scsi_ulto2b(entry->length, one->cdb_length); one->cdb_usage[0] = opcode; memcpy(&one->cdb_usage[1], entry->usage, entry->length - 1); } else one->support = 1; break; case RSO_OPTIONS_OC_ASA: one = (struct scsi_report_supported_opcodes_one *) ctsio->kern_data_ptr; entry = &ctl_cmd_table[opcode]; if (entry->flags & CTL_CMD_FLAG_SA5) { entry = &((const struct ctl_cmd_entry *) entry->execute)[service_action]; } else if (service_action != 0) { one->support = 1; break; } goto fill_one; } ctl_set_success(ctsio); ctsio->io_hdr.flags |= CTL_FLAG_ALLOCATED; ctsio->be_move_done = ctl_config_move_done; ctl_datamove((union ctl_io *)ctsio); return(retval); } int ctl_report_supported_tmf(struct ctl_scsiio *ctsio) { struct scsi_report_supported_tmf *cdb; struct scsi_report_supported_tmf_ext_data *data; int retval; int alloc_len, total_len; CTL_DEBUG_PRINT(("ctl_report_supported_tmf\n")); cdb = (struct scsi_report_supported_tmf *)ctsio->cdb; retval = CTL_RETVAL_COMPLETE; if (cdb->options & RST_REPD) total_len = sizeof(struct scsi_report_supported_tmf_ext_data); else total_len = sizeof(struct scsi_report_supported_tmf_data); alloc_len = scsi_4btoul(cdb->length); ctsio->kern_data_ptr = malloc(total_len, M_CTL, M_WAITOK | M_ZERO); ctsio->kern_sg_entries = 0; ctsio->kern_rel_offset = 0; ctsio->kern_data_len = min(total_len, alloc_len); ctsio->kern_total_len = ctsio->kern_data_len; data = (struct scsi_report_supported_tmf_ext_data *)ctsio->kern_data_ptr; data->byte1 |= RST_ATS | RST_ATSS | RST_CTSS | RST_LURS | RST_QTS | RST_TRS; data->byte2 |= RST_QAES | RST_QTSS | RST_ITNRS; data->length = total_len - 4; ctl_set_success(ctsio); ctsio->io_hdr.flags |= CTL_FLAG_ALLOCATED; ctsio->be_move_done = ctl_config_move_done; ctl_datamove((union ctl_io *)ctsio); return (retval); } int ctl_report_timestamp(struct ctl_scsiio *ctsio) { struct scsi_report_timestamp *cdb; struct scsi_report_timestamp_data *data; struct timeval tv; int64_t timestamp; int retval; int alloc_len, total_len; CTL_DEBUG_PRINT(("ctl_report_timestamp\n")); cdb = (struct scsi_report_timestamp *)ctsio->cdb; retval = CTL_RETVAL_COMPLETE; total_len = sizeof(struct scsi_report_timestamp_data); alloc_len = scsi_4btoul(cdb->length); ctsio->kern_data_ptr = malloc(total_len, M_CTL, M_WAITOK | M_ZERO); ctsio->kern_sg_entries = 0; ctsio->kern_rel_offset = 0; ctsio->kern_data_len = min(total_len, alloc_len); ctsio->kern_total_len = ctsio->kern_data_len; data = (struct scsi_report_timestamp_data *)ctsio->kern_data_ptr; scsi_ulto2b(sizeof(*data) - 2, data->length); data->origin = RTS_ORIG_OUTSIDE; getmicrotime(&tv); timestamp = (int64_t)tv.tv_sec * 1000 + tv.tv_usec / 1000; scsi_ulto4b(timestamp >> 16, data->timestamp); scsi_ulto2b(timestamp & 0xffff, &data->timestamp[4]); ctl_set_success(ctsio); ctsio->io_hdr.flags |= CTL_FLAG_ALLOCATED; ctsio->be_move_done = ctl_config_move_done; ctl_datamove((union ctl_io *)ctsio); return (retval); } int ctl_persistent_reserve_in(struct ctl_scsiio *ctsio) { struct ctl_softc *softc = CTL_SOFTC(ctsio); struct ctl_lun *lun = CTL_LUN(ctsio); struct scsi_per_res_in *cdb; int alloc_len, total_len = 0; /* struct scsi_per_res_in_rsrv in_data; */ uint64_t key; CTL_DEBUG_PRINT(("ctl_persistent_reserve_in\n")); cdb = (struct scsi_per_res_in *)ctsio->cdb; alloc_len = scsi_2btoul(cdb->length); retry: mtx_lock(&lun->lun_lock); switch (cdb->action) { case SPRI_RK: /* read keys */ total_len = sizeof(struct scsi_per_res_in_keys) + lun->pr_key_count * sizeof(struct scsi_per_res_key); break; case SPRI_RR: /* read reservation */ if (lun->flags & CTL_LUN_PR_RESERVED) total_len = sizeof(struct scsi_per_res_in_rsrv); else total_len = sizeof(struct scsi_per_res_in_header); break; case SPRI_RC: /* report capabilities */ total_len = sizeof(struct scsi_per_res_cap); break; case SPRI_RS: /* read full status */ total_len = sizeof(struct scsi_per_res_in_header) + (sizeof(struct scsi_per_res_in_full_desc) + 256) * lun->pr_key_count; break; default: panic("%s: Invalid PR type %#x", __func__, cdb->action); } mtx_unlock(&lun->lun_lock); ctsio->kern_data_ptr = malloc(total_len, M_CTL, M_WAITOK | M_ZERO); ctsio->kern_rel_offset = 0; ctsio->kern_sg_entries = 0; ctsio->kern_data_len = min(total_len, alloc_len); ctsio->kern_total_len = ctsio->kern_data_len; mtx_lock(&lun->lun_lock); switch (cdb->action) { case SPRI_RK: { // read keys struct scsi_per_res_in_keys *res_keys; int i, key_count; res_keys = (struct scsi_per_res_in_keys*)ctsio->kern_data_ptr; /* * We had to drop the lock to allocate our buffer, which * leaves time for someone to come in with another * persistent reservation. (That is unlikely, though, * since this should be the only persistent reservation * command active right now.) */ if (total_len != (sizeof(struct scsi_per_res_in_keys) + (lun->pr_key_count * sizeof(struct scsi_per_res_key)))){ mtx_unlock(&lun->lun_lock); free(ctsio->kern_data_ptr, M_CTL); printf("%s: reservation length changed, retrying\n", __func__); goto retry; } scsi_ulto4b(lun->pr_generation, res_keys->header.generation); scsi_ulto4b(sizeof(struct scsi_per_res_key) * lun->pr_key_count, res_keys->header.length); for (i = 0, key_count = 0; i < CTL_MAX_INITIATORS; i++) { if ((key = ctl_get_prkey(lun, i)) == 0) continue; /* * We used lun->pr_key_count to calculate the * size to allocate. If it turns out the number of * initiators with the registered flag set is * larger than that (i.e. they haven't been kept in * sync), we've got a problem. */ if (key_count >= lun->pr_key_count) { key_count++; continue; } scsi_u64to8b(key, res_keys->keys[key_count].key); key_count++; } break; } case SPRI_RR: { // read reservation struct scsi_per_res_in_rsrv *res; int tmp_len, header_only; res = (struct scsi_per_res_in_rsrv *)ctsio->kern_data_ptr; scsi_ulto4b(lun->pr_generation, res->header.generation); if (lun->flags & CTL_LUN_PR_RESERVED) { tmp_len = sizeof(struct scsi_per_res_in_rsrv); scsi_ulto4b(sizeof(struct scsi_per_res_in_rsrv_data), res->header.length); header_only = 0; } else { tmp_len = sizeof(struct scsi_per_res_in_header); scsi_ulto4b(0, res->header.length); header_only = 1; } /* * We had to drop the lock to allocate our buffer, which * leaves time for someone to come in with another * persistent reservation. (That is unlikely, though, * since this should be the only persistent reservation * command active right now.) */ if (tmp_len != total_len) { mtx_unlock(&lun->lun_lock); free(ctsio->kern_data_ptr, M_CTL); printf("%s: reservation status changed, retrying\n", __func__); goto retry; } /* * No reservation held, so we're done. */ if (header_only != 0) break; /* * If the registration is an All Registrants type, the key * is 0, since it doesn't really matter. */ if (lun->pr_res_idx != CTL_PR_ALL_REGISTRANTS) { scsi_u64to8b(ctl_get_prkey(lun, lun->pr_res_idx), res->data.reservation); } res->data.scopetype = lun->pr_res_type; break; } case SPRI_RC: //report capabilities { struct scsi_per_res_cap *res_cap; uint16_t type_mask; res_cap = (struct scsi_per_res_cap *)ctsio->kern_data_ptr; scsi_ulto2b(sizeof(*res_cap), res_cap->length); res_cap->flags1 = SPRI_CRH; res_cap->flags2 = SPRI_TMV | SPRI_ALLOW_5; type_mask = SPRI_TM_WR_EX_AR | SPRI_TM_EX_AC_RO | SPRI_TM_WR_EX_RO | SPRI_TM_EX_AC | SPRI_TM_WR_EX | SPRI_TM_EX_AC_AR; scsi_ulto2b(type_mask, res_cap->type_mask); break; } case SPRI_RS: { // read full status struct scsi_per_res_in_full *res_status; struct scsi_per_res_in_full_desc *res_desc; struct ctl_port *port; int i, len; res_status = (struct scsi_per_res_in_full*)ctsio->kern_data_ptr; /* * We had to drop the lock to allocate our buffer, which * leaves time for someone to come in with another * persistent reservation. (That is unlikely, though, * since this should be the only persistent reservation * command active right now.) */ if (total_len < (sizeof(struct scsi_per_res_in_header) + (sizeof(struct scsi_per_res_in_full_desc) + 256) * lun->pr_key_count)){ mtx_unlock(&lun->lun_lock); free(ctsio->kern_data_ptr, M_CTL); printf("%s: reservation length changed, retrying\n", __func__); goto retry; } scsi_ulto4b(lun->pr_generation, res_status->header.generation); res_desc = &res_status->desc[0]; for (i = 0; i < CTL_MAX_INITIATORS; i++) { if ((key = ctl_get_prkey(lun, i)) == 0) continue; scsi_u64to8b(key, res_desc->res_key.key); if ((lun->flags & CTL_LUN_PR_RESERVED) && (lun->pr_res_idx == i || lun->pr_res_idx == CTL_PR_ALL_REGISTRANTS)) { res_desc->flags = SPRI_FULL_R_HOLDER; res_desc->scopetype = lun->pr_res_type; } scsi_ulto2b(i / CTL_MAX_INIT_PER_PORT, res_desc->rel_trgt_port_id); len = 0; port = softc->ctl_ports[i / CTL_MAX_INIT_PER_PORT]; if (port != NULL) len = ctl_create_iid(port, i % CTL_MAX_INIT_PER_PORT, res_desc->transport_id); scsi_ulto4b(len, res_desc->additional_length); res_desc = (struct scsi_per_res_in_full_desc *) &res_desc->transport_id[len]; } scsi_ulto4b((uint8_t *)res_desc - (uint8_t *)&res_status->desc[0], res_status->header.length); break; } default: panic("%s: Invalid PR type %#x", __func__, cdb->action); } mtx_unlock(&lun->lun_lock); ctl_set_success(ctsio); ctsio->io_hdr.flags |= CTL_FLAG_ALLOCATED; ctsio->be_move_done = ctl_config_move_done; ctl_datamove((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } /* * Returns 0 if ctl_persistent_reserve_out() should continue, non-zero if * it should return. */ static int ctl_pro_preempt(struct ctl_softc *softc, struct ctl_lun *lun, uint64_t res_key, uint64_t sa_res_key, uint8_t type, uint32_t residx, struct ctl_scsiio *ctsio, struct scsi_per_res_out *cdb, struct scsi_per_res_out_parms* param) { union ctl_ha_msg persis_io; int i; mtx_lock(&lun->lun_lock); if (sa_res_key == 0) { if (lun->pr_res_idx == CTL_PR_ALL_REGISTRANTS) { /* validate scope and type */ if ((cdb->scope_type & SPR_SCOPE_MASK) != SPR_LU_SCOPE) { mtx_unlock(&lun->lun_lock); ctl_set_invalid_field(/*ctsio*/ ctsio, /*sks_valid*/ 1, /*command*/ 1, /*field*/ 2, /*bit_valid*/ 1, /*bit*/ 4); ctl_done((union ctl_io *)ctsio); return (1); } if (type>8 || type==2 || type==4 || type==0) { mtx_unlock(&lun->lun_lock); ctl_set_invalid_field(/*ctsio*/ ctsio, /*sks_valid*/ 1, /*command*/ 1, /*field*/ 2, /*bit_valid*/ 1, /*bit*/ 0); ctl_done((union ctl_io *)ctsio); return (1); } /* * Unregister everybody else and build UA for * them */ for(i = 0; i < CTL_MAX_INITIATORS; i++) { if (i == residx || ctl_get_prkey(lun, i) == 0) continue; ctl_clr_prkey(lun, i); ctl_est_ua(lun, i, CTL_UA_REG_PREEMPT); } lun->pr_key_count = 1; lun->pr_res_type = type; if (lun->pr_res_type != SPR_TYPE_WR_EX_AR && lun->pr_res_type != SPR_TYPE_EX_AC_AR) lun->pr_res_idx = residx; lun->pr_generation++; mtx_unlock(&lun->lun_lock); /* send msg to other side */ persis_io.hdr.nexus = ctsio->io_hdr.nexus; persis_io.hdr.msg_type = CTL_MSG_PERS_ACTION; persis_io.pr.pr_info.action = CTL_PR_PREEMPT; persis_io.pr.pr_info.residx = lun->pr_res_idx; persis_io.pr.pr_info.res_type = type; memcpy(persis_io.pr.pr_info.sa_res_key, param->serv_act_res_key, sizeof(param->serv_act_res_key)); ctl_ha_msg_send(CTL_HA_CHAN_CTL, &persis_io, sizeof(persis_io.pr), M_WAITOK); } else { /* not all registrants */ mtx_unlock(&lun->lun_lock); free(ctsio->kern_data_ptr, M_CTL); ctl_set_invalid_field(ctsio, /*sks_valid*/ 1, /*command*/ 0, /*field*/ 8, /*bit_valid*/ 0, /*bit*/ 0); ctl_done((union ctl_io *)ctsio); return (1); } } else if (lun->pr_res_idx == CTL_PR_ALL_REGISTRANTS || !(lun->flags & CTL_LUN_PR_RESERVED)) { int found = 0; if (res_key == sa_res_key) { /* special case */ /* * The spec implies this is not good but doesn't * say what to do. There are two choices either * generate a res conflict or check condition * with illegal field in parameter data. Since * that is what is done when the sa_res_key is * zero I'll take that approach since this has * to do with the sa_res_key. */ mtx_unlock(&lun->lun_lock); free(ctsio->kern_data_ptr, M_CTL); ctl_set_invalid_field(ctsio, /*sks_valid*/ 1, /*command*/ 0, /*field*/ 8, /*bit_valid*/ 0, /*bit*/ 0); ctl_done((union ctl_io *)ctsio); return (1); } for (i = 0; i < CTL_MAX_INITIATORS; i++) { if (ctl_get_prkey(lun, i) != sa_res_key) continue; found = 1; ctl_clr_prkey(lun, i); lun->pr_key_count--; ctl_est_ua(lun, i, CTL_UA_REG_PREEMPT); } if (!found) { mtx_unlock(&lun->lun_lock); free(ctsio->kern_data_ptr, M_CTL); ctl_set_reservation_conflict(ctsio); ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } lun->pr_generation++; mtx_unlock(&lun->lun_lock); /* send msg to other side */ persis_io.hdr.nexus = ctsio->io_hdr.nexus; persis_io.hdr.msg_type = CTL_MSG_PERS_ACTION; persis_io.pr.pr_info.action = CTL_PR_PREEMPT; persis_io.pr.pr_info.residx = lun->pr_res_idx; persis_io.pr.pr_info.res_type = type; memcpy(persis_io.pr.pr_info.sa_res_key, param->serv_act_res_key, sizeof(param->serv_act_res_key)); ctl_ha_msg_send(CTL_HA_CHAN_CTL, &persis_io, sizeof(persis_io.pr), M_WAITOK); } else { /* Reserved but not all registrants */ /* sa_res_key is res holder */ if (sa_res_key == ctl_get_prkey(lun, lun->pr_res_idx)) { /* validate scope and type */ if ((cdb->scope_type & SPR_SCOPE_MASK) != SPR_LU_SCOPE) { mtx_unlock(&lun->lun_lock); ctl_set_invalid_field(/*ctsio*/ ctsio, /*sks_valid*/ 1, /*command*/ 1, /*field*/ 2, /*bit_valid*/ 1, /*bit*/ 4); ctl_done((union ctl_io *)ctsio); return (1); } if (type>8 || type==2 || type==4 || type==0) { mtx_unlock(&lun->lun_lock); ctl_set_invalid_field(/*ctsio*/ ctsio, /*sks_valid*/ 1, /*command*/ 1, /*field*/ 2, /*bit_valid*/ 1, /*bit*/ 0); ctl_done((union ctl_io *)ctsio); return (1); } /* * Do the following: * if sa_res_key != res_key remove all * registrants w/sa_res_key and generate UA * for these registrants(Registrations * Preempted) if it wasn't an exclusive * reservation generate UA(Reservations * Preempted) for all other registered nexuses * if the type has changed. Establish the new * reservation and holder. If res_key and * sa_res_key are the same do the above * except don't unregister the res holder. */ for(i = 0; i < CTL_MAX_INITIATORS; i++) { if (i == residx || ctl_get_prkey(lun, i) == 0) continue; if (sa_res_key == ctl_get_prkey(lun, i)) { ctl_clr_prkey(lun, i); lun->pr_key_count--; ctl_est_ua(lun, i, CTL_UA_REG_PREEMPT); } else if (type != lun->pr_res_type && (lun->pr_res_type == SPR_TYPE_WR_EX_RO || lun->pr_res_type == SPR_TYPE_EX_AC_RO)) { ctl_est_ua(lun, i, CTL_UA_RES_RELEASE); } } lun->pr_res_type = type; if (lun->pr_res_type != SPR_TYPE_WR_EX_AR && lun->pr_res_type != SPR_TYPE_EX_AC_AR) lun->pr_res_idx = residx; else lun->pr_res_idx = CTL_PR_ALL_REGISTRANTS; lun->pr_generation++; mtx_unlock(&lun->lun_lock); persis_io.hdr.nexus = ctsio->io_hdr.nexus; persis_io.hdr.msg_type = CTL_MSG_PERS_ACTION; persis_io.pr.pr_info.action = CTL_PR_PREEMPT; persis_io.pr.pr_info.residx = lun->pr_res_idx; persis_io.pr.pr_info.res_type = type; memcpy(persis_io.pr.pr_info.sa_res_key, param->serv_act_res_key, sizeof(param->serv_act_res_key)); ctl_ha_msg_send(CTL_HA_CHAN_CTL, &persis_io, sizeof(persis_io.pr), M_WAITOK); } else { /* * sa_res_key is not the res holder just * remove registrants */ int found=0; for (i = 0; i < CTL_MAX_INITIATORS; i++) { if (sa_res_key != ctl_get_prkey(lun, i)) continue; found = 1; ctl_clr_prkey(lun, i); lun->pr_key_count--; ctl_est_ua(lun, i, CTL_UA_REG_PREEMPT); } if (!found) { mtx_unlock(&lun->lun_lock); free(ctsio->kern_data_ptr, M_CTL); ctl_set_reservation_conflict(ctsio); ctl_done((union ctl_io *)ctsio); return (1); } lun->pr_generation++; mtx_unlock(&lun->lun_lock); persis_io.hdr.nexus = ctsio->io_hdr.nexus; persis_io.hdr.msg_type = CTL_MSG_PERS_ACTION; persis_io.pr.pr_info.action = CTL_PR_PREEMPT; persis_io.pr.pr_info.residx = lun->pr_res_idx; persis_io.pr.pr_info.res_type = type; memcpy(persis_io.pr.pr_info.sa_res_key, param->serv_act_res_key, sizeof(param->serv_act_res_key)); ctl_ha_msg_send(CTL_HA_CHAN_CTL, &persis_io, sizeof(persis_io.pr), M_WAITOK); } } return (0); } static void ctl_pro_preempt_other(struct ctl_lun *lun, union ctl_ha_msg *msg) { uint64_t sa_res_key; int i; sa_res_key = scsi_8btou64(msg->pr.pr_info.sa_res_key); if (lun->pr_res_idx == CTL_PR_ALL_REGISTRANTS || lun->pr_res_idx == CTL_PR_NO_RESERVATION || sa_res_key != ctl_get_prkey(lun, lun->pr_res_idx)) { if (sa_res_key == 0) { /* * Unregister everybody else and build UA for * them */ for(i = 0; i < CTL_MAX_INITIATORS; i++) { if (i == msg->pr.pr_info.residx || ctl_get_prkey(lun, i) == 0) continue; ctl_clr_prkey(lun, i); ctl_est_ua(lun, i, CTL_UA_REG_PREEMPT); } lun->pr_key_count = 1; lun->pr_res_type = msg->pr.pr_info.res_type; if (lun->pr_res_type != SPR_TYPE_WR_EX_AR && lun->pr_res_type != SPR_TYPE_EX_AC_AR) lun->pr_res_idx = msg->pr.pr_info.residx; } else { for (i = 0; i < CTL_MAX_INITIATORS; i++) { if (sa_res_key == ctl_get_prkey(lun, i)) continue; ctl_clr_prkey(lun, i); lun->pr_key_count--; ctl_est_ua(lun, i, CTL_UA_REG_PREEMPT); } } } else { for (i = 0; i < CTL_MAX_INITIATORS; i++) { if (i == msg->pr.pr_info.residx || ctl_get_prkey(lun, i) == 0) continue; if (sa_res_key == ctl_get_prkey(lun, i)) { ctl_clr_prkey(lun, i); lun->pr_key_count--; ctl_est_ua(lun, i, CTL_UA_REG_PREEMPT); } else if (msg->pr.pr_info.res_type != lun->pr_res_type && (lun->pr_res_type == SPR_TYPE_WR_EX_RO || lun->pr_res_type == SPR_TYPE_EX_AC_RO)) { ctl_est_ua(lun, i, CTL_UA_RES_RELEASE); } } lun->pr_res_type = msg->pr.pr_info.res_type; if (lun->pr_res_type != SPR_TYPE_WR_EX_AR && lun->pr_res_type != SPR_TYPE_EX_AC_AR) lun->pr_res_idx = msg->pr.pr_info.residx; else lun->pr_res_idx = CTL_PR_ALL_REGISTRANTS; } lun->pr_generation++; } int ctl_persistent_reserve_out(struct ctl_scsiio *ctsio) { struct ctl_softc *softc = CTL_SOFTC(ctsio); struct ctl_lun *lun = CTL_LUN(ctsio); int retval; u_int32_t param_len; struct scsi_per_res_out *cdb; struct scsi_per_res_out_parms* param; uint32_t residx; uint64_t res_key, sa_res_key, key; uint8_t type; union ctl_ha_msg persis_io; int i; CTL_DEBUG_PRINT(("ctl_persistent_reserve_out\n")); cdb = (struct scsi_per_res_out *)ctsio->cdb; retval = CTL_RETVAL_COMPLETE; /* * We only support whole-LUN scope. The scope & type are ignored for * register, register and ignore existing key and clear. * We sometimes ignore scope and type on preempts too!! * Verify reservation type here as well. */ type = cdb->scope_type & SPR_TYPE_MASK; if ((cdb->action == SPRO_RESERVE) || (cdb->action == SPRO_RELEASE)) { if ((cdb->scope_type & SPR_SCOPE_MASK) != SPR_LU_SCOPE) { ctl_set_invalid_field(/*ctsio*/ ctsio, /*sks_valid*/ 1, /*command*/ 1, /*field*/ 2, /*bit_valid*/ 1, /*bit*/ 4); ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } if (type>8 || type==2 || type==4 || type==0) { ctl_set_invalid_field(/*ctsio*/ ctsio, /*sks_valid*/ 1, /*command*/ 1, /*field*/ 2, /*bit_valid*/ 1, /*bit*/ 0); ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } } param_len = scsi_4btoul(cdb->length); if ((ctsio->io_hdr.flags & CTL_FLAG_ALLOCATED) == 0) { ctsio->kern_data_ptr = malloc(param_len, M_CTL, M_WAITOK); ctsio->kern_data_len = param_len; ctsio->kern_total_len = param_len; ctsio->kern_rel_offset = 0; ctsio->kern_sg_entries = 0; ctsio->io_hdr.flags |= CTL_FLAG_ALLOCATED; ctsio->be_move_done = ctl_config_move_done; ctl_datamove((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } param = (struct scsi_per_res_out_parms *)ctsio->kern_data_ptr; residx = ctl_get_initindex(&ctsio->io_hdr.nexus); res_key = scsi_8btou64(param->res_key.key); sa_res_key = scsi_8btou64(param->serv_act_res_key); /* * Validate the reservation key here except for SPRO_REG_IGNO * This must be done for all other service actions */ if ((cdb->action & SPRO_ACTION_MASK) != SPRO_REG_IGNO) { mtx_lock(&lun->lun_lock); if ((key = ctl_get_prkey(lun, residx)) != 0) { if (res_key != key) { /* * The current key passed in doesn't match * the one the initiator previously * registered. */ mtx_unlock(&lun->lun_lock); free(ctsio->kern_data_ptr, M_CTL); ctl_set_reservation_conflict(ctsio); ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } } else if ((cdb->action & SPRO_ACTION_MASK) != SPRO_REGISTER) { /* * We are not registered */ mtx_unlock(&lun->lun_lock); free(ctsio->kern_data_ptr, M_CTL); ctl_set_reservation_conflict(ctsio); ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } else if (res_key != 0) { /* * We are not registered and trying to register but * the register key isn't zero. */ mtx_unlock(&lun->lun_lock); free(ctsio->kern_data_ptr, M_CTL); ctl_set_reservation_conflict(ctsio); ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } mtx_unlock(&lun->lun_lock); } switch (cdb->action & SPRO_ACTION_MASK) { case SPRO_REGISTER: case SPRO_REG_IGNO: { /* * We don't support any of these options, as we report in * the read capabilities request (see * ctl_persistent_reserve_in(), above). */ if ((param->flags & SPR_SPEC_I_PT) || (param->flags & SPR_ALL_TG_PT) || (param->flags & SPR_APTPL)) { int bit_ptr; if (param->flags & SPR_APTPL) bit_ptr = 0; else if (param->flags & SPR_ALL_TG_PT) bit_ptr = 2; else /* SPR_SPEC_I_PT */ bit_ptr = 3; free(ctsio->kern_data_ptr, M_CTL); ctl_set_invalid_field(ctsio, /*sks_valid*/ 1, /*command*/ 0, /*field*/ 20, /*bit_valid*/ 1, /*bit*/ bit_ptr); ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } mtx_lock(&lun->lun_lock); /* * The initiator wants to clear the * key/unregister. */ if (sa_res_key == 0) { if ((res_key == 0 && (cdb->action & SPRO_ACTION_MASK) == SPRO_REGISTER) || ((cdb->action & SPRO_ACTION_MASK) == SPRO_REG_IGNO && ctl_get_prkey(lun, residx) == 0)) { mtx_unlock(&lun->lun_lock); goto done; } ctl_clr_prkey(lun, residx); lun->pr_key_count--; if (residx == lun->pr_res_idx) { lun->flags &= ~CTL_LUN_PR_RESERVED; lun->pr_res_idx = CTL_PR_NO_RESERVATION; if ((lun->pr_res_type == SPR_TYPE_WR_EX_RO || lun->pr_res_type == SPR_TYPE_EX_AC_RO) && lun->pr_key_count) { /* * If the reservation is a registrants * only type we need to generate a UA * for other registered inits. The * sense code should be RESERVATIONS * RELEASED */ for (i = softc->init_min; i < softc->init_max; i++){ if (ctl_get_prkey(lun, i) == 0) continue; ctl_est_ua(lun, i, CTL_UA_RES_RELEASE); } } lun->pr_res_type = 0; } else if (lun->pr_res_idx == CTL_PR_ALL_REGISTRANTS) { if (lun->pr_key_count==0) { lun->flags &= ~CTL_LUN_PR_RESERVED; lun->pr_res_type = 0; lun->pr_res_idx = CTL_PR_NO_RESERVATION; } } lun->pr_generation++; mtx_unlock(&lun->lun_lock); persis_io.hdr.nexus = ctsio->io_hdr.nexus; persis_io.hdr.msg_type = CTL_MSG_PERS_ACTION; persis_io.pr.pr_info.action = CTL_PR_UNREG_KEY; persis_io.pr.pr_info.residx = residx; ctl_ha_msg_send(CTL_HA_CHAN_CTL, &persis_io, sizeof(persis_io.pr), M_WAITOK); } else /* sa_res_key != 0 */ { /* * If we aren't registered currently then increment * the key count and set the registered flag. */ ctl_alloc_prkey(lun, residx); if (ctl_get_prkey(lun, residx) == 0) lun->pr_key_count++; ctl_set_prkey(lun, residx, sa_res_key); lun->pr_generation++; mtx_unlock(&lun->lun_lock); persis_io.hdr.nexus = ctsio->io_hdr.nexus; persis_io.hdr.msg_type = CTL_MSG_PERS_ACTION; persis_io.pr.pr_info.action = CTL_PR_REG_KEY; persis_io.pr.pr_info.residx = residx; memcpy(persis_io.pr.pr_info.sa_res_key, param->serv_act_res_key, sizeof(param->serv_act_res_key)); ctl_ha_msg_send(CTL_HA_CHAN_CTL, &persis_io, sizeof(persis_io.pr), M_WAITOK); } break; } case SPRO_RESERVE: mtx_lock(&lun->lun_lock); if (lun->flags & CTL_LUN_PR_RESERVED) { /* * if this isn't the reservation holder and it's * not a "all registrants" type or if the type is * different then we have a conflict */ if ((lun->pr_res_idx != residx && lun->pr_res_idx != CTL_PR_ALL_REGISTRANTS) || lun->pr_res_type != type) { mtx_unlock(&lun->lun_lock); free(ctsio->kern_data_ptr, M_CTL); ctl_set_reservation_conflict(ctsio); ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } mtx_unlock(&lun->lun_lock); } else /* create a reservation */ { /* * If it's not an "all registrants" type record * reservation holder */ if (type != SPR_TYPE_WR_EX_AR && type != SPR_TYPE_EX_AC_AR) lun->pr_res_idx = residx; /* Res holder */ else lun->pr_res_idx = CTL_PR_ALL_REGISTRANTS; lun->flags |= CTL_LUN_PR_RESERVED; lun->pr_res_type = type; mtx_unlock(&lun->lun_lock); /* send msg to other side */ persis_io.hdr.nexus = ctsio->io_hdr.nexus; persis_io.hdr.msg_type = CTL_MSG_PERS_ACTION; persis_io.pr.pr_info.action = CTL_PR_RESERVE; persis_io.pr.pr_info.residx = lun->pr_res_idx; persis_io.pr.pr_info.res_type = type; ctl_ha_msg_send(CTL_HA_CHAN_CTL, &persis_io, sizeof(persis_io.pr), M_WAITOK); } break; case SPRO_RELEASE: mtx_lock(&lun->lun_lock); if ((lun->flags & CTL_LUN_PR_RESERVED) == 0) { /* No reservation exists return good status */ mtx_unlock(&lun->lun_lock); goto done; } /* * Is this nexus a reservation holder? */ if (lun->pr_res_idx != residx && lun->pr_res_idx != CTL_PR_ALL_REGISTRANTS) { /* * not a res holder return good status but * do nothing */ mtx_unlock(&lun->lun_lock); goto done; } if (lun->pr_res_type != type) { mtx_unlock(&lun->lun_lock); free(ctsio->kern_data_ptr, M_CTL); ctl_set_illegal_pr_release(ctsio); ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } /* okay to release */ lun->flags &= ~CTL_LUN_PR_RESERVED; lun->pr_res_idx = CTL_PR_NO_RESERVATION; lun->pr_res_type = 0; /* * If this isn't an exclusive access reservation and NUAR * is not set, generate UA for all other registrants. */ if (type != SPR_TYPE_EX_AC && type != SPR_TYPE_WR_EX && (lun->MODE_CTRL.queue_flags & SCP_NUAR) == 0) { for (i = softc->init_min; i < softc->init_max; i++) { if (i == residx || ctl_get_prkey(lun, i) == 0) continue; ctl_est_ua(lun, i, CTL_UA_RES_RELEASE); } } mtx_unlock(&lun->lun_lock); /* Send msg to other side */ persis_io.hdr.nexus = ctsio->io_hdr.nexus; persis_io.hdr.msg_type = CTL_MSG_PERS_ACTION; persis_io.pr.pr_info.action = CTL_PR_RELEASE; ctl_ha_msg_send(CTL_HA_CHAN_CTL, &persis_io, sizeof(persis_io.pr), M_WAITOK); break; case SPRO_CLEAR: /* send msg to other side */ mtx_lock(&lun->lun_lock); lun->flags &= ~CTL_LUN_PR_RESERVED; lun->pr_res_type = 0; lun->pr_key_count = 0; lun->pr_res_idx = CTL_PR_NO_RESERVATION; ctl_clr_prkey(lun, residx); for (i = 0; i < CTL_MAX_INITIATORS; i++) if (ctl_get_prkey(lun, i) != 0) { ctl_clr_prkey(lun, i); ctl_est_ua(lun, i, CTL_UA_REG_PREEMPT); } lun->pr_generation++; mtx_unlock(&lun->lun_lock); persis_io.hdr.nexus = ctsio->io_hdr.nexus; persis_io.hdr.msg_type = CTL_MSG_PERS_ACTION; persis_io.pr.pr_info.action = CTL_PR_CLEAR; ctl_ha_msg_send(CTL_HA_CHAN_CTL, &persis_io, sizeof(persis_io.pr), M_WAITOK); break; case SPRO_PREEMPT: case SPRO_PRE_ABO: { int nretval; nretval = ctl_pro_preempt(softc, lun, res_key, sa_res_key, type, residx, ctsio, cdb, param); if (nretval != 0) return (CTL_RETVAL_COMPLETE); break; } default: panic("%s: Invalid PR type %#x", __func__, cdb->action); } done: free(ctsio->kern_data_ptr, M_CTL); ctl_set_success(ctsio); ctl_done((union ctl_io *)ctsio); return (retval); } /* * This routine is for handling a message from the other SC pertaining to * persistent reserve out. All the error checking will have been done * so only perorming the action need be done here to keep the two * in sync. */ static void ctl_hndl_per_res_out_on_other_sc(union ctl_io *io) { struct ctl_softc *softc = CTL_SOFTC(io); union ctl_ha_msg *msg = (union ctl_ha_msg *)&io->presio.pr_msg; struct ctl_lun *lun; int i; uint32_t residx, targ_lun; targ_lun = msg->hdr.nexus.targ_mapped_lun; mtx_lock(&softc->ctl_lock); if (targ_lun >= ctl_max_luns || (lun = softc->ctl_luns[targ_lun]) == NULL) { mtx_unlock(&softc->ctl_lock); return; } mtx_lock(&lun->lun_lock); mtx_unlock(&softc->ctl_lock); if (lun->flags & CTL_LUN_DISABLED) { mtx_unlock(&lun->lun_lock); return; } residx = ctl_get_initindex(&msg->hdr.nexus); switch(msg->pr.pr_info.action) { case CTL_PR_REG_KEY: ctl_alloc_prkey(lun, msg->pr.pr_info.residx); if (ctl_get_prkey(lun, msg->pr.pr_info.residx) == 0) lun->pr_key_count++; ctl_set_prkey(lun, msg->pr.pr_info.residx, scsi_8btou64(msg->pr.pr_info.sa_res_key)); lun->pr_generation++; break; case CTL_PR_UNREG_KEY: ctl_clr_prkey(lun, msg->pr.pr_info.residx); lun->pr_key_count--; /* XXX Need to see if the reservation has been released */ /* if so do we need to generate UA? */ if (msg->pr.pr_info.residx == lun->pr_res_idx) { lun->flags &= ~CTL_LUN_PR_RESERVED; lun->pr_res_idx = CTL_PR_NO_RESERVATION; if ((lun->pr_res_type == SPR_TYPE_WR_EX_RO || lun->pr_res_type == SPR_TYPE_EX_AC_RO) && lun->pr_key_count) { /* * If the reservation is a registrants * only type we need to generate a UA * for other registered inits. The * sense code should be RESERVATIONS * RELEASED */ for (i = softc->init_min; i < softc->init_max; i++) { if (ctl_get_prkey(lun, i) == 0) continue; ctl_est_ua(lun, i, CTL_UA_RES_RELEASE); } } lun->pr_res_type = 0; } else if (lun->pr_res_idx == CTL_PR_ALL_REGISTRANTS) { if (lun->pr_key_count==0) { lun->flags &= ~CTL_LUN_PR_RESERVED; lun->pr_res_type = 0; lun->pr_res_idx = CTL_PR_NO_RESERVATION; } } lun->pr_generation++; break; case CTL_PR_RESERVE: lun->flags |= CTL_LUN_PR_RESERVED; lun->pr_res_type = msg->pr.pr_info.res_type; lun->pr_res_idx = msg->pr.pr_info.residx; break; case CTL_PR_RELEASE: /* * If this isn't an exclusive access reservation and NUAR * is not set, generate UA for all other registrants. */ if (lun->pr_res_type != SPR_TYPE_EX_AC && lun->pr_res_type != SPR_TYPE_WR_EX && (lun->MODE_CTRL.queue_flags & SCP_NUAR) == 0) { for (i = softc->init_min; i < softc->init_max; i++) { if (i == residx || ctl_get_prkey(lun, i) == 0) continue; ctl_est_ua(lun, i, CTL_UA_RES_RELEASE); } } lun->flags &= ~CTL_LUN_PR_RESERVED; lun->pr_res_idx = CTL_PR_NO_RESERVATION; lun->pr_res_type = 0; break; case CTL_PR_PREEMPT: ctl_pro_preempt_other(lun, msg); break; case CTL_PR_CLEAR: lun->flags &= ~CTL_LUN_PR_RESERVED; lun->pr_res_type = 0; lun->pr_key_count = 0; lun->pr_res_idx = CTL_PR_NO_RESERVATION; for (i=0; i < CTL_MAX_INITIATORS; i++) { if (ctl_get_prkey(lun, i) == 0) continue; ctl_clr_prkey(lun, i); ctl_est_ua(lun, i, CTL_UA_REG_PREEMPT); } lun->pr_generation++; break; } mtx_unlock(&lun->lun_lock); } int ctl_read_write(struct ctl_scsiio *ctsio) { struct ctl_lun *lun = CTL_LUN(ctsio); struct ctl_lba_len_flags *lbalen; uint64_t lba; uint32_t num_blocks; int flags, retval; int isread; CTL_DEBUG_PRINT(("ctl_read_write: command: %#x\n", ctsio->cdb[0])); flags = 0; isread = ctsio->cdb[0] == READ_6 || ctsio->cdb[0] == READ_10 || ctsio->cdb[0] == READ_12 || ctsio->cdb[0] == READ_16; switch (ctsio->cdb[0]) { case READ_6: case WRITE_6: { struct scsi_rw_6 *cdb; cdb = (struct scsi_rw_6 *)ctsio->cdb; lba = scsi_3btoul(cdb->addr); /* only 5 bits are valid in the most significant address byte */ lba &= 0x1fffff; num_blocks = cdb->length; /* * This is correct according to SBC-2. */ if (num_blocks == 0) num_blocks = 256; break; } case READ_10: case WRITE_10: { struct scsi_rw_10 *cdb; cdb = (struct scsi_rw_10 *)ctsio->cdb; if (cdb->byte2 & SRW10_FUA) flags |= CTL_LLF_FUA; if (cdb->byte2 & SRW10_DPO) flags |= CTL_LLF_DPO; lba = scsi_4btoul(cdb->addr); num_blocks = scsi_2btoul(cdb->length); break; } case WRITE_VERIFY_10: { struct scsi_write_verify_10 *cdb; cdb = (struct scsi_write_verify_10 *)ctsio->cdb; flags |= CTL_LLF_FUA; if (cdb->byte2 & SWV_DPO) flags |= CTL_LLF_DPO; lba = scsi_4btoul(cdb->addr); num_blocks = scsi_2btoul(cdb->length); break; } case READ_12: case WRITE_12: { struct scsi_rw_12 *cdb; cdb = (struct scsi_rw_12 *)ctsio->cdb; if (cdb->byte2 & SRW12_FUA) flags |= CTL_LLF_FUA; if (cdb->byte2 & SRW12_DPO) flags |= CTL_LLF_DPO; lba = scsi_4btoul(cdb->addr); num_blocks = scsi_4btoul(cdb->length); break; } case WRITE_VERIFY_12: { struct scsi_write_verify_12 *cdb; cdb = (struct scsi_write_verify_12 *)ctsio->cdb; flags |= CTL_LLF_FUA; if (cdb->byte2 & SWV_DPO) flags |= CTL_LLF_DPO; lba = scsi_4btoul(cdb->addr); num_blocks = scsi_4btoul(cdb->length); break; } case READ_16: case WRITE_16: { struct scsi_rw_16 *cdb; cdb = (struct scsi_rw_16 *)ctsio->cdb; if (cdb->byte2 & SRW12_FUA) flags |= CTL_LLF_FUA; if (cdb->byte2 & SRW12_DPO) flags |= CTL_LLF_DPO; lba = scsi_8btou64(cdb->addr); num_blocks = scsi_4btoul(cdb->length); break; } case WRITE_ATOMIC_16: { struct scsi_write_atomic_16 *cdb; if (lun->be_lun->atomicblock == 0) { ctl_set_invalid_opcode(ctsio); ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } cdb = (struct scsi_write_atomic_16 *)ctsio->cdb; if (cdb->byte2 & SRW12_FUA) flags |= CTL_LLF_FUA; if (cdb->byte2 & SRW12_DPO) flags |= CTL_LLF_DPO; lba = scsi_8btou64(cdb->addr); num_blocks = scsi_2btoul(cdb->length); if (num_blocks > lun->be_lun->atomicblock) { ctl_set_invalid_field(ctsio, /*sks_valid*/ 1, /*command*/ 1, /*field*/ 12, /*bit_valid*/ 0, /*bit*/ 0); ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } break; } case WRITE_VERIFY_16: { struct scsi_write_verify_16 *cdb; cdb = (struct scsi_write_verify_16 *)ctsio->cdb; flags |= CTL_LLF_FUA; if (cdb->byte2 & SWV_DPO) flags |= CTL_LLF_DPO; lba = scsi_8btou64(cdb->addr); num_blocks = scsi_4btoul(cdb->length); break; } default: /* * We got a command we don't support. This shouldn't * happen, commands should be filtered out above us. */ ctl_set_invalid_opcode(ctsio); ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); break; /* NOTREACHED */ } /* * The first check is to make sure we're in bounds, the second * check is to catch wrap-around problems. If the lba + num blocks * is less than the lba, then we've wrapped around and the block * range is invalid anyway. */ if (((lba + num_blocks) > (lun->be_lun->maxlba + 1)) || ((lba + num_blocks) < lba)) { ctl_set_lba_out_of_range(ctsio, MAX(lba, lun->be_lun->maxlba + 1)); ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } /* * According to SBC-3, a transfer length of 0 is not an error. * Note that this cannot happen with WRITE(6) or READ(6), since 0 * translates to 256 blocks for those commands. */ if (num_blocks == 0) { ctl_set_success(ctsio); ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } /* Set FUA and/or DPO if caches are disabled. */ if (isread) { if ((lun->MODE_CACHING.flags1 & SCP_RCD) != 0) flags |= CTL_LLF_FUA | CTL_LLF_DPO; } else { if ((lun->MODE_CACHING.flags1 & SCP_WCE) == 0) flags |= CTL_LLF_FUA; } lbalen = (struct ctl_lba_len_flags *) &ctsio->io_hdr.ctl_private[CTL_PRIV_LBA_LEN]; lbalen->lba = lba; lbalen->len = num_blocks; lbalen->flags = (isread ? CTL_LLF_READ : CTL_LLF_WRITE) | flags; ctsio->kern_total_len = num_blocks * lun->be_lun->blocksize; ctsio->kern_rel_offset = 0; CTL_DEBUG_PRINT(("ctl_read_write: calling data_submit()\n")); retval = lun->backend->data_submit((union ctl_io *)ctsio); return (retval); } static int ctl_cnw_cont(union ctl_io *io) { struct ctl_lun *lun = CTL_LUN(io); struct ctl_scsiio *ctsio; struct ctl_lba_len_flags *lbalen; int retval; ctsio = &io->scsiio; ctsio->io_hdr.status = CTL_STATUS_NONE; ctsio->io_hdr.flags &= ~CTL_FLAG_IO_CONT; lbalen = (struct ctl_lba_len_flags *) &ctsio->io_hdr.ctl_private[CTL_PRIV_LBA_LEN]; lbalen->flags &= ~CTL_LLF_COMPARE; lbalen->flags |= CTL_LLF_WRITE; CTL_DEBUG_PRINT(("ctl_cnw_cont: calling data_submit()\n")); retval = lun->backend->data_submit((union ctl_io *)ctsio); return (retval); } int ctl_cnw(struct ctl_scsiio *ctsio) { struct ctl_lun *lun = CTL_LUN(ctsio); struct ctl_lba_len_flags *lbalen; uint64_t lba; uint32_t num_blocks; int flags, retval; CTL_DEBUG_PRINT(("ctl_cnw: command: %#x\n", ctsio->cdb[0])); flags = 0; switch (ctsio->cdb[0]) { case COMPARE_AND_WRITE: { struct scsi_compare_and_write *cdb; cdb = (struct scsi_compare_and_write *)ctsio->cdb; if (cdb->byte2 & SRW10_FUA) flags |= CTL_LLF_FUA; if (cdb->byte2 & SRW10_DPO) flags |= CTL_LLF_DPO; lba = scsi_8btou64(cdb->addr); num_blocks = cdb->length; break; } default: /* * We got a command we don't support. This shouldn't * happen, commands should be filtered out above us. */ ctl_set_invalid_opcode(ctsio); ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); break; /* NOTREACHED */ } /* * The first check is to make sure we're in bounds, the second * check is to catch wrap-around problems. If the lba + num blocks * is less than the lba, then we've wrapped around and the block * range is invalid anyway. */ if (((lba + num_blocks) > (lun->be_lun->maxlba + 1)) || ((lba + num_blocks) < lba)) { ctl_set_lba_out_of_range(ctsio, MAX(lba, lun->be_lun->maxlba + 1)); ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } /* * According to SBC-3, a transfer length of 0 is not an error. */ if (num_blocks == 0) { ctl_set_success(ctsio); ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } /* Set FUA if write cache is disabled. */ if ((lun->MODE_CACHING.flags1 & SCP_WCE) == 0) flags |= CTL_LLF_FUA; ctsio->kern_total_len = 2 * num_blocks * lun->be_lun->blocksize; ctsio->kern_rel_offset = 0; /* * Set the IO_CONT flag, so that if this I/O gets passed to * ctl_data_submit_done(), it'll get passed back to * ctl_ctl_cnw_cont() for further processing. */ ctsio->io_hdr.flags |= CTL_FLAG_IO_CONT; ctsio->io_cont = ctl_cnw_cont; lbalen = (struct ctl_lba_len_flags *) &ctsio->io_hdr.ctl_private[CTL_PRIV_LBA_LEN]; lbalen->lba = lba; lbalen->len = num_blocks; lbalen->flags = CTL_LLF_COMPARE | flags; CTL_DEBUG_PRINT(("ctl_cnw: calling data_submit()\n")); retval = lun->backend->data_submit((union ctl_io *)ctsio); return (retval); } int ctl_verify(struct ctl_scsiio *ctsio) { struct ctl_lun *lun = CTL_LUN(ctsio); struct ctl_lba_len_flags *lbalen; uint64_t lba; uint32_t num_blocks; int bytchk, flags; int retval; CTL_DEBUG_PRINT(("ctl_verify: command: %#x\n", ctsio->cdb[0])); bytchk = 0; flags = CTL_LLF_FUA; switch (ctsio->cdb[0]) { case VERIFY_10: { struct scsi_verify_10 *cdb; cdb = (struct scsi_verify_10 *)ctsio->cdb; if (cdb->byte2 & SVFY_BYTCHK) bytchk = 1; if (cdb->byte2 & SVFY_DPO) flags |= CTL_LLF_DPO; lba = scsi_4btoul(cdb->addr); num_blocks = scsi_2btoul(cdb->length); break; } case VERIFY_12: { struct scsi_verify_12 *cdb; cdb = (struct scsi_verify_12 *)ctsio->cdb; if (cdb->byte2 & SVFY_BYTCHK) bytchk = 1; if (cdb->byte2 & SVFY_DPO) flags |= CTL_LLF_DPO; lba = scsi_4btoul(cdb->addr); num_blocks = scsi_4btoul(cdb->length); break; } case VERIFY_16: { struct scsi_rw_16 *cdb; cdb = (struct scsi_rw_16 *)ctsio->cdb; if (cdb->byte2 & SVFY_BYTCHK) bytchk = 1; if (cdb->byte2 & SVFY_DPO) flags |= CTL_LLF_DPO; lba = scsi_8btou64(cdb->addr); num_blocks = scsi_4btoul(cdb->length); break; } default: /* * We got a command we don't support. This shouldn't * happen, commands should be filtered out above us. */ ctl_set_invalid_opcode(ctsio); ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } /* * The first check is to make sure we're in bounds, the second * check is to catch wrap-around problems. If the lba + num blocks * is less than the lba, then we've wrapped around and the block * range is invalid anyway. */ if (((lba + num_blocks) > (lun->be_lun->maxlba + 1)) || ((lba + num_blocks) < lba)) { ctl_set_lba_out_of_range(ctsio, MAX(lba, lun->be_lun->maxlba + 1)); ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } /* * According to SBC-3, a transfer length of 0 is not an error. */ if (num_blocks == 0) { ctl_set_success(ctsio); ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } lbalen = (struct ctl_lba_len_flags *) &ctsio->io_hdr.ctl_private[CTL_PRIV_LBA_LEN]; lbalen->lba = lba; lbalen->len = num_blocks; if (bytchk) { lbalen->flags = CTL_LLF_COMPARE | flags; ctsio->kern_total_len = num_blocks * lun->be_lun->blocksize; } else { lbalen->flags = CTL_LLF_VERIFY | flags; ctsio->kern_total_len = 0; } ctsio->kern_rel_offset = 0; CTL_DEBUG_PRINT(("ctl_verify: calling data_submit()\n")); retval = lun->backend->data_submit((union ctl_io *)ctsio); return (retval); } int ctl_report_luns(struct ctl_scsiio *ctsio) { struct ctl_softc *softc = CTL_SOFTC(ctsio); struct ctl_port *port = CTL_PORT(ctsio); struct ctl_lun *lun, *request_lun = CTL_LUN(ctsio); struct scsi_report_luns *cdb; struct scsi_report_luns_data *lun_data; int num_filled, num_luns, num_port_luns, retval; uint32_t alloc_len, lun_datalen; uint32_t initidx, targ_lun_id, lun_id; retval = CTL_RETVAL_COMPLETE; cdb = (struct scsi_report_luns *)ctsio->cdb; CTL_DEBUG_PRINT(("ctl_report_luns\n")); num_luns = 0; num_port_luns = port->lun_map ? port->lun_map_size : ctl_max_luns; mtx_lock(&softc->ctl_lock); for (targ_lun_id = 0; targ_lun_id < num_port_luns; targ_lun_id++) { if (ctl_lun_map_from_port(port, targ_lun_id) != UINT32_MAX) num_luns++; } mtx_unlock(&softc->ctl_lock); switch (cdb->select_report) { case RPL_REPORT_DEFAULT: case RPL_REPORT_ALL: case RPL_REPORT_NONSUBSID: break; case RPL_REPORT_WELLKNOWN: case RPL_REPORT_ADMIN: case RPL_REPORT_CONGLOM: num_luns = 0; break; default: ctl_set_invalid_field(ctsio, /*sks_valid*/ 1, /*command*/ 1, /*field*/ 2, /*bit_valid*/ 0, /*bit*/ 0); ctl_done((union ctl_io *)ctsio); return (retval); break; /* NOTREACHED */ } alloc_len = scsi_4btoul(cdb->length); /* * The initiator has to allocate at least 16 bytes for this request, * so he can at least get the header and the first LUN. Otherwise * we reject the request (per SPC-3 rev 14, section 6.21). */ if (alloc_len < (sizeof(struct scsi_report_luns_data) + sizeof(struct scsi_report_luns_lundata))) { ctl_set_invalid_field(ctsio, /*sks_valid*/ 1, /*command*/ 1, /*field*/ 6, /*bit_valid*/ 0, /*bit*/ 0); ctl_done((union ctl_io *)ctsio); return (retval); } lun_datalen = sizeof(*lun_data) + (num_luns * sizeof(struct scsi_report_luns_lundata)); ctsio->kern_data_ptr = malloc(lun_datalen, M_CTL, M_WAITOK | M_ZERO); lun_data = (struct scsi_report_luns_data *)ctsio->kern_data_ptr; ctsio->kern_sg_entries = 0; initidx = ctl_get_initindex(&ctsio->io_hdr.nexus); mtx_lock(&softc->ctl_lock); for (targ_lun_id = 0, num_filled = 0; targ_lun_id < num_port_luns && num_filled < num_luns; targ_lun_id++) { lun_id = ctl_lun_map_from_port(port, targ_lun_id); if (lun_id == UINT32_MAX) continue; lun = softc->ctl_luns[lun_id]; if (lun == NULL) continue; be64enc(lun_data->luns[num_filled++].lundata, ctl_encode_lun(targ_lun_id)); /* * According to SPC-3, rev 14 section 6.21: * * "The execution of a REPORT LUNS command to any valid and * installed logical unit shall clear the REPORTED LUNS DATA * HAS CHANGED unit attention condition for all logical * units of that target with respect to the requesting * initiator. A valid and installed logical unit is one * having a PERIPHERAL QUALIFIER of 000b in the standard * INQUIRY data (see 6.4.2)." * * If request_lun is NULL, the LUN this report luns command * was issued to is either disabled or doesn't exist. In that * case, we shouldn't clear any pending lun change unit * attention. */ if (request_lun != NULL) { mtx_lock(&lun->lun_lock); ctl_clr_ua(lun, initidx, CTL_UA_LUN_CHANGE); mtx_unlock(&lun->lun_lock); } } mtx_unlock(&softc->ctl_lock); /* * It's quite possible that we've returned fewer LUNs than we allocated * space for. Trim it. */ lun_datalen = sizeof(*lun_data) + (num_filled * sizeof(struct scsi_report_luns_lundata)); ctsio->kern_rel_offset = 0; ctsio->kern_sg_entries = 0; ctsio->kern_data_len = min(lun_datalen, alloc_len); ctsio->kern_total_len = ctsio->kern_data_len; /* * We set this to the actual data length, regardless of how much * space we actually have to return results. If the user looks at * this value, he'll know whether or not he allocated enough space * and reissue the command if necessary. We don't support well * known logical units, so if the user asks for that, return none. */ scsi_ulto4b(lun_datalen - 8, lun_data->length); /* * We can only return SCSI_STATUS_CHECK_COND when we can't satisfy * this request. */ ctl_set_success(ctsio); ctsio->io_hdr.flags |= CTL_FLAG_ALLOCATED; ctsio->be_move_done = ctl_config_move_done; ctl_datamove((union ctl_io *)ctsio); return (retval); } int ctl_request_sense(struct ctl_scsiio *ctsio) { struct ctl_softc *softc = CTL_SOFTC(ctsio); struct ctl_lun *lun = CTL_LUN(ctsio); struct scsi_request_sense *cdb; struct scsi_sense_data *sense_ptr, *ps; uint32_t initidx; int have_error; u_int sense_len = SSD_FULL_SIZE; scsi_sense_data_type sense_format; ctl_ua_type ua_type; uint8_t asc = 0, ascq = 0; cdb = (struct scsi_request_sense *)ctsio->cdb; CTL_DEBUG_PRINT(("ctl_request_sense\n")); /* * Determine which sense format the user wants. */ if (cdb->byte2 & SRS_DESC) sense_format = SSD_TYPE_DESC; else sense_format = SSD_TYPE_FIXED; ctsio->kern_data_ptr = malloc(sizeof(*sense_ptr), M_CTL, M_WAITOK); sense_ptr = (struct scsi_sense_data *)ctsio->kern_data_ptr; ctsio->kern_sg_entries = 0; ctsio->kern_rel_offset = 0; /* * struct scsi_sense_data, which is currently set to 256 bytes, is * larger than the largest allowed value for the length field in the * REQUEST SENSE CDB, which is 252 bytes as of SPC-4. */ ctsio->kern_data_len = cdb->length; ctsio->kern_total_len = cdb->length; /* * If we don't have a LUN, we don't have any pending sense. */ if (lun == NULL || ((lun->flags & CTL_LUN_PRIMARY_SC) == 0 && softc->ha_link < CTL_HA_LINK_UNKNOWN)) { /* "Logical unit not supported" */ ctl_set_sense_data(sense_ptr, &sense_len, NULL, sense_format, /*current_error*/ 1, /*sense_key*/ SSD_KEY_ILLEGAL_REQUEST, /*asc*/ 0x25, /*ascq*/ 0x00, SSD_ELEM_NONE); goto send; } have_error = 0; initidx = ctl_get_initindex(&ctsio->io_hdr.nexus); /* * Check for pending sense, and then for pending unit attentions. * Pending sense gets returned first, then pending unit attentions. */ mtx_lock(&lun->lun_lock); ps = lun->pending_sense[initidx / CTL_MAX_INIT_PER_PORT]; if (ps != NULL) ps += initidx % CTL_MAX_INIT_PER_PORT; if (ps != NULL && ps->error_code != 0) { scsi_sense_data_type stored_format; /* * Check to see which sense format was used for the stored * sense data. */ stored_format = scsi_sense_type(ps); /* * If the user requested a different sense format than the * one we stored, then we need to convert it to the other * format. If we're going from descriptor to fixed format * sense data, we may lose things in translation, depending * on what options were used. * * If the stored format is SSD_TYPE_NONE (i.e. invalid), * for some reason we'll just copy it out as-is. */ if ((stored_format == SSD_TYPE_FIXED) && (sense_format == SSD_TYPE_DESC)) ctl_sense_to_desc((struct scsi_sense_data_fixed *) ps, (struct scsi_sense_data_desc *)sense_ptr); else if ((stored_format == SSD_TYPE_DESC) && (sense_format == SSD_TYPE_FIXED)) ctl_sense_to_fixed((struct scsi_sense_data_desc *) ps, (struct scsi_sense_data_fixed *)sense_ptr); else memcpy(sense_ptr, ps, sizeof(*sense_ptr)); ps->error_code = 0; have_error = 1; } else { ua_type = ctl_build_ua(lun, initidx, sense_ptr, &sense_len, sense_format); if (ua_type != CTL_UA_NONE) have_error = 1; } if (have_error == 0) { /* * Report informational exception if have one and allowed. */ if (lun->MODE_IE.mrie != SIEP_MRIE_NO) { asc = lun->ie_asc; ascq = lun->ie_ascq; } ctl_set_sense_data(sense_ptr, &sense_len, lun, sense_format, /*current_error*/ 1, /*sense_key*/ SSD_KEY_NO_SENSE, /*asc*/ asc, /*ascq*/ ascq, SSD_ELEM_NONE); } mtx_unlock(&lun->lun_lock); send: /* * We report the SCSI status as OK, since the status of the command * itself is OK. We're reporting sense as parameter data. */ ctl_set_success(ctsio); ctsio->io_hdr.flags |= CTL_FLAG_ALLOCATED; ctsio->be_move_done = ctl_config_move_done; ctl_datamove((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } int ctl_tur(struct ctl_scsiio *ctsio) { CTL_DEBUG_PRINT(("ctl_tur\n")); ctl_set_success(ctsio); ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } /* * SCSI VPD page 0x00, the Supported VPD Pages page. */ static int ctl_inquiry_evpd_supported(struct ctl_scsiio *ctsio, int alloc_len) { struct ctl_lun *lun = CTL_LUN(ctsio); struct scsi_vpd_supported_pages *pages; int sup_page_size; int p; sup_page_size = sizeof(struct scsi_vpd_supported_pages) * SCSI_EVPD_NUM_SUPPORTED_PAGES; ctsio->kern_data_ptr = malloc(sup_page_size, M_CTL, M_WAITOK | M_ZERO); pages = (struct scsi_vpd_supported_pages *)ctsio->kern_data_ptr; ctsio->kern_rel_offset = 0; ctsio->kern_sg_entries = 0; ctsio->kern_data_len = min(sup_page_size, alloc_len); ctsio->kern_total_len = ctsio->kern_data_len; /* * The control device is always connected. The disk device, on the * other hand, may not be online all the time. Need to change this * to figure out whether the disk device is actually online or not. */ if (lun != NULL) pages->device = (SID_QUAL_LU_CONNECTED << 5) | lun->be_lun->lun_type; else pages->device = (SID_QUAL_LU_OFFLINE << 5) | T_DIRECT; p = 0; /* Supported VPD pages */ pages->page_list[p++] = SVPD_SUPPORTED_PAGES; /* Serial Number */ pages->page_list[p++] = SVPD_UNIT_SERIAL_NUMBER; /* Device Identification */ pages->page_list[p++] = SVPD_DEVICE_ID; /* Extended INQUIRY Data */ pages->page_list[p++] = SVPD_EXTENDED_INQUIRY_DATA; /* Mode Page Policy */ pages->page_list[p++] = SVPD_MODE_PAGE_POLICY; /* SCSI Ports */ pages->page_list[p++] = SVPD_SCSI_PORTS; /* Third-party Copy */ pages->page_list[p++] = SVPD_SCSI_TPC; + /* SCSI Feature Sets */ + pages->page_list[p++] = SVPD_SCSI_SFS; if (lun != NULL && lun->be_lun->lun_type == T_DIRECT) { /* Block limits */ pages->page_list[p++] = SVPD_BLOCK_LIMITS; /* Block Device Characteristics */ pages->page_list[p++] = SVPD_BDC; /* Logical Block Provisioning */ pages->page_list[p++] = SVPD_LBP; } pages->length = p; ctl_set_success(ctsio); ctsio->io_hdr.flags |= CTL_FLAG_ALLOCATED; ctsio->be_move_done = ctl_config_move_done; ctl_datamove((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } /* * SCSI VPD page 0x80, the Unit Serial Number page. */ static int ctl_inquiry_evpd_serial(struct ctl_scsiio *ctsio, int alloc_len) { struct ctl_lun *lun = CTL_LUN(ctsio); struct scsi_vpd_unit_serial_number *sn_ptr; int data_len; data_len = 4 + CTL_SN_LEN; ctsio->kern_data_ptr = malloc(data_len, M_CTL, M_WAITOK | M_ZERO); sn_ptr = (struct scsi_vpd_unit_serial_number *)ctsio->kern_data_ptr; ctsio->kern_rel_offset = 0; ctsio->kern_sg_entries = 0; ctsio->kern_data_len = min(data_len, alloc_len); ctsio->kern_total_len = ctsio->kern_data_len; /* * The control device is always connected. The disk device, on the * other hand, may not be online all the time. Need to change this * to figure out whether the disk device is actually online or not. */ if (lun != NULL) sn_ptr->device = (SID_QUAL_LU_CONNECTED << 5) | lun->be_lun->lun_type; else sn_ptr->device = (SID_QUAL_LU_OFFLINE << 5) | T_DIRECT; sn_ptr->page_code = SVPD_UNIT_SERIAL_NUMBER; sn_ptr->length = CTL_SN_LEN; /* * If we don't have a LUN, we just leave the serial number as * all spaces. */ if (lun != NULL) { strncpy((char *)sn_ptr->serial_num, (char *)lun->be_lun->serial_num, CTL_SN_LEN); } else memset(sn_ptr->serial_num, 0x20, CTL_SN_LEN); ctl_set_success(ctsio); ctsio->io_hdr.flags |= CTL_FLAG_ALLOCATED; ctsio->be_move_done = ctl_config_move_done; ctl_datamove((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } /* * SCSI VPD page 0x86, the Extended INQUIRY Data page. */ static int ctl_inquiry_evpd_eid(struct ctl_scsiio *ctsio, int alloc_len) { struct ctl_lun *lun = CTL_LUN(ctsio); struct scsi_vpd_extended_inquiry_data *eid_ptr; int data_len; data_len = sizeof(struct scsi_vpd_extended_inquiry_data); ctsio->kern_data_ptr = malloc(data_len, M_CTL, M_WAITOK | M_ZERO); eid_ptr = (struct scsi_vpd_extended_inquiry_data *)ctsio->kern_data_ptr; ctsio->kern_sg_entries = 0; ctsio->kern_rel_offset = 0; ctsio->kern_data_len = min(data_len, alloc_len); ctsio->kern_total_len = ctsio->kern_data_len; /* * The control device is always connected. The disk device, on the * other hand, may not be online all the time. */ if (lun != NULL) eid_ptr->device = (SID_QUAL_LU_CONNECTED << 5) | lun->be_lun->lun_type; else eid_ptr->device = (SID_QUAL_LU_OFFLINE << 5) | T_DIRECT; eid_ptr->page_code = SVPD_EXTENDED_INQUIRY_DATA; scsi_ulto2b(data_len - 4, eid_ptr->page_length); /* * We support head of queue, ordered and simple tags. */ eid_ptr->flags2 = SVPD_EID_HEADSUP | SVPD_EID_ORDSUP | SVPD_EID_SIMPSUP; /* * Volatile cache supported. */ eid_ptr->flags3 = SVPD_EID_V_SUP; /* * This means that we clear the REPORTED LUNS DATA HAS CHANGED unit * attention for a particular IT nexus on all LUNs once we report * it to that nexus once. This bit is required as of SPC-4. */ eid_ptr->flags4 = SVPD_EID_LUICLR; /* * We support revert to defaults (RTD) bit in MODE SELECT. */ eid_ptr->flags5 = SVPD_EID_RTD_SUP; /* * XXX KDM in order to correctly answer this, we would need * information from the SIM to determine how much sense data it * can send. So this would really be a path inquiry field, most * likely. This can be set to a maximum of 252 according to SPC-4, * but the hardware may or may not be able to support that much. * 0 just means that the maximum sense data length is not reported. */ eid_ptr->max_sense_length = 0; ctl_set_success(ctsio); ctsio->io_hdr.flags |= CTL_FLAG_ALLOCATED; ctsio->be_move_done = ctl_config_move_done; ctl_datamove((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } static int ctl_inquiry_evpd_mpp(struct ctl_scsiio *ctsio, int alloc_len) { struct ctl_lun *lun = CTL_LUN(ctsio); struct scsi_vpd_mode_page_policy *mpp_ptr; int data_len; data_len = sizeof(struct scsi_vpd_mode_page_policy) + sizeof(struct scsi_vpd_mode_page_policy_descr); ctsio->kern_data_ptr = malloc(data_len, M_CTL, M_WAITOK | M_ZERO); mpp_ptr = (struct scsi_vpd_mode_page_policy *)ctsio->kern_data_ptr; ctsio->kern_rel_offset = 0; ctsio->kern_sg_entries = 0; ctsio->kern_data_len = min(data_len, alloc_len); ctsio->kern_total_len = ctsio->kern_data_len; /* * The control device is always connected. The disk device, on the * other hand, may not be online all the time. */ if (lun != NULL) mpp_ptr->device = (SID_QUAL_LU_CONNECTED << 5) | lun->be_lun->lun_type; else mpp_ptr->device = (SID_QUAL_LU_OFFLINE << 5) | T_DIRECT; mpp_ptr->page_code = SVPD_MODE_PAGE_POLICY; scsi_ulto2b(data_len - 4, mpp_ptr->page_length); mpp_ptr->descr[0].page_code = 0x3f; mpp_ptr->descr[0].subpage_code = 0xff; mpp_ptr->descr[0].policy = SVPD_MPP_SHARED; ctl_set_success(ctsio); ctsio->io_hdr.flags |= CTL_FLAG_ALLOCATED; ctsio->be_move_done = ctl_config_move_done; ctl_datamove((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } /* * SCSI VPD page 0x83, the Device Identification page. */ static int ctl_inquiry_evpd_devid(struct ctl_scsiio *ctsio, int alloc_len) { struct ctl_softc *softc = CTL_SOFTC(ctsio); struct ctl_port *port = CTL_PORT(ctsio); struct ctl_lun *lun = CTL_LUN(ctsio); struct scsi_vpd_device_id *devid_ptr; struct scsi_vpd_id_descriptor *desc; int data_len, g; uint8_t proto; data_len = sizeof(struct scsi_vpd_device_id) + sizeof(struct scsi_vpd_id_descriptor) + sizeof(struct scsi_vpd_id_rel_trgt_port_id) + sizeof(struct scsi_vpd_id_descriptor) + sizeof(struct scsi_vpd_id_trgt_port_grp_id); if (lun && lun->lun_devid) data_len += lun->lun_devid->len; if (port && port->port_devid) data_len += port->port_devid->len; if (port && port->target_devid) data_len += port->target_devid->len; ctsio->kern_data_ptr = malloc(data_len, M_CTL, M_WAITOK | M_ZERO); devid_ptr = (struct scsi_vpd_device_id *)ctsio->kern_data_ptr; ctsio->kern_sg_entries = 0; ctsio->kern_rel_offset = 0; ctsio->kern_sg_entries = 0; ctsio->kern_data_len = min(data_len, alloc_len); ctsio->kern_total_len = ctsio->kern_data_len; /* * The control device is always connected. The disk device, on the * other hand, may not be online all the time. */ if (lun != NULL) devid_ptr->device = (SID_QUAL_LU_CONNECTED << 5) | lun->be_lun->lun_type; else devid_ptr->device = (SID_QUAL_LU_OFFLINE << 5) | T_DIRECT; devid_ptr->page_code = SVPD_DEVICE_ID; scsi_ulto2b(data_len - 4, devid_ptr->length); if (port && port->port_type == CTL_PORT_FC) proto = SCSI_PROTO_FC << 4; else if (port && port->port_type == CTL_PORT_SAS) proto = SCSI_PROTO_SAS << 4; else if (port && port->port_type == CTL_PORT_ISCSI) proto = SCSI_PROTO_ISCSI << 4; else proto = SCSI_PROTO_SPI << 4; desc = (struct scsi_vpd_id_descriptor *)devid_ptr->desc_list; /* * We're using a LUN association here. i.e., this device ID is a * per-LUN identifier. */ if (lun && lun->lun_devid) { memcpy(desc, lun->lun_devid->data, lun->lun_devid->len); desc = (struct scsi_vpd_id_descriptor *)((uint8_t *)desc + lun->lun_devid->len); } /* * This is for the WWPN which is a port association. */ if (port && port->port_devid) { memcpy(desc, port->port_devid->data, port->port_devid->len); desc = (struct scsi_vpd_id_descriptor *)((uint8_t *)desc + port->port_devid->len); } /* * This is for the Relative Target Port(type 4h) identifier */ desc->proto_codeset = proto | SVPD_ID_CODESET_BINARY; desc->id_type = SVPD_ID_PIV | SVPD_ID_ASSOC_PORT | SVPD_ID_TYPE_RELTARG; desc->length = 4; scsi_ulto2b(ctsio->io_hdr.nexus.targ_port, &desc->identifier[2]); desc = (struct scsi_vpd_id_descriptor *)(&desc->identifier[0] + sizeof(struct scsi_vpd_id_rel_trgt_port_id)); /* * This is for the Target Port Group(type 5h) identifier */ desc->proto_codeset = proto | SVPD_ID_CODESET_BINARY; desc->id_type = SVPD_ID_PIV | SVPD_ID_ASSOC_PORT | SVPD_ID_TYPE_TPORTGRP; desc->length = 4; if (softc->is_single || (port && port->status & CTL_PORT_STATUS_HA_SHARED)) g = 1; else g = 2 + ctsio->io_hdr.nexus.targ_port / softc->port_cnt; scsi_ulto2b(g, &desc->identifier[2]); desc = (struct scsi_vpd_id_descriptor *)(&desc->identifier[0] + sizeof(struct scsi_vpd_id_trgt_port_grp_id)); /* * This is for the Target identifier */ if (port && port->target_devid) { memcpy(desc, port->target_devid->data, port->target_devid->len); } ctl_set_success(ctsio); ctsio->io_hdr.flags |= CTL_FLAG_ALLOCATED; ctsio->be_move_done = ctl_config_move_done; ctl_datamove((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } static int ctl_inquiry_evpd_scsi_ports(struct ctl_scsiio *ctsio, int alloc_len) { struct ctl_softc *softc = CTL_SOFTC(ctsio); struct ctl_lun *lun = CTL_LUN(ctsio); struct scsi_vpd_scsi_ports *sp; struct scsi_vpd_port_designation *pd; struct scsi_vpd_port_designation_cont *pdc; struct ctl_port *port; int data_len, num_target_ports, iid_len, id_len; num_target_ports = 0; iid_len = 0; id_len = 0; mtx_lock(&softc->ctl_lock); STAILQ_FOREACH(port, &softc->port_list, links) { if ((port->status & CTL_PORT_STATUS_ONLINE) == 0) continue; if (lun != NULL && ctl_lun_map_to_port(port, lun->lun) == UINT32_MAX) continue; num_target_ports++; if (port->init_devid) iid_len += port->init_devid->len; if (port->port_devid) id_len += port->port_devid->len; } mtx_unlock(&softc->ctl_lock); data_len = sizeof(struct scsi_vpd_scsi_ports) + num_target_ports * (sizeof(struct scsi_vpd_port_designation) + sizeof(struct scsi_vpd_port_designation_cont)) + iid_len + id_len; ctsio->kern_data_ptr = malloc(data_len, M_CTL, M_WAITOK | M_ZERO); sp = (struct scsi_vpd_scsi_ports *)ctsio->kern_data_ptr; ctsio->kern_sg_entries = 0; ctsio->kern_rel_offset = 0; ctsio->kern_sg_entries = 0; ctsio->kern_data_len = min(data_len, alloc_len); ctsio->kern_total_len = ctsio->kern_data_len; /* * The control device is always connected. The disk device, on the * other hand, may not be online all the time. Need to change this * to figure out whether the disk device is actually online or not. */ if (lun != NULL) sp->device = (SID_QUAL_LU_CONNECTED << 5) | lun->be_lun->lun_type; else sp->device = (SID_QUAL_LU_OFFLINE << 5) | T_DIRECT; sp->page_code = SVPD_SCSI_PORTS; scsi_ulto2b(data_len - sizeof(struct scsi_vpd_scsi_ports), sp->page_length); pd = &sp->design[0]; mtx_lock(&softc->ctl_lock); STAILQ_FOREACH(port, &softc->port_list, links) { if ((port->status & CTL_PORT_STATUS_ONLINE) == 0) continue; if (lun != NULL && ctl_lun_map_to_port(port, lun->lun) == UINT32_MAX) continue; scsi_ulto2b(port->targ_port, pd->relative_port_id); if (port->init_devid) { iid_len = port->init_devid->len; memcpy(pd->initiator_transportid, port->init_devid->data, port->init_devid->len); } else iid_len = 0; scsi_ulto2b(iid_len, pd->initiator_transportid_length); pdc = (struct scsi_vpd_port_designation_cont *) (&pd->initiator_transportid[iid_len]); if (port->port_devid) { id_len = port->port_devid->len; memcpy(pdc->target_port_descriptors, port->port_devid->data, port->port_devid->len); } else id_len = 0; scsi_ulto2b(id_len, pdc->target_port_descriptors_length); pd = (struct scsi_vpd_port_designation *) ((uint8_t *)pdc->target_port_descriptors + id_len); } mtx_unlock(&softc->ctl_lock); ctl_set_success(ctsio); ctsio->io_hdr.flags |= CTL_FLAG_ALLOCATED; ctsio->be_move_done = ctl_config_move_done; ctl_datamove((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } static int +ctl_inquiry_evpd_sfs(struct ctl_scsiio *ctsio, int alloc_len) +{ + struct ctl_lun *lun = CTL_LUN(ctsio); + struct scsi_vpd_sfs *sfs_ptr; + int sfs_page_size, n; + + sfs_page_size = sizeof(*sfs_ptr) + 5 * 2; + ctsio->kern_data_ptr = malloc(sfs_page_size, M_CTL, M_WAITOK | M_ZERO); + sfs_ptr = (struct scsi_vpd_sfs *)ctsio->kern_data_ptr; + ctsio->kern_sg_entries = 0; + ctsio->kern_rel_offset = 0; + ctsio->kern_sg_entries = 0; + ctsio->kern_data_len = min(sfs_page_size, alloc_len); + ctsio->kern_total_len = ctsio->kern_data_len; + + /* + * The control device is always connected. The disk device, on the + * other hand, may not be online all the time. Need to change this + * to figure out whether the disk device is actually online or not. + */ + if (lun != NULL) + sfs_ptr->device = (SID_QUAL_LU_CONNECTED << 5) | + lun->be_lun->lun_type; + else + sfs_ptr->device = (SID_QUAL_LU_OFFLINE << 5) | T_DIRECT; + + sfs_ptr->page_code = SVPD_SCSI_SFS; + n = 0; + /* Discovery 2016 */ + scsi_ulto2b(0x0001, &sfs_ptr->codes[2 * n++]); + if (lun != NULL && lun->be_lun->lun_type == T_DIRECT) { + /* SBC Base 2016 */ + scsi_ulto2b(0x0101, &sfs_ptr->codes[2 * n++]); + /* SBC Base 2010 */ + scsi_ulto2b(0x0102, &sfs_ptr->codes[2 * n++]); + if (lun->be_lun->flags & CTL_LUN_FLAG_UNMAP) { + /* Basic Provisioning 2016 */ + scsi_ulto2b(0x0103, &sfs_ptr->codes[2 * n++]); + } + /* Drive Maintenance 2016 */ + //scsi_ulto2b(0x0104, &sfs_ptr->codes[2 * n++]); + } + scsi_ulto2b(4 + 2 * n, sfs_ptr->page_length); + + ctl_set_success(ctsio); + ctsio->io_hdr.flags |= CTL_FLAG_ALLOCATED; + ctsio->be_move_done = ctl_config_move_done; + ctl_datamove((union ctl_io *)ctsio); + return (CTL_RETVAL_COMPLETE); +} + +static int ctl_inquiry_evpd_block_limits(struct ctl_scsiio *ctsio, int alloc_len) { struct ctl_lun *lun = CTL_LUN(ctsio); struct scsi_vpd_block_limits *bl_ptr; const char *val; uint64_t ival; ctsio->kern_data_ptr = malloc(sizeof(*bl_ptr), M_CTL, M_WAITOK | M_ZERO); bl_ptr = (struct scsi_vpd_block_limits *)ctsio->kern_data_ptr; ctsio->kern_sg_entries = 0; ctsio->kern_rel_offset = 0; ctsio->kern_sg_entries = 0; ctsio->kern_data_len = min(sizeof(*bl_ptr), alloc_len); ctsio->kern_total_len = ctsio->kern_data_len; /* * The control device is always connected. The disk device, on the * other hand, may not be online all the time. Need to change this * to figure out whether the disk device is actually online or not. */ if (lun != NULL) bl_ptr->device = (SID_QUAL_LU_CONNECTED << 5) | lun->be_lun->lun_type; else bl_ptr->device = (SID_QUAL_LU_OFFLINE << 5) | T_DIRECT; bl_ptr->page_code = SVPD_BLOCK_LIMITS; scsi_ulto2b(sizeof(*bl_ptr) - 4, bl_ptr->page_length); bl_ptr->max_cmp_write_len = 0xff; scsi_ulto4b(0xffffffff, bl_ptr->max_txfer_len); if (lun != NULL) { scsi_ulto4b(lun->be_lun->opttxferlen, bl_ptr->opt_txfer_len); if (lun->be_lun->flags & CTL_LUN_FLAG_UNMAP) { ival = 0xffffffff; val = dnvlist_get_string(lun->be_lun->options, "unmap_max_lba", NULL); if (val != NULL) ctl_expand_number(val, &ival); scsi_ulto4b(ival, bl_ptr->max_unmap_lba_cnt); ival = 0xffffffff; val = dnvlist_get_string(lun->be_lun->options, "unmap_max_descr", NULL); if (val != NULL) ctl_expand_number(val, &ival); scsi_ulto4b(ival, bl_ptr->max_unmap_blk_cnt); if (lun->be_lun->ublockexp != 0) { scsi_ulto4b((1 << lun->be_lun->ublockexp), bl_ptr->opt_unmap_grain); scsi_ulto4b(0x80000000 | lun->be_lun->ublockoff, bl_ptr->unmap_grain_align); } } scsi_ulto4b(lun->be_lun->atomicblock, bl_ptr->max_atomic_transfer_length); scsi_ulto4b(0, bl_ptr->atomic_alignment); scsi_ulto4b(0, bl_ptr->atomic_transfer_length_granularity); scsi_ulto4b(0, bl_ptr->max_atomic_transfer_length_with_atomic_boundary); scsi_ulto4b(0, bl_ptr->max_atomic_boundary_size); ival = UINT64_MAX; val = dnvlist_get_string(lun->be_lun->options, "write_same_max_lba", NULL); if (val != NULL) ctl_expand_number(val, &ival); scsi_u64to8b(ival, bl_ptr->max_write_same_length); } ctl_set_success(ctsio); ctsio->io_hdr.flags |= CTL_FLAG_ALLOCATED; ctsio->be_move_done = ctl_config_move_done; ctl_datamove((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } static int ctl_inquiry_evpd_bdc(struct ctl_scsiio *ctsio, int alloc_len) { struct ctl_lun *lun = CTL_LUN(ctsio); struct scsi_vpd_block_device_characteristics *bdc_ptr; const char *value; u_int i; ctsio->kern_data_ptr = malloc(sizeof(*bdc_ptr), M_CTL, M_WAITOK | M_ZERO); bdc_ptr = (struct scsi_vpd_block_device_characteristics *)ctsio->kern_data_ptr; ctsio->kern_sg_entries = 0; ctsio->kern_rel_offset = 0; ctsio->kern_data_len = min(sizeof(*bdc_ptr), alloc_len); ctsio->kern_total_len = ctsio->kern_data_len; /* * The control device is always connected. The disk device, on the * other hand, may not be online all the time. Need to change this * to figure out whether the disk device is actually online or not. */ if (lun != NULL) bdc_ptr->device = (SID_QUAL_LU_CONNECTED << 5) | lun->be_lun->lun_type; else bdc_ptr->device = (SID_QUAL_LU_OFFLINE << 5) | T_DIRECT; bdc_ptr->page_code = SVPD_BDC; scsi_ulto2b(sizeof(*bdc_ptr) - 4, bdc_ptr->page_length); if (lun != NULL && (value = dnvlist_get_string(lun->be_lun->options, "rpm", NULL)) != NULL) i = strtol(value, NULL, 0); else i = CTL_DEFAULT_ROTATION_RATE; scsi_ulto2b(i, bdc_ptr->medium_rotation_rate); if (lun != NULL && (value = dnvlist_get_string(lun->be_lun->options, "formfactor", NULL)) != NULL) i = strtol(value, NULL, 0); else i = 0; bdc_ptr->wab_wac_ff = (i & 0x0f); - bdc_ptr->flags = SVPD_FUAB | SVPD_VBULS; + bdc_ptr->flags = SVPD_RBWZ | SVPD_FUAB | SVPD_VBULS; ctl_set_success(ctsio); ctsio->io_hdr.flags |= CTL_FLAG_ALLOCATED; ctsio->be_move_done = ctl_config_move_done; ctl_datamove((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } static int ctl_inquiry_evpd_lbp(struct ctl_scsiio *ctsio, int alloc_len) { struct ctl_lun *lun = CTL_LUN(ctsio); struct scsi_vpd_logical_block_prov *lbp_ptr; const char *value; ctsio->kern_data_ptr = malloc(sizeof(*lbp_ptr), M_CTL, M_WAITOK | M_ZERO); lbp_ptr = (struct scsi_vpd_logical_block_prov *)ctsio->kern_data_ptr; ctsio->kern_sg_entries = 0; ctsio->kern_rel_offset = 0; ctsio->kern_data_len = min(sizeof(*lbp_ptr), alloc_len); ctsio->kern_total_len = ctsio->kern_data_len; /* * The control device is always connected. The disk device, on the * other hand, may not be online all the time. Need to change this * to figure out whether the disk device is actually online or not. */ if (lun != NULL) lbp_ptr->device = (SID_QUAL_LU_CONNECTED << 5) | lun->be_lun->lun_type; else lbp_ptr->device = (SID_QUAL_LU_OFFLINE << 5) | T_DIRECT; lbp_ptr->page_code = SVPD_LBP; scsi_ulto2b(sizeof(*lbp_ptr) - 4, lbp_ptr->page_length); lbp_ptr->threshold_exponent = CTL_LBP_EXPONENT; if (lun != NULL && lun->be_lun->flags & CTL_LUN_FLAG_UNMAP) { lbp_ptr->flags = SVPD_LBP_UNMAP | SVPD_LBP_WS16 | SVPD_LBP_WS10 | SVPD_LBP_RZ | SVPD_LBP_ANC_SUP; value = dnvlist_get_string(lun->be_lun->options, "provisioning_type", NULL); if (value != NULL) { if (strcmp(value, "resource") == 0) lbp_ptr->prov_type = SVPD_LBP_RESOURCE; else if (strcmp(value, "thin") == 0) lbp_ptr->prov_type = SVPD_LBP_THIN; } else lbp_ptr->prov_type = SVPD_LBP_THIN; } ctl_set_success(ctsio); ctsio->io_hdr.flags |= CTL_FLAG_ALLOCATED; ctsio->be_move_done = ctl_config_move_done; ctl_datamove((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } /* * INQUIRY with the EVPD bit set. */ static int ctl_inquiry_evpd(struct ctl_scsiio *ctsio) { struct ctl_lun *lun = CTL_LUN(ctsio); struct scsi_inquiry *cdb; int alloc_len, retval; cdb = (struct scsi_inquiry *)ctsio->cdb; alloc_len = scsi_2btoul(cdb->length); switch (cdb->page_code) { case SVPD_SUPPORTED_PAGES: retval = ctl_inquiry_evpd_supported(ctsio, alloc_len); break; case SVPD_UNIT_SERIAL_NUMBER: retval = ctl_inquiry_evpd_serial(ctsio, alloc_len); break; case SVPD_DEVICE_ID: retval = ctl_inquiry_evpd_devid(ctsio, alloc_len); break; case SVPD_EXTENDED_INQUIRY_DATA: retval = ctl_inquiry_evpd_eid(ctsio, alloc_len); break; case SVPD_MODE_PAGE_POLICY: retval = ctl_inquiry_evpd_mpp(ctsio, alloc_len); break; case SVPD_SCSI_PORTS: retval = ctl_inquiry_evpd_scsi_ports(ctsio, alloc_len); break; case SVPD_SCSI_TPC: retval = ctl_inquiry_evpd_tpc(ctsio, alloc_len); + break; + case SVPD_SCSI_SFS: + retval = ctl_inquiry_evpd_sfs(ctsio, alloc_len); break; case SVPD_BLOCK_LIMITS: if (lun == NULL || lun->be_lun->lun_type != T_DIRECT) goto err; retval = ctl_inquiry_evpd_block_limits(ctsio, alloc_len); break; case SVPD_BDC: if (lun == NULL || lun->be_lun->lun_type != T_DIRECT) goto err; retval = ctl_inquiry_evpd_bdc(ctsio, alloc_len); break; case SVPD_LBP: if (lun == NULL || lun->be_lun->lun_type != T_DIRECT) goto err; retval = ctl_inquiry_evpd_lbp(ctsio, alloc_len); break; default: err: ctl_set_invalid_field(ctsio, /*sks_valid*/ 1, /*command*/ 1, /*field*/ 2, /*bit_valid*/ 0, /*bit*/ 0); ctl_done((union ctl_io *)ctsio); retval = CTL_RETVAL_COMPLETE; break; } return (retval); } /* * Standard INQUIRY data. */ static int ctl_inquiry_std(struct ctl_scsiio *ctsio) { struct ctl_softc *softc = CTL_SOFTC(ctsio); struct ctl_port *port = CTL_PORT(ctsio); struct ctl_lun *lun = CTL_LUN(ctsio); struct scsi_inquiry_data *inq_ptr; struct scsi_inquiry *cdb; const char *val; uint32_t alloc_len, data_len; ctl_port_type port_type; port_type = port->port_type; if (port_type == CTL_PORT_IOCTL || port_type == CTL_PORT_INTERNAL) port_type = CTL_PORT_SCSI; cdb = (struct scsi_inquiry *)ctsio->cdb; alloc_len = scsi_2btoul(cdb->length); /* * We malloc the full inquiry data size here and fill it * in. If the user only asks for less, we'll give him * that much. */ data_len = offsetof(struct scsi_inquiry_data, vendor_specific1); ctsio->kern_data_ptr = malloc(data_len, M_CTL, M_WAITOK | M_ZERO); inq_ptr = (struct scsi_inquiry_data *)ctsio->kern_data_ptr; ctsio->kern_sg_entries = 0; ctsio->kern_rel_offset = 0; ctsio->kern_data_len = min(data_len, alloc_len); ctsio->kern_total_len = ctsio->kern_data_len; if (lun != NULL) { if ((lun->flags & CTL_LUN_PRIMARY_SC) || softc->ha_link >= CTL_HA_LINK_UNKNOWN) { inq_ptr->device = (SID_QUAL_LU_CONNECTED << 5) | lun->be_lun->lun_type; } else { inq_ptr->device = (SID_QUAL_LU_OFFLINE << 5) | lun->be_lun->lun_type; } if (lun->flags & CTL_LUN_REMOVABLE) inq_ptr->dev_qual2 |= SID_RMB; } else inq_ptr->device = (SID_QUAL_BAD_LU << 5) | T_NODEVICE; /* RMB in byte 2 is 0 */ inq_ptr->version = SCSI_REV_SPC5; /* * According to SAM-3, even if a device only supports a single * level of LUN addressing, it should still set the HISUP bit: * * 4.9.1 Logical unit numbers overview * * All logical unit number formats described in this standard are * hierarchical in structure even when only a single level in that * hierarchy is used. The HISUP bit shall be set to one in the * standard INQUIRY data (see SPC-2) when any logical unit number * format described in this standard is used. Non-hierarchical * formats are outside the scope of this standard. * * Therefore we set the HiSup bit here. * * The response format is 2, per SPC-3. */ inq_ptr->response_format = SID_HiSup | 2; inq_ptr->additional_length = data_len - (offsetof(struct scsi_inquiry_data, additional_length) + 1); CTL_DEBUG_PRINT(("additional_length = %d\n", inq_ptr->additional_length)); inq_ptr->spc3_flags = SPC3_SID_3PC | SPC3_SID_TPGS_IMPLICIT; if (port_type == CTL_PORT_SCSI) inq_ptr->spc2_flags = SPC2_SID_ADDR16; inq_ptr->spc2_flags |= SPC2_SID_MultiP; inq_ptr->flags = SID_CmdQue; if (port_type == CTL_PORT_SCSI) inq_ptr->flags |= SID_WBus16 | SID_Sync; /* * Per SPC-3, unused bytes in ASCII strings are filled with spaces. * We have 8 bytes for the vendor name, and 16 bytes for the device * name and 4 bytes for the revision. */ if (lun == NULL || (val = dnvlist_get_string(lun->be_lun->options, "vendor", NULL)) == NULL) { strncpy(inq_ptr->vendor, CTL_VENDOR, sizeof(inq_ptr->vendor)); } else { memset(inq_ptr->vendor, ' ', sizeof(inq_ptr->vendor)); strncpy(inq_ptr->vendor, val, min(sizeof(inq_ptr->vendor), strlen(val))); } if (lun == NULL) { strncpy(inq_ptr->product, CTL_DIRECT_PRODUCT, sizeof(inq_ptr->product)); } else if ((val = dnvlist_get_string(lun->be_lun->options, "product", NULL)) == NULL) { switch (lun->be_lun->lun_type) { case T_DIRECT: strncpy(inq_ptr->product, CTL_DIRECT_PRODUCT, sizeof(inq_ptr->product)); break; case T_PROCESSOR: strncpy(inq_ptr->product, CTL_PROCESSOR_PRODUCT, sizeof(inq_ptr->product)); break; case T_CDROM: strncpy(inq_ptr->product, CTL_CDROM_PRODUCT, sizeof(inq_ptr->product)); break; default: strncpy(inq_ptr->product, CTL_UNKNOWN_PRODUCT, sizeof(inq_ptr->product)); break; } } else { memset(inq_ptr->product, ' ', sizeof(inq_ptr->product)); strncpy(inq_ptr->product, val, min(sizeof(inq_ptr->product), strlen(val))); } /* * XXX make this a macro somewhere so it automatically gets * incremented when we make changes. */ if (lun == NULL || (val = dnvlist_get_string(lun->be_lun->options, "revision", NULL)) == NULL) { strncpy(inq_ptr->revision, "0001", sizeof(inq_ptr->revision)); } else { memset(inq_ptr->revision, ' ', sizeof(inq_ptr->revision)); strncpy(inq_ptr->revision, val, min(sizeof(inq_ptr->revision), strlen(val))); } /* * For parallel SCSI, we support double transition and single * transition clocking. We also support QAS (Quick Arbitration * and Selection) and Information Unit transfers on both the * control and array devices. */ if (port_type == CTL_PORT_SCSI) inq_ptr->spi3data = SID_SPI_CLOCK_DT_ST | SID_SPI_QAS | SID_SPI_IUS; /* SAM-6 (no version claimed) */ scsi_ulto2b(0x00C0, inq_ptr->version1); /* SPC-5 (no version claimed) */ scsi_ulto2b(0x05C0, inq_ptr->version2); if (port_type == CTL_PORT_FC) { /* FCP-2 ANSI INCITS.350:2003 */ scsi_ulto2b(0x0917, inq_ptr->version3); } else if (port_type == CTL_PORT_SCSI) { /* SPI-4 ANSI INCITS.362:200x */ scsi_ulto2b(0x0B56, inq_ptr->version3); } else if (port_type == CTL_PORT_ISCSI) { /* iSCSI (no version claimed) */ scsi_ulto2b(0x0960, inq_ptr->version3); } else if (port_type == CTL_PORT_SAS) { /* SAS (no version claimed) */ scsi_ulto2b(0x0BE0, inq_ptr->version3); } else if (port_type == CTL_PORT_UMASS) { /* USB Mass Storage Class Bulk-Only Transport, Revision 1.0 */ scsi_ulto2b(0x1730, inq_ptr->version3); } if (lun == NULL) { /* SBC-4 (no version claimed) */ scsi_ulto2b(0x0600, inq_ptr->version4); } else { switch (lun->be_lun->lun_type) { case T_DIRECT: /* SBC-4 (no version claimed) */ scsi_ulto2b(0x0600, inq_ptr->version4); break; case T_PROCESSOR: break; case T_CDROM: /* MMC-6 (no version claimed) */ scsi_ulto2b(0x04E0, inq_ptr->version4); break; default: break; } } ctl_set_success(ctsio); ctsio->io_hdr.flags |= CTL_FLAG_ALLOCATED; ctsio->be_move_done = ctl_config_move_done; ctl_datamove((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } int ctl_inquiry(struct ctl_scsiio *ctsio) { struct scsi_inquiry *cdb; int retval; CTL_DEBUG_PRINT(("ctl_inquiry\n")); cdb = (struct scsi_inquiry *)ctsio->cdb; if (cdb->byte2 & SI_EVPD) retval = ctl_inquiry_evpd(ctsio); else if (cdb->page_code == 0) retval = ctl_inquiry_std(ctsio); else { ctl_set_invalid_field(ctsio, /*sks_valid*/ 1, /*command*/ 1, /*field*/ 2, /*bit_valid*/ 0, /*bit*/ 0); ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } return (retval); } int ctl_get_config(struct ctl_scsiio *ctsio) { struct ctl_lun *lun = CTL_LUN(ctsio); struct scsi_get_config_header *hdr; struct scsi_get_config_feature *feature; struct scsi_get_config *cdb; uint32_t alloc_len, data_len; int rt, starting; cdb = (struct scsi_get_config *)ctsio->cdb; rt = (cdb->rt & SGC_RT_MASK); starting = scsi_2btoul(cdb->starting_feature); alloc_len = scsi_2btoul(cdb->length); data_len = sizeof(struct scsi_get_config_header) + sizeof(struct scsi_get_config_feature) + 8 + sizeof(struct scsi_get_config_feature) + 8 + sizeof(struct scsi_get_config_feature) + 4 + sizeof(struct scsi_get_config_feature) + 4 + sizeof(struct scsi_get_config_feature) + 8 + sizeof(struct scsi_get_config_feature) + sizeof(struct scsi_get_config_feature) + 4 + sizeof(struct scsi_get_config_feature) + 4 + sizeof(struct scsi_get_config_feature) + 4 + sizeof(struct scsi_get_config_feature) + 4 + sizeof(struct scsi_get_config_feature) + 4 + sizeof(struct scsi_get_config_feature) + 4; ctsio->kern_data_ptr = malloc(data_len, M_CTL, M_WAITOK | M_ZERO); ctsio->kern_sg_entries = 0; ctsio->kern_rel_offset = 0; hdr = (struct scsi_get_config_header *)ctsio->kern_data_ptr; if (lun->flags & CTL_LUN_NO_MEDIA) scsi_ulto2b(0x0000, hdr->current_profile); else scsi_ulto2b(0x0010, hdr->current_profile); feature = (struct scsi_get_config_feature *)(hdr + 1); if (starting > 0x003b) goto done; if (starting > 0x003a) goto f3b; if (starting > 0x002b) goto f3a; if (starting > 0x002a) goto f2b; if (starting > 0x001f) goto f2a; if (starting > 0x001e) goto f1f; if (starting > 0x001d) goto f1e; if (starting > 0x0010) goto f1d; if (starting > 0x0003) goto f10; if (starting > 0x0002) goto f3; if (starting > 0x0001) goto f2; if (starting > 0x0000) goto f1; /* Profile List */ scsi_ulto2b(0x0000, feature->feature_code); feature->flags = SGC_F_PERSISTENT | SGC_F_CURRENT; feature->add_length = 8; scsi_ulto2b(0x0008, &feature->feature_data[0]); /* CD-ROM */ feature->feature_data[2] = 0x00; scsi_ulto2b(0x0010, &feature->feature_data[4]); /* DVD-ROM */ feature->feature_data[6] = 0x01; feature = (struct scsi_get_config_feature *) &feature->feature_data[feature->add_length]; f1: /* Core */ scsi_ulto2b(0x0001, feature->feature_code); feature->flags = 0x08 | SGC_F_PERSISTENT | SGC_F_CURRENT; feature->add_length = 8; scsi_ulto4b(0x00000000, &feature->feature_data[0]); feature->feature_data[4] = 0x03; feature = (struct scsi_get_config_feature *) &feature->feature_data[feature->add_length]; f2: /* Morphing */ scsi_ulto2b(0x0002, feature->feature_code); feature->flags = 0x04 | SGC_F_PERSISTENT | SGC_F_CURRENT; feature->add_length = 4; feature->feature_data[0] = 0x02; feature = (struct scsi_get_config_feature *) &feature->feature_data[feature->add_length]; f3: /* Removable Medium */ scsi_ulto2b(0x0003, feature->feature_code); feature->flags = 0x04 | SGC_F_PERSISTENT | SGC_F_CURRENT; feature->add_length = 4; feature->feature_data[0] = 0x39; feature = (struct scsi_get_config_feature *) &feature->feature_data[feature->add_length]; if (rt == SGC_RT_CURRENT && (lun->flags & CTL_LUN_NO_MEDIA)) goto done; f10: /* Random Read */ scsi_ulto2b(0x0010, feature->feature_code); feature->flags = 0x00; if ((lun->flags & CTL_LUN_NO_MEDIA) == 0) feature->flags |= SGC_F_CURRENT; feature->add_length = 8; scsi_ulto4b(lun->be_lun->blocksize, &feature->feature_data[0]); scsi_ulto2b(1, &feature->feature_data[4]); feature->feature_data[6] = 0x00; feature = (struct scsi_get_config_feature *) &feature->feature_data[feature->add_length]; f1d: /* Multi-Read */ scsi_ulto2b(0x001D, feature->feature_code); feature->flags = 0x00; if ((lun->flags & CTL_LUN_NO_MEDIA) == 0) feature->flags |= SGC_F_CURRENT; feature->add_length = 0; feature = (struct scsi_get_config_feature *) &feature->feature_data[feature->add_length]; f1e: /* CD Read */ scsi_ulto2b(0x001E, feature->feature_code); feature->flags = 0x00; if ((lun->flags & CTL_LUN_NO_MEDIA) == 0) feature->flags |= SGC_F_CURRENT; feature->add_length = 4; feature->feature_data[0] = 0x00; feature = (struct scsi_get_config_feature *) &feature->feature_data[feature->add_length]; f1f: /* DVD Read */ scsi_ulto2b(0x001F, feature->feature_code); feature->flags = 0x08; if ((lun->flags & CTL_LUN_NO_MEDIA) == 0) feature->flags |= SGC_F_CURRENT; feature->add_length = 4; feature->feature_data[0] = 0x01; feature->feature_data[2] = 0x03; feature = (struct scsi_get_config_feature *) &feature->feature_data[feature->add_length]; f2a: /* DVD+RW */ scsi_ulto2b(0x002A, feature->feature_code); feature->flags = 0x04; if ((lun->flags & CTL_LUN_NO_MEDIA) == 0) feature->flags |= SGC_F_CURRENT; feature->add_length = 4; feature->feature_data[0] = 0x00; feature->feature_data[1] = 0x00; feature = (struct scsi_get_config_feature *) &feature->feature_data[feature->add_length]; f2b: /* DVD+R */ scsi_ulto2b(0x002B, feature->feature_code); feature->flags = 0x00; if ((lun->flags & CTL_LUN_NO_MEDIA) == 0) feature->flags |= SGC_F_CURRENT; feature->add_length = 4; feature->feature_data[0] = 0x00; feature = (struct scsi_get_config_feature *) &feature->feature_data[feature->add_length]; f3a: /* DVD+RW Dual Layer */ scsi_ulto2b(0x003A, feature->feature_code); feature->flags = 0x00; if ((lun->flags & CTL_LUN_NO_MEDIA) == 0) feature->flags |= SGC_F_CURRENT; feature->add_length = 4; feature->feature_data[0] = 0x00; feature->feature_data[1] = 0x00; feature = (struct scsi_get_config_feature *) &feature->feature_data[feature->add_length]; f3b: /* DVD+R Dual Layer */ scsi_ulto2b(0x003B, feature->feature_code); feature->flags = 0x00; if ((lun->flags & CTL_LUN_NO_MEDIA) == 0) feature->flags |= SGC_F_CURRENT; feature->add_length = 4; feature->feature_data[0] = 0x00; feature = (struct scsi_get_config_feature *) &feature->feature_data[feature->add_length]; done: data_len = (uint8_t *)feature - (uint8_t *)hdr; if (rt == SGC_RT_SPECIFIC && data_len > 4) { feature = (struct scsi_get_config_feature *)(hdr + 1); if (scsi_2btoul(feature->feature_code) == starting) feature = (struct scsi_get_config_feature *) &feature->feature_data[feature->add_length]; data_len = (uint8_t *)feature - (uint8_t *)hdr; } scsi_ulto4b(data_len - 4, hdr->data_length); ctsio->kern_data_len = min(data_len, alloc_len); ctsio->kern_total_len = ctsio->kern_data_len; ctl_set_success(ctsio); ctsio->io_hdr.flags |= CTL_FLAG_ALLOCATED; ctsio->be_move_done = ctl_config_move_done; ctl_datamove((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } int ctl_get_event_status(struct ctl_scsiio *ctsio) { struct scsi_get_event_status_header *hdr; struct scsi_get_event_status *cdb; uint32_t alloc_len, data_len; cdb = (struct scsi_get_event_status *)ctsio->cdb; if ((cdb->byte2 & SGESN_POLLED) == 0) { ctl_set_invalid_field(ctsio, /*sks_valid*/ 1, /*command*/ 1, /*field*/ 1, /*bit_valid*/ 1, /*bit*/ 0); ctl_done((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } alloc_len = scsi_2btoul(cdb->length); data_len = sizeof(struct scsi_get_event_status_header); ctsio->kern_data_ptr = malloc(data_len, M_CTL, M_WAITOK | M_ZERO); ctsio->kern_sg_entries = 0; ctsio->kern_rel_offset = 0; ctsio->kern_data_len = min(data_len, alloc_len); ctsio->kern_total_len = ctsio->kern_data_len; hdr = (struct scsi_get_event_status_header *)ctsio->kern_data_ptr; scsi_ulto2b(0, hdr->descr_length); hdr->nea_class = SGESN_NEA; hdr->supported_class = 0; ctl_set_success(ctsio); ctsio->io_hdr.flags |= CTL_FLAG_ALLOCATED; ctsio->be_move_done = ctl_config_move_done; ctl_datamove((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } int ctl_mechanism_status(struct ctl_scsiio *ctsio) { struct scsi_mechanism_status_header *hdr; struct scsi_mechanism_status *cdb; uint32_t alloc_len, data_len; cdb = (struct scsi_mechanism_status *)ctsio->cdb; alloc_len = scsi_2btoul(cdb->length); data_len = sizeof(struct scsi_mechanism_status_header); ctsio->kern_data_ptr = malloc(data_len, M_CTL, M_WAITOK | M_ZERO); ctsio->kern_sg_entries = 0; ctsio->kern_rel_offset = 0; ctsio->kern_data_len = min(data_len, alloc_len); ctsio->kern_total_len = ctsio->kern_data_len; hdr = (struct scsi_mechanism_status_header *)ctsio->kern_data_ptr; hdr->state1 = 0x00; hdr->state2 = 0xe0; scsi_ulto3b(0, hdr->lba); hdr->slots_num = 0; scsi_ulto2b(0, hdr->slots_length); ctl_set_success(ctsio); ctsio->io_hdr.flags |= CTL_FLAG_ALLOCATED; ctsio->be_move_done = ctl_config_move_done; ctl_datamove((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } static void ctl_ultomsf(uint32_t lba, uint8_t *buf) { lba += 150; buf[0] = 0; buf[1] = bin2bcd((lba / 75) / 60); buf[2] = bin2bcd((lba / 75) % 60); buf[3] = bin2bcd(lba % 75); } int ctl_read_toc(struct ctl_scsiio *ctsio) { struct ctl_lun *lun = CTL_LUN(ctsio); struct scsi_read_toc_hdr *hdr; struct scsi_read_toc_type01_descr *descr; struct scsi_read_toc *cdb; uint32_t alloc_len, data_len; int format, msf; cdb = (struct scsi_read_toc *)ctsio->cdb; msf = (cdb->byte2 & CD_MSF) != 0; format = cdb->format; alloc_len = scsi_2btoul(cdb->data_len); data_len = sizeof(struct scsi_read_toc_hdr); if (format == 0) data_len += 2 * sizeof(struct scsi_read_toc_type01_descr); else data_len += sizeof(struct scsi_read_toc_type01_descr); ctsio->kern_data_ptr = malloc(data_len, M_CTL, M_WAITOK | M_ZERO); ctsio->kern_sg_entries = 0; ctsio->kern_rel_offset = 0; ctsio->kern_data_len = min(data_len, alloc_len); ctsio->kern_total_len = ctsio->kern_data_len; hdr = (struct scsi_read_toc_hdr *)ctsio->kern_data_ptr; if (format == 0) { scsi_ulto2b(0x12, hdr->data_length); hdr->first = 1; hdr->last = 1; descr = (struct scsi_read_toc_type01_descr *)(hdr + 1); descr->addr_ctl = 0x14; descr->track_number = 1; if (msf) ctl_ultomsf(0, descr->track_start); else scsi_ulto4b(0, descr->track_start); descr++; descr->addr_ctl = 0x14; descr->track_number = 0xaa; if (msf) ctl_ultomsf(lun->be_lun->maxlba+1, descr->track_start); else scsi_ulto4b(lun->be_lun->maxlba+1, descr->track_start); } else { scsi_ulto2b(0x0a, hdr->data_length); hdr->first = 1; hdr->last = 1; descr = (struct scsi_read_toc_type01_descr *)(hdr + 1); descr->addr_ctl = 0x14; descr->track_number = 1; if (msf) ctl_ultomsf(0, descr->track_start); else scsi_ulto4b(0, descr->track_start); } ctl_set_success(ctsio); ctsio->io_hdr.flags |= CTL_FLAG_ALLOCATED; ctsio->be_move_done = ctl_config_move_done; ctl_datamove((union ctl_io *)ctsio); return (CTL_RETVAL_COMPLETE); } /* * For known CDB types, parse the LBA and length. */ static int ctl_get_lba_len(union ctl_io *io, uint64_t *lba, uint64_t *len) { if (io->io_hdr.io_type != CTL_IO_SCSI) return (1); switch (io->scsiio.cdb[0]) { case COMPARE_AND_WRITE: { struct scsi_compare_and_write *cdb; cdb = (struct scsi_compare_and_write *)io->scsiio.cdb; *lba = scsi_8btou64(cdb->addr); *len = cdb->length; break; } case READ_6: case WRITE_6: { struct scsi_rw_6 *cdb; cdb = (struct scsi_rw_6 *)io->scsiio.cdb; *lba = scsi_3btoul(cdb->addr); /* only 5 bits are valid in the most significant address byte */ *lba &= 0x1fffff; *len = cdb->length; break; } case READ_10: case WRITE_10: { struct scsi_rw_10 *cdb; cdb = (struct scsi_rw_10 *)io->scsiio.cdb; *lba = scsi_4btoul(cdb->addr); *len = scsi_2btoul(cdb->length); break; } case WRITE_VERIFY_10: { struct scsi_write_verify_10 *cdb; cdb = (struct scsi_write_verify_10 *)io->scsiio.cdb; *lba = scsi_4btoul(cdb->addr); *len = scsi_2btoul(cdb->length); break; } case READ_12: case WRITE_12: { struct scsi_rw_12 *cdb; cdb = (struct scsi_rw_12 *)io->scsiio.cdb; *lba = scsi_4btoul(cdb->addr); *len = scsi_4btoul(cdb->length); break; } case WRITE_VERIFY_12: { struct scsi_write_verify_12 *cdb; cdb = (struct scsi_write_verify_12 *)io->scsiio.cdb; *lba = scsi_4btoul(cdb->addr); *len = scsi_4btoul(cdb->length); break; } case READ_16: case WRITE_16: { struct scsi_rw_16 *cdb; cdb = (struct scsi_rw_16 *)io->scsiio.cdb; *lba = scsi_8btou64(cdb->addr); *len = scsi_4btoul(cdb->length); break; } case WRITE_ATOMIC_16: { struct scsi_write_atomic_16 *cdb; cdb = (struct scsi_write_atomic_16 *)io->scsiio.cdb; *lba = scsi_8btou64(cdb->addr); *len = scsi_2btoul(cdb->length); break; } case WRITE_VERIFY_16: { struct scsi_write_verify_16 *cdb; cdb = (struct scsi_write_verify_16 *)io->scsiio.cdb; *lba = scsi_8btou64(cdb->addr); *len = scsi_4btoul(cdb->length); break; } case WRITE_SAME_10: { struct scsi_write_same_10 *cdb; cdb = (struct scsi_write_same_10 *)io->scsiio.cdb; *lba = scsi_4btoul(cdb->addr); *len = scsi_2btoul(cdb->length); break; } case WRITE_SAME_16: { struct scsi_write_same_16 *cdb; cdb = (struct scsi_write_same_16 *)io->scsiio.cdb; *lba = scsi_8btou64(cdb->addr); *len = scsi_4btoul(cdb->length); break; } case VERIFY_10: { struct scsi_verify_10 *cdb; cdb = (struct scsi_verify_10 *)io->scsiio.cdb; *lba = scsi_4btoul(cdb->addr); *len = scsi_2btoul(cdb->length); break; } case VERIFY_12: { struct scsi_verify_12 *cdb; cdb = (struct scsi_verify_12 *)io->scsiio.cdb; *lba = scsi_4btoul(cdb->addr); *len = scsi_4btoul(cdb->length); break; } case VERIFY_16: { struct scsi_verify_16 *cdb; cdb = (struct scsi_verify_16 *)io->scsiio.cdb; *lba = scsi_8btou64(cdb->addr); *len = scsi_4btoul(cdb->length); break; } case UNMAP: { *lba = 0; *len = UINT64_MAX; break; } case SERVICE_ACTION_IN: { /* GET LBA STATUS */ struct scsi_get_lba_status *cdb; cdb = (struct scsi_get_lba_status *)io->scsiio.cdb; *lba = scsi_8btou64(cdb->addr); *len = UINT32_MAX; break; } default: return (1); break; /* NOTREACHED */ } return (0); } static ctl_action ctl_extent_check_lba(uint64_t lba1, uint64_t len1, uint64_t lba2, uint64_t len2, bool seq) { uint64_t endlba1, endlba2; endlba1 = lba1 + len1 - (seq ? 0 : 1); endlba2 = lba2 + len2 - 1; if ((endlba1 < lba2) || (endlba2 < lba1)) return (CTL_ACTION_PASS); else return (CTL_ACTION_BLOCK); } static int ctl_extent_check_unmap(union ctl_io *io, uint64_t lba2, uint64_t len2) { struct ctl_ptr_len_flags *ptrlen; struct scsi_unmap_desc *buf, *end, *range; uint64_t lba; uint32_t len; /* If not UNMAP -- go other way. */ if (io->io_hdr.io_type != CTL_IO_SCSI || io->scsiio.cdb[0] != UNMAP) return (CTL_ACTION_ERROR); /* If UNMAP without data -- block and wait for data. */ ptrlen = (struct ctl_ptr_len_flags *) &io->io_hdr.ctl_private[CTL_PRIV_LBA_LEN]; if ((io->io_hdr.flags & CTL_FLAG_ALLOCATED) == 0 || ptrlen->ptr == NULL) return (CTL_ACTION_BLOCK); /* UNMAP with data -- check for collision. */ buf = (struct scsi_unmap_desc *)ptrlen->ptr; end = buf + ptrlen->len / sizeof(*buf); for (range = buf; range < end; range++) { lba = scsi_8btou64(range->lba); len = scsi_4btoul(range->length); if ((lba < lba2 + len2) && (lba + len > lba2)) return (CTL_ACTION_BLOCK); } return (CTL_ACTION_PASS); } static ctl_action ctl_extent_check(union ctl_io *io1, union ctl_io *io2, bool seq) { uint64_t lba1, lba2; uint64_t len1, len2; int retval; if (ctl_get_lba_len(io2, &lba2, &len2) != 0) return (CTL_ACTION_ERROR); retval = ctl_extent_check_unmap(io1, lba2, len2); if (retval != CTL_ACTION_ERROR) return (retval); if (ctl_get_lba_len(io1, &lba1, &len1) != 0) return (CTL_ACTION_ERROR); if (io1->io_hdr.flags & CTL_FLAG_SERSEQ_DONE) seq = FALSE; return (ctl_extent_check_lba(lba1, len1, lba2, len2, seq)); } static ctl_action ctl_extent_check_seq(union ctl_io *io1, union ctl_io *io2) { uint64_t lba1, lba2; uint64_t len1, len2; if (io1->io_hdr.flags & CTL_FLAG_SERSEQ_DONE) return (CTL_ACTION_PASS); if (ctl_get_lba_len(io1, &lba1, &len1) != 0) return (CTL_ACTION_ERROR); if (ctl_get_lba_len(io2, &lba2, &len2) != 0) return (CTL_ACTION_ERROR); if (lba1 + len1 == lba2) return (CTL_ACTION_BLOCK); return (CTL_ACTION_PASS); } static ctl_action ctl_check_for_blockage(struct ctl_lun *lun, union ctl_io *pending_io, union ctl_io *ooa_io) { const struct ctl_cmd_entry *pending_entry, *ooa_entry; const ctl_serialize_action *serialize_row; /* * Aborted commands are not going to be executed and may even * not report completion, so we don't care about their order. * Let them complete ASAP to clean the OOA queue. */ if (pending_io->io_hdr.flags & CTL_FLAG_ABORT) return (CTL_ACTION_SKIP); /* * The initiator attempted multiple untagged commands at the same * time. Can't do that. */ if ((pending_io->scsiio.tag_type == CTL_TAG_UNTAGGED) && (ooa_io->scsiio.tag_type == CTL_TAG_UNTAGGED) && ((pending_io->io_hdr.nexus.targ_port == ooa_io->io_hdr.nexus.targ_port) && (pending_io->io_hdr.nexus.initid == ooa_io->io_hdr.nexus.initid)) && ((ooa_io->io_hdr.flags & (CTL_FLAG_ABORT | CTL_FLAG_STATUS_SENT)) == 0)) return (CTL_ACTION_OVERLAP); /* * The initiator attempted to send multiple tagged commands with * the same ID. (It's fine if different initiators have the same * tag ID.) * * Even if all of those conditions are true, we don't kill the I/O * if the command ahead of us has been aborted. We won't end up * sending it to the FETD, and it's perfectly legal to resend a * command with the same tag number as long as the previous * instance of this tag number has been aborted somehow. */ if ((pending_io->scsiio.tag_type != CTL_TAG_UNTAGGED) && (ooa_io->scsiio.tag_type != CTL_TAG_UNTAGGED) && (pending_io->scsiio.tag_num == ooa_io->scsiio.tag_num) && ((pending_io->io_hdr.nexus.targ_port == ooa_io->io_hdr.nexus.targ_port) && (pending_io->io_hdr.nexus.initid == ooa_io->io_hdr.nexus.initid)) && ((ooa_io->io_hdr.flags & (CTL_FLAG_ABORT | CTL_FLAG_STATUS_SENT)) == 0)) return (CTL_ACTION_OVERLAP_TAG); /* * If we get a head of queue tag, SAM-3 says that we should * immediately execute it. * * What happens if this command would normally block for some other * reason? e.g. a request sense with a head of queue tag * immediately after a write. Normally that would block, but this * will result in its getting executed immediately... * * We currently return "pass" instead of "skip", so we'll end up * going through the rest of the queue to check for overlapped tags. * * XXX KDM check for other types of blockage first?? */ if (pending_io->scsiio.tag_type == CTL_TAG_HEAD_OF_QUEUE) return (CTL_ACTION_PASS); /* * Ordered tags have to block until all items ahead of them * have completed. If we get called with an ordered tag, we always * block, if something else is ahead of us in the queue. */ if (pending_io->scsiio.tag_type == CTL_TAG_ORDERED) return (CTL_ACTION_BLOCK); /* * Simple tags get blocked until all head of queue and ordered tags * ahead of them have completed. I'm lumping untagged commands in * with simple tags here. XXX KDM is that the right thing to do? */ if (((pending_io->scsiio.tag_type == CTL_TAG_UNTAGGED) || (pending_io->scsiio.tag_type == CTL_TAG_SIMPLE)) && ((ooa_io->scsiio.tag_type == CTL_TAG_HEAD_OF_QUEUE) || (ooa_io->scsiio.tag_type == CTL_TAG_ORDERED))) return (CTL_ACTION_BLOCK); pending_entry = ctl_get_cmd_entry(&pending_io->scsiio, NULL); KASSERT(pending_entry->seridx < CTL_SERIDX_COUNT, ("%s: Invalid seridx %d for pending CDB %02x %02x @ %p", __func__, pending_entry->seridx, pending_io->scsiio.cdb[0], pending_io->scsiio.cdb[1], pending_io)); ooa_entry = ctl_get_cmd_entry(&ooa_io->scsiio, NULL); if (ooa_entry->seridx == CTL_SERIDX_INVLD) return (CTL_ACTION_PASS); /* Unsupported command in OOA queue */ KASSERT(ooa_entry->seridx < CTL_SERIDX_COUNT, ("%s: Invalid seridx %d for ooa CDB %02x %02x @ %p", __func__, ooa_entry->seridx, ooa_io->scsiio.cdb[0], ooa_io->scsiio.cdb[1], ooa_io)); serialize_row = ctl_serialize_table[ooa_entry->seridx]; switch (serialize_row[pending_entry->seridx]) { case CTL_SER_BLOCK: return (CTL_ACTION_BLOCK); case CTL_SER_EXTENT: return (ctl_extent_check(ooa_io, pending_io, (lun->be_lun && lun->be_lun->serseq == CTL_LUN_SERSEQ_ON))); case CTL_SER_EXTENTOPT: if ((lun->MODE_CTRL.queue_flags & SCP_QUEUE_ALG_MASK) != SCP_QUEUE_ALG_UNRESTRICTED) return (ctl_extent_check(ooa_io, pending_io, (lun->be_lun && lun->be_lun->serseq == CTL_LUN_SERSEQ_ON))); return (CTL_ACTION_PASS); case CTL_SER_EXTENTSEQ: if (lun->be_lun && lun->be_lun->serseq != CTL_LUN_SERSEQ_OFF) return (ctl_extent_check_seq(ooa_io, pending_io)); return (CTL_ACTION_PASS); case CTL_SER_PASS: return (CTL_ACTION_PASS); case CTL_SER_BLOCKOPT: if ((lun->MODE_CTRL.queue_flags & SCP_QUEUE_ALG_MASK) != SCP_QUEUE_ALG_UNRESTRICTED) return (CTL_ACTION_BLOCK); return (CTL_ACTION_PASS); case CTL_SER_SKIP: return (CTL_ACTION_SKIP); default: panic("%s: Invalid serialization value %d for %d => %d", __func__, serialize_row[pending_entry->seridx], pending_entry->seridx, ooa_entry->seridx); } return (CTL_ACTION_ERROR); } /* * Check for blockage or overlaps against the OOA (Order Of Arrival) queue. * Assumptions: * - pending_io is generally either incoming, or on the blocked queue * - starting I/O is the I/O we want to start the check with. */ static ctl_action ctl_check_ooa(struct ctl_lun *lun, union ctl_io *pending_io, union ctl_io **starting_io) { union ctl_io *ooa_io; ctl_action action; mtx_assert(&lun->lun_lock, MA_OWNED); /* * Run back along the OOA queue, starting with the current * blocked I/O and going through every I/O before it on the * queue. If starting_io is NULL, we'll just end up returning * CTL_ACTION_PASS. */ for (ooa_io = *starting_io; ooa_io != NULL; ooa_io = (union ctl_io *)TAILQ_PREV(&ooa_io->io_hdr, ctl_ooaq, ooa_links)){ action = ctl_check_for_blockage(lun, pending_io, ooa_io); if (action != CTL_ACTION_PASS) { *starting_io = ooa_io; return (action); } } *starting_io = NULL; return (CTL_ACTION_PASS); } /* * Try to unblock the specified I/O. * * skip parameter allows explicitly skip present blocker of the I/O, * starting from the previous one on OOA queue. It can be used when * we know for sure that the blocker I/O does no longer count. */ static void ctl_try_unblock_io(struct ctl_lun *lun, union ctl_io *io, bool skip) { struct ctl_softc *softc = lun->ctl_softc; union ctl_io *bio, *obio; const struct ctl_cmd_entry *entry; union ctl_ha_msg msg_info; ctl_action action; mtx_assert(&lun->lun_lock, MA_OWNED); if (io->io_hdr.blocker == NULL) return; obio = bio = io->io_hdr.blocker; if (skip) bio = (union ctl_io *)TAILQ_PREV(&bio->io_hdr, ctl_ooaq, ooa_links); action = ctl_check_ooa(lun, io, &bio); if (action == CTL_ACTION_BLOCK) { /* Still blocked, but may be by different I/O now. */ if (bio != obio) { TAILQ_REMOVE(&obio->io_hdr.blocked_queue, &io->io_hdr, blocked_links); TAILQ_INSERT_TAIL(&bio->io_hdr.blocked_queue, &io->io_hdr, blocked_links); io->io_hdr.blocker = bio; } return; } /* No longer blocked, one way or another. */ TAILQ_REMOVE(&obio->io_hdr.blocked_queue, &io->io_hdr, blocked_links); io->io_hdr.blocker = NULL; switch (action) { case CTL_ACTION_OVERLAP: ctl_set_overlapped_cmd(&io->scsiio); goto error; case CTL_ACTION_OVERLAP_TAG: ctl_set_overlapped_tag(&io->scsiio, io->scsiio.tag_num & 0xff); goto error; case CTL_ACTION_PASS: case CTL_ACTION_SKIP: /* Serializing commands from the other SC retire there. */ if ((io->io_hdr.flags & CTL_FLAG_FROM_OTHER_SC) && (softc->ha_mode != CTL_HA_MODE_XFER)) { io->io_hdr.flags &= ~CTL_FLAG_IO_ACTIVE; msg_info.hdr.original_sc = io->io_hdr.remote_io; msg_info.hdr.serializing_sc = io; msg_info.hdr.msg_type = CTL_MSG_R2R; ctl_ha_msg_send(CTL_HA_CHAN_CTL, &msg_info, sizeof(msg_info.hdr), M_NOWAIT); break; } /* * Check this I/O for LUN state changes that may have happened * while this command was blocked. The LUN state may have been * changed by a command ahead of us in the queue. */ entry = ctl_get_cmd_entry(&io->scsiio, NULL); if (ctl_scsiio_lun_check(lun, entry, &io->scsiio) != 0) { ctl_done(io); break; } io->io_hdr.flags |= CTL_FLAG_IS_WAS_ON_RTR; ctl_enqueue_rtr(io); break; case CTL_ACTION_ERROR: default: ctl_set_internal_failure(&io->scsiio, /*sks_valid*/ 0, /*retry_count*/ 0); error: /* Serializing commands from the other SC are done here. */ if ((io->io_hdr.flags & CTL_FLAG_FROM_OTHER_SC) && (softc->ha_mode != CTL_HA_MODE_XFER)) { ctl_try_unblock_others(lun, io, TRUE); TAILQ_REMOVE(&lun->ooa_queue, &io->io_hdr, ooa_links); ctl_copy_sense_data_back(io, &msg_info); msg_info.hdr.original_sc = io->io_hdr.remote_io; msg_info.hdr.serializing_sc = NULL; msg_info.hdr.msg_type = CTL_MSG_BAD_JUJU; ctl_ha_msg_send(CTL_HA_CHAN_CTL, &msg_info, sizeof(msg_info.scsi), M_WAITOK); ctl_free_io(io); break; } ctl_done(io); break; } } /* * Try to unblock I/Os blocked by the specified I/O. * * skip parameter allows explicitly skip the specified I/O as blocker, * starting from the previous one on the OOA queue. It can be used when * we know for sure that the specified I/O does no longer count (done). * It has to be still on OOA queue though so that we know where to start. */ static void ctl_try_unblock_others(struct ctl_lun *lun, union ctl_io *bio, bool skip) { union ctl_io *io, *next_io; mtx_assert(&lun->lun_lock, MA_OWNED); for (io = (union ctl_io *)TAILQ_FIRST(&bio->io_hdr.blocked_queue); io != NULL; io = next_io) { next_io = (union ctl_io *)TAILQ_NEXT(&io->io_hdr, blocked_links); KASSERT(io->io_hdr.blocker != NULL, ("I/O %p on blocked list without blocker", io)); ctl_try_unblock_io(lun, io, skip); } KASSERT(!skip || TAILQ_EMPTY(&bio->io_hdr.blocked_queue), ("blocked_queue is not empty after skipping %p", bio)); } /* * This routine (with one exception) checks LUN flags that can be set by * commands ahead of us in the OOA queue. These flags have to be checked * when a command initially comes in, and when we pull a command off the * blocked queue and are preparing to execute it. The reason we have to * check these flags for commands on the blocked queue is that the LUN * state may have been changed by a command ahead of us while we're on the * blocked queue. * * Ordering is somewhat important with these checks, so please pay * careful attention to the placement of any new checks. */ static int ctl_scsiio_lun_check(struct ctl_lun *lun, const struct ctl_cmd_entry *entry, struct ctl_scsiio *ctsio) { struct ctl_softc *softc = lun->ctl_softc; int retval; uint32_t residx; retval = 0; mtx_assert(&lun->lun_lock, MA_OWNED); /* * If this shelf is a secondary shelf controller, we may have to * reject some commands disallowed by HA mode and link state. */ if ((lun->flags & CTL_LUN_PRIMARY_SC) == 0) { if (softc->ha_link == CTL_HA_LINK_OFFLINE && (entry->flags & CTL_CMD_FLAG_OK_ON_UNAVAIL) == 0) { ctl_set_lun_unavail(ctsio); retval = 1; goto bailout; } if ((lun->flags & CTL_LUN_PEER_SC_PRIMARY) == 0 && (entry->flags & CTL_CMD_FLAG_OK_ON_UNAVAIL) == 0) { ctl_set_lun_transit(ctsio); retval = 1; goto bailout; } if (softc->ha_mode == CTL_HA_MODE_ACT_STBY && (entry->flags & CTL_CMD_FLAG_OK_ON_STANDBY) == 0) { ctl_set_lun_standby(ctsio); retval = 1; goto bailout; } /* The rest of checks are only done on executing side */ if (softc->ha_mode == CTL_HA_MODE_XFER) goto bailout; } if (entry->pattern & CTL_LUN_PAT_WRITE) { if (lun->be_lun && lun->be_lun->flags & CTL_LUN_FLAG_READONLY) { ctl_set_hw_write_protected(ctsio); retval = 1; goto bailout; } if ((lun->MODE_CTRL.eca_and_aen & SCP_SWP) != 0) { ctl_set_sense(ctsio, /*current_error*/ 1, /*sense_key*/ SSD_KEY_DATA_PROTECT, /*asc*/ 0x27, /*ascq*/ 0x02, SSD_ELEM_NONE); retval = 1; goto bailout; } } /* * Check for a reservation conflict. If this command isn't allowed * even on reserved LUNs, and if this initiator isn't the one who * reserved us, reject the command with a reservation conflict. */ residx = ctl_get_initindex(&ctsio->io_hdr.nexus); if ((lun->flags & CTL_LUN_RESERVED) && ((entry->flags & CTL_CMD_FLAG_ALLOW_ON_RESV) == 0)) { if (lun->res_idx != residx) { ctl_set_reservation_conflict(ctsio); retval = 1; goto bailout; } } if ((lun->flags & CTL_LUN_PR_RESERVED) == 0 || (entry->flags & CTL_CMD_FLAG_ALLOW_ON_PR_RESV)) { /* No reservation or command is allowed. */; } else if ((entry->flags & CTL_CMD_FLAG_ALLOW_ON_PR_WRESV) && (lun->pr_res_type == SPR_TYPE_WR_EX || lun->pr_res_type == SPR_TYPE_WR_EX_RO || lun->pr_res_type == SPR_TYPE_WR_EX_AR)) { /* The command is allowed for Write Exclusive resv. */; } else { /* * if we aren't registered or it's a res holder type * reservation and this isn't the res holder then set a * conflict. */ if (ctl_get_prkey(lun, residx) == 0 || (residx != lun->pr_res_idx && lun->pr_res_type < 4)) { ctl_set_reservation_conflict(ctsio); retval = 1; goto bailout; } } if ((entry->flags & CTL_CMD_FLAG_OK_ON_NO_MEDIA) == 0) { if (lun->flags & CTL_LUN_EJECTED) ctl_set_lun_ejected(ctsio); else if (lun->flags & CTL_LUN_NO_MEDIA) { if (lun->flags & CTL_LUN_REMOVABLE) ctl_set_lun_no_media(ctsio); else ctl_set_lun_int_reqd(ctsio); } else if (lun->flags & CTL_LUN_STOPPED) ctl_set_lun_stopped(ctsio); else goto bailout; retval = 1; goto bailout; } bailout: return (retval); } static void ctl_failover_io(union ctl_io *io, int have_lock) { ctl_set_busy(&io->scsiio); ctl_done(io); } static void ctl_failover_lun(union ctl_io *rio) { struct ctl_softc *softc = CTL_SOFTC(rio); struct ctl_lun *lun; struct ctl_io_hdr *io, *next_io; uint32_t targ_lun; targ_lun = rio->io_hdr.nexus.targ_mapped_lun; CTL_DEBUG_PRINT(("FAILOVER for lun %ju\n", targ_lun)); /* Find and lock the LUN. */ mtx_lock(&softc->ctl_lock); if (targ_lun > ctl_max_luns || (lun = softc->ctl_luns[targ_lun]) == NULL) { mtx_unlock(&softc->ctl_lock); return; } mtx_lock(&lun->lun_lock); mtx_unlock(&softc->ctl_lock); if (lun->flags & CTL_LUN_DISABLED) { mtx_unlock(&lun->lun_lock); return; } if (softc->ha_mode == CTL_HA_MODE_XFER) { TAILQ_FOREACH_SAFE(io, &lun->ooa_queue, ooa_links, next_io) { /* We are master */ if (io->flags & CTL_FLAG_FROM_OTHER_SC) { if (io->flags & CTL_FLAG_IO_ACTIVE) { io->flags |= CTL_FLAG_ABORT; io->flags |= CTL_FLAG_FAILOVER; ctl_try_unblock_io(lun, (union ctl_io *)io, FALSE); } else { /* This can be only due to DATAMOVE */ io->msg_type = CTL_MSG_DATAMOVE_DONE; io->flags &= ~CTL_FLAG_DMA_INPROG; io->flags |= CTL_FLAG_IO_ACTIVE; io->port_status = 31340; ctl_enqueue_isc((union ctl_io *)io); } } else /* We are slave */ if (io->flags & CTL_FLAG_SENT_2OTHER_SC) { io->flags &= ~CTL_FLAG_SENT_2OTHER_SC; if (io->flags & CTL_FLAG_IO_ACTIVE) { io->flags |= CTL_FLAG_FAILOVER; } else { ctl_set_busy(&((union ctl_io *)io)-> scsiio); ctl_done((union ctl_io *)io); } } } } else { /* SERIALIZE modes */ TAILQ_FOREACH_SAFE(io, &lun->ooa_queue, ooa_links, next_io) { /* We are master */ if (io->flags & CTL_FLAG_FROM_OTHER_SC) { if (io->blocker != NULL) { TAILQ_REMOVE(&io->blocker->io_hdr.blocked_queue, io, blocked_links); io->blocker = NULL; } ctl_try_unblock_others(lun, (union ctl_io *)io, TRUE); TAILQ_REMOVE(&lun->ooa_queue, io, ooa_links); ctl_free_io((union ctl_io *)io); } else /* We are slave */ if (io->flags & CTL_FLAG_SENT_2OTHER_SC) { io->flags &= ~CTL_FLAG_SENT_2OTHER_SC; if (!(io->flags & CTL_FLAG_IO_ACTIVE)) { ctl_set_busy(&((union ctl_io *)io)-> scsiio); ctl_done((union ctl_io *)io); } } } } mtx_unlock(&lun->lun_lock); } static int ctl_scsiio_precheck(struct ctl_softc *softc, struct ctl_scsiio *ctsio) { struct ctl_lun *lun; const struct ctl_cmd_entry *entry; union ctl_io *bio; uint32_t initidx, targ_lun; int retval = 0; lun = NULL; targ_lun = ctsio->io_hdr.nexus.targ_mapped_lun; if (targ_lun < ctl_max_luns) lun = softc->ctl_luns[targ_lun]; if (lun) { /* * If the LUN is invalid, pretend that it doesn't exist. * It will go away as soon as all pending I/O has been * completed. */ mtx_lock(&lun->lun_lock); if (lun->flags & CTL_LUN_DISABLED) { mtx_unlock(&lun->lun_lock); lun = NULL; } } CTL_LUN(ctsio) = lun; if (lun) { CTL_BACKEND_LUN(ctsio) = lun->be_lun; /* * Every I/O goes into the OOA queue for a particular LUN, * and stays there until completion. */ #ifdef CTL_TIME_IO if (TAILQ_EMPTY(&lun->ooa_queue)) lun->idle_time += getsbinuptime() - lun->last_busy; #endif TAILQ_INSERT_TAIL(&lun->ooa_queue, &ctsio->io_hdr, ooa_links); } /* Get command entry and return error if it is unsuppotyed. */ entry = ctl_validate_command(ctsio); if (entry == NULL) { if (lun) mtx_unlock(&lun->lun_lock); return (retval); } ctsio->io_hdr.flags &= ~CTL_FLAG_DATA_MASK; ctsio->io_hdr.flags |= entry->flags & CTL_FLAG_DATA_MASK; /* * Check to see whether we can send this command to LUNs that don't * exist. This should pretty much only be the case for inquiry * and request sense. Further checks, below, really require having * a LUN, so we can't really check the command anymore. Just put * it on the rtr queue. */ if (lun == NULL) { if (entry->flags & CTL_CMD_FLAG_OK_ON_NO_LUN) { ctsio->io_hdr.flags |= CTL_FLAG_IS_WAS_ON_RTR; ctl_enqueue_rtr((union ctl_io *)ctsio); return (retval); } ctl_set_unsupported_lun(ctsio); ctl_done((union ctl_io *)ctsio); CTL_DEBUG_PRINT(("ctl_scsiio_precheck: bailing out due to invalid LUN\n")); return (retval); } else { /* * Make sure we support this particular command on this LUN. * e.g., we don't support writes to the control LUN. */ if (!ctl_cmd_applicable(lun->be_lun->lun_type, entry)) { mtx_unlock(&lun->lun_lock); ctl_set_invalid_opcode(ctsio); ctl_done((union ctl_io *)ctsio); return (retval); } } initidx = ctl_get_initindex(&ctsio->io_hdr.nexus); /* * If we've got a request sense, it'll clear the contingent * allegiance condition. Otherwise, if we have a CA condition for * this initiator, clear it, because it sent down a command other * than request sense. */ if (ctsio->cdb[0] != REQUEST_SENSE) { struct scsi_sense_data *ps; ps = lun->pending_sense[initidx / CTL_MAX_INIT_PER_PORT]; if (ps != NULL) ps[initidx % CTL_MAX_INIT_PER_PORT].error_code = 0; } /* * If the command has this flag set, it handles its own unit * attention reporting, we shouldn't do anything. Otherwise we * check for any pending unit attentions, and send them back to the * initiator. We only do this when a command initially comes in, * not when we pull it off the blocked queue. * * According to SAM-3, section 5.3.2, the order that things get * presented back to the host is basically unit attentions caused * by some sort of reset event, busy status, reservation conflicts * or task set full, and finally any other status. * * One issue here is that some of the unit attentions we report * don't fall into the "reset" category (e.g. "reported luns data * has changed"). So reporting it here, before the reservation * check, may be technically wrong. I guess the only thing to do * would be to check for and report the reset events here, and then * check for the other unit attention types after we check for a * reservation conflict. * * XXX KDM need to fix this */ if ((entry->flags & CTL_CMD_FLAG_NO_SENSE) == 0) { ctl_ua_type ua_type; u_int sense_len = 0; ua_type = ctl_build_ua(lun, initidx, &ctsio->sense_data, &sense_len, SSD_TYPE_NONE); if (ua_type != CTL_UA_NONE) { mtx_unlock(&lun->lun_lock); ctsio->scsi_status = SCSI_STATUS_CHECK_COND; ctsio->io_hdr.status = CTL_SCSI_ERROR | CTL_AUTOSENSE; ctsio->sense_len = sense_len; ctl_done((union ctl_io *)ctsio); return (retval); } } if (ctl_scsiio_lun_check(lun, entry, ctsio) != 0) { mtx_unlock(&lun->lun_lock); ctl_done((union ctl_io *)ctsio); return (retval); } /* * XXX CHD this is where we want to send IO to other side if * this LUN is secondary on this SC. We will need to make a copy * of the IO and flag the IO on this side as SENT_2OTHER and the flag * the copy we send as FROM_OTHER. * We also need to stuff the address of the original IO so we can * find it easily. Something similar will need be done on the other * side so when we are done we can find the copy. */ if ((lun->flags & CTL_LUN_PRIMARY_SC) == 0 && (lun->flags & CTL_LUN_PEER_SC_PRIMARY) != 0 && (entry->flags & CTL_CMD_FLAG_RUN_HERE) == 0) { union ctl_ha_msg msg_info; int isc_retval; ctsio->io_hdr.flags |= CTL_FLAG_SENT_2OTHER_SC; ctsio->io_hdr.flags &= ~CTL_FLAG_IO_ACTIVE; mtx_unlock(&lun->lun_lock); msg_info.hdr.msg_type = CTL_MSG_SERIALIZE; msg_info.hdr.original_sc = (union ctl_io *)ctsio; msg_info.hdr.serializing_sc = NULL; msg_info.hdr.nexus = ctsio->io_hdr.nexus; msg_info.scsi.tag_num = ctsio->tag_num; msg_info.scsi.tag_type = ctsio->tag_type; msg_info.scsi.cdb_len = ctsio->cdb_len; memcpy(msg_info.scsi.cdb, ctsio->cdb, CTL_MAX_CDBLEN); if ((isc_retval = ctl_ha_msg_send(CTL_HA_CHAN_CTL, &msg_info, sizeof(msg_info.scsi) - sizeof(msg_info.scsi.sense_data), M_WAITOK)) > CTL_HA_STATUS_SUCCESS) { ctl_set_busy(ctsio); ctl_done((union ctl_io *)ctsio); return (retval); } return (retval); } bio = (union ctl_io *)TAILQ_PREV(&ctsio->io_hdr, ctl_ooaq, ooa_links); switch (ctl_check_ooa(lun, (union ctl_io *)ctsio, &bio)) { case CTL_ACTION_BLOCK: ctsio->io_hdr.blocker = bio; TAILQ_INSERT_TAIL(&bio->io_hdr.blocked_queue, &ctsio->io_hdr, blocked_links); mtx_unlock(&lun->lun_lock); return (retval); case CTL_ACTION_PASS: case CTL_ACTION_SKIP: ctsio->io_hdr.flags |= CTL_FLAG_IS_WAS_ON_RTR; mtx_unlock(&lun->lun_lock); ctl_enqueue_rtr((union ctl_io *)ctsio); break; case CTL_ACTION_OVERLAP: mtx_unlock(&lun->lun_lock); ctl_set_overlapped_cmd(ctsio); ctl_done((union ctl_io *)ctsio); break; case CTL_ACTION_OVERLAP_TAG: mtx_unlock(&lun->lun_lock); ctl_set_overlapped_tag(ctsio, ctsio->tag_num & 0xff); ctl_done((union ctl_io *)ctsio); break; case CTL_ACTION_ERROR: default: mtx_unlock(&lun->lun_lock); ctl_set_internal_failure(ctsio, /*sks_valid*/ 0, /*retry_count*/ 0); ctl_done((union ctl_io *)ctsio); break; } return (retval); } const struct ctl_cmd_entry * ctl_get_cmd_entry(struct ctl_scsiio *ctsio, int *sa) { const struct ctl_cmd_entry *entry; int service_action; entry = &ctl_cmd_table[ctsio->cdb[0]]; if (sa) *sa = ((entry->flags & CTL_CMD_FLAG_SA5) != 0); if (entry->flags & CTL_CMD_FLAG_SA5) { service_action = ctsio->cdb[1] & SERVICE_ACTION_MASK; entry = &((const struct ctl_cmd_entry *) entry->execute)[service_action]; } return (entry); } const struct ctl_cmd_entry * ctl_validate_command(struct ctl_scsiio *ctsio) { const struct ctl_cmd_entry *entry; int i, sa; uint8_t diff; entry = ctl_get_cmd_entry(ctsio, &sa); if (entry->execute == NULL) { if (sa) ctl_set_invalid_field(ctsio, /*sks_valid*/ 1, /*command*/ 1, /*field*/ 1, /*bit_valid*/ 1, /*bit*/ 4); else ctl_set_invalid_opcode(ctsio); ctl_done((union ctl_io *)ctsio); return (NULL); } KASSERT(entry->length > 0, ("Not defined length for command 0x%02x/0x%02x", ctsio->cdb[0], ctsio->cdb[1])); for (i = 1; i < entry->length; i++) { diff = ctsio->cdb[i] & ~entry->usage[i - 1]; if (diff == 0) continue; ctl_set_invalid_field(ctsio, /*sks_valid*/ 1, /*command*/ 1, /*field*/ i, /*bit_valid*/ 1, /*bit*/ fls(diff) - 1); ctl_done((union ctl_io *)ctsio); return (NULL); } return (entry); } static int ctl_cmd_applicable(uint8_t lun_type, const struct ctl_cmd_entry *entry) { switch (lun_type) { case T_DIRECT: if ((entry->flags & CTL_CMD_FLAG_OK_ON_DIRECT) == 0) return (0); break; case T_PROCESSOR: if ((entry->flags & CTL_CMD_FLAG_OK_ON_PROC) == 0) return (0); break; case T_CDROM: if ((entry->flags & CTL_CMD_FLAG_OK_ON_CDROM) == 0) return (0); break; default: return (0); } return (1); } static int ctl_scsiio(struct ctl_scsiio *ctsio) { int retval; const struct ctl_cmd_entry *entry; retval = CTL_RETVAL_COMPLETE; CTL_DEBUG_PRINT(("ctl_scsiio cdb[0]=%02X\n", ctsio->cdb[0])); entry = ctl_get_cmd_entry(ctsio, NULL); /* * If this I/O has been aborted, just send it straight to * ctl_done() without executing it. */ if (ctsio->io_hdr.flags & CTL_FLAG_ABORT) { ctl_done((union ctl_io *)ctsio); goto bailout; } /* * All the checks should have been handled by ctl_scsiio_precheck(). * We should be clear now to just execute the I/O. */ retval = entry->execute(ctsio); bailout: return (retval); } static int ctl_target_reset(union ctl_io *io) { struct ctl_softc *softc = CTL_SOFTC(io); struct ctl_port *port = CTL_PORT(io); struct ctl_lun *lun; uint32_t initidx; ctl_ua_type ua_type; if (!(io->io_hdr.flags & CTL_FLAG_FROM_OTHER_SC)) { union ctl_ha_msg msg_info; msg_info.hdr.nexus = io->io_hdr.nexus; msg_info.task.task_action = io->taskio.task_action; msg_info.hdr.msg_type = CTL_MSG_MANAGE_TASKS; msg_info.hdr.original_sc = NULL; msg_info.hdr.serializing_sc = NULL; ctl_ha_msg_send(CTL_HA_CHAN_CTL, &msg_info, sizeof(msg_info.task), M_WAITOK); } initidx = ctl_get_initindex(&io->io_hdr.nexus); if (io->taskio.task_action == CTL_TASK_TARGET_RESET) ua_type = CTL_UA_TARG_RESET; else ua_type = CTL_UA_BUS_RESET; mtx_lock(&softc->ctl_lock); STAILQ_FOREACH(lun, &softc->lun_list, links) { if (port != NULL && ctl_lun_map_to_port(port, lun->lun) == UINT32_MAX) continue; ctl_do_lun_reset(lun, initidx, ua_type); } mtx_unlock(&softc->ctl_lock); io->taskio.task_status = CTL_TASK_FUNCTION_COMPLETE; return (0); } /* * The LUN should always be set. The I/O is optional, and is used to * distinguish between I/Os sent by this initiator, and by other * initiators. We set unit attention for initiators other than this one. * SAM-3 is vague on this point. It does say that a unit attention should * be established for other initiators when a LUN is reset (see section * 5.7.3), but it doesn't specifically say that the unit attention should * be established for this particular initiator when a LUN is reset. Here * is the relevant text, from SAM-3 rev 8: * * 5.7.2 When a SCSI initiator port aborts its own tasks * * When a SCSI initiator port causes its own task(s) to be aborted, no * notification that the task(s) have been aborted shall be returned to * the SCSI initiator port other than the completion response for the * command or task management function action that caused the task(s) to * be aborted and notification(s) associated with related effects of the * action (e.g., a reset unit attention condition). * * XXX KDM for now, we're setting unit attention for all initiators. */ static void ctl_do_lun_reset(struct ctl_lun *lun, uint32_t initidx, ctl_ua_type ua_type) { union ctl_io *xio; int i; mtx_lock(&lun->lun_lock); /* Abort tasks. */ for (xio = (union ctl_io *)TAILQ_FIRST(&lun->ooa_queue); xio != NULL; xio = (union ctl_io *)TAILQ_NEXT(&xio->io_hdr, ooa_links)) { xio->io_hdr.flags |= CTL_FLAG_ABORT | CTL_FLAG_ABORT_STATUS; ctl_try_unblock_io(lun, xio, FALSE); } /* Clear CA. */ for (i = 0; i < ctl_max_ports; i++) { free(lun->pending_sense[i], M_CTL); lun->pending_sense[i] = NULL; } /* Clear reservation. */ lun->flags &= ~CTL_LUN_RESERVED; /* Clear prevent media removal. */ if (lun->prevent) { for (i = 0; i < CTL_MAX_INITIATORS; i++) ctl_clear_mask(lun->prevent, i); lun->prevent_count = 0; } /* Clear TPC status */ ctl_tpc_lun_clear(lun, -1); /* Establish UA. */ #if 0 ctl_est_ua_all(lun, initidx, ua_type); #else ctl_est_ua_all(lun, -1, ua_type); #endif mtx_unlock(&lun->lun_lock); } static int ctl_lun_reset(union ctl_io *io) { struct ctl_softc *softc = CTL_SOFTC(io); struct ctl_lun *lun; uint32_t targ_lun, initidx; targ_lun = io->io_hdr.nexus.targ_mapped_lun; initidx = ctl_get_initindex(&io->io_hdr.nexus); mtx_lock(&softc->ctl_lock); if (targ_lun >= ctl_max_luns || (lun = softc->ctl_luns[targ_lun]) == NULL) { mtx_unlock(&softc->ctl_lock); io->taskio.task_status = CTL_TASK_LUN_DOES_NOT_EXIST; return (1); } ctl_do_lun_reset(lun, initidx, CTL_UA_LUN_RESET); mtx_unlock(&softc->ctl_lock); io->taskio.task_status = CTL_TASK_FUNCTION_COMPLETE; if ((io->io_hdr.flags & CTL_FLAG_FROM_OTHER_SC) == 0) { union ctl_ha_msg msg_info; msg_info.hdr.msg_type = CTL_MSG_MANAGE_TASKS; msg_info.hdr.nexus = io->io_hdr.nexus; msg_info.task.task_action = CTL_TASK_LUN_RESET; msg_info.hdr.original_sc = NULL; msg_info.hdr.serializing_sc = NULL; ctl_ha_msg_send(CTL_HA_CHAN_CTL, &msg_info, sizeof(msg_info.task), M_WAITOK); } return (0); } static void ctl_abort_tasks_lun(struct ctl_lun *lun, uint32_t targ_port, uint32_t init_id, int other_sc) { union ctl_io *xio; mtx_assert(&lun->lun_lock, MA_OWNED); /* * Run through the OOA queue and attempt to find the given I/O. * The target port, initiator ID, tag type and tag number have to * match the values that we got from the initiator. If we have an * untagged command to abort, simply abort the first untagged command * we come to. We only allow one untagged command at a time of course. */ for (xio = (union ctl_io *)TAILQ_FIRST(&lun->ooa_queue); xio != NULL; xio = (union ctl_io *)TAILQ_NEXT(&xio->io_hdr, ooa_links)) { if ((targ_port == UINT32_MAX || targ_port == xio->io_hdr.nexus.targ_port) && (init_id == UINT32_MAX || init_id == xio->io_hdr.nexus.initid)) { if (targ_port != xio->io_hdr.nexus.targ_port || init_id != xio->io_hdr.nexus.initid) xio->io_hdr.flags |= CTL_FLAG_ABORT_STATUS; xio->io_hdr.flags |= CTL_FLAG_ABORT; if (!other_sc && !(lun->flags & CTL_LUN_PRIMARY_SC)) { union ctl_ha_msg msg_info; msg_info.hdr.nexus = xio->io_hdr.nexus; msg_info.task.task_action = CTL_TASK_ABORT_TASK; msg_info.task.tag_num = xio->scsiio.tag_num; msg_info.task.tag_type = xio->scsiio.tag_type; msg_info.hdr.msg_type = CTL_MSG_MANAGE_TASKS; msg_info.hdr.original_sc = NULL; msg_info.hdr.serializing_sc = NULL; ctl_ha_msg_send(CTL_HA_CHAN_CTL, &msg_info, sizeof(msg_info.task), M_NOWAIT); } ctl_try_unblock_io(lun, xio, FALSE); } } } static int ctl_abort_task_set(union ctl_io *io) { struct ctl_softc *softc = CTL_SOFTC(io); struct ctl_lun *lun; uint32_t targ_lun; /* * Look up the LUN. */ targ_lun = io->io_hdr.nexus.targ_mapped_lun; mtx_lock(&softc->ctl_lock); if (targ_lun >= ctl_max_luns || (lun = softc->ctl_luns[targ_lun]) == NULL) { mtx_unlock(&softc->ctl_lock); io->taskio.task_status = CTL_TASK_LUN_DOES_NOT_EXIST; return (1); } mtx_lock(&lun->lun_lock); mtx_unlock(&softc->ctl_lock); if (io->taskio.task_action == CTL_TASK_ABORT_TASK_SET) { ctl_abort_tasks_lun(lun, io->io_hdr.nexus.targ_port, io->io_hdr.nexus.initid, (io->io_hdr.flags & CTL_FLAG_FROM_OTHER_SC) != 0); } else { /* CTL_TASK_CLEAR_TASK_SET */ ctl_abort_tasks_lun(lun, UINT32_MAX, UINT32_MAX, (io->io_hdr.flags & CTL_FLAG_FROM_OTHER_SC) != 0); } mtx_unlock(&lun->lun_lock); io->taskio.task_status = CTL_TASK_FUNCTION_COMPLETE; return (0); } static void ctl_i_t_nexus_loss(struct ctl_softc *softc, uint32_t initidx, ctl_ua_type ua_type) { struct ctl_lun *lun; struct scsi_sense_data *ps; uint32_t p, i; p = initidx / CTL_MAX_INIT_PER_PORT; i = initidx % CTL_MAX_INIT_PER_PORT; mtx_lock(&softc->ctl_lock); STAILQ_FOREACH(lun, &softc->lun_list, links) { mtx_lock(&lun->lun_lock); /* Abort tasks. */ ctl_abort_tasks_lun(lun, p, i, 1); /* Clear CA. */ ps = lun->pending_sense[p]; if (ps != NULL) ps[i].error_code = 0; /* Clear reservation. */ if ((lun->flags & CTL_LUN_RESERVED) && (lun->res_idx == initidx)) lun->flags &= ~CTL_LUN_RESERVED; /* Clear prevent media removal. */ if (lun->prevent && ctl_is_set(lun->prevent, initidx)) { ctl_clear_mask(lun->prevent, initidx); lun->prevent_count--; } /* Clear TPC status */ ctl_tpc_lun_clear(lun, initidx); /* Establish UA. */ ctl_est_ua(lun, initidx, ua_type); mtx_unlock(&lun->lun_lock); } mtx_unlock(&softc->ctl_lock); } static int ctl_i_t_nexus_reset(union ctl_io *io) { struct ctl_softc *softc = CTL_SOFTC(io); uint32_t initidx; if (!(io->io_hdr.flags & CTL_FLAG_FROM_OTHER_SC)) { union ctl_ha_msg msg_info; msg_info.hdr.nexus = io->io_hdr.nexus; msg_info.task.task_action = CTL_TASK_I_T_NEXUS_RESET; msg_info.hdr.msg_type = CTL_MSG_MANAGE_TASKS; msg_info.hdr.original_sc = NULL; msg_info.hdr.serializing_sc = NULL; ctl_ha_msg_send(CTL_HA_CHAN_CTL, &msg_info, sizeof(msg_info.task), M_WAITOK); } initidx = ctl_get_initindex(&io->io_hdr.nexus); ctl_i_t_nexus_loss(softc, initidx, CTL_UA_I_T_NEXUS_LOSS); io->taskio.task_status = CTL_TASK_FUNCTION_COMPLETE; return (0); } static int ctl_abort_task(union ctl_io *io) { struct ctl_softc *softc = CTL_SOFTC(io); union ctl_io *xio; struct ctl_lun *lun; uint32_t targ_lun; /* * Look up the LUN. */ targ_lun = io->io_hdr.nexus.targ_mapped_lun; mtx_lock(&softc->ctl_lock); if (targ_lun >= ctl_max_luns || (lun = softc->ctl_luns[targ_lun]) == NULL) { mtx_unlock(&softc->ctl_lock); io->taskio.task_status = CTL_TASK_LUN_DOES_NOT_EXIST; return (1); } mtx_lock(&lun->lun_lock); mtx_unlock(&softc->ctl_lock); /* * Run through the OOA queue and attempt to find the given I/O. * The target port, initiator ID, tag type and tag number have to * match the values that we got from the initiator. If we have an * untagged command to abort, simply abort the first untagged command * we come to. We only allow one untagged command at a time of course. */ for (xio = (union ctl_io *)TAILQ_FIRST(&lun->ooa_queue); xio != NULL; xio = (union ctl_io *)TAILQ_NEXT(&xio->io_hdr, ooa_links)) { if ((xio->io_hdr.nexus.targ_port != io->io_hdr.nexus.targ_port) || (xio->io_hdr.nexus.initid != io->io_hdr.nexus.initid) || (xio->io_hdr.flags & CTL_FLAG_ABORT)) continue; /* * If the abort says that the task is untagged, the * task in the queue must be untagged. Otherwise, * we just check to see whether the tag numbers * match. This is because the QLogic firmware * doesn't pass back the tag type in an abort * request. */ #if 0 if (((xio->scsiio.tag_type == CTL_TAG_UNTAGGED) && (io->taskio.tag_type == CTL_TAG_UNTAGGED)) || (xio->scsiio.tag_num == io->taskio.tag_num)) { #else /* * XXX KDM we've got problems with FC, because it * doesn't send down a tag type with aborts. So we * can only really go by the tag number... * This may cause problems with parallel SCSI. * Need to figure that out!! */ if (xio->scsiio.tag_num == io->taskio.tag_num) { #endif xio->io_hdr.flags |= CTL_FLAG_ABORT; if ((io->io_hdr.flags & CTL_FLAG_FROM_OTHER_SC) == 0 && !(lun->flags & CTL_LUN_PRIMARY_SC)) { union ctl_ha_msg msg_info; msg_info.hdr.nexus = io->io_hdr.nexus; msg_info.task.task_action = CTL_TASK_ABORT_TASK; msg_info.task.tag_num = io->taskio.tag_num; msg_info.task.tag_type = io->taskio.tag_type; msg_info.hdr.msg_type = CTL_MSG_MANAGE_TASKS; msg_info.hdr.original_sc = NULL; msg_info.hdr.serializing_sc = NULL; ctl_ha_msg_send(CTL_HA_CHAN_CTL, &msg_info, sizeof(msg_info.task), M_NOWAIT); } ctl_try_unblock_io(lun, xio, FALSE); } } mtx_unlock(&lun->lun_lock); io->taskio.task_status = CTL_TASK_FUNCTION_COMPLETE; return (0); } static int ctl_query_task(union ctl_io *io, int task_set) { struct ctl_softc *softc = CTL_SOFTC(io); union ctl_io *xio; struct ctl_lun *lun; int found = 0; uint32_t targ_lun; targ_lun = io->io_hdr.nexus.targ_mapped_lun; mtx_lock(&softc->ctl_lock); if (targ_lun >= ctl_max_luns || (lun = softc->ctl_luns[targ_lun]) == NULL) { mtx_unlock(&softc->ctl_lock); io->taskio.task_status = CTL_TASK_LUN_DOES_NOT_EXIST; return (1); } mtx_lock(&lun->lun_lock); mtx_unlock(&softc->ctl_lock); for (xio = (union ctl_io *)TAILQ_FIRST(&lun->ooa_queue); xio != NULL; xio = (union ctl_io *)TAILQ_NEXT(&xio->io_hdr, ooa_links)) { if ((xio->io_hdr.nexus.targ_port != io->io_hdr.nexus.targ_port) || (xio->io_hdr.nexus.initid != io->io_hdr.nexus.initid) || (xio->io_hdr.flags & CTL_FLAG_ABORT)) continue; if (task_set || xio->scsiio.tag_num == io->taskio.tag_num) { found = 1; break; } } mtx_unlock(&lun->lun_lock); if (found) io->taskio.task_status = CTL_TASK_FUNCTION_SUCCEEDED; else io->taskio.task_status = CTL_TASK_FUNCTION_COMPLETE; return (0); } static int ctl_query_async_event(union ctl_io *io) { struct ctl_softc *softc = CTL_SOFTC(io); struct ctl_lun *lun; ctl_ua_type ua; uint32_t targ_lun, initidx; targ_lun = io->io_hdr.nexus.targ_mapped_lun; mtx_lock(&softc->ctl_lock); if (targ_lun >= ctl_max_luns || (lun = softc->ctl_luns[targ_lun]) == NULL) { mtx_unlock(&softc->ctl_lock); io->taskio.task_status = CTL_TASK_LUN_DOES_NOT_EXIST; return (1); } mtx_lock(&lun->lun_lock); mtx_unlock(&softc->ctl_lock); initidx = ctl_get_initindex(&io->io_hdr.nexus); ua = ctl_build_qae(lun, initidx, io->taskio.task_resp); mtx_unlock(&lun->lun_lock); if (ua != CTL_UA_NONE) io->taskio.task_status = CTL_TASK_FUNCTION_SUCCEEDED; else io->taskio.task_status = CTL_TASK_FUNCTION_COMPLETE; return (0); } static void ctl_run_task(union ctl_io *io) { int retval = 1; CTL_DEBUG_PRINT(("ctl_run_task\n")); KASSERT(io->io_hdr.io_type == CTL_IO_TASK, ("ctl_run_task: Unextected io_type %d\n", io->io_hdr.io_type)); io->taskio.task_status = CTL_TASK_FUNCTION_NOT_SUPPORTED; bzero(io->taskio.task_resp, sizeof(io->taskio.task_resp)); switch (io->taskio.task_action) { case CTL_TASK_ABORT_TASK: retval = ctl_abort_task(io); break; case CTL_TASK_ABORT_TASK_SET: case CTL_TASK_CLEAR_TASK_SET: retval = ctl_abort_task_set(io); break; case CTL_TASK_CLEAR_ACA: break; case CTL_TASK_I_T_NEXUS_RESET: retval = ctl_i_t_nexus_reset(io); break; case CTL_TASK_LUN_RESET: retval = ctl_lun_reset(io); break; case CTL_TASK_TARGET_RESET: case CTL_TASK_BUS_RESET: retval = ctl_target_reset(io); break; case CTL_TASK_PORT_LOGIN: break; case CTL_TASK_PORT_LOGOUT: break; case CTL_TASK_QUERY_TASK: retval = ctl_query_task(io, 0); break; case CTL_TASK_QUERY_TASK_SET: retval = ctl_query_task(io, 1); break; case CTL_TASK_QUERY_ASYNC_EVENT: retval = ctl_query_async_event(io); break; default: printf("%s: got unknown task management event %d\n", __func__, io->taskio.task_action); break; } if (retval == 0) io->io_hdr.status = CTL_SUCCESS; else io->io_hdr.status = CTL_ERROR; ctl_done(io); } /* * For HA operation. Handle commands that come in from the other * controller. */ static void ctl_handle_isc(union ctl_io *io) { struct ctl_softc *softc = CTL_SOFTC(io); struct ctl_lun *lun; const struct ctl_cmd_entry *entry; uint32_t targ_lun; targ_lun = io->io_hdr.nexus.targ_mapped_lun; switch (io->io_hdr.msg_type) { case CTL_MSG_SERIALIZE: ctl_serialize_other_sc_cmd(&io->scsiio); break; case CTL_MSG_R2R: /* Only used in SER_ONLY mode. */ entry = ctl_get_cmd_entry(&io->scsiio, NULL); if (targ_lun >= ctl_max_luns || (lun = softc->ctl_luns[targ_lun]) == NULL) { ctl_done(io); break; } mtx_lock(&lun->lun_lock); if (ctl_scsiio_lun_check(lun, entry, &io->scsiio) != 0) { mtx_unlock(&lun->lun_lock); ctl_done(io); break; } io->io_hdr.flags |= CTL_FLAG_IS_WAS_ON_RTR; mtx_unlock(&lun->lun_lock); ctl_enqueue_rtr(io); break; case CTL_MSG_FINISH_IO: if (softc->ha_mode == CTL_HA_MODE_XFER) { ctl_done(io); break; } if (targ_lun >= ctl_max_luns || (lun = softc->ctl_luns[targ_lun]) == NULL) { ctl_free_io(io); break; } mtx_lock(&lun->lun_lock); ctl_try_unblock_others(lun, io, TRUE); TAILQ_REMOVE(&lun->ooa_queue, &io->io_hdr, ooa_links); mtx_unlock(&lun->lun_lock); ctl_free_io(io); break; case CTL_MSG_PERS_ACTION: ctl_hndl_per_res_out_on_other_sc(io); ctl_free_io(io); break; case CTL_MSG_BAD_JUJU: ctl_done(io); break; case CTL_MSG_DATAMOVE: /* Only used in XFER mode */ ctl_datamove_remote(io); break; case CTL_MSG_DATAMOVE_DONE: /* Only used in XFER mode */ io->scsiio.be_move_done(io); break; case CTL_MSG_FAILOVER: ctl_failover_lun(io); ctl_free_io(io); break; default: printf("%s: Invalid message type %d\n", __func__, io->io_hdr.msg_type); ctl_free_io(io); break; } } /* * Returns the match type in the case of a match, or CTL_LUN_PAT_NONE if * there is no match. */ static ctl_lun_error_pattern ctl_cmd_pattern_match(struct ctl_scsiio *ctsio, struct ctl_error_desc *desc) { const struct ctl_cmd_entry *entry; ctl_lun_error_pattern filtered_pattern, pattern; pattern = desc->error_pattern; /* * XXX KDM we need more data passed into this function to match a * custom pattern, and we actually need to implement custom pattern * matching. */ if (pattern & CTL_LUN_PAT_CMD) return (CTL_LUN_PAT_CMD); if ((pattern & CTL_LUN_PAT_MASK) == CTL_LUN_PAT_ANY) return (CTL_LUN_PAT_ANY); entry = ctl_get_cmd_entry(ctsio, NULL); filtered_pattern = entry->pattern & pattern; /* * If the user requested specific flags in the pattern (e.g. * CTL_LUN_PAT_RANGE), make sure the command supports all of those * flags. * * If the user did not specify any flags, it doesn't matter whether * or not the command supports the flags. */ if ((filtered_pattern & ~CTL_LUN_PAT_MASK) != (pattern & ~CTL_LUN_PAT_MASK)) return (CTL_LUN_PAT_NONE); /* * If the user asked for a range check, see if the requested LBA * range overlaps with this command's LBA range. */ if (filtered_pattern & CTL_LUN_PAT_RANGE) { uint64_t lba1; uint64_t len1; ctl_action action; int retval; retval = ctl_get_lba_len((union ctl_io *)ctsio, &lba1, &len1); if (retval != 0) return (CTL_LUN_PAT_NONE); action = ctl_extent_check_lba(lba1, len1, desc->lba_range.lba, desc->lba_range.len, FALSE); /* * A "pass" means that the LBA ranges don't overlap, so * this doesn't match the user's range criteria. */ if (action == CTL_ACTION_PASS) return (CTL_LUN_PAT_NONE); } return (filtered_pattern); } static void ctl_inject_error(struct ctl_lun *lun, union ctl_io *io) { struct ctl_error_desc *desc, *desc2; mtx_assert(&lun->lun_lock, MA_OWNED); STAILQ_FOREACH_SAFE(desc, &lun->error_list, links, desc2) { ctl_lun_error_pattern pattern; /* * Check to see whether this particular command matches * the pattern in the descriptor. */ pattern = ctl_cmd_pattern_match(&io->scsiio, desc); if ((pattern & CTL_LUN_PAT_MASK) == CTL_LUN_PAT_NONE) continue; switch (desc->lun_error & CTL_LUN_INJ_TYPE) { case CTL_LUN_INJ_ABORTED: ctl_set_aborted(&io->scsiio); break; case CTL_LUN_INJ_MEDIUM_ERR: ctl_set_medium_error(&io->scsiio, (io->io_hdr.flags & CTL_FLAG_DATA_MASK) != CTL_FLAG_DATA_OUT); break; case CTL_LUN_INJ_UA: /* 29h/00h POWER ON, RESET, OR BUS DEVICE RESET * OCCURRED */ ctl_set_ua(&io->scsiio, 0x29, 0x00); break; case CTL_LUN_INJ_CUSTOM: /* * We're assuming the user knows what he is doing. * Just copy the sense information without doing * checks. */ bcopy(&desc->custom_sense, &io->scsiio.sense_data, MIN(sizeof(desc->custom_sense), sizeof(io->scsiio.sense_data))); io->scsiio.scsi_status = SCSI_STATUS_CHECK_COND; io->scsiio.sense_len = SSD_FULL_SIZE; io->io_hdr.status = CTL_SCSI_ERROR | CTL_AUTOSENSE; break; case CTL_LUN_INJ_NONE: default: /* * If this is an error injection type we don't know * about, clear the continuous flag (if it is set) * so it will get deleted below. */ desc->lun_error &= ~CTL_LUN_INJ_CONTINUOUS; break; } /* * By default, each error injection action is a one-shot */ if (desc->lun_error & CTL_LUN_INJ_CONTINUOUS) continue; STAILQ_REMOVE(&lun->error_list, desc, ctl_error_desc, links); free(desc, M_CTL); } } #ifdef CTL_IO_DELAY static void ctl_datamove_timer_wakeup(void *arg) { union ctl_io *io; io = (union ctl_io *)arg; ctl_datamove(io); } #endif /* CTL_IO_DELAY */ void ctl_datamove(union ctl_io *io) { void (*fe_datamove)(union ctl_io *io); mtx_assert(&((struct ctl_softc *)CTL_SOFTC(io))->ctl_lock, MA_NOTOWNED); CTL_DEBUG_PRINT(("ctl_datamove\n")); /* No data transferred yet. Frontend must update this when done. */ io->scsiio.kern_data_resid = io->scsiio.kern_data_len; #ifdef CTL_TIME_IO if ((time_uptime - io->io_hdr.start_time) > ctl_time_io_secs) { char str[256]; char path_str[64]; struct sbuf sb; ctl_scsi_path_string(io, path_str, sizeof(path_str)); sbuf_new(&sb, str, sizeof(str), SBUF_FIXEDLEN); sbuf_cat(&sb, path_str); switch (io->io_hdr.io_type) { case CTL_IO_SCSI: ctl_scsi_command_string(&io->scsiio, NULL, &sb); sbuf_printf(&sb, "\n"); sbuf_cat(&sb, path_str); sbuf_printf(&sb, "Tag: 0x%04x, type %d\n", io->scsiio.tag_num, io->scsiio.tag_type); break; case CTL_IO_TASK: sbuf_printf(&sb, "Task I/O type: %d, Tag: 0x%04x, " "Tag Type: %d\n", io->taskio.task_action, io->taskio.tag_num, io->taskio.tag_type); break; default: panic("%s: Invalid CTL I/O type %d\n", __func__, io->io_hdr.io_type); } sbuf_cat(&sb, path_str); sbuf_printf(&sb, "ctl_datamove: %jd seconds\n", (intmax_t)time_uptime - io->io_hdr.start_time); sbuf_finish(&sb); printf("%s", sbuf_data(&sb)); } #endif /* CTL_TIME_IO */ #ifdef CTL_IO_DELAY if (io->io_hdr.flags & CTL_FLAG_DELAY_DONE) { io->io_hdr.flags &= ~CTL_FLAG_DELAY_DONE; } else { struct ctl_lun *lun; lun = CTL_LUN(io); if ((lun != NULL) && (lun->delay_info.datamove_delay > 0)) { callout_init(&io->io_hdr.delay_callout, /*mpsafe*/ 1); io->io_hdr.flags |= CTL_FLAG_DELAY_DONE; callout_reset(&io->io_hdr.delay_callout, lun->delay_info.datamove_delay * hz, ctl_datamove_timer_wakeup, io); if (lun->delay_info.datamove_type == CTL_DELAY_TYPE_ONESHOT) lun->delay_info.datamove_delay = 0; return; } } #endif /* * This command has been aborted. Set the port status, so we fail * the data move. */ if (io->io_hdr.flags & CTL_FLAG_ABORT) { printf("ctl_datamove: tag 0x%04x on (%u:%u:%u) aborted\n", io->scsiio.tag_num, io->io_hdr.nexus.initid, io->io_hdr.nexus.targ_port, io->io_hdr.nexus.targ_lun); io->io_hdr.port_status = 31337; /* * Note that the backend, in this case, will get the * callback in its context. In other cases it may get * called in the frontend's interrupt thread context. */ io->scsiio.be_move_done(io); return; } /* Don't confuse frontend with zero length data move. */ if (io->scsiio.kern_data_len == 0) { io->scsiio.be_move_done(io); return; } fe_datamove = CTL_PORT(io)->fe_datamove; fe_datamove(io); } static void ctl_send_datamove_done(union ctl_io *io, int have_lock) { union ctl_ha_msg msg; #ifdef CTL_TIME_IO struct bintime cur_bt; #endif memset(&msg, 0, sizeof(msg)); msg.hdr.msg_type = CTL_MSG_DATAMOVE_DONE; msg.hdr.original_sc = io; msg.hdr.serializing_sc = io->io_hdr.remote_io; msg.hdr.nexus = io->io_hdr.nexus; msg.hdr.status = io->io_hdr.status; msg.scsi.kern_data_resid = io->scsiio.kern_data_resid; msg.scsi.tag_num = io->scsiio.tag_num; msg.scsi.tag_type = io->scsiio.tag_type; msg.scsi.scsi_status = io->scsiio.scsi_status; memcpy(&msg.scsi.sense_data, &io->scsiio.sense_data, io->scsiio.sense_len); msg.scsi.sense_len = io->scsiio.sense_len; msg.scsi.port_status = io->io_hdr.port_status; io->io_hdr.flags &= ~CTL_FLAG_IO_ACTIVE; if (io->io_hdr.flags & CTL_FLAG_FAILOVER) { ctl_failover_io(io, /*have_lock*/ have_lock); return; } ctl_ha_msg_send(CTL_HA_CHAN_CTL, &msg, sizeof(msg.scsi) - sizeof(msg.scsi.sense_data) + msg.scsi.sense_len, M_WAITOK); #ifdef CTL_TIME_IO getbinuptime(&cur_bt); bintime_sub(&cur_bt, &io->io_hdr.dma_start_bt); bintime_add(&io->io_hdr.dma_bt, &cur_bt); #endif io->io_hdr.num_dmas++; } /* * The DMA to the remote side is done, now we need to tell the other side * we're done so it can continue with its data movement. */ static void ctl_datamove_remote_write_cb(struct ctl_ha_dt_req *rq) { union ctl_io *io; uint32_t i; io = rq->context; if (rq->ret != CTL_HA_STATUS_SUCCESS) { printf("%s: ISC DMA write failed with error %d", __func__, rq->ret); ctl_set_internal_failure(&io->scsiio, /*sks_valid*/ 1, /*retry_count*/ rq->ret); } ctl_dt_req_free(rq); for (i = 0; i < io->scsiio.kern_sg_entries; i++) free(CTL_LSGLT(io)[i].addr, M_CTL); free(CTL_RSGL(io), M_CTL); CTL_RSGL(io) = NULL; CTL_LSGL(io) = NULL; /* * The data is in local and remote memory, so now we need to send * status (good or back) back to the other side. */ ctl_send_datamove_done(io, /*have_lock*/ 0); } /* * We've moved the data from the host/controller into local memory. Now we * need to push it over to the remote controller's memory. */ static int ctl_datamove_remote_dm_write_cb(union ctl_io *io) { int retval; retval = ctl_datamove_remote_xfer(io, CTL_HA_DT_CMD_WRITE, ctl_datamove_remote_write_cb); return (retval); } static void ctl_datamove_remote_write(union ctl_io *io) { int retval; void (*fe_datamove)(union ctl_io *io); /* * - Get the data from the host/HBA into local memory. * - DMA memory from the local controller to the remote controller. * - Send status back to the remote controller. */ retval = ctl_datamove_remote_sgl_setup(io); if (retval != 0) return; /* Switch the pointer over so the FETD knows what to do */ io->scsiio.kern_data_ptr = (uint8_t *)CTL_LSGL(io); /* * Use a custom move done callback, since we need to send completion * back to the other controller, not to the backend on this side. */ io->scsiio.be_move_done = ctl_datamove_remote_dm_write_cb; fe_datamove = CTL_PORT(io)->fe_datamove; fe_datamove(io); } static int ctl_datamove_remote_dm_read_cb(union ctl_io *io) { uint32_t i; for (i = 0; i < io->scsiio.kern_sg_entries; i++) free(CTL_LSGLT(io)[i].addr, M_CTL); free(CTL_RSGL(io), M_CTL); CTL_RSGL(io) = NULL; CTL_LSGL(io) = NULL; /* * The read is done, now we need to send status (good or bad) back * to the other side. */ ctl_send_datamove_done(io, /*have_lock*/ 0); return (0); } static void ctl_datamove_remote_read_cb(struct ctl_ha_dt_req *rq) { union ctl_io *io; void (*fe_datamove)(union ctl_io *io); io = rq->context; if (rq->ret != CTL_HA_STATUS_SUCCESS) { printf("%s: ISC DMA read failed with error %d\n", __func__, rq->ret); ctl_set_internal_failure(&io->scsiio, /*sks_valid*/ 1, /*retry_count*/ rq->ret); } ctl_dt_req_free(rq); /* Switch the pointer over so the FETD knows what to do */ io->scsiio.kern_data_ptr = (uint8_t *)CTL_LSGL(io); /* * Use a custom move done callback, since we need to send completion * back to the other controller, not to the backend on this side. */ io->scsiio.be_move_done = ctl_datamove_remote_dm_read_cb; /* XXX KDM add checks like the ones in ctl_datamove? */ fe_datamove = CTL_PORT(io)->fe_datamove; fe_datamove(io); } static int ctl_datamove_remote_sgl_setup(union ctl_io *io) { struct ctl_sg_entry *local_sglist; uint32_t len_to_go; int retval; int i; retval = 0; local_sglist = CTL_LSGL(io); len_to_go = io->scsiio.kern_data_len; /* * The difficult thing here is that the size of the various * S/G segments may be different than the size from the * remote controller. That'll make it harder when DMAing * the data back to the other side. */ for (i = 0; len_to_go > 0; i++) { local_sglist[i].len = MIN(len_to_go, CTL_HA_DATAMOVE_SEGMENT); local_sglist[i].addr = malloc(local_sglist[i].len, M_CTL, M_WAITOK); len_to_go -= local_sglist[i].len; } /* * Reset the number of S/G entries accordingly. The original * number of S/G entries is available in rem_sg_entries. */ io->scsiio.kern_sg_entries = i; return (retval); } static int ctl_datamove_remote_xfer(union ctl_io *io, unsigned command, ctl_ha_dt_cb callback) { struct ctl_ha_dt_req *rq; struct ctl_sg_entry *remote_sglist, *local_sglist; uint32_t local_used, remote_used, total_used; int i, j, isc_ret; rq = ctl_dt_req_alloc(); /* * If we failed to allocate the request, and if the DMA didn't fail * anyway, set busy status. This is just a resource allocation * failure. */ if ((rq == NULL) && ((io->io_hdr.status & CTL_STATUS_MASK) != CTL_STATUS_NONE && (io->io_hdr.status & CTL_STATUS_MASK) != CTL_SUCCESS)) ctl_set_busy(&io->scsiio); if ((io->io_hdr.status & CTL_STATUS_MASK) != CTL_STATUS_NONE && (io->io_hdr.status & CTL_STATUS_MASK) != CTL_SUCCESS) { if (rq != NULL) ctl_dt_req_free(rq); /* * The data move failed. We need to return status back * to the other controller. No point in trying to DMA * data to the remote controller. */ ctl_send_datamove_done(io, /*have_lock*/ 0); return (1); } local_sglist = CTL_LSGL(io); remote_sglist = CTL_RSGL(io); local_used = 0; remote_used = 0; total_used = 0; /* * Pull/push the data over the wire from/to the other controller. * This takes into account the possibility that the local and * remote sglists may not be identical in terms of the size of * the elements and the number of elements. * * One fundamental assumption here is that the length allocated for * both the local and remote sglists is identical. Otherwise, we've * essentially got a coding error of some sort. */ isc_ret = CTL_HA_STATUS_SUCCESS; for (i = 0, j = 0; total_used < io->scsiio.kern_data_len; ) { uint32_t cur_len; uint8_t *tmp_ptr; rq->command = command; rq->context = io; /* * Both pointers should be aligned. But it is possible * that the allocation length is not. They should both * also have enough slack left over at the end, though, * to round up to the next 8 byte boundary. */ cur_len = MIN(local_sglist[i].len - local_used, remote_sglist[j].len - remote_used); rq->size = cur_len; tmp_ptr = (uint8_t *)local_sglist[i].addr; tmp_ptr += local_used; #if 0 /* Use physical addresses when talking to ISC hardware */ if ((io->io_hdr.flags & CTL_FLAG_BUS_ADDR) == 0) { /* XXX KDM use busdma */ rq->local = vtophys(tmp_ptr); } else rq->local = tmp_ptr; #else KASSERT((io->io_hdr.flags & CTL_FLAG_BUS_ADDR) == 0, ("HA does not support BUS_ADDR")); rq->local = tmp_ptr; #endif tmp_ptr = (uint8_t *)remote_sglist[j].addr; tmp_ptr += remote_used; rq->remote = tmp_ptr; rq->callback = NULL; local_used += cur_len; if (local_used >= local_sglist[i].len) { i++; local_used = 0; } remote_used += cur_len; if (remote_used >= remote_sglist[j].len) { j++; remote_used = 0; } total_used += cur_len; if (total_used >= io->scsiio.kern_data_len) rq->callback = callback; isc_ret = ctl_dt_single(rq); if (isc_ret > CTL_HA_STATUS_SUCCESS) break; } if (isc_ret != CTL_HA_STATUS_WAIT) { rq->ret = isc_ret; callback(rq); } return (0); } static void ctl_datamove_remote_read(union ctl_io *io) { int retval; uint32_t i; /* * This will send an error to the other controller in the case of a * failure. */ retval = ctl_datamove_remote_sgl_setup(io); if (retval != 0) return; retval = ctl_datamove_remote_xfer(io, CTL_HA_DT_CMD_READ, ctl_datamove_remote_read_cb); if (retval != 0) { /* * Make sure we free memory if there was an error.. The * ctl_datamove_remote_xfer() function will send the * datamove done message, or call the callback with an * error if there is a problem. */ for (i = 0; i < io->scsiio.kern_sg_entries; i++) free(CTL_LSGLT(io)[i].addr, M_CTL); free(CTL_RSGL(io), M_CTL); CTL_RSGL(io) = NULL; CTL_LSGL(io) = NULL; } } /* * Process a datamove request from the other controller. This is used for * XFER mode only, not SER_ONLY mode. For writes, we DMA into local memory * first. Once that is complete, the data gets DMAed into the remote * controller's memory. For reads, we DMA from the remote controller's * memory into our memory first, and then move it out to the FETD. */ static void ctl_datamove_remote(union ctl_io *io) { mtx_assert(&((struct ctl_softc *)CTL_SOFTC(io))->ctl_lock, MA_NOTOWNED); if (io->io_hdr.flags & CTL_FLAG_FAILOVER) { ctl_failover_io(io, /*have_lock*/ 0); return; } /* * Note that we look for an aborted I/O here, but don't do some of * the other checks that ctl_datamove() normally does. * We don't need to run the datamove delay code, since that should * have been done if need be on the other controller. */ if (io->io_hdr.flags & CTL_FLAG_ABORT) { printf("%s: tag 0x%04x on (%u:%u:%u) aborted\n", __func__, io->scsiio.tag_num, io->io_hdr.nexus.initid, io->io_hdr.nexus.targ_port, io->io_hdr.nexus.targ_lun); io->io_hdr.port_status = 31338; ctl_send_datamove_done(io, /*have_lock*/ 0); return; } if ((io->io_hdr.flags & CTL_FLAG_DATA_MASK) == CTL_FLAG_DATA_OUT) ctl_datamove_remote_write(io); else if ((io->io_hdr.flags & CTL_FLAG_DATA_MASK) == CTL_FLAG_DATA_IN) ctl_datamove_remote_read(io); else { io->io_hdr.port_status = 31339; ctl_send_datamove_done(io, /*have_lock*/ 0); } } static void ctl_process_done(union ctl_io *io) { struct ctl_softc *softc = CTL_SOFTC(io); struct ctl_port *port = CTL_PORT(io); struct ctl_lun *lun = CTL_LUN(io); void (*fe_done)(union ctl_io *io); union ctl_ha_msg msg; CTL_DEBUG_PRINT(("ctl_process_done\n")); fe_done = port->fe_done; #ifdef CTL_TIME_IO if ((time_uptime - io->io_hdr.start_time) > ctl_time_io_secs) { char str[256]; char path_str[64]; struct sbuf sb; ctl_scsi_path_string(io, path_str, sizeof(path_str)); sbuf_new(&sb, str, sizeof(str), SBUF_FIXEDLEN); sbuf_cat(&sb, path_str); switch (io->io_hdr.io_type) { case CTL_IO_SCSI: ctl_scsi_command_string(&io->scsiio, NULL, &sb); sbuf_printf(&sb, "\n"); sbuf_cat(&sb, path_str); sbuf_printf(&sb, "Tag: 0x%04x, type %d\n", io->scsiio.tag_num, io->scsiio.tag_type); break; case CTL_IO_TASK: sbuf_printf(&sb, "Task I/O type: %d, Tag: 0x%04x, " "Tag Type: %d\n", io->taskio.task_action, io->taskio.tag_num, io->taskio.tag_type); break; default: panic("%s: Invalid CTL I/O type %d\n", __func__, io->io_hdr.io_type); } sbuf_cat(&sb, path_str); sbuf_printf(&sb, "ctl_process_done: %jd seconds\n", (intmax_t)time_uptime - io->io_hdr.start_time); sbuf_finish(&sb); printf("%s", sbuf_data(&sb)); } #endif /* CTL_TIME_IO */ switch (io->io_hdr.io_type) { case CTL_IO_SCSI: break; case CTL_IO_TASK: if (ctl_debug & CTL_DEBUG_INFO) ctl_io_error_print(io, NULL); fe_done(io); return; default: panic("%s: Invalid CTL I/O type %d\n", __func__, io->io_hdr.io_type); } if (lun == NULL) { CTL_DEBUG_PRINT(("NULL LUN for lun %d\n", io->io_hdr.nexus.targ_mapped_lun)); goto bailout; } mtx_lock(&lun->lun_lock); /* * Check to see if we have any informational exception and status * of this command can be modified to report it in form of either * RECOVERED ERROR or NO SENSE, depending on MRIE mode page field. */ if (lun->ie_reported == 0 && lun->ie_asc != 0 && io->io_hdr.status == CTL_SUCCESS && (io->io_hdr.flags & CTL_FLAG_STATUS_SENT) == 0) { uint8_t mrie = lun->MODE_IE.mrie; uint8_t per = ((lun->MODE_RWER.byte3 & SMS_RWER_PER) || (lun->MODE_VER.byte3 & SMS_VER_PER)); if (((mrie == SIEP_MRIE_REC_COND && per) || mrie == SIEP_MRIE_REC_UNCOND || mrie == SIEP_MRIE_NO_SENSE) && (ctl_get_cmd_entry(&io->scsiio, NULL)->flags & CTL_CMD_FLAG_NO_SENSE) == 0) { ctl_set_sense(&io->scsiio, /*current_error*/ 1, /*sense_key*/ (mrie == SIEP_MRIE_NO_SENSE) ? SSD_KEY_NO_SENSE : SSD_KEY_RECOVERED_ERROR, /*asc*/ lun->ie_asc, /*ascq*/ lun->ie_ascq, SSD_ELEM_NONE); lun->ie_reported = 1; } } else if (lun->ie_reported < 0) lun->ie_reported = 0; /* * Check to see if we have any errors to inject here. We only * inject errors for commands that don't already have errors set. */ if (!STAILQ_EMPTY(&lun->error_list) && ((io->io_hdr.status & CTL_STATUS_MASK) == CTL_SUCCESS) && ((io->io_hdr.flags & CTL_FLAG_STATUS_SENT) == 0)) ctl_inject_error(lun, io); /* * XXX KDM how do we treat commands that aren't completed * successfully? * * XXX KDM should we also track I/O latency? */ if ((io->io_hdr.status & CTL_STATUS_MASK) == CTL_SUCCESS && io->io_hdr.io_type == CTL_IO_SCSI) { int type; #ifdef CTL_TIME_IO struct bintime bt; getbinuptime(&bt); bintime_sub(&bt, &io->io_hdr.start_bt); #endif if ((io->io_hdr.flags & CTL_FLAG_DATA_MASK) == CTL_FLAG_DATA_IN) type = CTL_STATS_READ; else if ((io->io_hdr.flags & CTL_FLAG_DATA_MASK) == CTL_FLAG_DATA_OUT) type = CTL_STATS_WRITE; else type = CTL_STATS_NO_IO; lun->stats.bytes[type] += io->scsiio.kern_total_len; lun->stats.operations[type] ++; lun->stats.dmas[type] += io->io_hdr.num_dmas; #ifdef CTL_TIME_IO bintime_add(&lun->stats.dma_time[type], &io->io_hdr.dma_bt); bintime_add(&lun->stats.time[type], &bt); #endif mtx_lock(&port->port_lock); port->stats.bytes[type] += io->scsiio.kern_total_len; port->stats.operations[type] ++; port->stats.dmas[type] += io->io_hdr.num_dmas; #ifdef CTL_TIME_IO bintime_add(&port->stats.dma_time[type], &io->io_hdr.dma_bt); bintime_add(&port->stats.time[type], &bt); #endif mtx_unlock(&port->port_lock); } /* * Run through the blocked queue of this I/O and see if anything * can be unblocked, now that this I/O is done and will be removed. * We need to do it before removal to have OOA position to start. */ ctl_try_unblock_others(lun, io, TRUE); /* * Remove this from the OOA queue. */ TAILQ_REMOVE(&lun->ooa_queue, &io->io_hdr, ooa_links); #ifdef CTL_TIME_IO if (TAILQ_EMPTY(&lun->ooa_queue)) lun->last_busy = getsbinuptime(); #endif /* * If the LUN has been invalidated, free it if there is nothing * left on its OOA queue. */ if ((lun->flags & CTL_LUN_INVALID) && TAILQ_EMPTY(&lun->ooa_queue)) { mtx_unlock(&lun->lun_lock); ctl_free_lun(lun); } else mtx_unlock(&lun->lun_lock); bailout: /* * If this command has been aborted, make sure we set the status * properly. The FETD is responsible for freeing the I/O and doing * whatever it needs to do to clean up its state. */ if (io->io_hdr.flags & CTL_FLAG_ABORT) ctl_set_task_aborted(&io->scsiio); /* * If enabled, print command error status. */ if ((io->io_hdr.status & CTL_STATUS_MASK) != CTL_SUCCESS && (ctl_debug & CTL_DEBUG_INFO) != 0) ctl_io_error_print(io, NULL); /* * Tell the FETD or the other shelf controller we're done with this * command. Note that only SCSI commands get to this point. Task * management commands are completed above. */ if ((softc->ha_mode != CTL_HA_MODE_XFER) && (io->io_hdr.flags & CTL_FLAG_SENT_2OTHER_SC)) { memset(&msg, 0, sizeof(msg)); msg.hdr.msg_type = CTL_MSG_FINISH_IO; msg.hdr.serializing_sc = io->io_hdr.remote_io; msg.hdr.nexus = io->io_hdr.nexus; ctl_ha_msg_send(CTL_HA_CHAN_CTL, &msg, sizeof(msg.scsi) - sizeof(msg.scsi.sense_data), M_WAITOK); } fe_done(io); } /* * Front end should call this if it doesn't do autosense. When the request * sense comes back in from the initiator, we'll dequeue this and send it. */ int ctl_queue_sense(union ctl_io *io) { struct ctl_softc *softc = CTL_SOFTC(io); struct ctl_port *port = CTL_PORT(io); struct ctl_lun *lun; struct scsi_sense_data *ps; uint32_t initidx, p, targ_lun; CTL_DEBUG_PRINT(("ctl_queue_sense\n")); targ_lun = ctl_lun_map_from_port(port, io->io_hdr.nexus.targ_lun); /* * LUN lookup will likely move to the ctl_work_thread() once we * have our new queueing infrastructure (that doesn't put things on * a per-LUN queue initially). That is so that we can handle * things like an INQUIRY to a LUN that we don't have enabled. We * can't deal with that right now. * If we don't have a LUN for this, just toss the sense information. */ mtx_lock(&softc->ctl_lock); if (targ_lun >= ctl_max_luns || (lun = softc->ctl_luns[targ_lun]) == NULL) { mtx_unlock(&softc->ctl_lock); goto bailout; } mtx_lock(&lun->lun_lock); mtx_unlock(&softc->ctl_lock); initidx = ctl_get_initindex(&io->io_hdr.nexus); p = initidx / CTL_MAX_INIT_PER_PORT; if (lun->pending_sense[p] == NULL) { lun->pending_sense[p] = malloc(sizeof(*ps) * CTL_MAX_INIT_PER_PORT, M_CTL, M_NOWAIT | M_ZERO); } if ((ps = lun->pending_sense[p]) != NULL) { ps += initidx % CTL_MAX_INIT_PER_PORT; memset(ps, 0, sizeof(*ps)); memcpy(ps, &io->scsiio.sense_data, io->scsiio.sense_len); } mtx_unlock(&lun->lun_lock); bailout: ctl_free_io(io); return (CTL_RETVAL_COMPLETE); } /* * Primary command inlet from frontend ports. All SCSI and task I/O * requests must go through this function. */ int ctl_queue(union ctl_io *io) { struct ctl_port *port = CTL_PORT(io); CTL_DEBUG_PRINT(("ctl_queue cdb[0]=%02X\n", io->scsiio.cdb[0])); #ifdef CTL_TIME_IO io->io_hdr.start_time = time_uptime; getbinuptime(&io->io_hdr.start_bt); #endif /* CTL_TIME_IO */ /* Map FE-specific LUN ID into global one. */ io->io_hdr.nexus.targ_mapped_lun = ctl_lun_map_from_port(port, io->io_hdr.nexus.targ_lun); switch (io->io_hdr.io_type) { case CTL_IO_SCSI: case CTL_IO_TASK: if (ctl_debug & CTL_DEBUG_CDB) ctl_io_print(io); ctl_enqueue_incoming(io); break; default: printf("ctl_queue: unknown I/O type %d\n", io->io_hdr.io_type); return (EINVAL); } return (CTL_RETVAL_COMPLETE); } #ifdef CTL_IO_DELAY static void ctl_done_timer_wakeup(void *arg) { union ctl_io *io; io = (union ctl_io *)arg; ctl_done(io); } #endif /* CTL_IO_DELAY */ void ctl_serseq_done(union ctl_io *io) { struct ctl_lun *lun = CTL_LUN(io);; if (lun->be_lun == NULL || lun->be_lun->serseq == CTL_LUN_SERSEQ_OFF) return; mtx_lock(&lun->lun_lock); io->io_hdr.flags |= CTL_FLAG_SERSEQ_DONE; ctl_try_unblock_others(lun, io, FALSE); mtx_unlock(&lun->lun_lock); } void ctl_done(union ctl_io *io) { /* * Enable this to catch duplicate completion issues. */ #if 0 if (io->io_hdr.flags & CTL_FLAG_ALREADY_DONE) { printf("%s: type %d msg %d cdb %x iptl: " "%u:%u:%u tag 0x%04x " "flag %#x status %x\n", __func__, io->io_hdr.io_type, io->io_hdr.msg_type, io->scsiio.cdb[0], io->io_hdr.nexus.initid, io->io_hdr.nexus.targ_port, io->io_hdr.nexus.targ_lun, (io->io_hdr.io_type == CTL_IO_TASK) ? io->taskio.tag_num : io->scsiio.tag_num, io->io_hdr.flags, io->io_hdr.status); } else io->io_hdr.flags |= CTL_FLAG_ALREADY_DONE; #endif /* * This is an internal copy of an I/O, and should not go through * the normal done processing logic. */ if (io->io_hdr.flags & CTL_FLAG_INT_COPY) return; #ifdef CTL_IO_DELAY if (io->io_hdr.flags & CTL_FLAG_DELAY_DONE) { io->io_hdr.flags &= ~CTL_FLAG_DELAY_DONE; } else { struct ctl_lun *lun = CTL_LUN(io); if ((lun != NULL) && (lun->delay_info.done_delay > 0)) { callout_init(&io->io_hdr.delay_callout, /*mpsafe*/ 1); io->io_hdr.flags |= CTL_FLAG_DELAY_DONE; callout_reset(&io->io_hdr.delay_callout, lun->delay_info.done_delay * hz, ctl_done_timer_wakeup, io); if (lun->delay_info.done_type == CTL_DELAY_TYPE_ONESHOT) lun->delay_info.done_delay = 0; return; } } #endif /* CTL_IO_DELAY */ ctl_enqueue_done(io); } static void ctl_work_thread(void *arg) { struct ctl_thread *thr = (struct ctl_thread *)arg; struct ctl_softc *softc = thr->ctl_softc; union ctl_io *io; int retval; CTL_DEBUG_PRINT(("ctl_work_thread starting\n")); thread_lock(curthread); sched_prio(curthread, PUSER - 1); thread_unlock(curthread); while (!softc->shutdown) { /* * We handle the queues in this order: * - ISC * - done queue (to free up resources, unblock other commands) * - incoming queue * - RtR queue * * If those queues are empty, we break out of the loop and * go to sleep. */ mtx_lock(&thr->queue_lock); io = (union ctl_io *)STAILQ_FIRST(&thr->isc_queue); if (io != NULL) { STAILQ_REMOVE_HEAD(&thr->isc_queue, links); mtx_unlock(&thr->queue_lock); ctl_handle_isc(io); continue; } io = (union ctl_io *)STAILQ_FIRST(&thr->done_queue); if (io != NULL) { STAILQ_REMOVE_HEAD(&thr->done_queue, links); /* clear any blocked commands, call fe_done */ mtx_unlock(&thr->queue_lock); ctl_process_done(io); continue; } io = (union ctl_io *)STAILQ_FIRST(&thr->incoming_queue); if (io != NULL) { STAILQ_REMOVE_HEAD(&thr->incoming_queue, links); mtx_unlock(&thr->queue_lock); if (io->io_hdr.io_type == CTL_IO_TASK) ctl_run_task(io); else ctl_scsiio_precheck(softc, &io->scsiio); continue; } io = (union ctl_io *)STAILQ_FIRST(&thr->rtr_queue); if (io != NULL) { STAILQ_REMOVE_HEAD(&thr->rtr_queue, links); mtx_unlock(&thr->queue_lock); retval = ctl_scsiio(&io->scsiio); if (retval != CTL_RETVAL_COMPLETE) CTL_DEBUG_PRINT(("ctl_scsiio failed\n")); continue; } /* Sleep until we have something to do. */ mtx_sleep(thr, &thr->queue_lock, PDROP, "-", 0); } thr->thread = NULL; kthread_exit(); } static void ctl_lun_thread(void *arg) { struct ctl_softc *softc = (struct ctl_softc *)arg; struct ctl_be_lun *be_lun; CTL_DEBUG_PRINT(("ctl_lun_thread starting\n")); thread_lock(curthread); sched_prio(curthread, PUSER - 1); thread_unlock(curthread); while (!softc->shutdown) { mtx_lock(&softc->ctl_lock); be_lun = STAILQ_FIRST(&softc->pending_lun_queue); if (be_lun != NULL) { STAILQ_REMOVE_HEAD(&softc->pending_lun_queue, links); mtx_unlock(&softc->ctl_lock); ctl_create_lun(be_lun); continue; } /* Sleep until we have something to do. */ mtx_sleep(&softc->pending_lun_queue, &softc->ctl_lock, PDROP, "-", 0); } softc->lun_thread = NULL; kthread_exit(); } static void ctl_thresh_thread(void *arg) { struct ctl_softc *softc = (struct ctl_softc *)arg; struct ctl_lun *lun; struct ctl_logical_block_provisioning_page *page; const char *attr; union ctl_ha_msg msg; uint64_t thres, val; int i, e, set; CTL_DEBUG_PRINT(("ctl_thresh_thread starting\n")); thread_lock(curthread); sched_prio(curthread, PUSER - 1); thread_unlock(curthread); while (!softc->shutdown) { mtx_lock(&softc->ctl_lock); STAILQ_FOREACH(lun, &softc->lun_list, links) { if ((lun->flags & CTL_LUN_DISABLED) || (lun->flags & CTL_LUN_NO_MEDIA) || lun->backend->lun_attr == NULL) continue; if ((lun->flags & CTL_LUN_PRIMARY_SC) == 0 && softc->ha_mode == CTL_HA_MODE_XFER) continue; if ((lun->MODE_RWER.byte8 & SMS_RWER_LBPERE) == 0) continue; e = 0; page = &lun->MODE_LBP; for (i = 0; i < CTL_NUM_LBP_THRESH; i++) { if ((page->descr[i].flags & SLBPPD_ENABLED) == 0) continue; thres = scsi_4btoul(page->descr[i].count); thres <<= CTL_LBP_EXPONENT; switch (page->descr[i].resource) { case 0x01: attr = "blocksavail"; break; case 0x02: attr = "blocksused"; break; case 0xf1: attr = "poolblocksavail"; break; case 0xf2: attr = "poolblocksused"; break; default: continue; } mtx_unlock(&softc->ctl_lock); // XXX val = lun->backend->lun_attr( lun->be_lun->be_lun, attr); mtx_lock(&softc->ctl_lock); if (val == UINT64_MAX) continue; if ((page->descr[i].flags & SLBPPD_ARMING_MASK) == SLBPPD_ARMING_INC) e = (val >= thres); else e = (val <= thres); if (e) break; } mtx_lock(&lun->lun_lock); if (e) { scsi_u64to8b((uint8_t *)&page->descr[i] - (uint8_t *)page, lun->ua_tpt_info); if (lun->lasttpt == 0 || time_uptime - lun->lasttpt >= CTL_LBP_UA_PERIOD) { lun->lasttpt = time_uptime; ctl_est_ua_all(lun, -1, CTL_UA_THIN_PROV_THRES); set = 1; } else set = 0; } else { lun->lasttpt = 0; ctl_clr_ua_all(lun, -1, CTL_UA_THIN_PROV_THRES); set = -1; } mtx_unlock(&lun->lun_lock); if (set != 0 && lun->ctl_softc->ha_mode == CTL_HA_MODE_XFER) { /* Send msg to other side. */ bzero(&msg.ua, sizeof(msg.ua)); msg.hdr.msg_type = CTL_MSG_UA; msg.hdr.nexus.initid = -1; msg.hdr.nexus.targ_port = -1; msg.hdr.nexus.targ_lun = lun->lun; msg.hdr.nexus.targ_mapped_lun = lun->lun; msg.ua.ua_all = 1; msg.ua.ua_set = (set > 0); msg.ua.ua_type = CTL_UA_THIN_PROV_THRES; memcpy(msg.ua.ua_info, lun->ua_tpt_info, 8); mtx_unlock(&softc->ctl_lock); // XXX ctl_ha_msg_send(CTL_HA_CHAN_CTL, &msg, sizeof(msg.ua), M_WAITOK); mtx_lock(&softc->ctl_lock); } } mtx_sleep(&softc->thresh_thread, &softc->ctl_lock, PDROP, "-", CTL_LBP_PERIOD * hz); } softc->thresh_thread = NULL; kthread_exit(); } static void ctl_enqueue_incoming(union ctl_io *io) { struct ctl_softc *softc = CTL_SOFTC(io); struct ctl_thread *thr; u_int idx; idx = (io->io_hdr.nexus.targ_port * 127 + io->io_hdr.nexus.initid) % worker_threads; thr = &softc->threads[idx]; mtx_lock(&thr->queue_lock); STAILQ_INSERT_TAIL(&thr->incoming_queue, &io->io_hdr, links); mtx_unlock(&thr->queue_lock); wakeup(thr); } static void ctl_enqueue_rtr(union ctl_io *io) { struct ctl_softc *softc = CTL_SOFTC(io); struct ctl_thread *thr; thr = &softc->threads[io->io_hdr.nexus.targ_mapped_lun % worker_threads]; mtx_lock(&thr->queue_lock); STAILQ_INSERT_TAIL(&thr->rtr_queue, &io->io_hdr, links); mtx_unlock(&thr->queue_lock); wakeup(thr); } static void ctl_enqueue_done(union ctl_io *io) { struct ctl_softc *softc = CTL_SOFTC(io); struct ctl_thread *thr; thr = &softc->threads[io->io_hdr.nexus.targ_mapped_lun % worker_threads]; mtx_lock(&thr->queue_lock); STAILQ_INSERT_TAIL(&thr->done_queue, &io->io_hdr, links); mtx_unlock(&thr->queue_lock); wakeup(thr); } static void ctl_enqueue_isc(union ctl_io *io) { struct ctl_softc *softc = CTL_SOFTC(io); struct ctl_thread *thr; thr = &softc->threads[io->io_hdr.nexus.targ_mapped_lun % worker_threads]; mtx_lock(&thr->queue_lock); STAILQ_INSERT_TAIL(&thr->isc_queue, &io->io_hdr, links); mtx_unlock(&thr->queue_lock); wakeup(thr); } /* * vim: ts=8 */ Index: projects/nfsv42/sys/cam/ctl/ctl.h =================================================================== --- projects/nfsv42/sys/cam/ctl/ctl.h (revision 350367) +++ projects/nfsv42/sys/cam/ctl/ctl.h (revision 350368) @@ -1,207 +1,210 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2003 Silicon Graphics International Corp. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions, and the following disclaimer, * without modification. * 2. Redistributions in binary form must reproduce at minimum a disclaimer * substantially similar to the "NO WARRANTY" disclaimer below * ("Disclaimer") and any redistribution must be conditioned upon * including a substantially similar Disclaimer requirement for further * binary redistribution. * * NO WARRANTY * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGES. * * $Id: //depot/users/kenm/FreeBSD-test2/sys/cam/ctl/ctl.h#5 $ * $FreeBSD$ */ /* * Function definitions used both within CTL and potentially in various CTL * clients. * * Author: Ken Merry */ #ifndef _CTL_H_ #define _CTL_H_ #define CTL_RETVAL_COMPLETE 0 #define CTL_RETVAL_QUEUED 1 #define CTL_RETVAL_ALLOCATED 2 #define CTL_RETVAL_ERROR 3 typedef enum { CTL_PORT_NONE = 0x00, CTL_PORT_FC = 0x01, CTL_PORT_SCSI = 0x02, CTL_PORT_IOCTL = 0x04, CTL_PORT_INTERNAL = 0x08, CTL_PORT_ISCSI = 0x10, CTL_PORT_SAS = 0x20, CTL_PORT_UMASS = 0x40, CTL_PORT_ALL = 0xff, CTL_PORT_ISC = 0x100 // FC port for inter-shelf communication } ctl_port_type; struct ctl_port_entry { ctl_port_type port_type; char port_name[64]; int32_t targ_port; int physical_port; int virtual_port; u_int flags; #define CTL_PORT_WWNN_VALID 0x01 #define CTL_PORT_WWPN_VALID 0x02 uint64_t wwnn; uint64_t wwpn; int online; }; struct ctl_modepage_header { uint8_t page_code; uint8_t subpage; uint16_t len_used; uint16_t len_left; }; union ctl_modepage_info { struct ctl_modepage_header header; }; /* * Serial number length, for VPD page 0x80. */ #define CTL_SN_LEN 16 /* * Device ID length, for VPD page 0x83. */ #define CTL_DEVID_LEN 64 #define CTL_DEVID_MIN_LEN 16 /* * WWPN length, for VPD page 0x83. */ #define CTL_WWPN_LEN 8 #define CTL_DRIVER_NAME_LEN 32 /* * Unit attention types. ASC/ASCQ values for these should be placed in * ctl_build_ua. These are also listed in order of reporting priority. * i.e. a poweron UA is reported first, bus reset second, etc. */ typedef enum { CTL_UA_NONE = 0x0000, CTL_UA_POWERON = 0x0001, CTL_UA_BUS_RESET = 0x0002, CTL_UA_TARG_RESET = 0x0004, CTL_UA_I_T_NEXUS_LOSS = 0x0008, CTL_UA_LUN_RESET = 0x0010, CTL_UA_LUN_CHANGE = 0x0020, CTL_UA_MODE_CHANGE = 0x0040, CTL_UA_LOG_CHANGE = 0x0080, CTL_UA_INQ_CHANGE = 0x0100, CTL_UA_RES_PREEMPT = 0x0400, CTL_UA_RES_RELEASE = 0x0800, CTL_UA_REG_PREEMPT = 0x1000, CTL_UA_ASYM_ACC_CHANGE = 0x2000, CTL_UA_CAPACITY_CHANGE = 0x4000, CTL_UA_THIN_PROV_THRES = 0x8000, CTL_UA_MEDIUM_CHANGE = 0x10000, CTL_UA_IE = 0x20000 } ctl_ua_type; #ifdef _KERNEL MALLOC_DECLARE(M_CTL); struct ctl_page_index; #ifdef SYSCTL_DECL /* from sysctl.h */ SYSCTL_DECL(_kern_cam_ctl); #endif struct ctl_lun; struct ctl_port; struct ctl_softc; /* * Put a string into an sbuf, escaping characters that are illegal or not * recommended in XML. Note this doesn't escape everything, just > < and &. */ int ctl_sbuf_printf_esc(struct sbuf *sb, char *str, int size); int ctl_ffz(uint32_t *mask, uint32_t first, uint32_t last); int ctl_set_mask(uint32_t *mask, uint32_t bit); int ctl_clear_mask(uint32_t *mask, uint32_t bit); int ctl_is_set(uint32_t *mask, uint32_t bit); int ctl_default_page_handler(struct ctl_scsiio *ctsio, struct ctl_page_index *page_index, uint8_t *page_ptr); int ctl_ie_page_handler(struct ctl_scsiio *ctsio, struct ctl_page_index *page_index, uint8_t *page_ptr); +int ctl_temp_log_sense_handler(struct ctl_scsiio *ctsio, + struct ctl_page_index *page_index, + int pc); int ctl_lbp_log_sense_handler(struct ctl_scsiio *ctsio, struct ctl_page_index *page_index, int pc); int ctl_sap_log_sense_handler(struct ctl_scsiio *ctsio, struct ctl_page_index *page_index, int pc); int ctl_ie_log_sense_handler(struct ctl_scsiio *ctsio, struct ctl_page_index *page_index, int pc); int ctl_config_move_done(union ctl_io *io); void ctl_datamove(union ctl_io *io); void ctl_serseq_done(union ctl_io *io); void ctl_done(union ctl_io *io); void ctl_data_submit_done(union ctl_io *io); void ctl_config_read_done(union ctl_io *io); void ctl_config_write_done(union ctl_io *io); void ctl_portDB_changed(int portnum); int ctl_ioctl_io(struct cdev *dev, u_long cmd, caddr_t addr, int flag, struct thread *td); void ctl_est_ua(struct ctl_lun *lun, uint32_t initidx, ctl_ua_type ua); void ctl_est_ua_port(struct ctl_lun *lun, int port, uint32_t except, ctl_ua_type ua); void ctl_est_ua_all(struct ctl_lun *lun, uint32_t except, ctl_ua_type ua); void ctl_clr_ua(struct ctl_lun *lun, uint32_t initidx, ctl_ua_type ua); void ctl_clr_ua_all(struct ctl_lun *lun, uint32_t except, ctl_ua_type ua); void ctl_clr_ua_allluns(struct ctl_softc *ctl_softc, uint32_t initidx, ctl_ua_type ua_type); uint32_t ctl_decode_lun(uint64_t encoded); uint64_t ctl_encode_lun(uint32_t decoded); void ctl_isc_announce_lun(struct ctl_lun *lun); void ctl_isc_announce_port(struct ctl_port *port); void ctl_isc_announce_iid(struct ctl_port *port, int iid); void ctl_isc_announce_mode(struct ctl_lun *lun, uint32_t initidx, uint8_t page, uint8_t subpage); int ctl_expand_number(const char *buf, uint64_t *num); #endif /* _KERNEL */ #endif /* _CTL_H_ */ /* * vim: ts=8 */ Index: projects/nfsv42/sys/cam/ctl/ctl_private.h =================================================================== --- projects/nfsv42/sys/cam/ctl/ctl_private.h (revision 350367) +++ projects/nfsv42/sys/cam/ctl/ctl_private.h (revision 350368) @@ -1,550 +1,553 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2003, 2004, 2005, 2008 Silicon Graphics International Corp. * Copyright (c) 2014-2017 Alexander Motin * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions, and the following disclaimer, * without modification. * 2. Redistributions in binary form must reproduce at minimum a disclaimer * substantially similar to the "NO WARRANTY" disclaimer below * ("Disclaimer") and any redistribution must be conditioned upon * including a substantially similar Disclaimer requirement for further * binary redistribution. * * NO WARRANTY * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGES. * * $Id: //depot/users/kenm/FreeBSD-test2/sys/cam/ctl/ctl_private.h#7 $ * $FreeBSD$ */ /* * CAM Target Layer driver private data structures/definitions. * * Author: Ken Merry */ #ifndef _CTL_PRIVATE_H_ #define _CTL_PRIVATE_H_ #include #include #include /* * SCSI vendor and product names. */ #define CTL_VENDOR "FREEBSD " #define CTL_DIRECT_PRODUCT "CTLDISK " #define CTL_PROCESSOR_PRODUCT "CTLPROCESSOR " #define CTL_CDROM_PRODUCT "CTLCDROM " #define CTL_UNKNOWN_PRODUCT "CTLDEVICE " #define CTL_POOL_ENTRIES_OTHER_SC 200 struct ctl_io_pool { char name[64]; uint32_t id; struct ctl_softc *ctl_softc; struct uma_zone *zone; }; typedef enum { CTL_SER_BLOCK, CTL_SER_BLOCKOPT, CTL_SER_EXTENT, CTL_SER_EXTENTOPT, CTL_SER_EXTENTSEQ, CTL_SER_PASS, CTL_SER_SKIP } ctl_serialize_action; typedef enum { CTL_ACTION_BLOCK, CTL_ACTION_OVERLAP, CTL_ACTION_OVERLAP_TAG, CTL_ACTION_PASS, CTL_ACTION_SKIP, CTL_ACTION_ERROR } ctl_action; /* * WARNING: Keep the bottom nibble here free, we OR in the data direction * flags for each command. * * Note: "OK_ON_NO_LUN" == we don't have to have a lun configured * "OK_ON_BOTH" == we have to have a lun configured * "SA5" == command has 5-bit service action at byte 1 */ typedef enum { CTL_CMD_FLAG_NONE = 0x0000, CTL_CMD_FLAG_NO_SENSE = 0x0010, CTL_CMD_FLAG_ALLOW_ON_RESV = 0x0020, CTL_CMD_FLAG_ALLOW_ON_PR_RESV = 0x0040, CTL_CMD_FLAG_ALLOW_ON_PR_WRESV = 0x0080, CTL_CMD_FLAG_OK_ON_PROC = 0x0100, CTL_CMD_FLAG_OK_ON_DIRECT = 0x0200, CTL_CMD_FLAG_OK_ON_CDROM = 0x0400, CTL_CMD_FLAG_OK_ON_BOTH = 0x0700, CTL_CMD_FLAG_OK_ON_NO_LUN = 0x0800, CTL_CMD_FLAG_OK_ON_NO_MEDIA = 0x1000, CTL_CMD_FLAG_OK_ON_STANDBY = 0x2000, CTL_CMD_FLAG_OK_ON_UNAVAIL = 0x4000, CTL_CMD_FLAG_SA5 = 0x8000, CTL_CMD_FLAG_RUN_HERE = 0x10000 } ctl_cmd_flags; typedef enum { CTL_SERIDX_TUR = 0, CTL_SERIDX_READ, CTL_SERIDX_WRITE, CTL_SERIDX_UNMAP, CTL_SERIDX_SYNC, CTL_SERIDX_MD_SNS, CTL_SERIDX_MD_SEL, CTL_SERIDX_RQ_SNS, CTL_SERIDX_INQ, CTL_SERIDX_RD_CAP, CTL_SERIDX_RES, CTL_SERIDX_LOG_SNS, CTL_SERIDX_FORMAT, CTL_SERIDX_START, /* TBD: others to be filled in as needed */ CTL_SERIDX_COUNT, /* LAST, not a normal code, provides # codes */ CTL_SERIDX_INVLD = CTL_SERIDX_COUNT } ctl_seridx; typedef int ctl_opfunc(struct ctl_scsiio *ctsio); struct ctl_cmd_entry { ctl_opfunc *execute; ctl_seridx seridx; ctl_cmd_flags flags; ctl_lun_error_pattern pattern; uint8_t length; /* CDB length */ uint8_t usage[15]; /* Mask of allowed CDB bits * after the opcode byte. */ }; typedef enum { CTL_LUN_NONE = 0x000, CTL_LUN_CONTROL = 0x001, CTL_LUN_RESERVED = 0x002, CTL_LUN_INVALID = 0x004, CTL_LUN_DISABLED = 0x008, CTL_LUN_MALLOCED = 0x010, CTL_LUN_STOPPED = 0x020, CTL_LUN_NO_MEDIA = 0x040, CTL_LUN_EJECTED = 0x080, CTL_LUN_PR_RESERVED = 0x100, CTL_LUN_PRIMARY_SC = 0x200, CTL_LUN_READONLY = 0x800, CTL_LUN_PEER_SC_PRIMARY = 0x1000, CTL_LUN_REMOVABLE = 0x2000 } ctl_lun_flags; typedef enum { CTLBLOCK_FLAG_NONE = 0x00, CTLBLOCK_FLAG_INVALID = 0x01 } ctlblock_flags; union ctl_softcs { struct ctl_softc *ctl_softc; struct ctlblock_softc *ctlblock_softc; }; /* * Mode page defaults. */ #if 0 /* * These values make Solaris trim off some of the capacity. */ #define CTL_DEFAULT_SECTORS_PER_TRACK 63 #define CTL_DEFAULT_HEADS 255 /* * These values seem to work okay. */ #define CTL_DEFAULT_SECTORS_PER_TRACK 63 #define CTL_DEFAULT_HEADS 16 /* * These values work reasonably well. */ #define CTL_DEFAULT_SECTORS_PER_TRACK 512 #define CTL_DEFAULT_HEADS 64 #endif /* * Solaris is somewhat picky about how many heads and sectors per track you * have defined in mode pages 3 and 4. These values seem to cause Solaris * to get the capacity more or less right when you run the format tool. * They still have problems when dealing with devices larger than 1TB, * but there isn't anything we can do about that. * * For smaller LUN sizes, this ends up causing the number of cylinders to * work out to 0. Solaris actually recognizes that and comes up with its * own bogus geometry to fit the actual capacity of the drive. They really * should just give up on geometry and stick to the read capacity * information alone for modern disk drives. * * One thing worth mentioning about Solaris' mkfs command is that it * doesn't like sectors per track values larger than 256. 512 seems to * work okay for format, but causes problems when you try to make a * filesystem. * * Another caveat about these values: the product of these two values * really should be a power of 2. This is because of the simplistic * shift-based calculation that we have to use on the i386 platform to * calculate the number of cylinders here. (If you use a divide, you end * up calling __udivdi3(), which is a hardware FP call on the PC. On the * XScale, it is done in software, so you can do that from inside the * kernel.) * * So for the current values (256 S/T, 128 H), we get 32768, which works * very nicely for calculating cylinders. * * If you want to change these values so that their product is no longer a * power of 2, re-visit the calculation in ctl_init_page_index(). You may * need to make it a bit more complicated to get the number of cylinders * right. */ #define CTL_DEFAULT_SECTORS_PER_TRACK 256 #define CTL_DEFAULT_HEADS 128 #define CTL_DEFAULT_ROTATION_RATE SVPD_NON_ROTATING struct ctl_page_index; typedef int ctl_modesen_handler(struct ctl_scsiio *ctsio, struct ctl_page_index *page_index, int pc); typedef int ctl_modesel_handler(struct ctl_scsiio *ctsio, struct ctl_page_index *page_index, uint8_t *page_ptr); typedef enum { CTL_PAGE_FLAG_NONE = 0x00, CTL_PAGE_FLAG_DIRECT = 0x01, CTL_PAGE_FLAG_PROC = 0x02, CTL_PAGE_FLAG_CDROM = 0x04, CTL_PAGE_FLAG_ALL = 0x07 } ctl_page_flags; struct ctl_page_index { uint8_t page_code; uint8_t subpage; uint16_t page_len; uint8_t *page_data; ctl_page_flags page_flags; ctl_modesen_handler *sense_handler; ctl_modesel_handler *select_handler; }; #define CTL_PAGE_CURRENT 0x00 #define CTL_PAGE_CHANGEABLE 0x01 #define CTL_PAGE_DEFAULT 0x02 #define CTL_PAGE_SAVED 0x03 #define CTL_NUM_LBP_PARAMS 4 #define CTL_NUM_LBP_THRESH 4 #define CTL_LBP_EXPONENT 11 /* 2048 sectors */ #define CTL_LBP_PERIOD 10 /* 10 seconds */ #define CTL_LBP_UA_PERIOD 300 /* 5 minutes */ struct ctl_logical_block_provisioning_page { struct scsi_logical_block_provisioning_page main; struct scsi_logical_block_provisioning_page_descr descr[CTL_NUM_LBP_THRESH]; }; static const struct ctl_page_index page_index_template[] = { {SMS_RW_ERROR_RECOVERY_PAGE, 0, sizeof(struct scsi_da_rw_recovery_page), NULL, CTL_PAGE_FLAG_DIRECT | CTL_PAGE_FLAG_CDROM, NULL, ctl_default_page_handler}, {SMS_FORMAT_DEVICE_PAGE, 0, sizeof(struct scsi_format_page), NULL, CTL_PAGE_FLAG_DIRECT, NULL, NULL}, {SMS_RIGID_DISK_PAGE, 0, sizeof(struct scsi_rigid_disk_page), NULL, CTL_PAGE_FLAG_DIRECT, NULL, NULL}, {SMS_VERIFY_ERROR_RECOVERY_PAGE, 0, sizeof(struct scsi_da_verify_recovery_page), NULL, CTL_PAGE_FLAG_DIRECT | CTL_PAGE_FLAG_CDROM, NULL, ctl_default_page_handler}, {SMS_CACHING_PAGE, 0, sizeof(struct scsi_caching_page), NULL, CTL_PAGE_FLAG_DIRECT | CTL_PAGE_FLAG_CDROM, NULL, ctl_default_page_handler}, {SMS_CONTROL_MODE_PAGE, 0, sizeof(struct scsi_control_page), NULL, CTL_PAGE_FLAG_ALL, NULL, ctl_default_page_handler}, {SMS_CONTROL_MODE_PAGE | SMPH_SPF, 0x01, sizeof(struct scsi_control_ext_page), NULL, CTL_PAGE_FLAG_ALL, NULL, ctl_default_page_handler}, {SMS_INFO_EXCEPTIONS_PAGE, 0, sizeof(struct scsi_info_exceptions_page), NULL, CTL_PAGE_FLAG_ALL, NULL, ctl_ie_page_handler}, {SMS_INFO_EXCEPTIONS_PAGE | SMPH_SPF, 0x02, sizeof(struct ctl_logical_block_provisioning_page), NULL, CTL_PAGE_FLAG_DIRECT, NULL, ctl_default_page_handler}, {SMS_CDDVD_CAPS_PAGE, 0, sizeof(struct scsi_cddvd_capabilities_page), NULL, CTL_PAGE_FLAG_CDROM, NULL, NULL}, }; #define CTL_NUM_MODE_PAGES sizeof(page_index_template)/ \ sizeof(page_index_template[0]) struct ctl_mode_pages { struct scsi_da_rw_recovery_page rw_er_page[4]; struct scsi_format_page format_page[4]; struct scsi_rigid_disk_page rigid_disk_page[4]; struct scsi_da_verify_recovery_page verify_er_page[4]; struct scsi_caching_page caching_page[4]; struct scsi_control_page control_page[4]; struct scsi_control_ext_page control_ext_page[4]; struct scsi_info_exceptions_page ie_page[4]; struct ctl_logical_block_provisioning_page lbp_page[4]; struct scsi_cddvd_capabilities_page cddvd_page[4]; struct ctl_page_index index[CTL_NUM_MODE_PAGES]; }; #define MODE_RWER mode_pages.rw_er_page[CTL_PAGE_CURRENT] #define MODE_FMT mode_pages.format_page[CTL_PAGE_CURRENT] #define MODE_RDISK mode_pages.rigid_disk_page[CTL_PAGE_CURRENT] #define MODE_VER mode_pages.verify_er_page[CTL_PAGE_CURRENT] #define MODE_CACHING mode_pages.caching_page[CTL_PAGE_CURRENT] #define MODE_CTRL mode_pages.control_page[CTL_PAGE_CURRENT] #define MODE_CTRLE mode_pages.control_ext_page[CTL_PAGE_CURRENT] #define MODE_IE mode_pages.ie_page[CTL_PAGE_CURRENT] #define MODE_LBP mode_pages.lbp_page[CTL_PAGE_CURRENT] #define MODE_CDDVD mode_pages.cddvd_page[CTL_PAGE_CURRENT] static const struct ctl_page_index log_page_index_template[] = { {SLS_SUPPORTED_PAGES_PAGE, 0, 0, NULL, CTL_PAGE_FLAG_ALL, NULL, NULL}, {SLS_SUPPORTED_PAGES_PAGE, SLS_SUPPORTED_SUBPAGES_SUBPAGE, 0, NULL, CTL_PAGE_FLAG_ALL, NULL, NULL}, + {SLS_TEMPERATURE, 0, 0, NULL, + CTL_PAGE_FLAG_DIRECT, ctl_temp_log_sense_handler, NULL}, {SLS_LOGICAL_BLOCK_PROVISIONING, 0, 0, NULL, CTL_PAGE_FLAG_DIRECT, ctl_lbp_log_sense_handler, NULL}, {SLS_STAT_AND_PERF, 0, 0, NULL, CTL_PAGE_FLAG_ALL, ctl_sap_log_sense_handler, NULL}, {SLS_IE_PAGE, 0, 0, NULL, CTL_PAGE_FLAG_ALL, ctl_ie_log_sense_handler, NULL}, }; #define CTL_NUM_LOG_PAGES sizeof(log_page_index_template)/ \ sizeof(log_page_index_template[0]) struct ctl_log_pages { uint8_t pages_page[CTL_NUM_LOG_PAGES]; uint8_t subpages_page[CTL_NUM_LOG_PAGES * 2]; uint8_t lbp_page[12*CTL_NUM_LBP_PARAMS]; struct stat_page { struct scsi_log_stat_and_perf sap; struct scsi_log_idle_time it; struct scsi_log_time_interval ti; } stat_page; + struct scsi_log_temperature temp_page[2]; struct scsi_log_informational_exceptions ie_page; struct ctl_page_index index[CTL_NUM_LOG_PAGES]; }; struct ctl_lun_delay_info { ctl_delay_type datamove_type; uint32_t datamove_delay; ctl_delay_type done_type; uint32_t done_delay; }; #define CTL_PR_ALL_REGISTRANTS 0xFFFFFFFF #define CTL_PR_NO_RESERVATION 0xFFFFFFF0 struct ctl_devid { int len; uint8_t data[]; }; #define NUM_HA_SHELVES 2 #define CTL_WRITE_BUFFER_SIZE 262144 struct tpc_list; struct ctl_lun { struct mtx lun_lock; uint64_t lun; ctl_lun_flags flags; STAILQ_HEAD(,ctl_error_desc) error_list; uint64_t error_serial; struct ctl_softc *ctl_softc; struct ctl_be_lun *be_lun; struct ctl_backend_driver *backend; struct ctl_lun_delay_info delay_info; #ifdef CTL_TIME_IO sbintime_t idle_time; sbintime_t last_busy; #endif TAILQ_HEAD(ctl_ooaq, ctl_io_hdr) ooa_queue; STAILQ_ENTRY(ctl_lun) links; struct scsi_sense_data **pending_sense; ctl_ua_type **pending_ua; uint8_t ua_tpt_info[8]; time_t lasttpt; uint8_t ie_asc; /* Informational exceptions */ uint8_t ie_ascq; int ie_reported; /* Already reported */ uint32_t ie_reportcnt; /* REPORT COUNT */ struct callout ie_callout; /* INTERVAL TIMER */ struct ctl_mode_pages mode_pages; struct ctl_log_pages log_pages; struct ctl_io_stats stats; uint32_t res_idx; uint32_t pr_generation; uint64_t **pr_keys; int pr_key_count; uint32_t pr_res_idx; uint8_t pr_res_type; int prevent_count; uint32_t *prevent; uint8_t *write_buffer; struct ctl_devid *lun_devid; TAILQ_HEAD(tpc_lists, tpc_list) tpc_lists; }; typedef enum { CTL_FLAG_ACTIVE_SHELF = 0x04 } ctl_gen_flags; #define CTL_MAX_THREADS 16 struct ctl_thread { struct mtx_padalign queue_lock; struct ctl_softc *ctl_softc; struct thread *thread; STAILQ_HEAD(, ctl_io_hdr) incoming_queue; STAILQ_HEAD(, ctl_io_hdr) rtr_queue; STAILQ_HEAD(, ctl_io_hdr) done_queue; STAILQ_HEAD(, ctl_io_hdr) isc_queue; }; struct tpc_token; struct ctl_softc { struct mtx ctl_lock; struct cdev *dev; int num_luns; ctl_gen_flags flags; ctl_ha_mode ha_mode; int ha_id; int is_single; ctl_ha_link_state ha_link; int port_min; int port_max; int port_cnt; int init_min; int init_max; struct sysctl_ctx_list sysctl_ctx; struct sysctl_oid *sysctl_tree; void *othersc_pool; struct proc *ctl_proc; uint32_t *ctl_lun_mask; struct ctl_lun **ctl_luns; uint32_t *ctl_port_mask; STAILQ_HEAD(, ctl_lun) lun_list; STAILQ_HEAD(, ctl_be_lun) pending_lun_queue; uint32_t num_frontends; STAILQ_HEAD(, ctl_frontend) fe_list; uint32_t num_ports; STAILQ_HEAD(, ctl_port) port_list; struct ctl_port **ctl_ports; uint32_t num_backends; STAILQ_HEAD(, ctl_backend_driver) be_list; struct uma_zone *io_zone; uint32_t cur_pool_id; int shutdown; struct ctl_thread threads[CTL_MAX_THREADS]; struct thread *lun_thread; struct thread *thresh_thread; TAILQ_HEAD(tpc_tokens, tpc_token) tpc_tokens; struct callout tpc_timeout; struct mtx tpc_lock; }; #ifdef _KERNEL extern const struct ctl_cmd_entry ctl_cmd_table[256]; uint32_t ctl_get_initindex(struct ctl_nexus *nexus); int ctl_lun_map_init(struct ctl_port *port); int ctl_lun_map_deinit(struct ctl_port *port); int ctl_lun_map_set(struct ctl_port *port, uint32_t plun, uint32_t glun); int ctl_lun_map_unset(struct ctl_port *port, uint32_t plun); uint32_t ctl_lun_map_from_port(struct ctl_port *port, uint32_t plun); uint32_t ctl_lun_map_to_port(struct ctl_port *port, uint32_t glun); int ctl_pool_create(struct ctl_softc *ctl_softc, const char *pool_name, uint32_t total_ctl_io, void **npool); void ctl_pool_free(struct ctl_io_pool *pool); int ctl_scsi_release(struct ctl_scsiio *ctsio); int ctl_scsi_reserve(struct ctl_scsiio *ctsio); int ctl_start_stop(struct ctl_scsiio *ctsio); int ctl_prevent_allow(struct ctl_scsiio *ctsio); int ctl_sync_cache(struct ctl_scsiio *ctsio); int ctl_format(struct ctl_scsiio *ctsio); int ctl_read_buffer(struct ctl_scsiio *ctsio); int ctl_write_buffer(struct ctl_scsiio *ctsio); int ctl_write_same(struct ctl_scsiio *ctsio); int ctl_unmap(struct ctl_scsiio *ctsio); int ctl_mode_select(struct ctl_scsiio *ctsio); int ctl_mode_sense(struct ctl_scsiio *ctsio); int ctl_log_sense(struct ctl_scsiio *ctsio); int ctl_read_capacity(struct ctl_scsiio *ctsio); int ctl_read_capacity_16(struct ctl_scsiio *ctsio); int ctl_read_defect(struct ctl_scsiio *ctsio); int ctl_read_toc(struct ctl_scsiio *ctsio); int ctl_read_write(struct ctl_scsiio *ctsio); int ctl_cnw(struct ctl_scsiio *ctsio); int ctl_report_luns(struct ctl_scsiio *ctsio); int ctl_request_sense(struct ctl_scsiio *ctsio); int ctl_tur(struct ctl_scsiio *ctsio); int ctl_verify(struct ctl_scsiio *ctsio); int ctl_inquiry(struct ctl_scsiio *ctsio); int ctl_get_config(struct ctl_scsiio *ctsio); int ctl_get_event_status(struct ctl_scsiio *ctsio); int ctl_mechanism_status(struct ctl_scsiio *ctsio); int ctl_persistent_reserve_in(struct ctl_scsiio *ctsio); int ctl_persistent_reserve_out(struct ctl_scsiio *ctsio); int ctl_report_tagret_port_groups(struct ctl_scsiio *ctsio); int ctl_report_supported_opcodes(struct ctl_scsiio *ctsio); int ctl_report_supported_tmf(struct ctl_scsiio *ctsio); int ctl_report_timestamp(struct ctl_scsiio *ctsio); int ctl_get_lba_status(struct ctl_scsiio *ctsio); void ctl_tpc_init(struct ctl_softc *softc); void ctl_tpc_shutdown(struct ctl_softc *softc); void ctl_tpc_lun_init(struct ctl_lun *lun); void ctl_tpc_lun_clear(struct ctl_lun *lun, uint32_t initidx); void ctl_tpc_lun_shutdown(struct ctl_lun *lun); int ctl_inquiry_evpd_tpc(struct ctl_scsiio *ctsio, int alloc_len); int ctl_receive_copy_status_lid1(struct ctl_scsiio *ctsio); int ctl_receive_copy_failure_details(struct ctl_scsiio *ctsio); int ctl_receive_copy_status_lid4(struct ctl_scsiio *ctsio); int ctl_receive_copy_operating_parameters(struct ctl_scsiio *ctsio); int ctl_extended_copy_lid1(struct ctl_scsiio *ctsio); int ctl_extended_copy_lid4(struct ctl_scsiio *ctsio); int ctl_copy_operation_abort(struct ctl_scsiio *ctsio); int ctl_populate_token(struct ctl_scsiio *ctsio); int ctl_write_using_token(struct ctl_scsiio *ctsio); int ctl_receive_rod_token_information(struct ctl_scsiio *ctsio); int ctl_report_all_rod_tokens(struct ctl_scsiio *ctsio); #endif /* _KERNEL */ #endif /* _CTL_PRIVATE_H_ */ /* * vim: ts=8 */ Index: projects/nfsv42/sys/cam/scsi/scsi_all.c =================================================================== --- projects/nfsv42/sys/cam/scsi/scsi_all.c (revision 350367) +++ projects/nfsv42/sys/cam/scsi/scsi_all.c (revision 350368) @@ -1,9251 +1,9252 @@ /*- * Implementation of Utility functions for all SCSI device types. * * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 1997, 1998, 1999 Justin T. Gibbs. * Copyright (c) 1997, 1998, 2003 Kenneth D. Merry. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions, and the following disclaimer, * without modification, immediately at the beginning of the file. * 2. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #ifdef _KERNEL #include "opt_scsi.h" #include #include #include #include #include #include #include #include #else #include #include #include #include #include #endif #include #include #include #include #include #include #include #ifdef _KERNEL #include #include #include #include #else #include #include #ifndef FALSE #define FALSE 0 #endif /* FALSE */ #ifndef TRUE #define TRUE 1 #endif /* TRUE */ #define ERESTART -1 /* restart syscall */ #define EJUSTRETURN -2 /* don't modify regs, just return */ #endif /* !_KERNEL */ /* * This is the default number of milliseconds we wait for devices to settle * after a SCSI bus reset. */ #ifndef SCSI_DELAY #define SCSI_DELAY 2000 #endif /* * All devices need _some_ sort of bus settle delay, so we'll set it to * a minimum value of 100ms. Note that this is pertinent only for SPI- * not transport like Fibre Channel or iSCSI where 'delay' is completely * meaningless. */ #ifndef SCSI_MIN_DELAY #define SCSI_MIN_DELAY 100 #endif /* * Make sure the user isn't using seconds instead of milliseconds. */ #if (SCSI_DELAY < SCSI_MIN_DELAY && SCSI_DELAY != 0) #error "SCSI_DELAY is in milliseconds, not seconds! Please use a larger value" #endif int scsi_delay; static int ascentrycomp(const void *key, const void *member); static int senseentrycomp(const void *key, const void *member); static void fetchtableentries(int sense_key, int asc, int ascq, struct scsi_inquiry_data *, const struct sense_key_table_entry **, const struct asc_table_entry **); #ifdef _KERNEL static void init_scsi_delay(void); static int sysctl_scsi_delay(SYSCTL_HANDLER_ARGS); static int set_scsi_delay(int delay); #endif #if !defined(SCSI_NO_OP_STRINGS) #define D (1 << T_DIRECT) #define T (1 << T_SEQUENTIAL) #define L (1 << T_PRINTER) #define P (1 << T_PROCESSOR) #define W (1 << T_WORM) #define R (1 << T_CDROM) #define O (1 << T_OPTICAL) #define M (1 << T_CHANGER) #define A (1 << T_STORARRAY) #define E (1 << T_ENCLOSURE) #define B (1 << T_RBC) #define K (1 << T_OCRW) #define V (1 << T_ADC) #define F (1 << T_OSD) #define S (1 << T_SCANNER) #define C (1 << T_COMM) #define ALL (D | T | L | P | W | R | O | M | A | E | B | K | V | F | S | C) static struct op_table_entry plextor_cd_ops[] = { { 0xD8, R, "CD-DA READ" } }; static struct scsi_op_quirk_entry scsi_op_quirk_table[] = { { /* * I believe that 0xD8 is the Plextor proprietary command * to read CD-DA data. I'm not sure which Plextor CDROM * models support the command, though. I know for sure * that the 4X, 8X, and 12X models do, and presumably the * 12-20X does. I don't know about any earlier models, * though. If anyone has any more complete information, * feel free to change this quirk entry. */ {T_CDROM, SIP_MEDIA_REMOVABLE, "PLEXTOR", "CD-ROM PX*", "*"}, nitems(plextor_cd_ops), plextor_cd_ops } }; static struct op_table_entry scsi_op_codes[] = { /* * From: http://www.t10.org/lists/op-num.txt * Modifications by Kenneth Merry (ken@FreeBSD.ORG) * and Jung-uk Kim (jkim@FreeBSD.org) * * Note: order is important in this table, scsi_op_desc() currently * depends on the opcodes in the table being in order to save * search time. * Note: scanner and comm. devices are carried over from the previous * version because they were removed in the latest spec. */ /* File: OP-NUM.TXT * * SCSI Operation Codes * Numeric Sorted Listing * as of 5/26/15 * * D - DIRECT ACCESS DEVICE (SBC-2) device column key * .T - SEQUENTIAL ACCESS DEVICE (SSC-2) ----------------- * . L - PRINTER DEVICE (SSC) M = Mandatory * . P - PROCESSOR DEVICE (SPC) O = Optional * . .W - WRITE ONCE READ MULTIPLE DEVICE (SBC-2) V = Vendor spec. * . . R - CD/DVE DEVICE (MMC-3) Z = Obsolete * . . O - OPTICAL MEMORY DEVICE (SBC-2) * . . .M - MEDIA CHANGER DEVICE (SMC-2) * . . . A - STORAGE ARRAY DEVICE (SCC-2) * . . . .E - ENCLOSURE SERVICES DEVICE (SES) * . . . .B - SIMPLIFIED DIRECT-ACCESS DEVICE (RBC) * . . . . K - OPTICAL CARD READER/WRITER DEVICE (OCRW) * . . . . V - AUTOMATION/DRIVE INTERFACE (ADC) * . . . . .F - OBJECT-BASED STORAGE (OSD) * OP DTLPWROMAEBKVF Description * -- -------------- ---------------------------------------------- */ /* 00 MMMMMMMMMMMMMM TEST UNIT READY */ { 0x00, ALL, "TEST UNIT READY" }, /* 01 M REWIND */ { 0x01, T, "REWIND" }, /* 01 Z V ZZZZ REZERO UNIT */ { 0x01, D | W | R | O | M, "REZERO UNIT" }, /* 02 VVVVVV V */ /* 03 MMMMMMMMMMOMMM REQUEST SENSE */ { 0x03, ALL, "REQUEST SENSE" }, /* 04 M OO FORMAT UNIT */ { 0x04, D | R | O, "FORMAT UNIT" }, /* 04 O FORMAT MEDIUM */ { 0x04, T, "FORMAT MEDIUM" }, /* 04 O FORMAT */ { 0x04, L, "FORMAT" }, /* 05 VMVVVV V READ BLOCK LIMITS */ { 0x05, T, "READ BLOCK LIMITS" }, /* 06 VVVVVV V */ /* 07 OVV O OV REASSIGN BLOCKS */ { 0x07, D | W | O, "REASSIGN BLOCKS" }, /* 07 O INITIALIZE ELEMENT STATUS */ { 0x07, M, "INITIALIZE ELEMENT STATUS" }, /* 08 MOV O OV READ(6) */ { 0x08, D | T | W | O, "READ(6)" }, /* 08 O RECEIVE */ { 0x08, P, "RECEIVE" }, /* 08 GET MESSAGE(6) */ { 0x08, C, "GET MESSAGE(6)" }, /* 09 VVVVVV V */ /* 0A OO O OV WRITE(6) */ { 0x0A, D | T | W | O, "WRITE(6)" }, /* 0A M SEND(6) */ { 0x0A, P, "SEND(6)" }, /* 0A SEND MESSAGE(6) */ { 0x0A, C, "SEND MESSAGE(6)" }, /* 0A M PRINT */ { 0x0A, L, "PRINT" }, /* 0B Z ZOZV SEEK(6) */ { 0x0B, D | W | R | O, "SEEK(6)" }, /* 0B O SET CAPACITY */ { 0x0B, T, "SET CAPACITY" }, /* 0B O SLEW AND PRINT */ { 0x0B, L, "SLEW AND PRINT" }, /* 0C VVVVVV V */ /* 0D VVVVVV V */ /* 0E VVVVVV V */ /* 0F VOVVVV V READ REVERSE(6) */ { 0x0F, T, "READ REVERSE(6)" }, /* 10 VM VVV WRITE FILEMARKS(6) */ { 0x10, T, "WRITE FILEMARKS(6)" }, /* 10 O SYNCHRONIZE BUFFER */ { 0x10, L, "SYNCHRONIZE BUFFER" }, /* 11 VMVVVV SPACE(6) */ { 0x11, T, "SPACE(6)" }, /* 12 MMMMMMMMMMMMMM INQUIRY */ { 0x12, ALL, "INQUIRY" }, /* 13 V VVVV */ /* 13 O VERIFY(6) */ { 0x13, T, "VERIFY(6)" }, /* 14 VOOVVV RECOVER BUFFERED DATA */ { 0x14, T | L, "RECOVER BUFFERED DATA" }, /* 15 OMO O OOOO OO MODE SELECT(6) */ { 0x15, ALL & ~(P | R | B | F), "MODE SELECT(6)" }, /* 16 ZZMZO OOOZ O RESERVE(6) */ { 0x16, ALL & ~(R | B | V | F | C), "RESERVE(6)" }, /* 16 Z RESERVE ELEMENT(6) */ { 0x16, M, "RESERVE ELEMENT(6)" }, /* 17 ZZMZO OOOZ O RELEASE(6) */ { 0x17, ALL & ~(R | B | V | F | C), "RELEASE(6)" }, /* 17 Z RELEASE ELEMENT(6) */ { 0x17, M, "RELEASE ELEMENT(6)" }, /* 18 ZZZZOZO Z COPY */ { 0x18, D | T | L | P | W | R | O | K | S, "COPY" }, /* 19 VMVVVV ERASE(6) */ { 0x19, T, "ERASE(6)" }, /* 1A OMO O OOOO OO MODE SENSE(6) */ { 0x1A, ALL & ~(P | R | B | F), "MODE SENSE(6)" }, /* 1B O OOO O MO O START STOP UNIT */ { 0x1B, D | W | R | O | A | B | K | F, "START STOP UNIT" }, /* 1B O M LOAD UNLOAD */ { 0x1B, T | V, "LOAD UNLOAD" }, /* 1B SCAN */ { 0x1B, S, "SCAN" }, /* 1B O STOP PRINT */ { 0x1B, L, "STOP PRINT" }, /* 1B O OPEN/CLOSE IMPORT/EXPORT ELEMENT */ { 0x1B, M, "OPEN/CLOSE IMPORT/EXPORT ELEMENT" }, /* 1C OOOOO OOOM OOO RECEIVE DIAGNOSTIC RESULTS */ { 0x1C, ALL & ~(R | B), "RECEIVE DIAGNOSTIC RESULTS" }, /* 1D MMMMM MMOM MMM SEND DIAGNOSTIC */ { 0x1D, ALL & ~(R | B), "SEND DIAGNOSTIC" }, /* 1E OO OOOO O O PREVENT ALLOW MEDIUM REMOVAL */ { 0x1E, D | T | W | R | O | M | K | F, "PREVENT ALLOW MEDIUM REMOVAL" }, /* 1F */ /* 20 V VVV V */ /* 21 V VVV V */ /* 22 V VVV V */ /* 23 V V V V */ /* 23 O READ FORMAT CAPACITIES */ { 0x23, R, "READ FORMAT CAPACITIES" }, /* 24 V VV SET WINDOW */ { 0x24, S, "SET WINDOW" }, /* 25 M M M M READ CAPACITY(10) */ { 0x25, D | W | O | B, "READ CAPACITY(10)" }, /* 25 O READ CAPACITY */ { 0x25, R, "READ CAPACITY" }, /* 25 M READ CARD CAPACITY */ { 0x25, K, "READ CARD CAPACITY" }, /* 25 GET WINDOW */ { 0x25, S, "GET WINDOW" }, /* 26 V VV */ /* 27 V VV */ /* 28 M MOM MM READ(10) */ { 0x28, D | W | R | O | B | K | S, "READ(10)" }, /* 28 GET MESSAGE(10) */ { 0x28, C, "GET MESSAGE(10)" }, /* 29 V VVO READ GENERATION */ { 0x29, O, "READ GENERATION" }, /* 2A O MOM MO WRITE(10) */ { 0x2A, D | W | R | O | B | K, "WRITE(10)" }, /* 2A SEND(10) */ { 0x2A, S, "SEND(10)" }, /* 2A SEND MESSAGE(10) */ { 0x2A, C, "SEND MESSAGE(10)" }, /* 2B Z OOO O SEEK(10) */ { 0x2B, D | W | R | O | K, "SEEK(10)" }, /* 2B O LOCATE(10) */ { 0x2B, T, "LOCATE(10)" }, /* 2B O POSITION TO ELEMENT */ { 0x2B, M, "POSITION TO ELEMENT" }, /* 2C V OO ERASE(10) */ { 0x2C, R | O, "ERASE(10)" }, /* 2D O READ UPDATED BLOCK */ { 0x2D, O, "READ UPDATED BLOCK" }, /* 2D V */ /* 2E O OOO MO WRITE AND VERIFY(10) */ { 0x2E, D | W | R | O | B | K, "WRITE AND VERIFY(10)" }, /* 2F O OOO VERIFY(10) */ { 0x2F, D | W | R | O, "VERIFY(10)" }, /* 30 Z ZZZ SEARCH DATA HIGH(10) */ { 0x30, D | W | R | O, "SEARCH DATA HIGH(10)" }, /* 31 Z ZZZ SEARCH DATA EQUAL(10) */ { 0x31, D | W | R | O, "SEARCH DATA EQUAL(10)" }, /* 31 OBJECT POSITION */ { 0x31, S, "OBJECT POSITION" }, /* 32 Z ZZZ SEARCH DATA LOW(10) */ { 0x32, D | W | R | O, "SEARCH DATA LOW(10)" }, /* 33 Z OZO SET LIMITS(10) */ { 0x33, D | W | R | O, "SET LIMITS(10)" }, /* 34 O O O O PRE-FETCH(10) */ { 0x34, D | W | O | K, "PRE-FETCH(10)" }, /* 34 M READ POSITION */ { 0x34, T, "READ POSITION" }, /* 34 GET DATA BUFFER STATUS */ { 0x34, S, "GET DATA BUFFER STATUS" }, /* 35 O OOO MO SYNCHRONIZE CACHE(10) */ { 0x35, D | W | R | O | B | K, "SYNCHRONIZE CACHE(10)" }, /* 36 Z O O O LOCK UNLOCK CACHE(10) */ { 0x36, D | W | O | K, "LOCK UNLOCK CACHE(10)" }, /* 37 O O READ DEFECT DATA(10) */ { 0x37, D | O, "READ DEFECT DATA(10)" }, /* 37 O INITIALIZE ELEMENT STATUS WITH RANGE */ { 0x37, M, "INITIALIZE ELEMENT STATUS WITH RANGE" }, /* 38 O O O MEDIUM SCAN */ { 0x38, W | O | K, "MEDIUM SCAN" }, /* 39 ZZZZOZO Z COMPARE */ { 0x39, D | T | L | P | W | R | O | K | S, "COMPARE" }, /* 3A ZZZZOZO Z COPY AND VERIFY */ { 0x3A, D | T | L | P | W | R | O | K | S, "COPY AND VERIFY" }, /* 3B OOOOOOOOOOMOOO WRITE BUFFER */ { 0x3B, ALL, "WRITE BUFFER" }, /* 3C OOOOOOOOOO OOO READ BUFFER */ { 0x3C, ALL & ~(B), "READ BUFFER" }, /* 3D O UPDATE BLOCK */ { 0x3D, O, "UPDATE BLOCK" }, /* 3E O O O READ LONG(10) */ { 0x3E, D | W | O, "READ LONG(10)" }, /* 3F O O O WRITE LONG(10) */ { 0x3F, D | W | O, "WRITE LONG(10)" }, /* 40 ZZZZOZOZ CHANGE DEFINITION */ { 0x40, D | T | L | P | W | R | O | M | S | C, "CHANGE DEFINITION" }, /* 41 O WRITE SAME(10) */ { 0x41, D, "WRITE SAME(10)" }, - /* 42 O UNMAP */ + /* 42 O UNMAP */ { 0x42, D, "UNMAP" }, /* 42 O READ SUB-CHANNEL */ { 0x42, R, "READ SUB-CHANNEL" }, /* 43 O READ TOC/PMA/ATIP */ { 0x43, R, "READ TOC/PMA/ATIP" }, /* 44 M M REPORT DENSITY SUPPORT */ { 0x44, T | V, "REPORT DENSITY SUPPORT" }, /* 44 READ HEADER */ /* 45 O PLAY AUDIO(10) */ { 0x45, R, "PLAY AUDIO(10)" }, /* 46 M GET CONFIGURATION */ { 0x46, R, "GET CONFIGURATION" }, /* 47 O PLAY AUDIO MSF */ { 0x47, R, "PLAY AUDIO MSF" }, - /* 48 */ + /* 48 O SANITIZE */ + { 0x48, D, "SANITIZE" }, /* 49 */ /* 4A M GET EVENT STATUS NOTIFICATION */ { 0x4A, R, "GET EVENT STATUS NOTIFICATION" }, /* 4B O PAUSE/RESUME */ { 0x4B, R, "PAUSE/RESUME" }, /* 4C OOOOO OOOO OOO LOG SELECT */ { 0x4C, ALL & ~(R | B), "LOG SELECT" }, /* 4D OOOOO OOOO OMO LOG SENSE */ { 0x4D, ALL & ~(R | B), "LOG SENSE" }, /* 4E O STOP PLAY/SCAN */ { 0x4E, R, "STOP PLAY/SCAN" }, /* 4F */ /* 50 O XDWRITE(10) */ { 0x50, D, "XDWRITE(10)" }, /* 51 O XPWRITE(10) */ { 0x51, D, "XPWRITE(10)" }, /* 51 O READ DISC INFORMATION */ { 0x51, R, "READ DISC INFORMATION" }, /* 52 O XDREAD(10) */ { 0x52, D, "XDREAD(10)" }, /* 52 O READ TRACK INFORMATION */ { 0x52, R, "READ TRACK INFORMATION" }, /* 53 O RESERVE TRACK */ { 0x53, R, "RESERVE TRACK" }, /* 54 O SEND OPC INFORMATION */ { 0x54, R, "SEND OPC INFORMATION" }, /* 55 OOO OMOOOOMOMO MODE SELECT(10) */ { 0x55, ALL & ~(P), "MODE SELECT(10)" }, /* 56 ZZMZO OOOZ RESERVE(10) */ { 0x56, ALL & ~(R | B | K | V | F | C), "RESERVE(10)" }, /* 56 Z RESERVE ELEMENT(10) */ { 0x56, M, "RESERVE ELEMENT(10)" }, /* 57 ZZMZO OOOZ RELEASE(10) */ { 0x57, ALL & ~(R | B | K | V | F | C), "RELEASE(10)" }, /* 57 Z RELEASE ELEMENT(10) */ { 0x57, M, "RELEASE ELEMENT(10)" }, /* 58 O REPAIR TRACK */ { 0x58, R, "REPAIR TRACK" }, /* 59 */ /* 5A OOO OMOOOOMOMO MODE SENSE(10) */ { 0x5A, ALL & ~(P), "MODE SENSE(10)" }, /* 5B O CLOSE TRACK/SESSION */ { 0x5B, R, "CLOSE TRACK/SESSION" }, /* 5C O READ BUFFER CAPACITY */ { 0x5C, R, "READ BUFFER CAPACITY" }, /* 5D O SEND CUE SHEET */ { 0x5D, R, "SEND CUE SHEET" }, /* 5E OOOOO OOOO M PERSISTENT RESERVE IN */ { 0x5E, ALL & ~(R | B | K | V | C), "PERSISTENT RESERVE IN" }, /* 5F OOOOO OOOO M PERSISTENT RESERVE OUT */ { 0x5F, ALL & ~(R | B | K | V | C), "PERSISTENT RESERVE OUT" }, /* 7E OO O OOOO O extended CDB */ { 0x7E, D | T | R | M | A | E | B | V, "extended CDB" }, /* 7F O M variable length CDB (more than 16 bytes) */ { 0x7F, D | F, "variable length CDB (more than 16 bytes)" }, /* 80 Z XDWRITE EXTENDED(16) */ { 0x80, D, "XDWRITE EXTENDED(16)" }, /* 80 M WRITE FILEMARKS(16) */ { 0x80, T, "WRITE FILEMARKS(16)" }, /* 81 Z REBUILD(16) */ { 0x81, D, "REBUILD(16)" }, /* 81 O READ REVERSE(16) */ { 0x81, T, "READ REVERSE(16)" }, /* 82 Z REGENERATE(16) */ { 0x82, D, "REGENERATE(16)" }, /* 83 OOOOO O OO EXTENDED COPY */ { 0x83, D | T | L | P | W | O | K | V, "EXTENDED COPY" }, /* 84 OOOOO O OO RECEIVE COPY RESULTS */ { 0x84, D | T | L | P | W | O | K | V, "RECEIVE COPY RESULTS" }, /* 85 O O O ATA COMMAND PASS THROUGH(16) */ { 0x85, D | R | B, "ATA COMMAND PASS THROUGH(16)" }, /* 86 OO OO OOOOOOO ACCESS CONTROL IN */ { 0x86, ALL & ~(L | R | F), "ACCESS CONTROL IN" }, /* 87 OO OO OOOOOOO ACCESS CONTROL OUT */ { 0x87, ALL & ~(L | R | F), "ACCESS CONTROL OUT" }, /* 88 MM O O O READ(16) */ { 0x88, D | T | W | O | B, "READ(16)" }, /* 89 O COMPARE AND WRITE*/ { 0x89, D, "COMPARE AND WRITE" }, /* 8A OM O O O WRITE(16) */ { 0x8A, D | T | W | O | B, "WRITE(16)" }, /* 8B O ORWRITE */ { 0x8B, D, "ORWRITE" }, /* 8C OO O OO O M READ ATTRIBUTE */ { 0x8C, D | T | W | O | M | B | V, "READ ATTRIBUTE" }, /* 8D OO O OO O O WRITE ATTRIBUTE */ { 0x8D, D | T | W | O | M | B | V, "WRITE ATTRIBUTE" }, /* 8E O O O O WRITE AND VERIFY(16) */ { 0x8E, D | W | O | B, "WRITE AND VERIFY(16)" }, /* 8F OO O O O VERIFY(16) */ { 0x8F, D | T | W | O | B, "VERIFY(16)" }, /* 90 O O O O PRE-FETCH(16) */ { 0x90, D | W | O | B, "PRE-FETCH(16)" }, /* 91 O O O O SYNCHRONIZE CACHE(16) */ { 0x91, D | W | O | B, "SYNCHRONIZE CACHE(16)" }, /* 91 O SPACE(16) */ { 0x91, T, "SPACE(16)" }, /* 92 Z O O LOCK UNLOCK CACHE(16) */ { 0x92, D | W | O, "LOCK UNLOCK CACHE(16)" }, /* 92 O LOCATE(16) */ { 0x92, T, "LOCATE(16)" }, /* 93 O WRITE SAME(16) */ { 0x93, D, "WRITE SAME(16)" }, /* 93 M ERASE(16) */ { 0x93, T, "ERASE(16)" }, /* 94 O ZBC OUT */ { 0x94, ALL, "ZBC OUT" }, /* 95 O ZBC IN */ { 0x95, ALL, "ZBC IN" }, /* 96 */ /* 97 */ /* 98 */ /* 99 */ /* 9A O WRITE STREAM(16) */ { 0x9A, D, "WRITE STREAM(16)" }, /* 9B OOOOOOOOOO OOO READ BUFFER(16) */ { 0x9B, ALL & ~(B) , "READ BUFFER(16)" }, /* 9C O WRITE ATOMIC(16) */ { 0x9C, D, "WRITE ATOMIC(16)" }, /* 9D SERVICE ACTION BIDIRECTIONAL */ { 0x9D, ALL, "SERVICE ACTION BIDIRECTIONAL" }, /* XXX KDM ALL for this? op-num.txt defines it for none.. */ /* 9E SERVICE ACTION IN(16) */ { 0x9E, ALL, "SERVICE ACTION IN(16)" }, /* 9F M SERVICE ACTION OUT(16) */ { 0x9F, ALL, "SERVICE ACTION OUT(16)" }, /* A0 MMOOO OMMM OMO REPORT LUNS */ { 0xA0, ALL & ~(R | B), "REPORT LUNS" }, /* A1 O BLANK */ { 0xA1, R, "BLANK" }, /* A1 O O ATA COMMAND PASS THROUGH(12) */ { 0xA1, D | B, "ATA COMMAND PASS THROUGH(12)" }, /* A2 OO O O SECURITY PROTOCOL IN */ { 0xA2, D | T | R | V, "SECURITY PROTOCOL IN" }, /* A3 OOO O OOMOOOM MAINTENANCE (IN) */ { 0xA3, ALL & ~(P | R | F), "MAINTENANCE (IN)" }, /* A3 O SEND KEY */ { 0xA3, R, "SEND KEY" }, /* A4 OOO O OOOOOOO MAINTENANCE (OUT) */ { 0xA4, ALL & ~(P | R | F), "MAINTENANCE (OUT)" }, /* A4 O REPORT KEY */ { 0xA4, R, "REPORT KEY" }, /* A5 O O OM MOVE MEDIUM */ { 0xA5, T | W | O | M, "MOVE MEDIUM" }, /* A5 O PLAY AUDIO(12) */ { 0xA5, R, "PLAY AUDIO(12)" }, /* A6 O EXCHANGE MEDIUM */ { 0xA6, M, "EXCHANGE MEDIUM" }, /* A6 O LOAD/UNLOAD C/DVD */ { 0xA6, R, "LOAD/UNLOAD C/DVD" }, /* A7 ZZ O O MOVE MEDIUM ATTACHED */ { 0xA7, D | T | W | O, "MOVE MEDIUM ATTACHED" }, /* A7 O SET READ AHEAD */ { 0xA7, R, "SET READ AHEAD" }, /* A8 O OOO READ(12) */ { 0xA8, D | W | R | O, "READ(12)" }, /* A8 GET MESSAGE(12) */ { 0xA8, C, "GET MESSAGE(12)" }, /* A9 O SERVICE ACTION OUT(12) */ { 0xA9, V, "SERVICE ACTION OUT(12)" }, /* AA O OOO WRITE(12) */ { 0xAA, D | W | R | O, "WRITE(12)" }, /* AA SEND MESSAGE(12) */ { 0xAA, C, "SEND MESSAGE(12)" }, /* AB O O SERVICE ACTION IN(12) */ { 0xAB, R | V, "SERVICE ACTION IN(12)" }, /* AC O ERASE(12) */ { 0xAC, O, "ERASE(12)" }, /* AC O GET PERFORMANCE */ { 0xAC, R, "GET PERFORMANCE" }, /* AD O READ DVD STRUCTURE */ { 0xAD, R, "READ DVD STRUCTURE" }, /* AE O O O WRITE AND VERIFY(12) */ { 0xAE, D | W | O, "WRITE AND VERIFY(12)" }, /* AF O OZO VERIFY(12) */ { 0xAF, D | W | R | O, "VERIFY(12)" }, /* B0 ZZZ SEARCH DATA HIGH(12) */ { 0xB0, W | R | O, "SEARCH DATA HIGH(12)" }, /* B1 ZZZ SEARCH DATA EQUAL(12) */ { 0xB1, W | R | O, "SEARCH DATA EQUAL(12)" }, /* B2 ZZZ SEARCH DATA LOW(12) */ { 0xB2, W | R | O, "SEARCH DATA LOW(12)" }, /* B3 Z OZO SET LIMITS(12) */ { 0xB3, D | W | R | O, "SET LIMITS(12)" }, /* B4 ZZ OZO READ ELEMENT STATUS ATTACHED */ { 0xB4, D | T | W | R | O, "READ ELEMENT STATUS ATTACHED" }, /* B5 OO O O SECURITY PROTOCOL OUT */ { 0xB5, D | T | R | V, "SECURITY PROTOCOL OUT" }, /* B5 O REQUEST VOLUME ELEMENT ADDRESS */ { 0xB5, M, "REQUEST VOLUME ELEMENT ADDRESS" }, /* B6 O SEND VOLUME TAG */ { 0xB6, M, "SEND VOLUME TAG" }, /* B6 O SET STREAMING */ { 0xB6, R, "SET STREAMING" }, /* B7 O O READ DEFECT DATA(12) */ { 0xB7, D | O, "READ DEFECT DATA(12)" }, /* B8 O OZOM READ ELEMENT STATUS */ { 0xB8, T | W | R | O | M, "READ ELEMENT STATUS" }, /* B9 O READ CD MSF */ { 0xB9, R, "READ CD MSF" }, /* BA O O OOMO REDUNDANCY GROUP (IN) */ { 0xBA, D | W | O | M | A | E, "REDUNDANCY GROUP (IN)" }, /* BA O SCAN */ { 0xBA, R, "SCAN" }, /* BB O O OOOO REDUNDANCY GROUP (OUT) */ { 0xBB, D | W | O | M | A | E, "REDUNDANCY GROUP (OUT)" }, /* BB O SET CD SPEED */ { 0xBB, R, "SET CD SPEED" }, /* BC O O OOMO SPARE (IN) */ { 0xBC, D | W | O | M | A | E, "SPARE (IN)" }, /* BD O O OOOO SPARE (OUT) */ { 0xBD, D | W | O | M | A | E, "SPARE (OUT)" }, /* BD O MECHANISM STATUS */ { 0xBD, R, "MECHANISM STATUS" }, /* BE O O OOMO VOLUME SET (IN) */ { 0xBE, D | W | O | M | A | E, "VOLUME SET (IN)" }, /* BE O READ CD */ { 0xBE, R, "READ CD" }, /* BF O O OOOO VOLUME SET (OUT) */ { 0xBF, D | W | O | M | A | E, "VOLUME SET (OUT)" }, /* BF O SEND DVD STRUCTURE */ { 0xBF, R, "SEND DVD STRUCTURE" } }; const char * scsi_op_desc(u_int16_t opcode, struct scsi_inquiry_data *inq_data) { caddr_t match; int i, j; u_int32_t opmask; u_int16_t pd_type; int num_ops[2]; struct op_table_entry *table[2]; int num_tables; /* * If we've got inquiry data, use it to determine what type of * device we're dealing with here. Otherwise, assume direct * access. */ if (inq_data == NULL) { pd_type = T_DIRECT; match = NULL; } else { pd_type = SID_TYPE(inq_data); match = cam_quirkmatch((caddr_t)inq_data, (caddr_t)scsi_op_quirk_table, nitems(scsi_op_quirk_table), sizeof(*scsi_op_quirk_table), scsi_inquiry_match); } if (match != NULL) { table[0] = ((struct scsi_op_quirk_entry *)match)->op_table; num_ops[0] = ((struct scsi_op_quirk_entry *)match)->num_ops; table[1] = scsi_op_codes; num_ops[1] = nitems(scsi_op_codes); num_tables = 2; } else { /* * If this is true, we have a vendor specific opcode that * wasn't covered in the quirk table. */ if ((opcode > 0xBF) || ((opcode > 0x5F) && (opcode < 0x80))) return("Vendor Specific Command"); table[0] = scsi_op_codes; num_ops[0] = nitems(scsi_op_codes); num_tables = 1; } /* RBC is 'Simplified' Direct Access Device */ if (pd_type == T_RBC) pd_type = T_DIRECT; /* * Host managed drives are direct access for the most part. */ if (pd_type == T_ZBC_HM) pd_type = T_DIRECT; /* Map NODEVICE to Direct Access Device to handle REPORT LUNS, etc. */ if (pd_type == T_NODEVICE) pd_type = T_DIRECT; opmask = 1 << pd_type; for (j = 0; j < num_tables; j++) { for (i = 0;i < num_ops[j] && table[j][i].opcode <= opcode; i++){ if ((table[j][i].opcode == opcode) && ((table[j][i].opmask & opmask) != 0)) return(table[j][i].desc); } } /* * If we can't find a match for the command in the table, we just * assume it's a vendor specifc command. */ return("Vendor Specific Command"); } #else /* SCSI_NO_OP_STRINGS */ const char * scsi_op_desc(u_int16_t opcode, struct scsi_inquiry_data *inq_data) { return(""); } #endif #if !defined(SCSI_NO_SENSE_STRINGS) #define SST(asc, ascq, action, desc) \ asc, ascq, action, desc #else const char empty_string[] = ""; #define SST(asc, ascq, action, desc) \ asc, ascq, action, empty_string #endif const struct sense_key_table_entry sense_key_table[] = { { SSD_KEY_NO_SENSE, SS_NOP, "NO SENSE" }, { SSD_KEY_RECOVERED_ERROR, SS_NOP|SSQ_PRINT_SENSE, "RECOVERED ERROR" }, { SSD_KEY_NOT_READY, SS_RDEF, "NOT READY" }, { SSD_KEY_MEDIUM_ERROR, SS_RDEF, "MEDIUM ERROR" }, { SSD_KEY_HARDWARE_ERROR, SS_RDEF, "HARDWARE FAILURE" }, { SSD_KEY_ILLEGAL_REQUEST, SS_FATAL|EINVAL, "ILLEGAL REQUEST" }, { SSD_KEY_UNIT_ATTENTION, SS_FATAL|ENXIO, "UNIT ATTENTION" }, { SSD_KEY_DATA_PROTECT, SS_FATAL|EACCES, "DATA PROTECT" }, { SSD_KEY_BLANK_CHECK, SS_FATAL|ENOSPC, "BLANK CHECK" }, { SSD_KEY_Vendor_Specific, SS_FATAL|EIO, "Vendor Specific" }, { SSD_KEY_COPY_ABORTED, SS_FATAL|EIO, "COPY ABORTED" }, { SSD_KEY_ABORTED_COMMAND, SS_RDEF, "ABORTED COMMAND" }, { SSD_KEY_EQUAL, SS_NOP, "EQUAL" }, { SSD_KEY_VOLUME_OVERFLOW, SS_FATAL|EIO, "VOLUME OVERFLOW" }, { SSD_KEY_MISCOMPARE, SS_NOP, "MISCOMPARE" }, { SSD_KEY_COMPLETED, SS_NOP, "COMPLETED" } }; static struct asc_table_entry quantum_fireball_entries[] = { { SST(0x04, 0x0b, SS_START | SSQ_DECREMENT_COUNT | ENXIO, "Logical unit not ready, initializing cmd. required") } }; static struct asc_table_entry sony_mo_entries[] = { { SST(0x04, 0x00, SS_START | SSQ_DECREMENT_COUNT | ENXIO, "Logical unit not ready, cause not reportable") } }; static struct asc_table_entry hgst_entries[] = { { SST(0x04, 0xF0, SS_RDEF, "Vendor Unique - Logical Unit Not Ready") }, { SST(0x0A, 0x01, SS_RDEF, "Unrecovered Super Certification Log Write Error") }, { SST(0x0A, 0x02, SS_RDEF, "Unrecovered Super Certification Log Read Error") }, { SST(0x15, 0x03, SS_RDEF, "Unrecovered Sector Error") }, { SST(0x3E, 0x04, SS_RDEF, "Unrecovered Self-Test Hard-Cache Test Fail") }, { SST(0x3E, 0x05, SS_RDEF, "Unrecovered Self-Test OTF-Cache Fail") }, { SST(0x40, 0x00, SS_RDEF, "Unrecovered SAT No Buffer Overflow Error") }, { SST(0x40, 0x01, SS_RDEF, "Unrecovered SAT Buffer Overflow Error") }, { SST(0x40, 0x02, SS_RDEF, "Unrecovered SAT No Buffer Overflow With ECS Fault") }, { SST(0x40, 0x03, SS_RDEF, "Unrecovered SAT Buffer Overflow With ECS Fault") }, { SST(0x40, 0x81, SS_RDEF, "DRAM Failure") }, { SST(0x44, 0x0B, SS_RDEF, "Vendor Unique - Internal Target Failure") }, { SST(0x44, 0xF2, SS_RDEF, "Vendor Unique - Internal Target Failure") }, { SST(0x44, 0xF6, SS_RDEF, "Vendor Unique - Internal Target Failure") }, { SST(0x44, 0xF9, SS_RDEF, "Vendor Unique - Internal Target Failure") }, { SST(0x44, 0xFA, SS_RDEF, "Vendor Unique - Internal Target Failure") }, { SST(0x5D, 0x22, SS_RDEF, "Extreme Over-Temperature Warning") }, { SST(0x5D, 0x50, SS_RDEF, "Load/Unload cycle Count Warning") }, { SST(0x81, 0x00, SS_RDEF, "Vendor Unique - Internal Logic Error") }, { SST(0x85, 0x00, SS_RDEF, "Vendor Unique - Internal Key Seed Error") }, }; static struct asc_table_entry seagate_entries[] = { { SST(0x04, 0xF0, SS_RDEF, "Logical Unit Not Ready, super certify in Progress") }, { SST(0x08, 0x86, SS_RDEF, "Write Fault Data Corruption") }, { SST(0x09, 0x0D, SS_RDEF, "Tracking Failure") }, { SST(0x09, 0x0E, SS_RDEF, "ETF Failure") }, { SST(0x0B, 0x5D, SS_RDEF, "Pre-SMART Warning") }, { SST(0x0B, 0x85, SS_RDEF, "5V Voltage Warning") }, { SST(0x0B, 0x8C, SS_RDEF, "12V Voltage Warning") }, { SST(0x0C, 0xFF, SS_RDEF, "Write Error - Too many error recovery revs") }, { SST(0x11, 0xFF, SS_RDEF, "Unrecovered Read Error - Too many error recovery revs") }, { SST(0x19, 0x0E, SS_RDEF, "Fewer than 1/2 defect list copies") }, { SST(0x20, 0xF3, SS_RDEF, "Illegal CDB linked to skip mask cmd") }, { SST(0x24, 0xF0, SS_RDEF, "Illegal byte in CDB, LBA not matching") }, { SST(0x24, 0xF1, SS_RDEF, "Illegal byte in CDB, LEN not matching") }, { SST(0x24, 0xF2, SS_RDEF, "Mask not matching transfer length") }, { SST(0x24, 0xF3, SS_RDEF, "Drive formatted without plist") }, { SST(0x26, 0x95, SS_RDEF, "Invalid Field Parameter - CAP File") }, { SST(0x26, 0x96, SS_RDEF, "Invalid Field Parameter - RAP File") }, { SST(0x26, 0x97, SS_RDEF, "Invalid Field Parameter - TMS Firmware Tag") }, { SST(0x26, 0x98, SS_RDEF, "Invalid Field Parameter - Check Sum") }, { SST(0x26, 0x99, SS_RDEF, "Invalid Field Parameter - Firmware Tag") }, { SST(0x29, 0x08, SS_RDEF, "Write Log Dump data") }, { SST(0x29, 0x09, SS_RDEF, "Write Log Dump data") }, { SST(0x29, 0x0A, SS_RDEF, "Reserved disk space") }, { SST(0x29, 0x0B, SS_RDEF, "SDBP") }, { SST(0x29, 0x0C, SS_RDEF, "SDBP") }, { SST(0x31, 0x91, SS_RDEF, "Format Corrupted World Wide Name (WWN) is Invalid") }, { SST(0x32, 0x03, SS_RDEF, "Defect List - Length exceeds Command Allocated Length") }, { SST(0x33, 0x00, SS_RDEF, "Flash not ready for access") }, { SST(0x3F, 0x70, SS_RDEF, "Invalid RAP block") }, { SST(0x3F, 0x71, SS_RDEF, "RAP/ETF mismatch") }, { SST(0x3F, 0x90, SS_RDEF, "Invalid CAP block") }, { SST(0x3F, 0x91, SS_RDEF, "World Wide Name (WWN) Mismatch") }, { SST(0x40, 0x01, SS_RDEF, "DRAM Parity Error") }, { SST(0x40, 0x02, SS_RDEF, "DRAM Parity Error") }, { SST(0x42, 0x0A, SS_RDEF, "Loopback Test") }, { SST(0x42, 0x0B, SS_RDEF, "Loopback Test") }, { SST(0x44, 0xF2, SS_RDEF, "Compare error during data integrity check") }, { SST(0x44, 0xF6, SS_RDEF, "Unrecoverable error during data integrity check") }, { SST(0x47, 0x80, SS_RDEF, "Fibre Channel Sequence Error") }, { SST(0x4E, 0x01, SS_RDEF, "Information Unit Too Short") }, { SST(0x80, 0x00, SS_RDEF, "General Firmware Error / Command Timeout") }, { SST(0x80, 0x01, SS_RDEF, "Command Timeout") }, { SST(0x80, 0x02, SS_RDEF, "Command Timeout") }, { SST(0x80, 0x80, SS_RDEF, "FC FIFO Error During Read Transfer") }, { SST(0x80, 0x81, SS_RDEF, "FC FIFO Error During Write Transfer") }, { SST(0x80, 0x82, SS_RDEF, "DISC FIFO Error During Read Transfer") }, { SST(0x80, 0x83, SS_RDEF, "DISC FIFO Error During Write Transfer") }, { SST(0x80, 0x84, SS_RDEF, "LBA Seeded LRC Error on Read") }, { SST(0x80, 0x85, SS_RDEF, "LBA Seeded LRC Error on Write") }, { SST(0x80, 0x86, SS_RDEF, "IOEDC Error on Read") }, { SST(0x80, 0x87, SS_RDEF, "IOEDC Error on Write") }, { SST(0x80, 0x88, SS_RDEF, "Host Parity Check Failed") }, { SST(0x80, 0x89, SS_RDEF, "IOEDC error on read detected by formatter") }, { SST(0x80, 0x8A, SS_RDEF, "Host Parity Errors / Host FIFO Initialization Failed") }, { SST(0x80, 0x8B, SS_RDEF, "Host Parity Errors") }, { SST(0x80, 0x8C, SS_RDEF, "Host Parity Errors") }, { SST(0x80, 0x8D, SS_RDEF, "Host Parity Errors") }, { SST(0x81, 0x00, SS_RDEF, "LA Check Failed") }, { SST(0x82, 0x00, SS_RDEF, "Internal client detected insufficient buffer") }, { SST(0x84, 0x00, SS_RDEF, "Scheduled Diagnostic And Repair") }, }; static struct scsi_sense_quirk_entry sense_quirk_table[] = { { /* * XXX The Quantum Fireball ST and SE like to return 0x04 0x0b * when they really should return 0x04 0x02. */ {T_DIRECT, SIP_MEDIA_FIXED, "QUANTUM", "FIREBALL S*", "*"}, /*num_sense_keys*/0, nitems(quantum_fireball_entries), /*sense key entries*/NULL, quantum_fireball_entries }, { /* * This Sony MO drive likes to return 0x04, 0x00 when it * isn't spun up. */ {T_DIRECT, SIP_MEDIA_REMOVABLE, "SONY", "SMO-*", "*"}, /*num_sense_keys*/0, nitems(sony_mo_entries), /*sense key entries*/NULL, sony_mo_entries }, { /* * HGST vendor-specific error codes */ {T_DIRECT, SIP_MEDIA_FIXED, "HGST", "*", "*"}, /*num_sense_keys*/0, nitems(hgst_entries), /*sense key entries*/NULL, hgst_entries }, { /* * SEAGATE vendor-specific error codes */ {T_DIRECT, SIP_MEDIA_FIXED, "SEAGATE", "*", "*"}, /*num_sense_keys*/0, nitems(seagate_entries), /*sense key entries*/NULL, seagate_entries } }; const u_int sense_quirk_table_size = nitems(sense_quirk_table); static struct asc_table_entry asc_table[] = { /* * From: http://www.t10.org/lists/asc-num.txt * Modifications by Jung-uk Kim (jkim@FreeBSD.org) */ /* * File: ASC-NUM.TXT * * SCSI ASC/ASCQ Assignments * Numeric Sorted Listing * as of 8/12/15 * * D - DIRECT ACCESS DEVICE (SBC-2) device column key * .T - SEQUENTIAL ACCESS DEVICE (SSC) ------------------- * . L - PRINTER DEVICE (SSC) blank = reserved * . P - PROCESSOR DEVICE (SPC) not blank = allowed * . .W - WRITE ONCE READ MULTIPLE DEVICE (SBC-2) * . . R - CD DEVICE (MMC) * . . O - OPTICAL MEMORY DEVICE (SBC-2) * . . .M - MEDIA CHANGER DEVICE (SMC) * . . . A - STORAGE ARRAY DEVICE (SCC) * . . . E - ENCLOSURE SERVICES DEVICE (SES) * . . . .B - SIMPLIFIED DIRECT-ACCESS DEVICE (RBC) * . . . . K - OPTICAL CARD READER/WRITER DEVICE (OCRW) * . . . . V - AUTOMATION/DRIVE INTERFACE (ADC) * . . . . .F - OBJECT-BASED STORAGE (OSD) * DTLPWROMAEBKVF * ASC ASCQ Action * Description */ /* DTLPWROMAEBKVF */ { SST(0x00, 0x00, SS_NOP, "No additional sense information") }, /* T */ { SST(0x00, 0x01, SS_RDEF, "Filemark detected") }, /* T */ { SST(0x00, 0x02, SS_RDEF, "End-of-partition/medium detected") }, /* T */ { SST(0x00, 0x03, SS_RDEF, "Setmark detected") }, /* T */ { SST(0x00, 0x04, SS_RDEF, "Beginning-of-partition/medium detected") }, /* TL */ { SST(0x00, 0x05, SS_RDEF, "End-of-data detected") }, /* DTLPWROMAEBKVF */ { SST(0x00, 0x06, SS_RDEF, "I/O process terminated") }, /* T */ { SST(0x00, 0x07, SS_RDEF, /* XXX TBD */ "Programmable early warning detected") }, /* R */ { SST(0x00, 0x11, SS_FATAL | EBUSY, "Audio play operation in progress") }, /* R */ { SST(0x00, 0x12, SS_NOP, "Audio play operation paused") }, /* R */ { SST(0x00, 0x13, SS_NOP, "Audio play operation successfully completed") }, /* R */ { SST(0x00, 0x14, SS_RDEF, "Audio play operation stopped due to error") }, /* R */ { SST(0x00, 0x15, SS_NOP, "No current audio status to return") }, /* DTLPWROMAEBKVF */ { SST(0x00, 0x16, SS_FATAL | EBUSY, "Operation in progress") }, /* DTL WROMAEBKVF */ { SST(0x00, 0x17, SS_RDEF, "Cleaning requested") }, /* T */ { SST(0x00, 0x18, SS_RDEF, /* XXX TBD */ "Erase operation in progress") }, /* T */ { SST(0x00, 0x19, SS_RDEF, /* XXX TBD */ "Locate operation in progress") }, /* T */ { SST(0x00, 0x1A, SS_RDEF, /* XXX TBD */ "Rewind operation in progress") }, /* T */ { SST(0x00, 0x1B, SS_RDEF, /* XXX TBD */ "Set capacity operation in progress") }, /* T */ { SST(0x00, 0x1C, SS_RDEF, /* XXX TBD */ "Verify operation in progress") }, /* DT B */ { SST(0x00, 0x1D, SS_NOP, "ATA pass through information available") }, /* DT R MAEBKV */ { SST(0x00, 0x1E, SS_RDEF, /* XXX TBD */ "Conflicting SA creation request") }, /* DT B */ { SST(0x00, 0x1F, SS_RDEF, /* XXX TBD */ "Logical unit transitioning to another power condition") }, /* DT P B */ { SST(0x00, 0x20, SS_NOP, "Extended copy information available") }, /* D */ { SST(0x00, 0x21, SS_RDEF, /* XXX TBD */ "Atomic command aborted due to ACA") }, /* D W O BK */ { SST(0x01, 0x00, SS_RDEF, "No index/sector signal") }, /* D WRO BK */ { SST(0x02, 0x00, SS_RDEF, "No seek complete") }, /* DTL W O BK */ { SST(0x03, 0x00, SS_RDEF, "Peripheral device write fault") }, /* T */ { SST(0x03, 0x01, SS_RDEF, "No write current") }, /* T */ { SST(0x03, 0x02, SS_RDEF, "Excessive write errors") }, /* DTLPWROMAEBKVF */ { SST(0x04, 0x00, SS_RDEF, "Logical unit not ready, cause not reportable") }, /* DTLPWROMAEBKVF */ { SST(0x04, 0x01, SS_WAIT | EBUSY, "Logical unit is in process of becoming ready") }, /* DTLPWROMAEBKVF */ { SST(0x04, 0x02, SS_START | SSQ_DECREMENT_COUNT | ENXIO, "Logical unit not ready, initializing command required") }, /* DTLPWROMAEBKVF */ { SST(0x04, 0x03, SS_FATAL | ENXIO, "Logical unit not ready, manual intervention required") }, /* DTL RO B */ { SST(0x04, 0x04, SS_FATAL | EBUSY, "Logical unit not ready, format in progress") }, /* DT W O A BK F */ { SST(0x04, 0x05, SS_FATAL | EBUSY, "Logical unit not ready, rebuild in progress") }, /* DT W O A BK */ { SST(0x04, 0x06, SS_FATAL | EBUSY, "Logical unit not ready, recalculation in progress") }, /* DTLPWROMAEBKVF */ { SST(0x04, 0x07, SS_FATAL | EBUSY, "Logical unit not ready, operation in progress") }, /* R */ { SST(0x04, 0x08, SS_FATAL | EBUSY, "Logical unit not ready, long write in progress") }, /* DTLPWROMAEBKVF */ { SST(0x04, 0x09, SS_RDEF, /* XXX TBD */ "Logical unit not ready, self-test in progress") }, /* DTLPWROMAEBKVF */ { SST(0x04, 0x0A, SS_WAIT | ENXIO, "Logical unit not accessible, asymmetric access state transition")}, /* DTLPWROMAEBKVF */ { SST(0x04, 0x0B, SS_FATAL | ENXIO, "Logical unit not accessible, target port in standby state") }, /* DTLPWROMAEBKVF */ { SST(0x04, 0x0C, SS_FATAL | ENXIO, "Logical unit not accessible, target port in unavailable state") }, /* F */ { SST(0x04, 0x0D, SS_RDEF, /* XXX TBD */ "Logical unit not ready, structure check required") }, /* DTL WR MAEBKVF */ { SST(0x04, 0x0E, SS_RDEF, /* XXX TBD */ "Logical unit not ready, security session in progress") }, /* DT WROM B */ { SST(0x04, 0x10, SS_RDEF, /* XXX TBD */ "Logical unit not ready, auxiliary memory not accessible") }, /* DT WRO AEB VF */ { SST(0x04, 0x11, SS_WAIT | EBUSY, "Logical unit not ready, notify (enable spinup) required") }, /* M V */ { SST(0x04, 0x12, SS_RDEF, /* XXX TBD */ "Logical unit not ready, offline") }, /* DT R MAEBKV */ { SST(0x04, 0x13, SS_RDEF, /* XXX TBD */ "Logical unit not ready, SA creation in progress") }, /* D B */ { SST(0x04, 0x14, SS_RDEF, /* XXX TBD */ "Logical unit not ready, space allocation in progress") }, /* M */ { SST(0x04, 0x15, SS_RDEF, /* XXX TBD */ "Logical unit not ready, robotics disabled") }, /* M */ { SST(0x04, 0x16, SS_RDEF, /* XXX TBD */ "Logical unit not ready, configuration required") }, /* M */ { SST(0x04, 0x17, SS_RDEF, /* XXX TBD */ "Logical unit not ready, calibration required") }, /* M */ { SST(0x04, 0x18, SS_RDEF, /* XXX TBD */ "Logical unit not ready, a door is open") }, /* M */ { SST(0x04, 0x19, SS_RDEF, /* XXX TBD */ "Logical unit not ready, operating in sequential mode") }, /* DT B */ { SST(0x04, 0x1A, SS_RDEF, /* XXX TBD */ "Logical unit not ready, START/STOP UNIT command in progress") }, /* D B */ - { SST(0x04, 0x1B, SS_RDEF, /* XXX TBD */ + { SST(0x04, 0x1B, SS_WAIT | EBUSY, "Logical unit not ready, sanitize in progress") }, /* DT MAEB */ { SST(0x04, 0x1C, SS_START | SSQ_DECREMENT_COUNT | ENXIO, "Logical unit not ready, additional power use not yet granted") }, /* D */ { SST(0x04, 0x1D, SS_RDEF, /* XXX TBD */ "Logical unit not ready, configuration in progress") }, /* D */ { SST(0x04, 0x1E, SS_FATAL | ENXIO, "Logical unit not ready, microcode activation required") }, /* DTLPWROMAEBKVF */ { SST(0x04, 0x1F, SS_FATAL | ENXIO, "Logical unit not ready, microcode download required") }, /* DTLPWROMAEBKVF */ { SST(0x04, 0x20, SS_RDEF, /* XXX TBD */ "Logical unit not ready, logical unit reset required") }, /* DTLPWROMAEBKVF */ { SST(0x04, 0x21, SS_RDEF, /* XXX TBD */ "Logical unit not ready, hard reset required") }, /* DTLPWROMAEBKVF */ { SST(0x04, 0x22, SS_RDEF, /* XXX TBD */ "Logical unit not ready, power cycle required") }, /* DTL WROMAEBKVF */ { SST(0x05, 0x00, SS_RDEF, "Logical unit does not respond to selection") }, /* D WROM BK */ { SST(0x06, 0x00, SS_RDEF, "No reference position found") }, /* DTL WROM BK */ { SST(0x07, 0x00, SS_RDEF, "Multiple peripheral devices selected") }, /* DTL WROMAEBKVF */ { SST(0x08, 0x00, SS_RDEF, "Logical unit communication failure") }, /* DTL WROMAEBKVF */ { SST(0x08, 0x01, SS_RDEF, "Logical unit communication time-out") }, /* DTL WROMAEBKVF */ { SST(0x08, 0x02, SS_RDEF, "Logical unit communication parity error") }, /* DT ROM BK */ { SST(0x08, 0x03, SS_RDEF, "Logical unit communication CRC error (Ultra-DMA/32)") }, /* DTLPWRO K */ { SST(0x08, 0x04, SS_RDEF, /* XXX TBD */ "Unreachable copy target") }, /* DT WRO B */ { SST(0x09, 0x00, SS_RDEF, "Track following error") }, /* WRO K */ { SST(0x09, 0x01, SS_RDEF, "Tracking servo failure") }, /* WRO K */ { SST(0x09, 0x02, SS_RDEF, "Focus servo failure") }, /* WRO */ { SST(0x09, 0x03, SS_RDEF, "Spindle servo failure") }, /* DT WRO B */ { SST(0x09, 0x04, SS_RDEF, "Head select fault") }, /* DT RO B */ { SST(0x09, 0x05, SS_RDEF, "Vibration induced tracking error") }, /* DTLPWROMAEBKVF */ { SST(0x0A, 0x00, SS_FATAL | ENOSPC, "Error log overflow") }, /* DTLPWROMAEBKVF */ { SST(0x0B, 0x00, SS_NOP | SSQ_PRINT_SENSE, "Warning") }, /* DTLPWROMAEBKVF */ { SST(0x0B, 0x01, SS_NOP | SSQ_PRINT_SENSE, "Warning - specified temperature exceeded") }, /* DTLPWROMAEBKVF */ { SST(0x0B, 0x02, SS_NOP | SSQ_PRINT_SENSE, "Warning - enclosure degraded") }, /* DTLPWROMAEBKVF */ { SST(0x0B, 0x03, SS_NOP | SSQ_PRINT_SENSE, "Warning - background self-test failed") }, /* DTLPWRO AEBKVF */ { SST(0x0B, 0x04, SS_NOP | SSQ_PRINT_SENSE, "Warning - background pre-scan detected medium error") }, /* DTLPWRO AEBKVF */ { SST(0x0B, 0x05, SS_NOP | SSQ_PRINT_SENSE, "Warning - background medium scan detected medium error") }, /* DTLPWROMAEBKVF */ { SST(0x0B, 0x06, SS_NOP | SSQ_PRINT_SENSE, "Warning - non-volatile cache now volatile") }, /* DTLPWROMAEBKVF */ { SST(0x0B, 0x07, SS_NOP | SSQ_PRINT_SENSE, "Warning - degraded power to non-volatile cache") }, /* DTLPWROMAEBKVF */ { SST(0x0B, 0x08, SS_NOP | SSQ_PRINT_SENSE, "Warning - power loss expected") }, /* D */ { SST(0x0B, 0x09, SS_NOP | SSQ_PRINT_SENSE, "Warning - device statistics notification available") }, /* DTLPWROMAEBKVF */ { SST(0x0B, 0x0A, SS_NOP | SSQ_PRINT_SENSE, "Warning - High critical temperature limit exceeded") }, /* DTLPWROMAEBKVF */ { SST(0x0B, 0x0B, SS_NOP | SSQ_PRINT_SENSE, "Warning - Low critical temperature limit exceeded") }, /* DTLPWROMAEBKVF */ { SST(0x0B, 0x0C, SS_NOP | SSQ_PRINT_SENSE, "Warning - High operating temperature limit exceeded") }, /* DTLPWROMAEBKVF */ { SST(0x0B, 0x0D, SS_NOP | SSQ_PRINT_SENSE, "Warning - Low operating temperature limit exceeded") }, /* DTLPWROMAEBKVF */ { SST(0x0B, 0x0E, SS_NOP | SSQ_PRINT_SENSE, "Warning - High citical humidity limit exceeded") }, /* DTLPWROMAEBKVF */ { SST(0x0B, 0x0F, SS_NOP | SSQ_PRINT_SENSE, "Warning - Low citical humidity limit exceeded") }, /* DTLPWROMAEBKVF */ { SST(0x0B, 0x10, SS_NOP | SSQ_PRINT_SENSE, "Warning - High operating humidity limit exceeded") }, /* DTLPWROMAEBKVF */ { SST(0x0B, 0x11, SS_NOP | SSQ_PRINT_SENSE, "Warning - Low operating humidity limit exceeded") }, /* T R */ { SST(0x0C, 0x00, SS_RDEF, "Write error") }, /* K */ { SST(0x0C, 0x01, SS_NOP | SSQ_PRINT_SENSE, "Write error - recovered with auto reallocation") }, /* D W O BK */ { SST(0x0C, 0x02, SS_RDEF, "Write error - auto reallocation failed") }, /* D W O BK */ { SST(0x0C, 0x03, SS_RDEF, "Write error - recommend reassignment") }, /* DT W O B */ { SST(0x0C, 0x04, SS_RDEF, "Compression check miscompare error") }, /* DT W O B */ { SST(0x0C, 0x05, SS_RDEF, "Data expansion occurred during compression") }, /* DT W O B */ { SST(0x0C, 0x06, SS_RDEF, "Block not compressible") }, /* R */ { SST(0x0C, 0x07, SS_RDEF, "Write error - recovery needed") }, /* R */ { SST(0x0C, 0x08, SS_RDEF, "Write error - recovery failed") }, /* R */ { SST(0x0C, 0x09, SS_RDEF, "Write error - loss of streaming") }, /* R */ { SST(0x0C, 0x0A, SS_RDEF, "Write error - padding blocks added") }, /* DT WROM B */ { SST(0x0C, 0x0B, SS_RDEF, /* XXX TBD */ "Auxiliary memory write error") }, /* DTLPWRO AEBKVF */ { SST(0x0C, 0x0C, SS_RDEF, /* XXX TBD */ "Write error - unexpected unsolicited data") }, /* DTLPWRO AEBKVF */ { SST(0x0C, 0x0D, SS_RDEF, /* XXX TBD */ "Write error - not enough unsolicited data") }, /* DT W O BK */ { SST(0x0C, 0x0E, SS_RDEF, /* XXX TBD */ "Multiple write errors") }, /* R */ { SST(0x0C, 0x0F, SS_RDEF, /* XXX TBD */ "Defects in error window") }, /* D */ { SST(0x0C, 0x10, SS_RDEF, /* XXX TBD */ "Incomplete multiple atomic write operations") }, /* D */ { SST(0x0C, 0x11, SS_RDEF, /* XXX TBD */ "Write error - recovery scan needed") }, /* D */ { SST(0x0C, 0x12, SS_RDEF, /* XXX TBD */ "Write error - insufficient zone resources") }, /* DTLPWRO A K */ { SST(0x0D, 0x00, SS_RDEF, /* XXX TBD */ "Error detected by third party temporary initiator") }, /* DTLPWRO A K */ { SST(0x0D, 0x01, SS_RDEF, /* XXX TBD */ "Third party device failure") }, /* DTLPWRO A K */ { SST(0x0D, 0x02, SS_RDEF, /* XXX TBD */ "Copy target device not reachable") }, /* DTLPWRO A K */ { SST(0x0D, 0x03, SS_RDEF, /* XXX TBD */ "Incorrect copy target device type") }, /* DTLPWRO A K */ { SST(0x0D, 0x04, SS_RDEF, /* XXX TBD */ "Copy target device data underrun") }, /* DTLPWRO A K */ { SST(0x0D, 0x05, SS_RDEF, /* XXX TBD */ "Copy target device data overrun") }, /* DT PWROMAEBK F */ { SST(0x0E, 0x00, SS_RDEF, /* XXX TBD */ "Invalid information unit") }, /* DT PWROMAEBK F */ { SST(0x0E, 0x01, SS_RDEF, /* XXX TBD */ "Information unit too short") }, /* DT PWROMAEBK F */ { SST(0x0E, 0x02, SS_RDEF, /* XXX TBD */ "Information unit too long") }, /* DT P R MAEBK F */ { SST(0x0E, 0x03, SS_FATAL | EINVAL, "Invalid field in command information unit") }, /* D W O BK */ { SST(0x10, 0x00, SS_RDEF, "ID CRC or ECC error") }, /* DT W O */ { SST(0x10, 0x01, SS_RDEF, /* XXX TBD */ "Logical block guard check failed") }, /* DT W O */ { SST(0x10, 0x02, SS_RDEF, /* XXX TBD */ "Logical block application tag check failed") }, /* DT W O */ { SST(0x10, 0x03, SS_RDEF, /* XXX TBD */ "Logical block reference tag check failed") }, /* T */ { SST(0x10, 0x04, SS_RDEF, /* XXX TBD */ "Logical block protection error on recovered buffer data") }, /* T */ { SST(0x10, 0x05, SS_RDEF, /* XXX TBD */ "Logical block protection method error") }, /* DT WRO BK */ { SST(0x11, 0x00, SS_FATAL|EIO, "Unrecovered read error") }, /* DT WRO BK */ { SST(0x11, 0x01, SS_FATAL|EIO, "Read retries exhausted") }, /* DT WRO BK */ { SST(0x11, 0x02, SS_FATAL|EIO, "Error too long to correct") }, /* DT W O BK */ { SST(0x11, 0x03, SS_FATAL|EIO, "Multiple read errors") }, /* D W O BK */ { SST(0x11, 0x04, SS_FATAL|EIO, "Unrecovered read error - auto reallocate failed") }, /* WRO B */ { SST(0x11, 0x05, SS_FATAL|EIO, "L-EC uncorrectable error") }, /* WRO B */ { SST(0x11, 0x06, SS_FATAL|EIO, "CIRC unrecovered error") }, /* W O B */ { SST(0x11, 0x07, SS_RDEF, "Data re-synchronization error") }, /* T */ { SST(0x11, 0x08, SS_RDEF, "Incomplete block read") }, /* T */ { SST(0x11, 0x09, SS_RDEF, "No gap found") }, /* DT O BK */ { SST(0x11, 0x0A, SS_RDEF, "Miscorrected error") }, /* D W O BK */ { SST(0x11, 0x0B, SS_FATAL|EIO, "Unrecovered read error - recommend reassignment") }, /* D W O BK */ { SST(0x11, 0x0C, SS_FATAL|EIO, "Unrecovered read error - recommend rewrite the data") }, /* DT WRO B */ { SST(0x11, 0x0D, SS_RDEF, "De-compression CRC error") }, /* DT WRO B */ { SST(0x11, 0x0E, SS_RDEF, "Cannot decompress using declared algorithm") }, /* R */ { SST(0x11, 0x0F, SS_RDEF, "Error reading UPC/EAN number") }, /* R */ { SST(0x11, 0x10, SS_RDEF, "Error reading ISRC number") }, /* R */ { SST(0x11, 0x11, SS_RDEF, "Read error - loss of streaming") }, /* DT WROM B */ { SST(0x11, 0x12, SS_RDEF, /* XXX TBD */ "Auxiliary memory read error") }, /* DTLPWRO AEBKVF */ { SST(0x11, 0x13, SS_RDEF, /* XXX TBD */ "Read error - failed retransmission request") }, /* D */ { SST(0x11, 0x14, SS_RDEF, /* XXX TBD */ "Read error - LBA marked bad by application client") }, /* D */ - { SST(0x11, 0x15, SS_RDEF, /* XXX TBD */ + { SST(0x11, 0x15, SS_FATAL | EIO, "Write after sanitize required") }, /* D W O BK */ { SST(0x12, 0x00, SS_RDEF, "Address mark not found for ID field") }, /* D W O BK */ { SST(0x13, 0x00, SS_RDEF, "Address mark not found for data field") }, /* DTL WRO BK */ { SST(0x14, 0x00, SS_RDEF, "Recorded entity not found") }, /* DT WRO BK */ { SST(0x14, 0x01, SS_RDEF, "Record not found") }, /* T */ { SST(0x14, 0x02, SS_RDEF, "Filemark or setmark not found") }, /* T */ { SST(0x14, 0x03, SS_RDEF, "End-of-data not found") }, /* T */ { SST(0x14, 0x04, SS_RDEF, "Block sequence error") }, /* DT W O BK */ { SST(0x14, 0x05, SS_RDEF, "Record not found - recommend reassignment") }, /* DT W O BK */ { SST(0x14, 0x06, SS_RDEF, "Record not found - data auto-reallocated") }, /* T */ { SST(0x14, 0x07, SS_RDEF, /* XXX TBD */ "Locate operation failure") }, /* DTL WROM BK */ { SST(0x15, 0x00, SS_RDEF, "Random positioning error") }, /* DTL WROM BK */ { SST(0x15, 0x01, SS_RDEF, "Mechanical positioning error") }, /* DT WRO BK */ { SST(0x15, 0x02, SS_RDEF, "Positioning error detected by read of medium") }, /* D W O BK */ { SST(0x16, 0x00, SS_RDEF, "Data synchronization mark error") }, /* D W O BK */ { SST(0x16, 0x01, SS_RDEF, "Data sync error - data rewritten") }, /* D W O BK */ { SST(0x16, 0x02, SS_RDEF, "Data sync error - recommend rewrite") }, /* D W O BK */ { SST(0x16, 0x03, SS_NOP | SSQ_PRINT_SENSE, "Data sync error - data auto-reallocated") }, /* D W O BK */ { SST(0x16, 0x04, SS_RDEF, "Data sync error - recommend reassignment") }, /* DT WRO BK */ { SST(0x17, 0x00, SS_NOP | SSQ_PRINT_SENSE, "Recovered data with no error correction applied") }, /* DT WRO BK */ { SST(0x17, 0x01, SS_NOP | SSQ_PRINT_SENSE, "Recovered data with retries") }, /* DT WRO BK */ { SST(0x17, 0x02, SS_NOP | SSQ_PRINT_SENSE, "Recovered data with positive head offset") }, /* DT WRO BK */ { SST(0x17, 0x03, SS_NOP | SSQ_PRINT_SENSE, "Recovered data with negative head offset") }, /* WRO B */ { SST(0x17, 0x04, SS_NOP | SSQ_PRINT_SENSE, "Recovered data with retries and/or CIRC applied") }, /* D WRO BK */ { SST(0x17, 0x05, SS_NOP | SSQ_PRINT_SENSE, "Recovered data using previous sector ID") }, /* D W O BK */ { SST(0x17, 0x06, SS_NOP | SSQ_PRINT_SENSE, "Recovered data without ECC - data auto-reallocated") }, /* D WRO BK */ { SST(0x17, 0x07, SS_NOP | SSQ_PRINT_SENSE, "Recovered data without ECC - recommend reassignment") }, /* D WRO BK */ { SST(0x17, 0x08, SS_NOP | SSQ_PRINT_SENSE, "Recovered data without ECC - recommend rewrite") }, /* D WRO BK */ { SST(0x17, 0x09, SS_NOP | SSQ_PRINT_SENSE, "Recovered data without ECC - data rewritten") }, /* DT WRO BK */ { SST(0x18, 0x00, SS_NOP | SSQ_PRINT_SENSE, "Recovered data with error correction applied") }, /* D WRO BK */ { SST(0x18, 0x01, SS_NOP | SSQ_PRINT_SENSE, "Recovered data with error corr. & retries applied") }, /* D WRO BK */ { SST(0x18, 0x02, SS_NOP | SSQ_PRINT_SENSE, "Recovered data - data auto-reallocated") }, /* R */ { SST(0x18, 0x03, SS_NOP | SSQ_PRINT_SENSE, "Recovered data with CIRC") }, /* R */ { SST(0x18, 0x04, SS_NOP | SSQ_PRINT_SENSE, "Recovered data with L-EC") }, /* D WRO BK */ { SST(0x18, 0x05, SS_NOP | SSQ_PRINT_SENSE, "Recovered data - recommend reassignment") }, /* D WRO BK */ { SST(0x18, 0x06, SS_NOP | SSQ_PRINT_SENSE, "Recovered data - recommend rewrite") }, /* D W O BK */ { SST(0x18, 0x07, SS_NOP | SSQ_PRINT_SENSE, "Recovered data with ECC - data rewritten") }, /* R */ { SST(0x18, 0x08, SS_RDEF, /* XXX TBD */ "Recovered data with linking") }, /* D O K */ { SST(0x19, 0x00, SS_RDEF, "Defect list error") }, /* D O K */ { SST(0x19, 0x01, SS_RDEF, "Defect list not available") }, /* D O K */ { SST(0x19, 0x02, SS_RDEF, "Defect list error in primary list") }, /* D O K */ { SST(0x19, 0x03, SS_RDEF, "Defect list error in grown list") }, /* DTLPWROMAEBKVF */ { SST(0x1A, 0x00, SS_RDEF, "Parameter list length error") }, /* DTLPWROMAEBKVF */ { SST(0x1B, 0x00, SS_RDEF, "Synchronous data transfer error") }, /* D O BK */ { SST(0x1C, 0x00, SS_RDEF, "Defect list not found") }, /* D O BK */ { SST(0x1C, 0x01, SS_RDEF, "Primary defect list not found") }, /* D O BK */ { SST(0x1C, 0x02, SS_RDEF, "Grown defect list not found") }, /* DT WRO BK */ { SST(0x1D, 0x00, SS_FATAL, "Miscompare during verify operation") }, /* D B */ { SST(0x1D, 0x01, SS_RDEF, /* XXX TBD */ "Miscomparable verify of unmapped LBA") }, /* D W O BK */ { SST(0x1E, 0x00, SS_NOP | SSQ_PRINT_SENSE, "Recovered ID with ECC correction") }, /* D O K */ { SST(0x1F, 0x00, SS_RDEF, "Partial defect list transfer") }, /* DTLPWROMAEBKVF */ { SST(0x20, 0x00, SS_FATAL | EINVAL, "Invalid command operation code") }, /* DT PWROMAEBK */ { SST(0x20, 0x01, SS_RDEF, /* XXX TBD */ "Access denied - initiator pending-enrolled") }, /* DT PWROMAEBK */ { SST(0x20, 0x02, SS_FATAL | EPERM, "Access denied - no access rights") }, /* DT PWROMAEBK */ { SST(0x20, 0x03, SS_RDEF, /* XXX TBD */ "Access denied - invalid mgmt ID key") }, /* T */ { SST(0x20, 0x04, SS_RDEF, /* XXX TBD */ "Illegal command while in write capable state") }, /* T */ { SST(0x20, 0x05, SS_RDEF, /* XXX TBD */ "Obsolete") }, /* T */ { SST(0x20, 0x06, SS_RDEF, /* XXX TBD */ "Illegal command while in explicit address mode") }, /* T */ { SST(0x20, 0x07, SS_RDEF, /* XXX TBD */ "Illegal command while in implicit address mode") }, /* DT PWROMAEBK */ { SST(0x20, 0x08, SS_RDEF, /* XXX TBD */ "Access denied - enrollment conflict") }, /* DT PWROMAEBK */ { SST(0x20, 0x09, SS_RDEF, /* XXX TBD */ "Access denied - invalid LU identifier") }, /* DT PWROMAEBK */ { SST(0x20, 0x0A, SS_RDEF, /* XXX TBD */ "Access denied - invalid proxy token") }, /* DT PWROMAEBK */ { SST(0x20, 0x0B, SS_RDEF, /* XXX TBD */ "Access denied - ACL LUN conflict") }, /* T */ { SST(0x20, 0x0C, SS_FATAL | EINVAL, "Illegal command when not in append-only mode") }, /* DT WRO BK */ { SST(0x21, 0x00, SS_FATAL | EINVAL, "Logical block address out of range") }, /* DT WROM BK */ { SST(0x21, 0x01, SS_FATAL | EINVAL, "Invalid element address") }, /* R */ { SST(0x21, 0x02, SS_RDEF, /* XXX TBD */ "Invalid address for write") }, /* R */ { SST(0x21, 0x03, SS_RDEF, /* XXX TBD */ "Invalid write crossing layer jump") }, /* D */ { SST(0x21, 0x04, SS_RDEF, /* XXX TBD */ "Unaligned write command") }, /* D */ { SST(0x21, 0x05, SS_RDEF, /* XXX TBD */ "Write boundary violation") }, /* D */ { SST(0x21, 0x06, SS_RDEF, /* XXX TBD */ "Attempt to read invalid data") }, /* D */ { SST(0x21, 0x07, SS_RDEF, /* XXX TBD */ "Read boundary violation") }, /* D */ { SST(0x22, 0x00, SS_FATAL | EINVAL, "Illegal function (use 20 00, 24 00, or 26 00)") }, /* DT P B */ { SST(0x23, 0x00, SS_FATAL | EINVAL, "Invalid token operation, cause not reportable") }, /* DT P B */ { SST(0x23, 0x01, SS_FATAL | EINVAL, "Invalid token operation, unsupported token type") }, /* DT P B */ { SST(0x23, 0x02, SS_FATAL | EINVAL, "Invalid token operation, remote token usage not supported") }, /* DT P B */ { SST(0x23, 0x03, SS_FATAL | EINVAL, "Invalid token operation, remote ROD token creation not supported") }, /* DT P B */ { SST(0x23, 0x04, SS_FATAL | EINVAL, "Invalid token operation, token unknown") }, /* DT P B */ { SST(0x23, 0x05, SS_FATAL | EINVAL, "Invalid token operation, token corrupt") }, /* DT P B */ { SST(0x23, 0x06, SS_FATAL | EINVAL, "Invalid token operation, token revoked") }, /* DT P B */ { SST(0x23, 0x07, SS_FATAL | EINVAL, "Invalid token operation, token expired") }, /* DT P B */ { SST(0x23, 0x08, SS_FATAL | EINVAL, "Invalid token operation, token cancelled") }, /* DT P B */ { SST(0x23, 0x09, SS_FATAL | EINVAL, "Invalid token operation, token deleted") }, /* DT P B */ { SST(0x23, 0x0A, SS_FATAL | EINVAL, "Invalid token operation, invalid token length") }, /* DTLPWROMAEBKVF */ { SST(0x24, 0x00, SS_FATAL | EINVAL, "Invalid field in CDB") }, /* DTLPWRO AEBKVF */ { SST(0x24, 0x01, SS_RDEF, /* XXX TBD */ "CDB decryption error") }, /* T */ { SST(0x24, 0x02, SS_RDEF, /* XXX TBD */ "Obsolete") }, /* T */ { SST(0x24, 0x03, SS_RDEF, /* XXX TBD */ "Obsolete") }, /* F */ { SST(0x24, 0x04, SS_RDEF, /* XXX TBD */ "Security audit value frozen") }, /* F */ { SST(0x24, 0x05, SS_RDEF, /* XXX TBD */ "Security working key frozen") }, /* F */ { SST(0x24, 0x06, SS_RDEF, /* XXX TBD */ "NONCE not unique") }, /* F */ { SST(0x24, 0x07, SS_RDEF, /* XXX TBD */ "NONCE timestamp out of range") }, /* DT R MAEBKV */ { SST(0x24, 0x08, SS_RDEF, /* XXX TBD */ "Invalid XCDB") }, /* DTLPWROMAEBKVF */ { SST(0x25, 0x00, SS_FATAL | ENXIO | SSQ_LOST, "Logical unit not supported") }, /* DTLPWROMAEBKVF */ { SST(0x26, 0x00, SS_FATAL | EINVAL, "Invalid field in parameter list") }, /* DTLPWROMAEBKVF */ { SST(0x26, 0x01, SS_FATAL | EINVAL, "Parameter not supported") }, /* DTLPWROMAEBKVF */ { SST(0x26, 0x02, SS_FATAL | EINVAL, "Parameter value invalid") }, /* DTLPWROMAE K */ { SST(0x26, 0x03, SS_FATAL | EINVAL, "Threshold parameters not supported") }, /* DTLPWROMAEBKVF */ { SST(0x26, 0x04, SS_FATAL | EINVAL, "Invalid release of persistent reservation") }, /* DTLPWRO A BK */ { SST(0x26, 0x05, SS_RDEF, /* XXX TBD */ "Data decryption error") }, /* DTLPWRO K */ { SST(0x26, 0x06, SS_FATAL | EINVAL, "Too many target descriptors") }, /* DTLPWRO K */ { SST(0x26, 0x07, SS_FATAL | EINVAL, "Unsupported target descriptor type code") }, /* DTLPWRO K */ { SST(0x26, 0x08, SS_FATAL | EINVAL, "Too many segment descriptors") }, /* DTLPWRO K */ { SST(0x26, 0x09, SS_FATAL | EINVAL, "Unsupported segment descriptor type code") }, /* DTLPWRO K */ { SST(0x26, 0x0A, SS_FATAL | EINVAL, "Unexpected inexact segment") }, /* DTLPWRO K */ { SST(0x26, 0x0B, SS_FATAL | EINVAL, "Inline data length exceeded") }, /* DTLPWRO K */ { SST(0x26, 0x0C, SS_FATAL | EINVAL, "Invalid operation for copy source or destination") }, /* DTLPWRO K */ { SST(0x26, 0x0D, SS_FATAL | EINVAL, "Copy segment granularity violation") }, /* DT PWROMAEBK */ { SST(0x26, 0x0E, SS_RDEF, /* XXX TBD */ "Invalid parameter while port is enabled") }, /* F */ { SST(0x26, 0x0F, SS_RDEF, /* XXX TBD */ "Invalid data-out buffer integrity check value") }, /* T */ { SST(0x26, 0x10, SS_RDEF, /* XXX TBD */ "Data decryption key fail limit reached") }, /* T */ { SST(0x26, 0x11, SS_RDEF, /* XXX TBD */ "Incomplete key-associated data set") }, /* T */ { SST(0x26, 0x12, SS_RDEF, /* XXX TBD */ "Vendor specific key reference not found") }, /* D */ { SST(0x26, 0x13, SS_RDEF, /* XXX TBD */ "Application tag mode page is invalid") }, /* DT WRO BK */ { SST(0x27, 0x00, SS_FATAL | EACCES, "Write protected") }, /* DT WRO BK */ { SST(0x27, 0x01, SS_FATAL | EACCES, "Hardware write protected") }, /* DT WRO BK */ { SST(0x27, 0x02, SS_FATAL | EACCES, "Logical unit software write protected") }, /* T R */ { SST(0x27, 0x03, SS_FATAL | EACCES, "Associated write protect") }, /* T R */ { SST(0x27, 0x04, SS_FATAL | EACCES, "Persistent write protect") }, /* T R */ { SST(0x27, 0x05, SS_FATAL | EACCES, "Permanent write protect") }, /* R F */ { SST(0x27, 0x06, SS_RDEF, /* XXX TBD */ "Conditional write protect") }, /* D B */ { SST(0x27, 0x07, SS_FATAL | ENOSPC, "Space allocation failed write protect") }, /* D */ { SST(0x27, 0x08, SS_FATAL | EACCES, "Zone is read only") }, /* DTLPWROMAEBKVF */ { SST(0x28, 0x00, SS_FATAL | ENXIO, "Not ready to ready change, medium may have changed") }, /* DT WROM B */ { SST(0x28, 0x01, SS_FATAL | ENXIO, "Import or export element accessed") }, /* R */ { SST(0x28, 0x02, SS_RDEF, /* XXX TBD */ "Format-layer may have changed") }, /* M */ { SST(0x28, 0x03, SS_RDEF, /* XXX TBD */ "Import/export element accessed, medium changed") }, /* * XXX JGibbs - All of these should use the same errno, but I don't * think ENXIO is the correct choice. Should we borrow from * the networking errnos? ECONNRESET anyone? */ /* DTLPWROMAEBKVF */ { SST(0x29, 0x00, SS_FATAL | ENXIO, "Power on, reset, or bus device reset occurred") }, /* DTLPWROMAEBKVF */ { SST(0x29, 0x01, SS_RDEF, "Power on occurred") }, /* DTLPWROMAEBKVF */ { SST(0x29, 0x02, SS_RDEF, "SCSI bus reset occurred") }, /* DTLPWROMAEBKVF */ { SST(0x29, 0x03, SS_RDEF, "Bus device reset function occurred") }, /* DTLPWROMAEBKVF */ { SST(0x29, 0x04, SS_RDEF, "Device internal reset") }, /* DTLPWROMAEBKVF */ { SST(0x29, 0x05, SS_RDEF, "Transceiver mode changed to single-ended") }, /* DTLPWROMAEBKVF */ { SST(0x29, 0x06, SS_RDEF, "Transceiver mode changed to LVD") }, /* DTLPWROMAEBKVF */ { SST(0x29, 0x07, SS_RDEF, /* XXX TBD */ "I_T nexus loss occurred") }, /* DTL WROMAEBKVF */ { SST(0x2A, 0x00, SS_RDEF, "Parameters changed") }, /* DTL WROMAEBKVF */ { SST(0x2A, 0x01, SS_RDEF, "Mode parameters changed") }, /* DTL WROMAE K */ { SST(0x2A, 0x02, SS_RDEF, "Log parameters changed") }, /* DTLPWROMAE K */ { SST(0x2A, 0x03, SS_RDEF, "Reservations preempted") }, /* DTLPWROMAE */ { SST(0x2A, 0x04, SS_RDEF, /* XXX TBD */ "Reservations released") }, /* DTLPWROMAE */ { SST(0x2A, 0x05, SS_RDEF, /* XXX TBD */ "Registrations preempted") }, /* DTLPWROMAEBKVF */ { SST(0x2A, 0x06, SS_RDEF, /* XXX TBD */ "Asymmetric access state changed") }, /* DTLPWROMAEBKVF */ { SST(0x2A, 0x07, SS_RDEF, /* XXX TBD */ "Implicit asymmetric access state transition failed") }, /* DT WROMAEBKVF */ { SST(0x2A, 0x08, SS_RDEF, /* XXX TBD */ "Priority changed") }, /* D */ { SST(0x2A, 0x09, SS_RDEF, /* XXX TBD */ "Capacity data has changed") }, /* DT */ { SST(0x2A, 0x0A, SS_RDEF, /* XXX TBD */ "Error history I_T nexus cleared") }, /* DT */ { SST(0x2A, 0x0B, SS_RDEF, /* XXX TBD */ "Error history snapshot released") }, /* F */ { SST(0x2A, 0x0C, SS_RDEF, /* XXX TBD */ "Error recovery attributes have changed") }, /* T */ { SST(0x2A, 0x0D, SS_RDEF, /* XXX TBD */ "Data encryption capabilities changed") }, /* DT M E V */ { SST(0x2A, 0x10, SS_RDEF, /* XXX TBD */ "Timestamp changed") }, /* T */ { SST(0x2A, 0x11, SS_RDEF, /* XXX TBD */ "Data encryption parameters changed by another I_T nexus") }, /* T */ { SST(0x2A, 0x12, SS_RDEF, /* XXX TBD */ "Data encryption parameters changed by vendor specific event") }, /* T */ { SST(0x2A, 0x13, SS_RDEF, /* XXX TBD */ "Data encryption key instance counter has changed") }, /* DT R MAEBKV */ { SST(0x2A, 0x14, SS_RDEF, /* XXX TBD */ "SA creation capabilities data has changed") }, /* T M V */ { SST(0x2A, 0x15, SS_RDEF, /* XXX TBD */ "Medium removal prevention preempted") }, /* DTLPWRO K */ { SST(0x2B, 0x00, SS_RDEF, "Copy cannot execute since host cannot disconnect") }, /* DTLPWROMAEBKVF */ { SST(0x2C, 0x00, SS_RDEF, "Command sequence error") }, /* */ { SST(0x2C, 0x01, SS_RDEF, "Too many windows specified") }, /* */ { SST(0x2C, 0x02, SS_RDEF, "Invalid combination of windows specified") }, /* R */ { SST(0x2C, 0x03, SS_RDEF, "Current program area is not empty") }, /* R */ { SST(0x2C, 0x04, SS_RDEF, "Current program area is empty") }, /* B */ { SST(0x2C, 0x05, SS_RDEF, /* XXX TBD */ "Illegal power condition request") }, /* R */ { SST(0x2C, 0x06, SS_RDEF, /* XXX TBD */ "Persistent prevent conflict") }, /* DTLPWROMAEBKVF */ { SST(0x2C, 0x07, SS_RDEF, /* XXX TBD */ "Previous busy status") }, /* DTLPWROMAEBKVF */ { SST(0x2C, 0x08, SS_RDEF, /* XXX TBD */ "Previous task set full status") }, /* DTLPWROM EBKVF */ { SST(0x2C, 0x09, SS_RDEF, /* XXX TBD */ "Previous reservation conflict status") }, /* F */ { SST(0x2C, 0x0A, SS_RDEF, /* XXX TBD */ "Partition or collection contains user objects") }, /* T */ { SST(0x2C, 0x0B, SS_RDEF, /* XXX TBD */ "Not reserved") }, /* D */ { SST(0x2C, 0x0C, SS_RDEF, /* XXX TBD */ "ORWRITE generation does not match") }, /* D */ { SST(0x2C, 0x0D, SS_RDEF, /* XXX TBD */ "Reset write pointer not allowed") }, /* D */ { SST(0x2C, 0x0E, SS_RDEF, /* XXX TBD */ "Zone is offline") }, /* D */ { SST(0x2C, 0x0F, SS_RDEF, /* XXX TBD */ "Stream not open") }, /* D */ { SST(0x2C, 0x10, SS_RDEF, /* XXX TBD */ "Unwritten data in zone") }, /* T */ { SST(0x2D, 0x00, SS_RDEF, "Overwrite error on update in place") }, /* R */ { SST(0x2E, 0x00, SS_RDEF, /* XXX TBD */ "Insufficient time for operation") }, /* D */ { SST(0x2E, 0x01, SS_RDEF, /* XXX TBD */ "Command timeout before processing") }, /* D */ { SST(0x2E, 0x02, SS_RDEF, /* XXX TBD */ "Command timeout during processing") }, /* D */ { SST(0x2E, 0x03, SS_RDEF, /* XXX TBD */ "Command timeout during processing due to error recovery") }, /* DTLPWROMAEBKVF */ { SST(0x2F, 0x00, SS_RDEF, "Commands cleared by another initiator") }, /* D */ { SST(0x2F, 0x01, SS_RDEF, /* XXX TBD */ "Commands cleared by power loss notification") }, /* DTLPWROMAEBKVF */ { SST(0x2F, 0x02, SS_RDEF, /* XXX TBD */ "Commands cleared by device server") }, /* DTLPWROMAEBKVF */ { SST(0x2F, 0x03, SS_RDEF, /* XXX TBD */ "Some commands cleared by queuing layer event") }, /* DT WROM BK */ { SST(0x30, 0x00, SS_RDEF, "Incompatible medium installed") }, /* DT WRO BK */ { SST(0x30, 0x01, SS_RDEF, "Cannot read medium - unknown format") }, /* DT WRO BK */ { SST(0x30, 0x02, SS_RDEF, "Cannot read medium - incompatible format") }, /* DT R K */ { SST(0x30, 0x03, SS_RDEF, "Cleaning cartridge installed") }, /* DT WRO BK */ { SST(0x30, 0x04, SS_RDEF, "Cannot write medium - unknown format") }, /* DT WRO BK */ { SST(0x30, 0x05, SS_RDEF, "Cannot write medium - incompatible format") }, /* DT WRO B */ { SST(0x30, 0x06, SS_RDEF, "Cannot format medium - incompatible medium") }, /* DTL WROMAEBKVF */ { SST(0x30, 0x07, SS_RDEF, "Cleaning failure") }, /* R */ { SST(0x30, 0x08, SS_RDEF, "Cannot write - application code mismatch") }, /* R */ { SST(0x30, 0x09, SS_RDEF, "Current session not fixated for append") }, /* DT WRO AEBK */ { SST(0x30, 0x0A, SS_RDEF, /* XXX TBD */ "Cleaning request rejected") }, /* T */ { SST(0x30, 0x0C, SS_RDEF, /* XXX TBD */ "WORM medium - overwrite attempted") }, /* T */ { SST(0x30, 0x0D, SS_RDEF, /* XXX TBD */ "WORM medium - integrity check") }, /* R */ { SST(0x30, 0x10, SS_RDEF, /* XXX TBD */ "Medium not formatted") }, /* M */ { SST(0x30, 0x11, SS_RDEF, /* XXX TBD */ "Incompatible volume type") }, /* M */ { SST(0x30, 0x12, SS_RDEF, /* XXX TBD */ "Incompatible volume qualifier") }, /* M */ { SST(0x30, 0x13, SS_RDEF, /* XXX TBD */ "Cleaning volume expired") }, /* DT WRO BK */ { SST(0x31, 0x00, SS_RDEF, "Medium format corrupted") }, /* D L RO B */ { SST(0x31, 0x01, SS_RDEF, "Format command failed") }, /* R */ { SST(0x31, 0x02, SS_RDEF, /* XXX TBD */ "Zoned formatting failed due to spare linking") }, /* D B */ - { SST(0x31, 0x03, SS_RDEF, /* XXX TBD */ + { SST(0x31, 0x03, SS_FATAL | EIO, "SANITIZE command failed") }, /* D W O BK */ { SST(0x32, 0x00, SS_RDEF, "No defect spare location available") }, /* D W O BK */ { SST(0x32, 0x01, SS_RDEF, "Defect list update failure") }, /* T */ { SST(0x33, 0x00, SS_RDEF, "Tape length error") }, /* DTLPWROMAEBKVF */ { SST(0x34, 0x00, SS_RDEF, "Enclosure failure") }, /* DTLPWROMAEBKVF */ { SST(0x35, 0x00, SS_RDEF, "Enclosure services failure") }, /* DTLPWROMAEBKVF */ { SST(0x35, 0x01, SS_RDEF, "Unsupported enclosure function") }, /* DTLPWROMAEBKVF */ { SST(0x35, 0x02, SS_RDEF, "Enclosure services unavailable") }, /* DTLPWROMAEBKVF */ { SST(0x35, 0x03, SS_RDEF, "Enclosure services transfer failure") }, /* DTLPWROMAEBKVF */ { SST(0x35, 0x04, SS_RDEF, "Enclosure services transfer refused") }, /* DTL WROMAEBKVF */ { SST(0x35, 0x05, SS_RDEF, /* XXX TBD */ "Enclosure services checksum error") }, /* L */ { SST(0x36, 0x00, SS_RDEF, "Ribbon, ink, or toner failure") }, /* DTL WROMAEBKVF */ { SST(0x37, 0x00, SS_RDEF, "Rounded parameter") }, /* B */ { SST(0x38, 0x00, SS_RDEF, /* XXX TBD */ "Event status notification") }, /* B */ { SST(0x38, 0x02, SS_RDEF, /* XXX TBD */ "ESN - power management class event") }, /* B */ { SST(0x38, 0x04, SS_RDEF, /* XXX TBD */ "ESN - media class event") }, /* B */ { SST(0x38, 0x06, SS_RDEF, /* XXX TBD */ "ESN - device busy class event") }, /* D */ { SST(0x38, 0x07, SS_RDEF, /* XXX TBD */ "Thin provisioning soft threshold reached") }, /* DTL WROMAE K */ { SST(0x39, 0x00, SS_RDEF, "Saving parameters not supported") }, /* DTL WROM BK */ { SST(0x3A, 0x00, SS_FATAL | ENXIO, "Medium not present") }, /* DT WROM BK */ { SST(0x3A, 0x01, SS_FATAL | ENXIO, "Medium not present - tray closed") }, /* DT WROM BK */ { SST(0x3A, 0x02, SS_FATAL | ENXIO, "Medium not present - tray open") }, /* DT WROM B */ { SST(0x3A, 0x03, SS_RDEF, /* XXX TBD */ "Medium not present - loadable") }, /* DT WRO B */ { SST(0x3A, 0x04, SS_RDEF, /* XXX TBD */ "Medium not present - medium auxiliary memory accessible") }, /* TL */ { SST(0x3B, 0x00, SS_RDEF, "Sequential positioning error") }, /* T */ { SST(0x3B, 0x01, SS_RDEF, "Tape position error at beginning-of-medium") }, /* T */ { SST(0x3B, 0x02, SS_RDEF, "Tape position error at end-of-medium") }, /* L */ { SST(0x3B, 0x03, SS_RDEF, "Tape or electronic vertical forms unit not ready") }, /* L */ { SST(0x3B, 0x04, SS_RDEF, "Slew failure") }, /* L */ { SST(0x3B, 0x05, SS_RDEF, "Paper jam") }, /* L */ { SST(0x3B, 0x06, SS_RDEF, "Failed to sense top-of-form") }, /* L */ { SST(0x3B, 0x07, SS_RDEF, "Failed to sense bottom-of-form") }, /* T */ { SST(0x3B, 0x08, SS_RDEF, "Reposition error") }, /* */ { SST(0x3B, 0x09, SS_RDEF, "Read past end of medium") }, /* */ { SST(0x3B, 0x0A, SS_RDEF, "Read past beginning of medium") }, /* */ { SST(0x3B, 0x0B, SS_RDEF, "Position past end of medium") }, /* T */ { SST(0x3B, 0x0C, SS_RDEF, "Position past beginning of medium") }, /* DT WROM BK */ { SST(0x3B, 0x0D, SS_FATAL | ENOSPC, "Medium destination element full") }, /* DT WROM BK */ { SST(0x3B, 0x0E, SS_RDEF, "Medium source element empty") }, /* R */ { SST(0x3B, 0x0F, SS_RDEF, "End of medium reached") }, /* DT WROM BK */ { SST(0x3B, 0x11, SS_RDEF, "Medium magazine not accessible") }, /* DT WROM BK */ { SST(0x3B, 0x12, SS_RDEF, "Medium magazine removed") }, /* DT WROM BK */ { SST(0x3B, 0x13, SS_RDEF, "Medium magazine inserted") }, /* DT WROM BK */ { SST(0x3B, 0x14, SS_RDEF, "Medium magazine locked") }, /* DT WROM BK */ { SST(0x3B, 0x15, SS_RDEF, "Medium magazine unlocked") }, /* R */ { SST(0x3B, 0x16, SS_RDEF, /* XXX TBD */ "Mechanical positioning or changer error") }, /* F */ { SST(0x3B, 0x17, SS_RDEF, /* XXX TBD */ "Read past end of user object") }, /* M */ { SST(0x3B, 0x18, SS_RDEF, /* XXX TBD */ "Element disabled") }, /* M */ { SST(0x3B, 0x19, SS_RDEF, /* XXX TBD */ "Element enabled") }, /* M */ { SST(0x3B, 0x1A, SS_RDEF, /* XXX TBD */ "Data transfer device removed") }, /* M */ { SST(0x3B, 0x1B, SS_RDEF, /* XXX TBD */ "Data transfer device inserted") }, /* T */ { SST(0x3B, 0x1C, SS_RDEF, /* XXX TBD */ "Too many logical objects on partition to support operation") }, /* DTLPWROMAE K */ { SST(0x3D, 0x00, SS_RDEF, "Invalid bits in IDENTIFY message") }, /* DTLPWROMAEBKVF */ { SST(0x3E, 0x00, SS_RDEF, "Logical unit has not self-configured yet") }, /* DTLPWROMAEBKVF */ { SST(0x3E, 0x01, SS_RDEF, "Logical unit failure") }, /* DTLPWROMAEBKVF */ { SST(0x3E, 0x02, SS_RDEF, "Timeout on logical unit") }, /* DTLPWROMAEBKVF */ { SST(0x3E, 0x03, SS_RDEF, /* XXX TBD */ "Logical unit failed self-test") }, /* DTLPWROMAEBKVF */ { SST(0x3E, 0x04, SS_RDEF, /* XXX TBD */ "Logical unit unable to update self-test log") }, /* DTLPWROMAEBKVF */ { SST(0x3F, 0x00, SS_RDEF, "Target operating conditions have changed") }, /* DTLPWROMAEBKVF */ { SST(0x3F, 0x01, SS_RDEF, "Microcode has been changed") }, /* DTLPWROM BK */ { SST(0x3F, 0x02, SS_RDEF, "Changed operating definition") }, /* DTLPWROMAEBKVF */ { SST(0x3F, 0x03, SS_RDEF, "INQUIRY data has changed") }, /* DT WROMAEBK */ { SST(0x3F, 0x04, SS_RDEF, "Component device attached") }, /* DT WROMAEBK */ { SST(0x3F, 0x05, SS_RDEF, "Device identifier changed") }, /* DT WROMAEB */ { SST(0x3F, 0x06, SS_RDEF, "Redundancy group created or modified") }, /* DT WROMAEB */ { SST(0x3F, 0x07, SS_RDEF, "Redundancy group deleted") }, /* DT WROMAEB */ { SST(0x3F, 0x08, SS_RDEF, "Spare created or modified") }, /* DT WROMAEB */ { SST(0x3F, 0x09, SS_RDEF, "Spare deleted") }, /* DT WROMAEBK */ { SST(0x3F, 0x0A, SS_RDEF, "Volume set created or modified") }, /* DT WROMAEBK */ { SST(0x3F, 0x0B, SS_RDEF, "Volume set deleted") }, /* DT WROMAEBK */ { SST(0x3F, 0x0C, SS_RDEF, "Volume set deassigned") }, /* DT WROMAEBK */ { SST(0x3F, 0x0D, SS_RDEF, "Volume set reassigned") }, /* DTLPWROMAE */ { SST(0x3F, 0x0E, SS_RDEF | SSQ_RESCAN , "Reported LUNs data has changed") }, /* DTLPWROMAEBKVF */ { SST(0x3F, 0x0F, SS_RDEF, /* XXX TBD */ "Echo buffer overwritten") }, /* DT WROM B */ { SST(0x3F, 0x10, SS_RDEF, /* XXX TBD */ "Medium loadable") }, /* DT WROM B */ { SST(0x3F, 0x11, SS_RDEF, /* XXX TBD */ "Medium auxiliary memory accessible") }, /* DTLPWR MAEBK F */ { SST(0x3F, 0x12, SS_RDEF, /* XXX TBD */ "iSCSI IP address added") }, /* DTLPWR MAEBK F */ { SST(0x3F, 0x13, SS_RDEF, /* XXX TBD */ "iSCSI IP address removed") }, /* DTLPWR MAEBK F */ { SST(0x3F, 0x14, SS_RDEF, /* XXX TBD */ "iSCSI IP address changed") }, /* DTLPWR MAEBK */ { SST(0x3F, 0x15, SS_RDEF, /* XXX TBD */ "Inspect referrals sense descriptors") }, /* DTLPWROMAEBKVF */ { SST(0x3F, 0x16, SS_RDEF, /* XXX TBD */ "Microcode has been changed without reset") }, /* D */ { SST(0x3F, 0x17, SS_RDEF, /* XXX TBD */ "Zone transition to full") }, /* D */ { SST(0x40, 0x00, SS_RDEF, "RAM failure") }, /* deprecated - use 40 NN instead */ /* DTLPWROMAEBKVF */ { SST(0x40, 0x80, SS_RDEF, "Diagnostic failure: ASCQ = Component ID") }, /* DTLPWROMAEBKVF */ { SST(0x40, 0xFF, SS_RDEF | SSQ_RANGE, NULL) }, /* Range 0x80->0xFF */ /* D */ { SST(0x41, 0x00, SS_RDEF, "Data path failure") }, /* deprecated - use 40 NN instead */ /* D */ { SST(0x42, 0x00, SS_RDEF, "Power-on or self-test failure") }, /* deprecated - use 40 NN instead */ /* DTLPWROMAEBKVF */ { SST(0x43, 0x00, SS_RDEF, "Message error") }, /* DTLPWROMAEBKVF */ { SST(0x44, 0x00, SS_FATAL | EIO, "Internal target failure") }, /* DT P MAEBKVF */ { SST(0x44, 0x01, SS_RDEF, /* XXX TBD */ "Persistent reservation information lost") }, /* DT B */ { SST(0x44, 0x71, SS_RDEF, /* XXX TBD */ "ATA device failed set features") }, /* DTLPWROMAEBKVF */ { SST(0x45, 0x00, SS_RDEF, "Select or reselect failure") }, /* DTLPWROM BK */ { SST(0x46, 0x00, SS_RDEF, "Unsuccessful soft reset") }, /* DTLPWROMAEBKVF */ { SST(0x47, 0x00, SS_RDEF, "SCSI parity error") }, /* DTLPWROMAEBKVF */ { SST(0x47, 0x01, SS_RDEF, /* XXX TBD */ "Data phase CRC error detected") }, /* DTLPWROMAEBKVF */ { SST(0x47, 0x02, SS_RDEF, /* XXX TBD */ "SCSI parity error detected during ST data phase") }, /* DTLPWROMAEBKVF */ { SST(0x47, 0x03, SS_RDEF, /* XXX TBD */ "Information unit iuCRC error detected") }, /* DTLPWROMAEBKVF */ { SST(0x47, 0x04, SS_RDEF, /* XXX TBD */ "Asynchronous information protection error detected") }, /* DTLPWROMAEBKVF */ { SST(0x47, 0x05, SS_RDEF, /* XXX TBD */ "Protocol service CRC error") }, /* DT MAEBKVF */ { SST(0x47, 0x06, SS_RDEF, /* XXX TBD */ "PHY test function in progress") }, /* DT PWROMAEBK */ { SST(0x47, 0x7F, SS_RDEF, /* XXX TBD */ "Some commands cleared by iSCSI protocol event") }, /* DTLPWROMAEBKVF */ { SST(0x48, 0x00, SS_RDEF, "Initiator detected error message received") }, /* DTLPWROMAEBKVF */ { SST(0x49, 0x00, SS_RDEF, "Invalid message error") }, /* DTLPWROMAEBKVF */ { SST(0x4A, 0x00, SS_RDEF, "Command phase error") }, /* DTLPWROMAEBKVF */ { SST(0x4B, 0x00, SS_RDEF, "Data phase error") }, /* DT PWROMAEBK */ { SST(0x4B, 0x01, SS_RDEF, /* XXX TBD */ "Invalid target port transfer tag received") }, /* DT PWROMAEBK */ { SST(0x4B, 0x02, SS_RDEF, /* XXX TBD */ "Too much write data") }, /* DT PWROMAEBK */ { SST(0x4B, 0x03, SS_RDEF, /* XXX TBD */ "ACK/NAK timeout") }, /* DT PWROMAEBK */ { SST(0x4B, 0x04, SS_RDEF, /* XXX TBD */ "NAK received") }, /* DT PWROMAEBK */ { SST(0x4B, 0x05, SS_RDEF, /* XXX TBD */ "Data offset error") }, /* DT PWROMAEBK */ { SST(0x4B, 0x06, SS_RDEF, /* XXX TBD */ "Initiator response timeout") }, /* DT PWROMAEBK F */ { SST(0x4B, 0x07, SS_RDEF, /* XXX TBD */ "Connection lost") }, /* DT PWROMAEBK F */ { SST(0x4B, 0x08, SS_RDEF, /* XXX TBD */ "Data-in buffer overflow - data buffer size") }, /* DT PWROMAEBK F */ { SST(0x4B, 0x09, SS_RDEF, /* XXX TBD */ "Data-in buffer overflow - data buffer descriptor area") }, /* DT PWROMAEBK F */ { SST(0x4B, 0x0A, SS_RDEF, /* XXX TBD */ "Data-in buffer error") }, /* DT PWROMAEBK F */ { SST(0x4B, 0x0B, SS_RDEF, /* XXX TBD */ "Data-out buffer overflow - data buffer size") }, /* DT PWROMAEBK F */ { SST(0x4B, 0x0C, SS_RDEF, /* XXX TBD */ "Data-out buffer overflow - data buffer descriptor area") }, /* DT PWROMAEBK F */ { SST(0x4B, 0x0D, SS_RDEF, /* XXX TBD */ "Data-out buffer error") }, /* DT PWROMAEBK F */ { SST(0x4B, 0x0E, SS_RDEF, /* XXX TBD */ "PCIe fabric error") }, /* DT PWROMAEBK F */ { SST(0x4B, 0x0F, SS_RDEF, /* XXX TBD */ "PCIe completion timeout") }, /* DT PWROMAEBK F */ { SST(0x4B, 0x10, SS_RDEF, /* XXX TBD */ "PCIe completer abort") }, /* DT PWROMAEBK F */ { SST(0x4B, 0x11, SS_RDEF, /* XXX TBD */ "PCIe poisoned TLP received") }, /* DT PWROMAEBK F */ { SST(0x4B, 0x12, SS_RDEF, /* XXX TBD */ "PCIe ECRC check failed") }, /* DT PWROMAEBK F */ { SST(0x4B, 0x13, SS_RDEF, /* XXX TBD */ "PCIe unsupported request") }, /* DT PWROMAEBK F */ { SST(0x4B, 0x14, SS_RDEF, /* XXX TBD */ "PCIe ACS violation") }, /* DT PWROMAEBK F */ { SST(0x4B, 0x15, SS_RDEF, /* XXX TBD */ "PCIe TLP prefix blocket") }, /* DTLPWROMAEBKVF */ { SST(0x4C, 0x00, SS_RDEF, "Logical unit failed self-configuration") }, /* DTLPWROMAEBKVF */ { SST(0x4D, 0x00, SS_RDEF, "Tagged overlapped commands: ASCQ = Queue tag ID") }, /* DTLPWROMAEBKVF */ { SST(0x4D, 0xFF, SS_RDEF | SSQ_RANGE, NULL) }, /* Range 0x00->0xFF */ /* DTLPWROMAEBKVF */ { SST(0x4E, 0x00, SS_RDEF, "Overlapped commands attempted") }, /* T */ { SST(0x50, 0x00, SS_RDEF, "Write append error") }, /* T */ { SST(0x50, 0x01, SS_RDEF, "Write append position error") }, /* T */ { SST(0x50, 0x02, SS_RDEF, "Position error related to timing") }, /* T RO */ { SST(0x51, 0x00, SS_RDEF, "Erase failure") }, /* R */ { SST(0x51, 0x01, SS_RDEF, /* XXX TBD */ "Erase failure - incomplete erase operation detected") }, /* T */ { SST(0x52, 0x00, SS_RDEF, "Cartridge fault") }, /* DTL WROM BK */ { SST(0x53, 0x00, SS_RDEF, "Media load or eject failed") }, /* T */ { SST(0x53, 0x01, SS_RDEF, "Unload tape failure") }, /* DT WROM BK */ { SST(0x53, 0x02, SS_RDEF, "Medium removal prevented") }, /* M */ { SST(0x53, 0x03, SS_RDEF, /* XXX TBD */ "Medium removal prevented by data transfer element") }, /* T */ { SST(0x53, 0x04, SS_RDEF, /* XXX TBD */ "Medium thread or unthread failure") }, /* M */ { SST(0x53, 0x05, SS_RDEF, /* XXX TBD */ "Volume identifier invalid") }, /* T */ { SST(0x53, 0x06, SS_RDEF, /* XXX TBD */ "Volume identifier missing") }, /* M */ { SST(0x53, 0x07, SS_RDEF, /* XXX TBD */ "Duplicate volume identifier") }, /* M */ { SST(0x53, 0x08, SS_RDEF, /* XXX TBD */ "Element status unknown") }, /* M */ { SST(0x53, 0x09, SS_RDEF, /* XXX TBD */ "Data transfer device error - load failed") }, /* M */ { SST(0x53, 0x0A, SS_RDEF, /* XXX TBD */ "Data transfer device error - unload failed") }, /* M */ { SST(0x53, 0x0B, SS_RDEF, /* XXX TBD */ "Data transfer device error - unload missing") }, /* M */ { SST(0x53, 0x0C, SS_RDEF, /* XXX TBD */ "Data transfer device error - eject failed") }, /* M */ { SST(0x53, 0x0D, SS_RDEF, /* XXX TBD */ "Data transfer device error - library communication failed") }, /* P */ { SST(0x54, 0x00, SS_RDEF, "SCSI to host system interface failure") }, /* P */ { SST(0x55, 0x00, SS_RDEF, "System resource failure") }, /* D O BK */ { SST(0x55, 0x01, SS_FATAL | ENOSPC, "System buffer full") }, /* DTLPWROMAE K */ { SST(0x55, 0x02, SS_RDEF, /* XXX TBD */ "Insufficient reservation resources") }, /* DTLPWROMAE K */ { SST(0x55, 0x03, SS_RDEF, /* XXX TBD */ "Insufficient resources") }, /* DTLPWROMAE K */ { SST(0x55, 0x04, SS_RDEF, /* XXX TBD */ "Insufficient registration resources") }, /* DT PWROMAEBK */ { SST(0x55, 0x05, SS_RDEF, /* XXX TBD */ "Insufficient access control resources") }, /* DT WROM B */ { SST(0x55, 0x06, SS_RDEF, /* XXX TBD */ "Auxiliary memory out of space") }, /* F */ { SST(0x55, 0x07, SS_RDEF, /* XXX TBD */ "Quota error") }, /* T */ { SST(0x55, 0x08, SS_RDEF, /* XXX TBD */ "Maximum number of supplemental decryption keys exceeded") }, /* M */ { SST(0x55, 0x09, SS_RDEF, /* XXX TBD */ "Medium auxiliary memory not accessible") }, /* M */ { SST(0x55, 0x0A, SS_RDEF, /* XXX TBD */ "Data currently unavailable") }, /* DTLPWROMAEBKVF */ { SST(0x55, 0x0B, SS_RDEF, /* XXX TBD */ "Insufficient power for operation") }, /* DT P B */ { SST(0x55, 0x0C, SS_RDEF, /* XXX TBD */ "Insufficient resources to create ROD") }, /* DT P B */ { SST(0x55, 0x0D, SS_RDEF, /* XXX TBD */ "Insufficient resources to create ROD token") }, /* D */ { SST(0x55, 0x0E, SS_RDEF, /* XXX TBD */ "Insufficient zone resources") }, /* D */ { SST(0x55, 0x0F, SS_RDEF, /* XXX TBD */ "Insufficient zone resources to complete write") }, /* D */ { SST(0x55, 0x10, SS_RDEF, /* XXX TBD */ "Maximum number of streams open") }, /* R */ { SST(0x57, 0x00, SS_RDEF, "Unable to recover table-of-contents") }, /* O */ { SST(0x58, 0x00, SS_RDEF, "Generation does not exist") }, /* O */ { SST(0x59, 0x00, SS_RDEF, "Updated block read") }, /* DTLPWRO BK */ { SST(0x5A, 0x00, SS_RDEF, "Operator request or state change input") }, /* DT WROM BK */ { SST(0x5A, 0x01, SS_RDEF, "Operator medium removal request") }, /* DT WRO A BK */ { SST(0x5A, 0x02, SS_RDEF, "Operator selected write protect") }, /* DT WRO A BK */ { SST(0x5A, 0x03, SS_RDEF, "Operator selected write permit") }, /* DTLPWROM K */ { SST(0x5B, 0x00, SS_RDEF, "Log exception") }, /* DTLPWROM K */ { SST(0x5B, 0x01, SS_RDEF, "Threshold condition met") }, /* DTLPWROM K */ { SST(0x5B, 0x02, SS_RDEF, "Log counter at maximum") }, /* DTLPWROM K */ { SST(0x5B, 0x03, SS_RDEF, "Log list codes exhausted") }, /* D O */ { SST(0x5C, 0x00, SS_RDEF, "RPL status change") }, /* D O */ { SST(0x5C, 0x01, SS_NOP | SSQ_PRINT_SENSE, "Spindles synchronized") }, /* D O */ { SST(0x5C, 0x02, SS_RDEF, "Spindles not synchronized") }, /* DTLPWROMAEBKVF */ { SST(0x5D, 0x00, SS_NOP | SSQ_PRINT_SENSE, "Failure prediction threshold exceeded") }, /* R B */ { SST(0x5D, 0x01, SS_NOP | SSQ_PRINT_SENSE, "Media failure prediction threshold exceeded") }, /* R */ { SST(0x5D, 0x02, SS_NOP | SSQ_PRINT_SENSE, "Logical unit failure prediction threshold exceeded") }, /* R */ { SST(0x5D, 0x03, SS_NOP | SSQ_PRINT_SENSE, "Spare area exhaustion prediction threshold exceeded") }, /* D B */ { SST(0x5D, 0x10, SS_NOP | SSQ_PRINT_SENSE, "Hardware impending failure general hard drive failure") }, /* D B */ { SST(0x5D, 0x11, SS_NOP | SSQ_PRINT_SENSE, "Hardware impending failure drive error rate too high") }, /* D B */ { SST(0x5D, 0x12, SS_NOP | SSQ_PRINT_SENSE, "Hardware impending failure data error rate too high") }, /* D B */ { SST(0x5D, 0x13, SS_NOP | SSQ_PRINT_SENSE, "Hardware impending failure seek error rate too high") }, /* D B */ { SST(0x5D, 0x14, SS_NOP | SSQ_PRINT_SENSE, "Hardware impending failure too many block reassigns") }, /* D B */ { SST(0x5D, 0x15, SS_NOP | SSQ_PRINT_SENSE, "Hardware impending failure access times too high") }, /* D B */ { SST(0x5D, 0x16, SS_NOP | SSQ_PRINT_SENSE, "Hardware impending failure start unit times too high") }, /* D B */ { SST(0x5D, 0x17, SS_NOP | SSQ_PRINT_SENSE, "Hardware impending failure channel parametrics") }, /* D B */ { SST(0x5D, 0x18, SS_NOP | SSQ_PRINT_SENSE, "Hardware impending failure controller detected") }, /* D B */ { SST(0x5D, 0x19, SS_NOP | SSQ_PRINT_SENSE, "Hardware impending failure throughput performance") }, /* D B */ { SST(0x5D, 0x1A, SS_NOP | SSQ_PRINT_SENSE, "Hardware impending failure seek time performance") }, /* D B */ { SST(0x5D, 0x1B, SS_NOP | SSQ_PRINT_SENSE, "Hardware impending failure spin-up retry count") }, /* D B */ { SST(0x5D, 0x1C, SS_NOP | SSQ_PRINT_SENSE, "Hardware impending failure drive calibration retry count") }, /* D B */ { SST(0x5D, 0x1D, SS_NOP | SSQ_PRINT_SENSE, "Hardware impending failure power loss protection circuit") }, /* D B */ { SST(0x5D, 0x20, SS_NOP | SSQ_PRINT_SENSE, "Controller impending failure general hard drive failure") }, /* D B */ { SST(0x5D, 0x21, SS_NOP | SSQ_PRINT_SENSE, "Controller impending failure drive error rate too high") }, /* D B */ { SST(0x5D, 0x22, SS_NOP | SSQ_PRINT_SENSE, "Controller impending failure data error rate too high") }, /* D B */ { SST(0x5D, 0x23, SS_NOP | SSQ_PRINT_SENSE, "Controller impending failure seek error rate too high") }, /* D B */ { SST(0x5D, 0x24, SS_NOP | SSQ_PRINT_SENSE, "Controller impending failure too many block reassigns") }, /* D B */ { SST(0x5D, 0x25, SS_NOP | SSQ_PRINT_SENSE, "Controller impending failure access times too high") }, /* D B */ { SST(0x5D, 0x26, SS_NOP | SSQ_PRINT_SENSE, "Controller impending failure start unit times too high") }, /* D B */ { SST(0x5D, 0x27, SS_NOP | SSQ_PRINT_SENSE, "Controller impending failure channel parametrics") }, /* D B */ { SST(0x5D, 0x28, SS_NOP | SSQ_PRINT_SENSE, "Controller impending failure controller detected") }, /* D B */ { SST(0x5D, 0x29, SS_NOP | SSQ_PRINT_SENSE, "Controller impending failure throughput performance") }, /* D B */ { SST(0x5D, 0x2A, SS_NOP | SSQ_PRINT_SENSE, "Controller impending failure seek time performance") }, /* D B */ { SST(0x5D, 0x2B, SS_NOP | SSQ_PRINT_SENSE, "Controller impending failure spin-up retry count") }, /* D B */ { SST(0x5D, 0x2C, SS_NOP | SSQ_PRINT_SENSE, "Controller impending failure drive calibration retry count") }, /* D B */ { SST(0x5D, 0x30, SS_NOP | SSQ_PRINT_SENSE, "Data channel impending failure general hard drive failure") }, /* D B */ { SST(0x5D, 0x31, SS_NOP | SSQ_PRINT_SENSE, "Data channel impending failure drive error rate too high") }, /* D B */ { SST(0x5D, 0x32, SS_NOP | SSQ_PRINT_SENSE, "Data channel impending failure data error rate too high") }, /* D B */ { SST(0x5D, 0x33, SS_NOP | SSQ_PRINT_SENSE, "Data channel impending failure seek error rate too high") }, /* D B */ { SST(0x5D, 0x34, SS_NOP | SSQ_PRINT_SENSE, "Data channel impending failure too many block reassigns") }, /* D B */ { SST(0x5D, 0x35, SS_NOP | SSQ_PRINT_SENSE, "Data channel impending failure access times too high") }, /* D B */ { SST(0x5D, 0x36, SS_NOP | SSQ_PRINT_SENSE, "Data channel impending failure start unit times too high") }, /* D B */ { SST(0x5D, 0x37, SS_NOP | SSQ_PRINT_SENSE, "Data channel impending failure channel parametrics") }, /* D B */ { SST(0x5D, 0x38, SS_NOP | SSQ_PRINT_SENSE, "Data channel impending failure controller detected") }, /* D B */ { SST(0x5D, 0x39, SS_NOP | SSQ_PRINT_SENSE, "Data channel impending failure throughput performance") }, /* D B */ { SST(0x5D, 0x3A, SS_NOP | SSQ_PRINT_SENSE, "Data channel impending failure seek time performance") }, /* D B */ { SST(0x5D, 0x3B, SS_NOP | SSQ_PRINT_SENSE, "Data channel impending failure spin-up retry count") }, /* D B */ { SST(0x5D, 0x3C, SS_NOP | SSQ_PRINT_SENSE, "Data channel impending failure drive calibration retry count") }, /* D B */ { SST(0x5D, 0x40, SS_NOP | SSQ_PRINT_SENSE, "Servo impending failure general hard drive failure") }, /* D B */ { SST(0x5D, 0x41, SS_NOP | SSQ_PRINT_SENSE, "Servo impending failure drive error rate too high") }, /* D B */ { SST(0x5D, 0x42, SS_NOP | SSQ_PRINT_SENSE, "Servo impending failure data error rate too high") }, /* D B */ { SST(0x5D, 0x43, SS_NOP | SSQ_PRINT_SENSE, "Servo impending failure seek error rate too high") }, /* D B */ { SST(0x5D, 0x44, SS_NOP | SSQ_PRINT_SENSE, "Servo impending failure too many block reassigns") }, /* D B */ { SST(0x5D, 0x45, SS_NOP | SSQ_PRINT_SENSE, "Servo impending failure access times too high") }, /* D B */ { SST(0x5D, 0x46, SS_NOP | SSQ_PRINT_SENSE, "Servo impending failure start unit times too high") }, /* D B */ { SST(0x5D, 0x47, SS_NOP | SSQ_PRINT_SENSE, "Servo impending failure channel parametrics") }, /* D B */ { SST(0x5D, 0x48, SS_NOP | SSQ_PRINT_SENSE, "Servo impending failure controller detected") }, /* D B */ { SST(0x5D, 0x49, SS_NOP | SSQ_PRINT_SENSE, "Servo impending failure throughput performance") }, /* D B */ { SST(0x5D, 0x4A, SS_NOP | SSQ_PRINT_SENSE, "Servo impending failure seek time performance") }, /* D B */ { SST(0x5D, 0x4B, SS_NOP | SSQ_PRINT_SENSE, "Servo impending failure spin-up retry count") }, /* D B */ { SST(0x5D, 0x4C, SS_NOP | SSQ_PRINT_SENSE, "Servo impending failure drive calibration retry count") }, /* D B */ { SST(0x5D, 0x50, SS_NOP | SSQ_PRINT_SENSE, "Spindle impending failure general hard drive failure") }, /* D B */ { SST(0x5D, 0x51, SS_NOP | SSQ_PRINT_SENSE, "Spindle impending failure drive error rate too high") }, /* D B */ { SST(0x5D, 0x52, SS_NOP | SSQ_PRINT_SENSE, "Spindle impending failure data error rate too high") }, /* D B */ { SST(0x5D, 0x53, SS_NOP | SSQ_PRINT_SENSE, "Spindle impending failure seek error rate too high") }, /* D B */ { SST(0x5D, 0x54, SS_NOP | SSQ_PRINT_SENSE, "Spindle impending failure too many block reassigns") }, /* D B */ { SST(0x5D, 0x55, SS_NOP | SSQ_PRINT_SENSE, "Spindle impending failure access times too high") }, /* D B */ { SST(0x5D, 0x56, SS_NOP | SSQ_PRINT_SENSE, "Spindle impending failure start unit times too high") }, /* D B */ { SST(0x5D, 0x57, SS_NOP | SSQ_PRINT_SENSE, "Spindle impending failure channel parametrics") }, /* D B */ { SST(0x5D, 0x58, SS_NOP | SSQ_PRINT_SENSE, "Spindle impending failure controller detected") }, /* D B */ { SST(0x5D, 0x59, SS_NOP | SSQ_PRINT_SENSE, "Spindle impending failure throughput performance") }, /* D B */ { SST(0x5D, 0x5A, SS_NOP | SSQ_PRINT_SENSE, "Spindle impending failure seek time performance") }, /* D B */ { SST(0x5D, 0x5B, SS_NOP | SSQ_PRINT_SENSE, "Spindle impending failure spin-up retry count") }, /* D B */ { SST(0x5D, 0x5C, SS_NOP | SSQ_PRINT_SENSE, "Spindle impending failure drive calibration retry count") }, /* D B */ { SST(0x5D, 0x60, SS_NOP | SSQ_PRINT_SENSE, "Firmware impending failure general hard drive failure") }, /* D B */ { SST(0x5D, 0x61, SS_NOP | SSQ_PRINT_SENSE, "Firmware impending failure drive error rate too high") }, /* D B */ { SST(0x5D, 0x62, SS_NOP | SSQ_PRINT_SENSE, "Firmware impending failure data error rate too high") }, /* D B */ { SST(0x5D, 0x63, SS_NOP | SSQ_PRINT_SENSE, "Firmware impending failure seek error rate too high") }, /* D B */ { SST(0x5D, 0x64, SS_NOP | SSQ_PRINT_SENSE, "Firmware impending failure too many block reassigns") }, /* D B */ { SST(0x5D, 0x65, SS_NOP | SSQ_PRINT_SENSE, "Firmware impending failure access times too high") }, /* D B */ { SST(0x5D, 0x66, SS_NOP | SSQ_PRINT_SENSE, "Firmware impending failure start unit times too high") }, /* D B */ { SST(0x5D, 0x67, SS_NOP | SSQ_PRINT_SENSE, "Firmware impending failure channel parametrics") }, /* D B */ { SST(0x5D, 0x68, SS_NOP | SSQ_PRINT_SENSE, "Firmware impending failure controller detected") }, /* D B */ { SST(0x5D, 0x69, SS_NOP | SSQ_PRINT_SENSE, "Firmware impending failure throughput performance") }, /* D B */ { SST(0x5D, 0x6A, SS_NOP | SSQ_PRINT_SENSE, "Firmware impending failure seek time performance") }, /* D B */ { SST(0x5D, 0x6B, SS_NOP | SSQ_PRINT_SENSE, "Firmware impending failure spin-up retry count") }, /* D B */ { SST(0x5D, 0x6C, SS_NOP | SSQ_PRINT_SENSE, "Firmware impending failure drive calibration retry count") }, /* D B */ { SST(0x5D, 0x73, SS_NOP | SSQ_PRINT_SENSE, "Media impending failure endurance limit met") }, /* DTLPWROMAEBKVF */ { SST(0x5D, 0xFF, SS_NOP | SSQ_PRINT_SENSE, "Failure prediction threshold exceeded (false)") }, /* DTLPWRO A K */ { SST(0x5E, 0x00, SS_RDEF, "Low power condition on") }, /* DTLPWRO A K */ { SST(0x5E, 0x01, SS_RDEF, "Idle condition activated by timer") }, /* DTLPWRO A K */ { SST(0x5E, 0x02, SS_RDEF, "Standby condition activated by timer") }, /* DTLPWRO A K */ { SST(0x5E, 0x03, SS_RDEF, "Idle condition activated by command") }, /* DTLPWRO A K */ { SST(0x5E, 0x04, SS_RDEF, "Standby condition activated by command") }, /* DTLPWRO A K */ { SST(0x5E, 0x05, SS_RDEF, "Idle-B condition activated by timer") }, /* DTLPWRO A K */ { SST(0x5E, 0x06, SS_RDEF, "Idle-B condition activated by command") }, /* DTLPWRO A K */ { SST(0x5E, 0x07, SS_RDEF, "Idle-C condition activated by timer") }, /* DTLPWRO A K */ { SST(0x5E, 0x08, SS_RDEF, "Idle-C condition activated by command") }, /* DTLPWRO A K */ { SST(0x5E, 0x09, SS_RDEF, "Standby-Y condition activated by timer") }, /* DTLPWRO A K */ { SST(0x5E, 0x0A, SS_RDEF, "Standby-Y condition activated by command") }, /* B */ { SST(0x5E, 0x41, SS_RDEF, /* XXX TBD */ "Power state change to active") }, /* B */ { SST(0x5E, 0x42, SS_RDEF, /* XXX TBD */ "Power state change to idle") }, /* B */ { SST(0x5E, 0x43, SS_RDEF, /* XXX TBD */ "Power state change to standby") }, /* B */ { SST(0x5E, 0x45, SS_RDEF, /* XXX TBD */ "Power state change to sleep") }, /* BK */ { SST(0x5E, 0x47, SS_RDEF, /* XXX TBD */ "Power state change to device control") }, /* */ { SST(0x60, 0x00, SS_RDEF, "Lamp failure") }, /* */ { SST(0x61, 0x00, SS_RDEF, "Video acquisition error") }, /* */ { SST(0x61, 0x01, SS_RDEF, "Unable to acquire video") }, /* */ { SST(0x61, 0x02, SS_RDEF, "Out of focus") }, /* */ { SST(0x62, 0x00, SS_RDEF, "Scan head positioning error") }, /* R */ { SST(0x63, 0x00, SS_RDEF, "End of user area encountered on this track") }, /* R */ { SST(0x63, 0x01, SS_FATAL | ENOSPC, "Packet does not fit in available space") }, /* R */ { SST(0x64, 0x00, SS_FATAL | ENXIO, "Illegal mode for this track") }, /* R */ { SST(0x64, 0x01, SS_RDEF, "Invalid packet size") }, /* DTLPWROMAEBKVF */ { SST(0x65, 0x00, SS_RDEF, "Voltage fault") }, /* */ { SST(0x66, 0x00, SS_RDEF, "Automatic document feeder cover up") }, /* */ { SST(0x66, 0x01, SS_RDEF, "Automatic document feeder lift up") }, /* */ { SST(0x66, 0x02, SS_RDEF, "Document jam in automatic document feeder") }, /* */ { SST(0x66, 0x03, SS_RDEF, "Document miss feed automatic in document feeder") }, /* A */ { SST(0x67, 0x00, SS_RDEF, "Configuration failure") }, /* A */ { SST(0x67, 0x01, SS_RDEF, "Configuration of incapable logical units failed") }, /* A */ { SST(0x67, 0x02, SS_RDEF, "Add logical unit failed") }, /* A */ { SST(0x67, 0x03, SS_RDEF, "Modification of logical unit failed") }, /* A */ { SST(0x67, 0x04, SS_RDEF, "Exchange of logical unit failed") }, /* A */ { SST(0x67, 0x05, SS_RDEF, "Remove of logical unit failed") }, /* A */ { SST(0x67, 0x06, SS_RDEF, "Attachment of logical unit failed") }, /* A */ { SST(0x67, 0x07, SS_RDEF, "Creation of logical unit failed") }, /* A */ { SST(0x67, 0x08, SS_RDEF, /* XXX TBD */ "Assign failure occurred") }, /* A */ { SST(0x67, 0x09, SS_RDEF, /* XXX TBD */ "Multiply assigned logical unit") }, /* DTLPWROMAEBKVF */ { SST(0x67, 0x0A, SS_RDEF, /* XXX TBD */ "Set target port groups command failed") }, /* DT B */ { SST(0x67, 0x0B, SS_RDEF, /* XXX TBD */ "ATA device feature not enabled") }, /* A */ { SST(0x68, 0x00, SS_RDEF, "Logical unit not configured") }, /* D */ { SST(0x68, 0x01, SS_RDEF, "Subsidiary logical unit not configured") }, /* A */ { SST(0x69, 0x00, SS_RDEF, "Data loss on logical unit") }, /* A */ { SST(0x69, 0x01, SS_RDEF, "Multiple logical unit failures") }, /* A */ { SST(0x69, 0x02, SS_RDEF, "Parity/data mismatch") }, /* A */ { SST(0x6A, 0x00, SS_RDEF, "Informational, refer to log") }, /* A */ { SST(0x6B, 0x00, SS_RDEF, "State change has occurred") }, /* A */ { SST(0x6B, 0x01, SS_RDEF, "Redundancy level got better") }, /* A */ { SST(0x6B, 0x02, SS_RDEF, "Redundancy level got worse") }, /* A */ { SST(0x6C, 0x00, SS_RDEF, "Rebuild failure occurred") }, /* A */ { SST(0x6D, 0x00, SS_RDEF, "Recalculate failure occurred") }, /* A */ { SST(0x6E, 0x00, SS_RDEF, "Command to logical unit failed") }, /* R */ { SST(0x6F, 0x00, SS_RDEF, /* XXX TBD */ "Copy protection key exchange failure - authentication failure") }, /* R */ { SST(0x6F, 0x01, SS_RDEF, /* XXX TBD */ "Copy protection key exchange failure - key not present") }, /* R */ { SST(0x6F, 0x02, SS_RDEF, /* XXX TBD */ "Copy protection key exchange failure - key not established") }, /* R */ { SST(0x6F, 0x03, SS_RDEF, /* XXX TBD */ "Read of scrambled sector without authentication") }, /* R */ { SST(0x6F, 0x04, SS_RDEF, /* XXX TBD */ "Media region code is mismatched to logical unit region") }, /* R */ { SST(0x6F, 0x05, SS_RDEF, /* XXX TBD */ "Drive region must be permanent/region reset count error") }, /* R */ { SST(0x6F, 0x06, SS_RDEF, /* XXX TBD */ "Insufficient block count for binding NONCE recording") }, /* R */ { SST(0x6F, 0x07, SS_RDEF, /* XXX TBD */ "Conflict in binding NONCE recording") }, /* T */ { SST(0x70, 0x00, SS_RDEF, "Decompression exception short: ASCQ = Algorithm ID") }, /* T */ { SST(0x70, 0xFF, SS_RDEF | SSQ_RANGE, NULL) }, /* Range 0x00 -> 0xFF */ /* T */ { SST(0x71, 0x00, SS_RDEF, "Decompression exception long: ASCQ = Algorithm ID") }, /* T */ { SST(0x71, 0xFF, SS_RDEF | SSQ_RANGE, NULL) }, /* Range 0x00 -> 0xFF */ /* R */ { SST(0x72, 0x00, SS_RDEF, "Session fixation error") }, /* R */ { SST(0x72, 0x01, SS_RDEF, "Session fixation error writing lead-in") }, /* R */ { SST(0x72, 0x02, SS_RDEF, "Session fixation error writing lead-out") }, /* R */ { SST(0x72, 0x03, SS_RDEF, "Session fixation error - incomplete track in session") }, /* R */ { SST(0x72, 0x04, SS_RDEF, "Empty or partially written reserved track") }, /* R */ { SST(0x72, 0x05, SS_RDEF, /* XXX TBD */ "No more track reservations allowed") }, /* R */ { SST(0x72, 0x06, SS_RDEF, /* XXX TBD */ "RMZ extension is not allowed") }, /* R */ { SST(0x72, 0x07, SS_RDEF, /* XXX TBD */ "No more test zone extensions are allowed") }, /* R */ { SST(0x73, 0x00, SS_RDEF, "CD control error") }, /* R */ { SST(0x73, 0x01, SS_RDEF, "Power calibration area almost full") }, /* R */ { SST(0x73, 0x02, SS_FATAL | ENOSPC, "Power calibration area is full") }, /* R */ { SST(0x73, 0x03, SS_RDEF, "Power calibration area error") }, /* R */ { SST(0x73, 0x04, SS_RDEF, "Program memory area update failure") }, /* R */ { SST(0x73, 0x05, SS_RDEF, "Program memory area is full") }, /* R */ { SST(0x73, 0x06, SS_RDEF, /* XXX TBD */ "RMA/PMA is almost full") }, /* R */ { SST(0x73, 0x10, SS_RDEF, /* XXX TBD */ "Current power calibration area almost full") }, /* R */ { SST(0x73, 0x11, SS_RDEF, /* XXX TBD */ "Current power calibration area is full") }, /* R */ { SST(0x73, 0x17, SS_RDEF, /* XXX TBD */ "RDZ is full") }, /* T */ { SST(0x74, 0x00, SS_RDEF, /* XXX TBD */ "Security error") }, /* T */ { SST(0x74, 0x01, SS_RDEF, /* XXX TBD */ "Unable to decrypt data") }, /* T */ { SST(0x74, 0x02, SS_RDEF, /* XXX TBD */ "Unencrypted data encountered while decrypting") }, /* T */ { SST(0x74, 0x03, SS_RDEF, /* XXX TBD */ "Incorrect data encryption key") }, /* T */ { SST(0x74, 0x04, SS_RDEF, /* XXX TBD */ "Cryptographic integrity validation failed") }, /* T */ { SST(0x74, 0x05, SS_RDEF, /* XXX TBD */ "Error decrypting data") }, /* T */ { SST(0x74, 0x06, SS_RDEF, /* XXX TBD */ "Unknown signature verification key") }, /* T */ { SST(0x74, 0x07, SS_RDEF, /* XXX TBD */ "Encryption parameters not useable") }, /* DT R M E VF */ { SST(0x74, 0x08, SS_RDEF, /* XXX TBD */ "Digital signature validation failure") }, /* T */ { SST(0x74, 0x09, SS_RDEF, /* XXX TBD */ "Encryption mode mismatch on read") }, /* T */ { SST(0x74, 0x0A, SS_RDEF, /* XXX TBD */ "Encrypted block not raw read enabled") }, /* T */ { SST(0x74, 0x0B, SS_RDEF, /* XXX TBD */ "Incorrect encryption parameters") }, /* DT R MAEBKV */ { SST(0x74, 0x0C, SS_RDEF, /* XXX TBD */ "Unable to decrypt parameter list") }, /* T */ { SST(0x74, 0x0D, SS_RDEF, /* XXX TBD */ "Encryption algorithm disabled") }, /* DT R MAEBKV */ { SST(0x74, 0x10, SS_RDEF, /* XXX TBD */ "SA creation parameter value invalid") }, /* DT R MAEBKV */ { SST(0x74, 0x11, SS_RDEF, /* XXX TBD */ "SA creation parameter value rejected") }, /* DT R MAEBKV */ { SST(0x74, 0x12, SS_RDEF, /* XXX TBD */ "Invalid SA usage") }, /* T */ { SST(0x74, 0x21, SS_RDEF, /* XXX TBD */ "Data encryption configuration prevented") }, /* DT R MAEBKV */ { SST(0x74, 0x30, SS_RDEF, /* XXX TBD */ "SA creation parameter not supported") }, /* DT R MAEBKV */ { SST(0x74, 0x40, SS_RDEF, /* XXX TBD */ "Authentication failed") }, /* V */ { SST(0x74, 0x61, SS_RDEF, /* XXX TBD */ "External data encryption key manager access error") }, /* V */ { SST(0x74, 0x62, SS_RDEF, /* XXX TBD */ "External data encryption key manager error") }, /* V */ { SST(0x74, 0x63, SS_RDEF, /* XXX TBD */ "External data encryption key not found") }, /* V */ { SST(0x74, 0x64, SS_RDEF, /* XXX TBD */ "External data encryption request not authorized") }, /* T */ { SST(0x74, 0x6E, SS_RDEF, /* XXX TBD */ "External data encryption control timeout") }, /* T */ { SST(0x74, 0x6F, SS_RDEF, /* XXX TBD */ "External data encryption control error") }, /* DT R M E V */ { SST(0x74, 0x71, SS_FATAL | EACCES, "Logical unit access not authorized") }, /* D */ { SST(0x74, 0x79, SS_FATAL | EACCES, "Security conflict in translated device") } }; const u_int asc_table_size = nitems(asc_table); struct asc_key { int asc; int ascq; }; static int ascentrycomp(const void *key, const void *member) { int asc; int ascq; const struct asc_table_entry *table_entry; asc = ((const struct asc_key *)key)->asc; ascq = ((const struct asc_key *)key)->ascq; table_entry = (const struct asc_table_entry *)member; if (asc >= table_entry->asc) { if (asc > table_entry->asc) return (1); if (ascq <= table_entry->ascq) { /* Check for ranges */ if (ascq == table_entry->ascq || ((table_entry->action & SSQ_RANGE) != 0 && ascq >= (table_entry - 1)->ascq)) return (0); return (-1); } return (1); } return (-1); } static int senseentrycomp(const void *key, const void *member) { int sense_key; const struct sense_key_table_entry *table_entry; sense_key = *((const int *)key); table_entry = (const struct sense_key_table_entry *)member; if (sense_key >= table_entry->sense_key) { if (sense_key == table_entry->sense_key) return (0); return (1); } return (-1); } static void fetchtableentries(int sense_key, int asc, int ascq, struct scsi_inquiry_data *inq_data, const struct sense_key_table_entry **sense_entry, const struct asc_table_entry **asc_entry) { caddr_t match; const struct asc_table_entry *asc_tables[2]; const struct sense_key_table_entry *sense_tables[2]; struct asc_key asc_ascq; size_t asc_tables_size[2]; size_t sense_tables_size[2]; int num_asc_tables; int num_sense_tables; int i; /* Default to failure */ *sense_entry = NULL; *asc_entry = NULL; match = NULL; if (inq_data != NULL) match = cam_quirkmatch((caddr_t)inq_data, (caddr_t)sense_quirk_table, sense_quirk_table_size, sizeof(*sense_quirk_table), scsi_inquiry_match); if (match != NULL) { struct scsi_sense_quirk_entry *quirk; quirk = (struct scsi_sense_quirk_entry *)match; asc_tables[0] = quirk->asc_info; asc_tables_size[0] = quirk->num_ascs; asc_tables[1] = asc_table; asc_tables_size[1] = asc_table_size; num_asc_tables = 2; sense_tables[0] = quirk->sense_key_info; sense_tables_size[0] = quirk->num_sense_keys; sense_tables[1] = sense_key_table; sense_tables_size[1] = nitems(sense_key_table); num_sense_tables = 2; } else { asc_tables[0] = asc_table; asc_tables_size[0] = asc_table_size; num_asc_tables = 1; sense_tables[0] = sense_key_table; sense_tables_size[0] = nitems(sense_key_table); num_sense_tables = 1; } asc_ascq.asc = asc; asc_ascq.ascq = ascq; for (i = 0; i < num_asc_tables; i++) { void *found_entry; found_entry = bsearch(&asc_ascq, asc_tables[i], asc_tables_size[i], sizeof(**asc_tables), ascentrycomp); if (found_entry) { *asc_entry = (struct asc_table_entry *)found_entry; break; } } for (i = 0; i < num_sense_tables; i++) { void *found_entry; found_entry = bsearch(&sense_key, sense_tables[i], sense_tables_size[i], sizeof(**sense_tables), senseentrycomp); if (found_entry) { *sense_entry = (struct sense_key_table_entry *)found_entry; break; } } } void scsi_sense_desc(int sense_key, int asc, int ascq, struct scsi_inquiry_data *inq_data, const char **sense_key_desc, const char **asc_desc) { const struct asc_table_entry *asc_entry; const struct sense_key_table_entry *sense_entry; fetchtableentries(sense_key, asc, ascq, inq_data, &sense_entry, &asc_entry); if (sense_entry != NULL) *sense_key_desc = sense_entry->desc; else *sense_key_desc = "Invalid Sense Key"; if (asc_entry != NULL) *asc_desc = asc_entry->desc; else if (asc >= 0x80 && asc <= 0xff) *asc_desc = "Vendor Specific ASC"; else if (ascq >= 0x80 && ascq <= 0xff) *asc_desc = "Vendor Specific ASCQ"; else *asc_desc = "Reserved ASC/ASCQ pair"; } /* * Given sense and device type information, return the appropriate action. * If we do not understand the specific error as identified by the ASC/ASCQ * pair, fall back on the more generic actions derived from the sense key. */ scsi_sense_action scsi_error_action(struct ccb_scsiio *csio, struct scsi_inquiry_data *inq_data, u_int32_t sense_flags) { const struct asc_table_entry *asc_entry; const struct sense_key_table_entry *sense_entry; int error_code, sense_key, asc, ascq; scsi_sense_action action; if (!scsi_extract_sense_ccb((union ccb *)csio, &error_code, &sense_key, &asc, &ascq)) { action = SS_RETRY | SSQ_DECREMENT_COUNT | SSQ_PRINT_SENSE | EIO; } else if ((error_code == SSD_DEFERRED_ERROR) || (error_code == SSD_DESC_DEFERRED_ERROR)) { /* * XXX dufault@FreeBSD.org * This error doesn't relate to the command associated * with this request sense. A deferred error is an error * for a command that has already returned GOOD status * (see SCSI2 8.2.14.2). * * By my reading of that section, it looks like the current * command has been cancelled, we should now clean things up * (hopefully recovering any lost data) and then retry the * current command. There are two easy choices, both wrong: * * 1. Drop through (like we had been doing), thus treating * this as if the error were for the current command and * return and stop the current command. * * 2. Issue a retry (like I made it do) thus hopefully * recovering the current transfer, and ignoring the * fact that we've dropped a command. * * These should probably be handled in a device specific * sense handler or punted back up to a user mode daemon */ action = SS_RETRY|SSQ_DECREMENT_COUNT|SSQ_PRINT_SENSE; } else { fetchtableentries(sense_key, asc, ascq, inq_data, &sense_entry, &asc_entry); /* * Override the 'No additional Sense' entry (0,0) * with the error action of the sense key. */ if (asc_entry != NULL && (asc != 0 || ascq != 0)) action = asc_entry->action; else if (sense_entry != NULL) action = sense_entry->action; else action = SS_RETRY|SSQ_DECREMENT_COUNT|SSQ_PRINT_SENSE; if (sense_key == SSD_KEY_RECOVERED_ERROR) { /* * The action succeeded but the device wants * the user to know that some recovery action * was required. */ action &= ~(SS_MASK|SSQ_MASK|SS_ERRMASK); action |= SS_NOP|SSQ_PRINT_SENSE; } else if (sense_key == SSD_KEY_ILLEGAL_REQUEST) { if ((sense_flags & SF_QUIET_IR) != 0) action &= ~SSQ_PRINT_SENSE; } else if (sense_key == SSD_KEY_UNIT_ATTENTION) { if ((sense_flags & SF_RETRY_UA) != 0 && (action & SS_MASK) == SS_FAIL) { action &= ~(SS_MASK|SSQ_MASK); action |= SS_RETRY|SSQ_DECREMENT_COUNT| SSQ_PRINT_SENSE; } action |= SSQ_UA; } } if ((action & SS_MASK) >= SS_START && (sense_flags & SF_NO_RECOVERY)) { action &= ~SS_MASK; action |= SS_FAIL; } else if ((action & SS_MASK) == SS_RETRY && (sense_flags & SF_NO_RETRY)) { action &= ~SS_MASK; action |= SS_FAIL; } if ((sense_flags & SF_PRINT_ALWAYS) != 0) action |= SSQ_PRINT_SENSE; else if ((sense_flags & SF_NO_PRINT) != 0) action &= ~SSQ_PRINT_SENSE; return (action); } char * scsi_cdb_string(u_int8_t *cdb_ptr, char *cdb_string, size_t len) { struct sbuf sb; int error; if (len == 0) return (""); sbuf_new(&sb, cdb_string, len, SBUF_FIXEDLEN); scsi_cdb_sbuf(cdb_ptr, &sb); /* ENOMEM just means that the fixed buffer is full, OK to ignore */ error = sbuf_finish(&sb); if (error != 0 && error != ENOMEM) return (""); return(sbuf_data(&sb)); } void scsi_cdb_sbuf(u_int8_t *cdb_ptr, struct sbuf *sb) { u_int8_t cdb_len; int i; if (cdb_ptr == NULL) return; /* * This is taken from the SCSI-3 draft spec. * (T10/1157D revision 0.3) * The top 3 bits of an opcode are the group code. The next 5 bits * are the command code. * Group 0: six byte commands * Group 1: ten byte commands * Group 2: ten byte commands * Group 3: reserved * Group 4: sixteen byte commands * Group 5: twelve byte commands * Group 6: vendor specific * Group 7: vendor specific */ switch((*cdb_ptr >> 5) & 0x7) { case 0: cdb_len = 6; break; case 1: case 2: cdb_len = 10; break; case 3: case 6: case 7: /* in this case, just print out the opcode */ cdb_len = 1; break; case 4: cdb_len = 16; break; case 5: cdb_len = 12; break; } for (i = 0; i < cdb_len; i++) sbuf_printf(sb, "%02hhx ", cdb_ptr[i]); return; } const char * scsi_status_string(struct ccb_scsiio *csio) { switch(csio->scsi_status) { case SCSI_STATUS_OK: return("OK"); case SCSI_STATUS_CHECK_COND: return("Check Condition"); case SCSI_STATUS_BUSY: return("Busy"); case SCSI_STATUS_INTERMED: return("Intermediate"); case SCSI_STATUS_INTERMED_COND_MET: return("Intermediate-Condition Met"); case SCSI_STATUS_RESERV_CONFLICT: return("Reservation Conflict"); case SCSI_STATUS_CMD_TERMINATED: return("Command Terminated"); case SCSI_STATUS_QUEUE_FULL: return("Queue Full"); case SCSI_STATUS_ACA_ACTIVE: return("ACA Active"); case SCSI_STATUS_TASK_ABORTED: return("Task Aborted"); default: { static char unkstr[64]; snprintf(unkstr, sizeof(unkstr), "Unknown %#x", csio->scsi_status); return(unkstr); } } } /* * scsi_command_string() returns 0 for success and -1 for failure. */ #ifdef _KERNEL int scsi_command_string(struct ccb_scsiio *csio, struct sbuf *sb) #else /* !_KERNEL */ int scsi_command_string(struct cam_device *device, struct ccb_scsiio *csio, struct sbuf *sb) #endif /* _KERNEL/!_KERNEL */ { struct scsi_inquiry_data *inq_data; #ifdef _KERNEL struct ccb_getdev *cgd; #endif /* _KERNEL */ #ifdef _KERNEL if ((cgd = (struct ccb_getdev*)xpt_alloc_ccb_nowait()) == NULL) return(-1); /* * Get the device information. */ xpt_setup_ccb(&cgd->ccb_h, csio->ccb_h.path, CAM_PRIORITY_NORMAL); cgd->ccb_h.func_code = XPT_GDEV_TYPE; xpt_action((union ccb *)cgd); /* * If the device is unconfigured, just pretend that it is a hard * drive. scsi_op_desc() needs this. */ if (cgd->ccb_h.status == CAM_DEV_NOT_THERE) cgd->inq_data.device = T_DIRECT; inq_data = &cgd->inq_data; #else /* !_KERNEL */ inq_data = &device->inq_data; #endif /* _KERNEL/!_KERNEL */ sbuf_printf(sb, "%s. CDB: ", scsi_op_desc(scsiio_cdb_ptr(csio)[0], inq_data)); scsi_cdb_sbuf(scsiio_cdb_ptr(csio), sb); #ifdef _KERNEL xpt_free_ccb((union ccb *)cgd); #endif return(0); } /* * Iterate over sense descriptors. Each descriptor is passed into iter_func(). * If iter_func() returns 0, list traversal continues. If iter_func() * returns non-zero, list traversal is stopped. */ void scsi_desc_iterate(struct scsi_sense_data_desc *sense, u_int sense_len, int (*iter_func)(struct scsi_sense_data_desc *sense, u_int, struct scsi_sense_desc_header *, void *), void *arg) { int cur_pos; int desc_len; /* * First make sure the extra length field is present. */ if (SSD_DESC_IS_PRESENT(sense, sense_len, extra_len) == 0) return; /* * The length of data actually returned may be different than the * extra_len recorded in the structure. */ desc_len = sense_len -offsetof(struct scsi_sense_data_desc, sense_desc); /* * Limit this further by the extra length reported, and the maximum * allowed extra length. */ desc_len = MIN(desc_len, MIN(sense->extra_len, SSD_EXTRA_MAX)); /* * Subtract the size of the header from the descriptor length. * This is to ensure that we have at least the header left, so we * don't have to check that inside the loop. This can wind up * being a negative value. */ desc_len -= sizeof(struct scsi_sense_desc_header); for (cur_pos = 0; cur_pos < desc_len;) { struct scsi_sense_desc_header *header; header = (struct scsi_sense_desc_header *) &sense->sense_desc[cur_pos]; /* * Check to make sure we have the entire descriptor. We * don't call iter_func() unless we do. * * Note that although cur_pos is at the beginning of the * descriptor, desc_len already has the header length * subtracted. So the comparison of the length in the * header (which does not include the header itself) to * desc_len - cur_pos is correct. */ if (header->length > (desc_len - cur_pos)) break; if (iter_func(sense, sense_len, header, arg) != 0) break; cur_pos += sizeof(*header) + header->length; } } struct scsi_find_desc_info { uint8_t desc_type; struct scsi_sense_desc_header *header; }; static int scsi_find_desc_func(struct scsi_sense_data_desc *sense, u_int sense_len, struct scsi_sense_desc_header *header, void *arg) { struct scsi_find_desc_info *desc_info; desc_info = (struct scsi_find_desc_info *)arg; if (header->desc_type == desc_info->desc_type) { desc_info->header = header; /* We found the descriptor, tell the iterator to stop. */ return (1); } else return (0); } /* * Given a descriptor type, return a pointer to it if it is in the sense * data and not truncated. Avoiding truncating sense data will simplify * things significantly for the caller. */ uint8_t * scsi_find_desc(struct scsi_sense_data_desc *sense, u_int sense_len, uint8_t desc_type) { struct scsi_find_desc_info desc_info; desc_info.desc_type = desc_type; desc_info.header = NULL; scsi_desc_iterate(sense, sense_len, scsi_find_desc_func, &desc_info); return ((uint8_t *)desc_info.header); } /* * Fill in SCSI descriptor sense data with the specified parameters. */ static void scsi_set_sense_data_desc_va(struct scsi_sense_data *sense_data, u_int *sense_len, scsi_sense_data_type sense_format, int current_error, int sense_key, int asc, int ascq, va_list ap) { struct scsi_sense_data_desc *sense; scsi_sense_elem_type elem_type; int space, len; uint8_t *desc, *data; memset(sense_data, 0, sizeof(*sense_data)); sense = (struct scsi_sense_data_desc *)sense_data; if (current_error != 0) sense->error_code = SSD_DESC_CURRENT_ERROR; else sense->error_code = SSD_DESC_DEFERRED_ERROR; sense->sense_key = sense_key; sense->add_sense_code = asc; sense->add_sense_code_qual = ascq; sense->flags = 0; desc = &sense->sense_desc[0]; space = *sense_len - offsetof(struct scsi_sense_data_desc, sense_desc); while ((elem_type = va_arg(ap, scsi_sense_elem_type)) != SSD_ELEM_NONE) { if (elem_type >= SSD_ELEM_MAX) { printf("%s: invalid sense type %d\n", __func__, elem_type); break; } len = va_arg(ap, int); data = va_arg(ap, uint8_t *); switch (elem_type) { case SSD_ELEM_SKIP: break; case SSD_ELEM_DESC: if (space < len) { sense->flags |= SSDD_SDAT_OVFL; break; } bcopy(data, desc, len); desc += len; space -= len; break; case SSD_ELEM_SKS: { struct scsi_sense_sks *sks = (void *)desc; if (len > sizeof(sks->sense_key_spec)) break; if (space < sizeof(*sks)) { sense->flags |= SSDD_SDAT_OVFL; break; } sks->desc_type = SSD_DESC_SKS; sks->length = sizeof(*sks) - (offsetof(struct scsi_sense_sks, length) + 1); bcopy(data, &sks->sense_key_spec, len); desc += sizeof(*sks); space -= sizeof(*sks); break; } case SSD_ELEM_COMMAND: { struct scsi_sense_command *cmd = (void *)desc; if (len > sizeof(cmd->command_info)) break; if (space < sizeof(*cmd)) { sense->flags |= SSDD_SDAT_OVFL; break; } cmd->desc_type = SSD_DESC_COMMAND; cmd->length = sizeof(*cmd) - (offsetof(struct scsi_sense_command, length) + 1); bcopy(data, &cmd->command_info[ sizeof(cmd->command_info) - len], len); desc += sizeof(*cmd); space -= sizeof(*cmd); break; } case SSD_ELEM_INFO: { struct scsi_sense_info *info = (void *)desc; if (len > sizeof(info->info)) break; if (space < sizeof(*info)) { sense->flags |= SSDD_SDAT_OVFL; break; } info->desc_type = SSD_DESC_INFO; info->length = sizeof(*info) - (offsetof(struct scsi_sense_info, length) + 1); info->byte2 = SSD_INFO_VALID; bcopy(data, &info->info[sizeof(info->info) - len], len); desc += sizeof(*info); space -= sizeof(*info); break; } case SSD_ELEM_FRU: { struct scsi_sense_fru *fru = (void *)desc; if (len > sizeof(fru->fru)) break; if (space < sizeof(*fru)) { sense->flags |= SSDD_SDAT_OVFL; break; } fru->desc_type = SSD_DESC_FRU; fru->length = sizeof(*fru) - (offsetof(struct scsi_sense_fru, length) + 1); fru->fru = *data; desc += sizeof(*fru); space -= sizeof(*fru); break; } case SSD_ELEM_STREAM: { struct scsi_sense_stream *stream = (void *)desc; if (len > sizeof(stream->byte3)) break; if (space < sizeof(*stream)) { sense->flags |= SSDD_SDAT_OVFL; break; } stream->desc_type = SSD_DESC_STREAM; stream->length = sizeof(*stream) - (offsetof(struct scsi_sense_stream, length) + 1); stream->byte3 = *data; desc += sizeof(*stream); space -= sizeof(*stream); break; } default: /* * We shouldn't get here, but if we do, do nothing. * We've already consumed the arguments above. */ break; } } sense->extra_len = desc - &sense->sense_desc[0]; *sense_len = offsetof(struct scsi_sense_data_desc, extra_len) + 1 + sense->extra_len; } /* * Fill in SCSI fixed sense data with the specified parameters. */ static void scsi_set_sense_data_fixed_va(struct scsi_sense_data *sense_data, u_int *sense_len, scsi_sense_data_type sense_format, int current_error, int sense_key, int asc, int ascq, va_list ap) { struct scsi_sense_data_fixed *sense; scsi_sense_elem_type elem_type; uint8_t *data; int len; memset(sense_data, 0, sizeof(*sense_data)); sense = (struct scsi_sense_data_fixed *)sense_data; if (current_error != 0) sense->error_code = SSD_CURRENT_ERROR; else sense->error_code = SSD_DEFERRED_ERROR; sense->flags = sense_key & SSD_KEY; sense->extra_len = 0; if (*sense_len >= 13) { sense->add_sense_code = asc; sense->extra_len = MAX(sense->extra_len, 5); } else sense->flags |= SSD_SDAT_OVFL; if (*sense_len >= 14) { sense->add_sense_code_qual = ascq; sense->extra_len = MAX(sense->extra_len, 6); } else sense->flags |= SSD_SDAT_OVFL; while ((elem_type = va_arg(ap, scsi_sense_elem_type)) != SSD_ELEM_NONE) { if (elem_type >= SSD_ELEM_MAX) { printf("%s: invalid sense type %d\n", __func__, elem_type); break; } len = va_arg(ap, int); data = va_arg(ap, uint8_t *); switch (elem_type) { case SSD_ELEM_SKIP: break; case SSD_ELEM_SKS: if (len > sizeof(sense->sense_key_spec)) break; if (*sense_len < 18) { sense->flags |= SSD_SDAT_OVFL; break; } bcopy(data, &sense->sense_key_spec[0], len); sense->extra_len = MAX(sense->extra_len, 10); break; case SSD_ELEM_COMMAND: if (*sense_len < 12) { sense->flags |= SSD_SDAT_OVFL; break; } if (len > sizeof(sense->cmd_spec_info)) { data += len - sizeof(sense->cmd_spec_info); len = sizeof(sense->cmd_spec_info); } bcopy(data, &sense->cmd_spec_info[ sizeof(sense->cmd_spec_info) - len], len); sense->extra_len = MAX(sense->extra_len, 4); break; case SSD_ELEM_INFO: /* Set VALID bit only if no overflow. */ sense->error_code |= SSD_ERRCODE_VALID; while (len > sizeof(sense->info)) { if (data[0] != 0) sense->error_code &= ~SSD_ERRCODE_VALID; data ++; len --; } bcopy(data, &sense->info[sizeof(sense->info) - len], len); break; case SSD_ELEM_FRU: if (*sense_len < 15) { sense->flags |= SSD_SDAT_OVFL; break; } sense->fru = *data; sense->extra_len = MAX(sense->extra_len, 7); break; case SSD_ELEM_STREAM: sense->flags |= *data & (SSD_ILI | SSD_EOM | SSD_FILEMARK); break; default: /* * We can't handle that in fixed format. Skip it. */ break; } } *sense_len = offsetof(struct scsi_sense_data_fixed, extra_len) + 1 + sense->extra_len; } /* * Fill in SCSI sense data with the specified parameters. This routine can * fill in either fixed or descriptor type sense data. */ void scsi_set_sense_data_va(struct scsi_sense_data *sense_data, u_int *sense_len, scsi_sense_data_type sense_format, int current_error, int sense_key, int asc, int ascq, va_list ap) { if (*sense_len > SSD_FULL_SIZE) *sense_len = SSD_FULL_SIZE; if (sense_format == SSD_TYPE_DESC) scsi_set_sense_data_desc_va(sense_data, sense_len, sense_format, current_error, sense_key, asc, ascq, ap); else scsi_set_sense_data_fixed_va(sense_data, sense_len, sense_format, current_error, sense_key, asc, ascq, ap); } void scsi_set_sense_data(struct scsi_sense_data *sense_data, scsi_sense_data_type sense_format, int current_error, int sense_key, int asc, int ascq, ...) { va_list ap; u_int sense_len = SSD_FULL_SIZE; va_start(ap, ascq); scsi_set_sense_data_va(sense_data, &sense_len, sense_format, current_error, sense_key, asc, ascq, ap); va_end(ap); } void scsi_set_sense_data_len(struct scsi_sense_data *sense_data, u_int *sense_len, scsi_sense_data_type sense_format, int current_error, int sense_key, int asc, int ascq, ...) { va_list ap; va_start(ap, ascq); scsi_set_sense_data_va(sense_data, sense_len, sense_format, current_error, sense_key, asc, ascq, ap); va_end(ap); } /* * Get sense information for three similar sense data types. */ int scsi_get_sense_info(struct scsi_sense_data *sense_data, u_int sense_len, uint8_t info_type, uint64_t *info, int64_t *signed_info) { scsi_sense_data_type sense_type; if (sense_len == 0) goto bailout; sense_type = scsi_sense_type(sense_data); switch (sense_type) { case SSD_TYPE_DESC: { struct scsi_sense_data_desc *sense; uint8_t *desc; sense = (struct scsi_sense_data_desc *)sense_data; desc = scsi_find_desc(sense, sense_len, info_type); if (desc == NULL) goto bailout; switch (info_type) { case SSD_DESC_INFO: { struct scsi_sense_info *info_desc; info_desc = (struct scsi_sense_info *)desc; if ((info_desc->byte2 & SSD_INFO_VALID) == 0) goto bailout; *info = scsi_8btou64(info_desc->info); if (signed_info != NULL) *signed_info = *info; break; } case SSD_DESC_COMMAND: { struct scsi_sense_command *cmd_desc; cmd_desc = (struct scsi_sense_command *)desc; *info = scsi_8btou64(cmd_desc->command_info); if (signed_info != NULL) *signed_info = *info; break; } case SSD_DESC_FRU: { struct scsi_sense_fru *fru_desc; fru_desc = (struct scsi_sense_fru *)desc; if (fru_desc->fru == 0) goto bailout; *info = fru_desc->fru; if (signed_info != NULL) *signed_info = (int8_t)fru_desc->fru; break; } default: goto bailout; break; } break; } case SSD_TYPE_FIXED: { struct scsi_sense_data_fixed *sense; sense = (struct scsi_sense_data_fixed *)sense_data; switch (info_type) { case SSD_DESC_INFO: { uint32_t info_val; if ((sense->error_code & SSD_ERRCODE_VALID) == 0) goto bailout; if (SSD_FIXED_IS_PRESENT(sense, sense_len, info) == 0) goto bailout; info_val = scsi_4btoul(sense->info); *info = info_val; if (signed_info != NULL) *signed_info = (int32_t)info_val; break; } case SSD_DESC_COMMAND: { uint32_t cmd_val; if ((SSD_FIXED_IS_PRESENT(sense, sense_len, cmd_spec_info) == 0) || (SSD_FIXED_IS_FILLED(sense, cmd_spec_info) == 0)) goto bailout; cmd_val = scsi_4btoul(sense->cmd_spec_info); if (cmd_val == 0) goto bailout; *info = cmd_val; if (signed_info != NULL) *signed_info = (int32_t)cmd_val; break; } case SSD_DESC_FRU: if ((SSD_FIXED_IS_PRESENT(sense, sense_len, fru) == 0) || (SSD_FIXED_IS_FILLED(sense, fru) == 0)) goto bailout; if (sense->fru == 0) goto bailout; *info = sense->fru; if (signed_info != NULL) *signed_info = (int8_t)sense->fru; break; default: goto bailout; break; } break; } default: goto bailout; break; } return (0); bailout: return (1); } int scsi_get_sks(struct scsi_sense_data *sense_data, u_int sense_len, uint8_t *sks) { scsi_sense_data_type sense_type; if (sense_len == 0) goto bailout; sense_type = scsi_sense_type(sense_data); switch (sense_type) { case SSD_TYPE_DESC: { struct scsi_sense_data_desc *sense; struct scsi_sense_sks *desc; sense = (struct scsi_sense_data_desc *)sense_data; desc = (struct scsi_sense_sks *)scsi_find_desc(sense, sense_len, SSD_DESC_SKS); if (desc == NULL) goto bailout; if ((desc->sense_key_spec[0] & SSD_SKS_VALID) == 0) goto bailout; bcopy(desc->sense_key_spec, sks, sizeof(desc->sense_key_spec)); break; } case SSD_TYPE_FIXED: { struct scsi_sense_data_fixed *sense; sense = (struct scsi_sense_data_fixed *)sense_data; if ((SSD_FIXED_IS_PRESENT(sense, sense_len, sense_key_spec)== 0) || (SSD_FIXED_IS_FILLED(sense, sense_key_spec) == 0)) goto bailout; if ((sense->sense_key_spec[0] & SSD_SCS_VALID) == 0) goto bailout; bcopy(sense->sense_key_spec, sks,sizeof(sense->sense_key_spec)); break; } default: goto bailout; break; } return (0); bailout: return (1); } /* * Provide a common interface for fixed and descriptor sense to detect * whether we have block-specific sense information. It is clear by the * presence of the block descriptor in descriptor mode, but we have to * infer from the inquiry data and ILI bit in fixed mode. */ int scsi_get_block_info(struct scsi_sense_data *sense_data, u_int sense_len, struct scsi_inquiry_data *inq_data, uint8_t *block_bits) { scsi_sense_data_type sense_type; if (inq_data != NULL) { switch (SID_TYPE(inq_data)) { case T_DIRECT: case T_RBC: case T_ZBC_HM: break; default: goto bailout; break; } } sense_type = scsi_sense_type(sense_data); switch (sense_type) { case SSD_TYPE_DESC: { struct scsi_sense_data_desc *sense; struct scsi_sense_block *block; sense = (struct scsi_sense_data_desc *)sense_data; block = (struct scsi_sense_block *)scsi_find_desc(sense, sense_len, SSD_DESC_BLOCK); if (block == NULL) goto bailout; *block_bits = block->byte3; break; } case SSD_TYPE_FIXED: { struct scsi_sense_data_fixed *sense; sense = (struct scsi_sense_data_fixed *)sense_data; if (SSD_FIXED_IS_PRESENT(sense, sense_len, flags) == 0) goto bailout; *block_bits = sense->flags & SSD_ILI; break; } default: goto bailout; break; } return (0); bailout: return (1); } int scsi_get_stream_info(struct scsi_sense_data *sense_data, u_int sense_len, struct scsi_inquiry_data *inq_data, uint8_t *stream_bits) { scsi_sense_data_type sense_type; if (inq_data != NULL) { switch (SID_TYPE(inq_data)) { case T_SEQUENTIAL: break; default: goto bailout; break; } } sense_type = scsi_sense_type(sense_data); switch (sense_type) { case SSD_TYPE_DESC: { struct scsi_sense_data_desc *sense; struct scsi_sense_stream *stream; sense = (struct scsi_sense_data_desc *)sense_data; stream = (struct scsi_sense_stream *)scsi_find_desc(sense, sense_len, SSD_DESC_STREAM); if (stream == NULL) goto bailout; *stream_bits = stream->byte3; break; } case SSD_TYPE_FIXED: { struct scsi_sense_data_fixed *sense; sense = (struct scsi_sense_data_fixed *)sense_data; if (SSD_FIXED_IS_PRESENT(sense, sense_len, flags) == 0) goto bailout; *stream_bits = sense->flags & (SSD_ILI|SSD_EOM|SSD_FILEMARK); break; } default: goto bailout; break; } return (0); bailout: return (1); } void scsi_info_sbuf(struct sbuf *sb, uint8_t *cdb, int cdb_len, struct scsi_inquiry_data *inq_data, uint64_t info) { sbuf_printf(sb, "Info: %#jx", info); } void scsi_command_sbuf(struct sbuf *sb, uint8_t *cdb, int cdb_len, struct scsi_inquiry_data *inq_data, uint64_t csi) { sbuf_printf(sb, "Command Specific Info: %#jx", csi); } void scsi_progress_sbuf(struct sbuf *sb, uint16_t progress) { sbuf_printf(sb, "Progress: %d%% (%d/%d) complete", (progress * 100) / SSD_SKS_PROGRESS_DENOM, progress, SSD_SKS_PROGRESS_DENOM); } /* * Returns 1 for failure (i.e. SKS isn't valid) and 0 for success. */ int scsi_sks_sbuf(struct sbuf *sb, int sense_key, uint8_t *sks) { switch (sense_key) { case SSD_KEY_ILLEGAL_REQUEST: { struct scsi_sense_sks_field *field; int bad_command; char tmpstr[40]; /*Field Pointer*/ field = (struct scsi_sense_sks_field *)sks; if (field->byte0 & SSD_SKS_FIELD_CMD) bad_command = 1; else bad_command = 0; tmpstr[0] = '\0'; /* Bit pointer is valid */ if (field->byte0 & SSD_SKS_BPV) snprintf(tmpstr, sizeof(tmpstr), "bit %d ", field->byte0 & SSD_SKS_BIT_VALUE); sbuf_printf(sb, "%s byte %d %sis invalid", bad_command ? "Command" : "Data", scsi_2btoul(field->field), tmpstr); break; } case SSD_KEY_UNIT_ATTENTION: { struct scsi_sense_sks_overflow *overflow; overflow = (struct scsi_sense_sks_overflow *)sks; /*UA Condition Queue Overflow*/ sbuf_printf(sb, "Unit Attention Condition Queue %s", (overflow->byte0 & SSD_SKS_OVERFLOW_SET) ? "Overflowed" : "Did Not Overflow??"); break; } case SSD_KEY_RECOVERED_ERROR: case SSD_KEY_HARDWARE_ERROR: case SSD_KEY_MEDIUM_ERROR: { struct scsi_sense_sks_retry *retry; /*Actual Retry Count*/ retry = (struct scsi_sense_sks_retry *)sks; sbuf_printf(sb, "Actual Retry Count: %d", scsi_2btoul(retry->actual_retry_count)); break; } case SSD_KEY_NO_SENSE: case SSD_KEY_NOT_READY: { struct scsi_sense_sks_progress *progress; int progress_val; /*Progress Indication*/ progress = (struct scsi_sense_sks_progress *)sks; progress_val = scsi_2btoul(progress->progress); scsi_progress_sbuf(sb, progress_val); break; } case SSD_KEY_COPY_ABORTED: { struct scsi_sense_sks_segment *segment; char tmpstr[40]; /*Segment Pointer*/ segment = (struct scsi_sense_sks_segment *)sks; tmpstr[0] = '\0'; if (segment->byte0 & SSD_SKS_SEGMENT_BPV) snprintf(tmpstr, sizeof(tmpstr), "bit %d ", segment->byte0 & SSD_SKS_SEGMENT_BITPTR); sbuf_printf(sb, "%s byte %d %sis invalid", (segment->byte0 & SSD_SKS_SEGMENT_SD) ? "Segment" : "Data", scsi_2btoul(segment->field), tmpstr); break; } default: sbuf_printf(sb, "Sense Key Specific: %#x,%#x", sks[0], scsi_2btoul(&sks[1])); break; } return (0); } void scsi_fru_sbuf(struct sbuf *sb, uint64_t fru) { sbuf_printf(sb, "Field Replaceable Unit: %d", (int)fru); } void scsi_stream_sbuf(struct sbuf *sb, uint8_t stream_bits) { int need_comma; need_comma = 0; /* * XXX KDM this needs more descriptive decoding. */ sbuf_printf(sb, "Stream Command Sense Data: "); if (stream_bits & SSD_DESC_STREAM_FM) { sbuf_printf(sb, "Filemark"); need_comma = 1; } if (stream_bits & SSD_DESC_STREAM_EOM) { sbuf_printf(sb, "%sEOM", (need_comma) ? "," : ""); need_comma = 1; } if (stream_bits & SSD_DESC_STREAM_ILI) sbuf_printf(sb, "%sILI", (need_comma) ? "," : ""); } void scsi_block_sbuf(struct sbuf *sb, uint8_t block_bits) { sbuf_printf(sb, "Block Command Sense Data: "); if (block_bits & SSD_DESC_BLOCK_ILI) sbuf_printf(sb, "ILI"); } void scsi_sense_info_sbuf(struct sbuf *sb, struct scsi_sense_data *sense, u_int sense_len, uint8_t *cdb, int cdb_len, struct scsi_inquiry_data *inq_data, struct scsi_sense_desc_header *header) { struct scsi_sense_info *info; info = (struct scsi_sense_info *)header; if ((info->byte2 & SSD_INFO_VALID) == 0) return; scsi_info_sbuf(sb, cdb, cdb_len, inq_data, scsi_8btou64(info->info)); } void scsi_sense_command_sbuf(struct sbuf *sb, struct scsi_sense_data *sense, u_int sense_len, uint8_t *cdb, int cdb_len, struct scsi_inquiry_data *inq_data, struct scsi_sense_desc_header *header) { struct scsi_sense_command *command; command = (struct scsi_sense_command *)header; scsi_command_sbuf(sb, cdb, cdb_len, inq_data, scsi_8btou64(command->command_info)); } void scsi_sense_sks_sbuf(struct sbuf *sb, struct scsi_sense_data *sense, u_int sense_len, uint8_t *cdb, int cdb_len, struct scsi_inquiry_data *inq_data, struct scsi_sense_desc_header *header) { struct scsi_sense_sks *sks; int error_code, sense_key, asc, ascq; sks = (struct scsi_sense_sks *)header; if ((sks->sense_key_spec[0] & SSD_SKS_VALID) == 0) return; scsi_extract_sense_len(sense, sense_len, &error_code, &sense_key, &asc, &ascq, /*show_errors*/ 1); scsi_sks_sbuf(sb, sense_key, sks->sense_key_spec); } void scsi_sense_fru_sbuf(struct sbuf *sb, struct scsi_sense_data *sense, u_int sense_len, uint8_t *cdb, int cdb_len, struct scsi_inquiry_data *inq_data, struct scsi_sense_desc_header *header) { struct scsi_sense_fru *fru; fru = (struct scsi_sense_fru *)header; if (fru->fru == 0) return; scsi_fru_sbuf(sb, (uint64_t)fru->fru); } void scsi_sense_stream_sbuf(struct sbuf *sb, struct scsi_sense_data *sense, u_int sense_len, uint8_t *cdb, int cdb_len, struct scsi_inquiry_data *inq_data, struct scsi_sense_desc_header *header) { struct scsi_sense_stream *stream; stream = (struct scsi_sense_stream *)header; scsi_stream_sbuf(sb, stream->byte3); } void scsi_sense_block_sbuf(struct sbuf *sb, struct scsi_sense_data *sense, u_int sense_len, uint8_t *cdb, int cdb_len, struct scsi_inquiry_data *inq_data, struct scsi_sense_desc_header *header) { struct scsi_sense_block *block; block = (struct scsi_sense_block *)header; scsi_block_sbuf(sb, block->byte3); } void scsi_sense_progress_sbuf(struct sbuf *sb, struct scsi_sense_data *sense, u_int sense_len, uint8_t *cdb, int cdb_len, struct scsi_inquiry_data *inq_data, struct scsi_sense_desc_header *header) { struct scsi_sense_progress *progress; const char *sense_key_desc; const char *asc_desc; int progress_val; progress = (struct scsi_sense_progress *)header; /* * Get descriptions for the sense key, ASC, and ASCQ in the * progress descriptor. These could be different than the values * in the overall sense data. */ scsi_sense_desc(progress->sense_key, progress->add_sense_code, progress->add_sense_code_qual, inq_data, &sense_key_desc, &asc_desc); progress_val = scsi_2btoul(progress->progress); /* * The progress indicator is for the operation described by the * sense key, ASC, and ASCQ in the descriptor. */ sbuf_cat(sb, sense_key_desc); sbuf_printf(sb, " asc:%x,%x (%s): ", progress->add_sense_code, progress->add_sense_code_qual, asc_desc); scsi_progress_sbuf(sb, progress_val); } void scsi_sense_ata_sbuf(struct sbuf *sb, struct scsi_sense_data *sense, u_int sense_len, uint8_t *cdb, int cdb_len, struct scsi_inquiry_data *inq_data, struct scsi_sense_desc_header *header) { struct scsi_sense_ata_ret_desc *res; res = (struct scsi_sense_ata_ret_desc *)header; sbuf_printf(sb, "ATA status: %02x (%s%s%s%s%s%s%s%s), ", res->status, (res->status & 0x80) ? "BSY " : "", (res->status & 0x40) ? "DRDY " : "", (res->status & 0x20) ? "DF " : "", (res->status & 0x10) ? "SERV " : "", (res->status & 0x08) ? "DRQ " : "", (res->status & 0x04) ? "CORR " : "", (res->status & 0x02) ? "IDX " : "", (res->status & 0x01) ? "ERR" : ""); if (res->status & 1) { sbuf_printf(sb, "error: %02x (%s%s%s%s%s%s%s%s), ", res->error, (res->error & 0x80) ? "ICRC " : "", (res->error & 0x40) ? "UNC " : "", (res->error & 0x20) ? "MC " : "", (res->error & 0x10) ? "IDNF " : "", (res->error & 0x08) ? "MCR " : "", (res->error & 0x04) ? "ABRT " : "", (res->error & 0x02) ? "NM " : "", (res->error & 0x01) ? "ILI" : ""); } if (res->flags & SSD_DESC_ATA_FLAG_EXTEND) { sbuf_printf(sb, "count: %02x%02x, ", res->count_15_8, res->count_7_0); sbuf_printf(sb, "LBA: %02x%02x%02x%02x%02x%02x, ", res->lba_47_40, res->lba_39_32, res->lba_31_24, res->lba_23_16, res->lba_15_8, res->lba_7_0); } else { sbuf_printf(sb, "count: %02x, ", res->count_7_0); sbuf_printf(sb, "LBA: %02x%02x%02x, ", res->lba_23_16, res->lba_15_8, res->lba_7_0); } sbuf_printf(sb, "device: %02x, ", res->device); } void scsi_sense_forwarded_sbuf(struct sbuf *sb, struct scsi_sense_data *sense, u_int sense_len, uint8_t *cdb, int cdb_len, struct scsi_inquiry_data *inq_data, struct scsi_sense_desc_header *header) { struct scsi_sense_forwarded *forwarded; const char *sense_key_desc; const char *asc_desc; int error_code, sense_key, asc, ascq; forwarded = (struct scsi_sense_forwarded *)header; scsi_extract_sense_len((struct scsi_sense_data *)forwarded->sense_data, forwarded->length - 2, &error_code, &sense_key, &asc, &ascq, 1); scsi_sense_desc(sense_key, asc, ascq, NULL, &sense_key_desc, &asc_desc); sbuf_printf(sb, "Forwarded sense: %s asc:%x,%x (%s): ", sense_key_desc, asc, ascq, asc_desc); } /* * Generic sense descriptor printing routine. This is used when we have * not yet implemented a specific printing routine for this descriptor. */ void scsi_sense_generic_sbuf(struct sbuf *sb, struct scsi_sense_data *sense, u_int sense_len, uint8_t *cdb, int cdb_len, struct scsi_inquiry_data *inq_data, struct scsi_sense_desc_header *header) { int i; uint8_t *buf_ptr; sbuf_printf(sb, "Descriptor %#x:", header->desc_type); buf_ptr = (uint8_t *)&header[1]; for (i = 0; i < header->length; i++, buf_ptr++) sbuf_printf(sb, " %02x", *buf_ptr); } /* * Keep this list in numeric order. This speeds the array traversal. */ struct scsi_sense_desc_printer { uint8_t desc_type; /* * The function arguments here are the superset of what is needed * to print out various different descriptors. Command and * information descriptors need inquiry data and command type. * Sense key specific descriptors need the sense key. * * The sense, cdb, and inquiry data arguments may be NULL, but the * information printed may not be fully decoded as a result. */ void (*print_func)(struct sbuf *sb, struct scsi_sense_data *sense, u_int sense_len, uint8_t *cdb, int cdb_len, struct scsi_inquiry_data *inq_data, struct scsi_sense_desc_header *header); } scsi_sense_printers[] = { {SSD_DESC_INFO, scsi_sense_info_sbuf}, {SSD_DESC_COMMAND, scsi_sense_command_sbuf}, {SSD_DESC_SKS, scsi_sense_sks_sbuf}, {SSD_DESC_FRU, scsi_sense_fru_sbuf}, {SSD_DESC_STREAM, scsi_sense_stream_sbuf}, {SSD_DESC_BLOCK, scsi_sense_block_sbuf}, {SSD_DESC_ATA, scsi_sense_ata_sbuf}, {SSD_DESC_PROGRESS, scsi_sense_progress_sbuf}, {SSD_DESC_FORWARDED, scsi_sense_forwarded_sbuf} }; void scsi_sense_desc_sbuf(struct sbuf *sb, struct scsi_sense_data *sense, u_int sense_len, uint8_t *cdb, int cdb_len, struct scsi_inquiry_data *inq_data, struct scsi_sense_desc_header *header) { u_int i; for (i = 0; i < nitems(scsi_sense_printers); i++) { struct scsi_sense_desc_printer *printer; printer = &scsi_sense_printers[i]; /* * The list is sorted, so quit if we've passed our * descriptor number. */ if (printer->desc_type > header->desc_type) break; if (printer->desc_type != header->desc_type) continue; printer->print_func(sb, sense, sense_len, cdb, cdb_len, inq_data, header); return; } /* * No specific printing routine, so use the generic routine. */ scsi_sense_generic_sbuf(sb, sense, sense_len, cdb, cdb_len, inq_data, header); } scsi_sense_data_type scsi_sense_type(struct scsi_sense_data *sense_data) { switch (sense_data->error_code & SSD_ERRCODE) { case SSD_DESC_CURRENT_ERROR: case SSD_DESC_DEFERRED_ERROR: return (SSD_TYPE_DESC); break; case SSD_CURRENT_ERROR: case SSD_DEFERRED_ERROR: return (SSD_TYPE_FIXED); break; default: break; } return (SSD_TYPE_NONE); } struct scsi_print_sense_info { struct sbuf *sb; char *path_str; uint8_t *cdb; int cdb_len; struct scsi_inquiry_data *inq_data; }; static int scsi_print_desc_func(struct scsi_sense_data_desc *sense, u_int sense_len, struct scsi_sense_desc_header *header, void *arg) { struct scsi_print_sense_info *print_info; print_info = (struct scsi_print_sense_info *)arg; switch (header->desc_type) { case SSD_DESC_INFO: case SSD_DESC_FRU: case SSD_DESC_COMMAND: case SSD_DESC_SKS: case SSD_DESC_BLOCK: case SSD_DESC_STREAM: /* * We have already printed these descriptors, if they are * present. */ break; default: { sbuf_printf(print_info->sb, "%s", print_info->path_str); scsi_sense_desc_sbuf(print_info->sb, (struct scsi_sense_data *)sense, sense_len, print_info->cdb, print_info->cdb_len, print_info->inq_data, header); sbuf_printf(print_info->sb, "\n"); break; } } /* * Tell the iterator that we want to see more descriptors if they * are present. */ return (0); } void scsi_sense_only_sbuf(struct scsi_sense_data *sense, u_int sense_len, struct sbuf *sb, char *path_str, struct scsi_inquiry_data *inq_data, uint8_t *cdb, int cdb_len) { int error_code, sense_key, asc, ascq; sbuf_cat(sb, path_str); scsi_extract_sense_len(sense, sense_len, &error_code, &sense_key, &asc, &ascq, /*show_errors*/ 1); sbuf_printf(sb, "SCSI sense: "); switch (error_code) { case SSD_DEFERRED_ERROR: case SSD_DESC_DEFERRED_ERROR: sbuf_printf(sb, "Deferred error: "); /* FALLTHROUGH */ case SSD_CURRENT_ERROR: case SSD_DESC_CURRENT_ERROR: { struct scsi_sense_data_desc *desc_sense; struct scsi_print_sense_info print_info; const char *sense_key_desc; const char *asc_desc; uint8_t sks[3]; uint64_t val; uint8_t bits; /* * Get descriptions for the sense key, ASC, and ASCQ. If * these aren't present in the sense data (i.e. the sense * data isn't long enough), the -1 values that * scsi_extract_sense_len() returns will yield default * or error descriptions. */ scsi_sense_desc(sense_key, asc, ascq, inq_data, &sense_key_desc, &asc_desc); /* * We first print the sense key and ASC/ASCQ. */ sbuf_cat(sb, sense_key_desc); sbuf_printf(sb, " asc:%x,%x (%s)\n", asc, ascq, asc_desc); /* * Print any block or stream device-specific information. */ if (scsi_get_block_info(sense, sense_len, inq_data, &bits) == 0 && bits != 0) { sbuf_cat(sb, path_str); scsi_block_sbuf(sb, bits); sbuf_printf(sb, "\n"); } else if (scsi_get_stream_info(sense, sense_len, inq_data, &bits) == 0 && bits != 0) { sbuf_cat(sb, path_str); scsi_stream_sbuf(sb, bits); sbuf_printf(sb, "\n"); } /* * Print the info field. */ if (scsi_get_sense_info(sense, sense_len, SSD_DESC_INFO, &val, NULL) == 0) { sbuf_cat(sb, path_str); scsi_info_sbuf(sb, cdb, cdb_len, inq_data, val); sbuf_printf(sb, "\n"); } /* * Print the FRU. */ if (scsi_get_sense_info(sense, sense_len, SSD_DESC_FRU, &val, NULL) == 0) { sbuf_cat(sb, path_str); scsi_fru_sbuf(sb, val); sbuf_printf(sb, "\n"); } /* * Print any command-specific information. */ if (scsi_get_sense_info(sense, sense_len, SSD_DESC_COMMAND, &val, NULL) == 0) { sbuf_cat(sb, path_str); scsi_command_sbuf(sb, cdb, cdb_len, inq_data, val); sbuf_printf(sb, "\n"); } /* * Print out any sense-key-specific information. */ if (scsi_get_sks(sense, sense_len, sks) == 0) { sbuf_cat(sb, path_str); scsi_sks_sbuf(sb, sense_key, sks); sbuf_printf(sb, "\n"); } /* * If this is fixed sense, we're done. If we have * descriptor sense, we might have more information * available. */ if (scsi_sense_type(sense) != SSD_TYPE_DESC) break; desc_sense = (struct scsi_sense_data_desc *)sense; print_info.sb = sb; print_info.path_str = path_str; print_info.cdb = cdb; print_info.cdb_len = cdb_len; print_info.inq_data = inq_data; /* * Print any sense descriptors that we have not already printed. */ scsi_desc_iterate(desc_sense, sense_len, scsi_print_desc_func, &print_info); break; } case -1: /* * scsi_extract_sense_len() sets values to -1 if the * show_errors flag is set and they aren't present in the * sense data. This means that sense_len is 0. */ sbuf_printf(sb, "No sense data present\n"); break; default: { sbuf_printf(sb, "Error code 0x%x", error_code); if (sense->error_code & SSD_ERRCODE_VALID) { struct scsi_sense_data_fixed *fixed_sense; fixed_sense = (struct scsi_sense_data_fixed *)sense; if (SSD_FIXED_IS_PRESENT(fixed_sense, sense_len, info)){ uint32_t info; info = scsi_4btoul(fixed_sense->info); sbuf_printf(sb, " at block no. %d (decimal)", info); } } sbuf_printf(sb, "\n"); break; } } } /* * scsi_sense_sbuf() returns 0 for success and -1 for failure. */ #ifdef _KERNEL int scsi_sense_sbuf(struct ccb_scsiio *csio, struct sbuf *sb, scsi_sense_string_flags flags) #else /* !_KERNEL */ int scsi_sense_sbuf(struct cam_device *device, struct ccb_scsiio *csio, struct sbuf *sb, scsi_sense_string_flags flags) #endif /* _KERNEL/!_KERNEL */ { struct scsi_sense_data *sense; struct scsi_inquiry_data *inq_data; #ifdef _KERNEL struct ccb_getdev *cgd; #endif /* _KERNEL */ char path_str[64]; #ifndef _KERNEL if (device == NULL) return(-1); #endif /* !_KERNEL */ if ((csio == NULL) || (sb == NULL)) return(-1); /* * If the CDB is a physical address, we can't deal with it.. */ if ((csio->ccb_h.flags & CAM_CDB_PHYS) != 0) flags &= ~SSS_FLAG_PRINT_COMMAND; #ifdef _KERNEL xpt_path_string(csio->ccb_h.path, path_str, sizeof(path_str)); #else /* !_KERNEL */ cam_path_string(device, path_str, sizeof(path_str)); #endif /* _KERNEL/!_KERNEL */ #ifdef _KERNEL if ((cgd = (struct ccb_getdev*)xpt_alloc_ccb_nowait()) == NULL) return(-1); /* * Get the device information. */ xpt_setup_ccb(&cgd->ccb_h, csio->ccb_h.path, CAM_PRIORITY_NORMAL); cgd->ccb_h.func_code = XPT_GDEV_TYPE; xpt_action((union ccb *)cgd); /* * If the device is unconfigured, just pretend that it is a hard * drive. scsi_op_desc() needs this. */ if (cgd->ccb_h.status == CAM_DEV_NOT_THERE) cgd->inq_data.device = T_DIRECT; inq_data = &cgd->inq_data; #else /* !_KERNEL */ inq_data = &device->inq_data; #endif /* _KERNEL/!_KERNEL */ sense = NULL; if (flags & SSS_FLAG_PRINT_COMMAND) { sbuf_cat(sb, path_str); #ifdef _KERNEL scsi_command_string(csio, sb); #else /* !_KERNEL */ scsi_command_string(device, csio, sb); #endif /* _KERNEL/!_KERNEL */ sbuf_printf(sb, "\n"); } /* * If the sense data is a physical pointer, forget it. */ if (csio->ccb_h.flags & CAM_SENSE_PTR) { if (csio->ccb_h.flags & CAM_SENSE_PHYS) { #ifdef _KERNEL xpt_free_ccb((union ccb*)cgd); #endif /* _KERNEL/!_KERNEL */ return(-1); } else { /* * bcopy the pointer to avoid unaligned access * errors on finicky architectures. We don't * ensure that the sense data is pointer aligned. */ bcopy((struct scsi_sense_data **)&csio->sense_data, &sense, sizeof(struct scsi_sense_data *)); } } else { /* * If the physical sense flag is set, but the sense pointer * is not also set, we assume that the user is an idiot and * return. (Well, okay, it could be that somehow, the * entire csio is physical, but we would have probably core * dumped on one of the bogus pointer deferences above * already.) */ if (csio->ccb_h.flags & CAM_SENSE_PHYS) { #ifdef _KERNEL xpt_free_ccb((union ccb*)cgd); #endif /* _KERNEL/!_KERNEL */ return(-1); } else sense = &csio->sense_data; } scsi_sense_only_sbuf(sense, csio->sense_len - csio->sense_resid, sb, path_str, inq_data, scsiio_cdb_ptr(csio), csio->cdb_len); #ifdef _KERNEL xpt_free_ccb((union ccb*)cgd); #endif /* _KERNEL/!_KERNEL */ return(0); } #ifdef _KERNEL char * scsi_sense_string(struct ccb_scsiio *csio, char *str, int str_len) #else /* !_KERNEL */ char * scsi_sense_string(struct cam_device *device, struct ccb_scsiio *csio, char *str, int str_len) #endif /* _KERNEL/!_KERNEL */ { struct sbuf sb; sbuf_new(&sb, str, str_len, 0); #ifdef _KERNEL scsi_sense_sbuf(csio, &sb, SSS_FLAG_PRINT_COMMAND); #else /* !_KERNEL */ scsi_sense_sbuf(device, csio, &sb, SSS_FLAG_PRINT_COMMAND); #endif /* _KERNEL/!_KERNEL */ sbuf_finish(&sb); return(sbuf_data(&sb)); } #ifdef _KERNEL void scsi_sense_print(struct ccb_scsiio *csio) { struct sbuf sb; char str[512]; sbuf_new(&sb, str, sizeof(str), 0); scsi_sense_sbuf(csio, &sb, SSS_FLAG_PRINT_COMMAND); sbuf_finish(&sb); sbuf_putbuf(&sb); } #else /* !_KERNEL */ void scsi_sense_print(struct cam_device *device, struct ccb_scsiio *csio, FILE *ofile) { struct sbuf sb; char str[512]; if ((device == NULL) || (csio == NULL) || (ofile == NULL)) return; sbuf_new(&sb, str, sizeof(str), 0); scsi_sense_sbuf(device, csio, &sb, SSS_FLAG_PRINT_COMMAND); sbuf_finish(&sb); fprintf(ofile, "%s", sbuf_data(&sb)); } #endif /* _KERNEL/!_KERNEL */ /* * Extract basic sense information. This is backward-compatible with the * previous implementation. For new implementations, * scsi_extract_sense_len() is recommended. */ void scsi_extract_sense(struct scsi_sense_data *sense_data, int *error_code, int *sense_key, int *asc, int *ascq) { scsi_extract_sense_len(sense_data, sizeof(*sense_data), error_code, sense_key, asc, ascq, /*show_errors*/ 0); } /* * Extract basic sense information from SCSI I/O CCB structure. */ int scsi_extract_sense_ccb(union ccb *ccb, int *error_code, int *sense_key, int *asc, int *ascq) { struct scsi_sense_data *sense_data; /* Make sure there are some sense data we can access. */ if (ccb->ccb_h.func_code != XPT_SCSI_IO || (ccb->ccb_h.status & CAM_STATUS_MASK) != CAM_SCSI_STATUS_ERROR || (ccb->csio.scsi_status != SCSI_STATUS_CHECK_COND) || (ccb->ccb_h.status & CAM_AUTOSNS_VALID) == 0 || (ccb->ccb_h.flags & CAM_SENSE_PHYS)) return (0); if (ccb->ccb_h.flags & CAM_SENSE_PTR) bcopy((struct scsi_sense_data **)&ccb->csio.sense_data, &sense_data, sizeof(struct scsi_sense_data *)); else sense_data = &ccb->csio.sense_data; scsi_extract_sense_len(sense_data, ccb->csio.sense_len - ccb->csio.sense_resid, error_code, sense_key, asc, ascq, 1); if (*error_code == -1) return (0); return (1); } /* * Extract basic sense information. If show_errors is set, sense values * will be set to -1 if they are not present. */ void scsi_extract_sense_len(struct scsi_sense_data *sense_data, u_int sense_len, int *error_code, int *sense_key, int *asc, int *ascq, int show_errors) { /* * If we have no length, we have no sense. */ if (sense_len == 0) { if (show_errors == 0) { *error_code = 0; *sense_key = 0; *asc = 0; *ascq = 0; } else { *error_code = -1; *sense_key = -1; *asc = -1; *ascq = -1; } return; } *error_code = sense_data->error_code & SSD_ERRCODE; switch (*error_code) { case SSD_DESC_CURRENT_ERROR: case SSD_DESC_DEFERRED_ERROR: { struct scsi_sense_data_desc *sense; sense = (struct scsi_sense_data_desc *)sense_data; if (SSD_DESC_IS_PRESENT(sense, sense_len, sense_key)) *sense_key = sense->sense_key & SSD_KEY; else *sense_key = (show_errors) ? -1 : 0; if (SSD_DESC_IS_PRESENT(sense, sense_len, add_sense_code)) *asc = sense->add_sense_code; else *asc = (show_errors) ? -1 : 0; if (SSD_DESC_IS_PRESENT(sense, sense_len, add_sense_code_qual)) *ascq = sense->add_sense_code_qual; else *ascq = (show_errors) ? -1 : 0; break; } case SSD_CURRENT_ERROR: case SSD_DEFERRED_ERROR: default: { struct scsi_sense_data_fixed *sense; sense = (struct scsi_sense_data_fixed *)sense_data; if (SSD_FIXED_IS_PRESENT(sense, sense_len, flags)) *sense_key = sense->flags & SSD_KEY; else *sense_key = (show_errors) ? -1 : 0; if ((SSD_FIXED_IS_PRESENT(sense, sense_len, add_sense_code)) && (SSD_FIXED_IS_FILLED(sense, add_sense_code))) *asc = sense->add_sense_code; else *asc = (show_errors) ? -1 : 0; if ((SSD_FIXED_IS_PRESENT(sense, sense_len,add_sense_code_qual)) && (SSD_FIXED_IS_FILLED(sense, add_sense_code_qual))) *ascq = sense->add_sense_code_qual; else *ascq = (show_errors) ? -1 : 0; break; } } } int scsi_get_sense_key(struct scsi_sense_data *sense_data, u_int sense_len, int show_errors) { int error_code, sense_key, asc, ascq; scsi_extract_sense_len(sense_data, sense_len, &error_code, &sense_key, &asc, &ascq, show_errors); return (sense_key); } int scsi_get_asc(struct scsi_sense_data *sense_data, u_int sense_len, int show_errors) { int error_code, sense_key, asc, ascq; scsi_extract_sense_len(sense_data, sense_len, &error_code, &sense_key, &asc, &ascq, show_errors); return (asc); } int scsi_get_ascq(struct scsi_sense_data *sense_data, u_int sense_len, int show_errors) { int error_code, sense_key, asc, ascq; scsi_extract_sense_len(sense_data, sense_len, &error_code, &sense_key, &asc, &ascq, show_errors); return (ascq); } /* * This function currently requires at least 36 bytes, or * SHORT_INQUIRY_LENGTH, worth of data to function properly. If this * function needs more or less data in the future, another length should be * defined in scsi_all.h to indicate the minimum amount of data necessary * for this routine to function properly. */ void scsi_print_inquiry_sbuf(struct sbuf *sb, struct scsi_inquiry_data *inq_data) { u_int8_t type; char *dtype, *qtype; type = SID_TYPE(inq_data); /* * Figure out basic device type and qualifier. */ if (SID_QUAL_IS_VENDOR_UNIQUE(inq_data)) { qtype = " (vendor-unique qualifier)"; } else { switch (SID_QUAL(inq_data)) { case SID_QUAL_LU_CONNECTED: qtype = ""; break; case SID_QUAL_LU_OFFLINE: qtype = " (offline)"; break; case SID_QUAL_RSVD: qtype = " (reserved qualifier)"; break; default: case SID_QUAL_BAD_LU: qtype = " (LUN not supported)"; break; } } switch (type) { case T_DIRECT: dtype = "Direct Access"; break; case T_SEQUENTIAL: dtype = "Sequential Access"; break; case T_PRINTER: dtype = "Printer"; break; case T_PROCESSOR: dtype = "Processor"; break; case T_WORM: dtype = "WORM"; break; case T_CDROM: dtype = "CD-ROM"; break; case T_SCANNER: dtype = "Scanner"; break; case T_OPTICAL: dtype = "Optical"; break; case T_CHANGER: dtype = "Changer"; break; case T_COMM: dtype = "Communication"; break; case T_STORARRAY: dtype = "Storage Array"; break; case T_ENCLOSURE: dtype = "Enclosure Services"; break; case T_RBC: dtype = "Simplified Direct Access"; break; case T_OCRW: dtype = "Optical Card Read/Write"; break; case T_OSD: dtype = "Object-Based Storage"; break; case T_ADC: dtype = "Automation/Drive Interface"; break; case T_ZBC_HM: dtype = "Host Managed Zoned Block"; break; case T_NODEVICE: dtype = "Uninstalled"; break; default: dtype = "unknown"; break; } scsi_print_inquiry_short_sbuf(sb, inq_data); sbuf_printf(sb, "%s %s ", SID_IS_REMOVABLE(inq_data) ? "Removable" : "Fixed", dtype); if (SID_ANSI_REV(inq_data) == SCSI_REV_0) sbuf_printf(sb, "SCSI "); else if (SID_ANSI_REV(inq_data) <= SCSI_REV_SPC) { sbuf_printf(sb, "SCSI-%d ", SID_ANSI_REV(inq_data)); } else { sbuf_printf(sb, "SPC-%d SCSI ", SID_ANSI_REV(inq_data) - 2); } sbuf_printf(sb, "device%s\n", qtype); } void scsi_print_inquiry(struct scsi_inquiry_data *inq_data) { struct sbuf sb; char buffer[120]; sbuf_new(&sb, buffer, 120, SBUF_FIXEDLEN); scsi_print_inquiry_sbuf(&sb, inq_data); sbuf_finish(&sb); sbuf_putbuf(&sb); } void scsi_print_inquiry_short_sbuf(struct sbuf *sb, struct scsi_inquiry_data *inq_data) { sbuf_printf(sb, "<"); cam_strvis_sbuf(sb, inq_data->vendor, sizeof(inq_data->vendor), 0); sbuf_printf(sb, " "); cam_strvis_sbuf(sb, inq_data->product, sizeof(inq_data->product), 0); sbuf_printf(sb, " "); cam_strvis_sbuf(sb, inq_data->revision, sizeof(inq_data->revision), 0); sbuf_printf(sb, "> "); } void scsi_print_inquiry_short(struct scsi_inquiry_data *inq_data) { struct sbuf sb; char buffer[84]; sbuf_new(&sb, buffer, 84, SBUF_FIXEDLEN); scsi_print_inquiry_short_sbuf(&sb, inq_data); sbuf_finish(&sb); sbuf_putbuf(&sb); } /* * Table of syncrates that don't follow the "divisible by 4" * rule. This table will be expanded in future SCSI specs. */ static struct { u_int period_factor; u_int period; /* in 100ths of ns */ } scsi_syncrates[] = { { 0x08, 625 }, /* FAST-160 */ { 0x09, 1250 }, /* FAST-80 */ { 0x0a, 2500 }, /* FAST-40 40MHz */ { 0x0b, 3030 }, /* FAST-40 33MHz */ { 0x0c, 5000 } /* FAST-20 */ }; /* * Return the frequency in kHz corresponding to the given * sync period factor. */ u_int scsi_calc_syncsrate(u_int period_factor) { u_int i; u_int num_syncrates; /* * It's a bug if period is zero, but if it is anyway, don't * die with a divide fault- instead return something which * 'approximates' async */ if (period_factor == 0) { return (3300); } num_syncrates = nitems(scsi_syncrates); /* See if the period is in the "exception" table */ for (i = 0; i < num_syncrates; i++) { if (period_factor == scsi_syncrates[i].period_factor) { /* Period in kHz */ return (100000000 / scsi_syncrates[i].period); } } /* * Wasn't in the table, so use the standard * 4 times conversion. */ return (10000000 / (period_factor * 4 * 10)); } /* * Return the SCSI sync parameter that corresponds to * the passed in period in 10ths of ns. */ u_int scsi_calc_syncparam(u_int period) { u_int i; u_int num_syncrates; if (period == 0) return (~0); /* Async */ /* Adjust for exception table being in 100ths. */ period *= 10; num_syncrates = nitems(scsi_syncrates); /* See if the period is in the "exception" table */ for (i = 0; i < num_syncrates; i++) { if (period <= scsi_syncrates[i].period) { /* Period in 100ths of ns */ return (scsi_syncrates[i].period_factor); } } /* * Wasn't in the table, so use the standard * 1/4 period in ns conversion. */ return (period/400); } int scsi_devid_is_naa_ieee_reg(uint8_t *bufp) { struct scsi_vpd_id_descriptor *descr; struct scsi_vpd_id_naa_basic *naa; int n; descr = (struct scsi_vpd_id_descriptor *)bufp; naa = (struct scsi_vpd_id_naa_basic *)descr->identifier; if ((descr->id_type & SVPD_ID_TYPE_MASK) != SVPD_ID_TYPE_NAA) return 0; if (descr->length < sizeof(struct scsi_vpd_id_naa_ieee_reg)) return 0; n = naa->naa >> SVPD_ID_NAA_NAA_SHIFT; if (n != SVPD_ID_NAA_LOCAL_REG && n != SVPD_ID_NAA_IEEE_REG) return 0; return 1; } int scsi_devid_is_sas_target(uint8_t *bufp) { struct scsi_vpd_id_descriptor *descr; descr = (struct scsi_vpd_id_descriptor *)bufp; if (!scsi_devid_is_naa_ieee_reg(bufp)) return 0; if ((descr->id_type & SVPD_ID_PIV) == 0) /* proto field reserved */ return 0; if ((descr->proto_codeset >> SVPD_ID_PROTO_SHIFT) != SCSI_PROTO_SAS) return 0; return 1; } int scsi_devid_is_lun_eui64(uint8_t *bufp) { struct scsi_vpd_id_descriptor *descr; descr = (struct scsi_vpd_id_descriptor *)bufp; if ((descr->id_type & SVPD_ID_ASSOC_MASK) != SVPD_ID_ASSOC_LUN) return 0; if ((descr->id_type & SVPD_ID_TYPE_MASK) != SVPD_ID_TYPE_EUI64) return 0; return 1; } int scsi_devid_is_lun_naa(uint8_t *bufp) { struct scsi_vpd_id_descriptor *descr; descr = (struct scsi_vpd_id_descriptor *)bufp; if ((descr->id_type & SVPD_ID_ASSOC_MASK) != SVPD_ID_ASSOC_LUN) return 0; if ((descr->id_type & SVPD_ID_TYPE_MASK) != SVPD_ID_TYPE_NAA) return 0; return 1; } int scsi_devid_is_lun_t10(uint8_t *bufp) { struct scsi_vpd_id_descriptor *descr; descr = (struct scsi_vpd_id_descriptor *)bufp; if ((descr->id_type & SVPD_ID_ASSOC_MASK) != SVPD_ID_ASSOC_LUN) return 0; if ((descr->id_type & SVPD_ID_TYPE_MASK) != SVPD_ID_TYPE_T10) return 0; return 1; } int scsi_devid_is_lun_name(uint8_t *bufp) { struct scsi_vpd_id_descriptor *descr; descr = (struct scsi_vpd_id_descriptor *)bufp; if ((descr->id_type & SVPD_ID_ASSOC_MASK) != SVPD_ID_ASSOC_LUN) return 0; if ((descr->id_type & SVPD_ID_TYPE_MASK) != SVPD_ID_TYPE_SCSI_NAME) return 0; return 1; } int scsi_devid_is_lun_md5(uint8_t *bufp) { struct scsi_vpd_id_descriptor *descr; descr = (struct scsi_vpd_id_descriptor *)bufp; if ((descr->id_type & SVPD_ID_ASSOC_MASK) != SVPD_ID_ASSOC_LUN) return 0; if ((descr->id_type & SVPD_ID_TYPE_MASK) != SVPD_ID_TYPE_MD5_LUN_ID) return 0; return 1; } int scsi_devid_is_lun_uuid(uint8_t *bufp) { struct scsi_vpd_id_descriptor *descr; descr = (struct scsi_vpd_id_descriptor *)bufp; if ((descr->id_type & SVPD_ID_ASSOC_MASK) != SVPD_ID_ASSOC_LUN) return 0; if ((descr->id_type & SVPD_ID_TYPE_MASK) != SVPD_ID_TYPE_UUID) return 0; return 1; } int scsi_devid_is_port_naa(uint8_t *bufp) { struct scsi_vpd_id_descriptor *descr; descr = (struct scsi_vpd_id_descriptor *)bufp; if ((descr->id_type & SVPD_ID_ASSOC_MASK) != SVPD_ID_ASSOC_PORT) return 0; if ((descr->id_type & SVPD_ID_TYPE_MASK) != SVPD_ID_TYPE_NAA) return 0; return 1; } struct scsi_vpd_id_descriptor * scsi_get_devid_desc(struct scsi_vpd_id_descriptor *desc, uint32_t len, scsi_devid_checkfn_t ck_fn) { uint8_t *desc_buf_end; desc_buf_end = (uint8_t *)desc + len; for (; desc->identifier <= desc_buf_end && desc->identifier + desc->length <= desc_buf_end; desc = (struct scsi_vpd_id_descriptor *)(desc->identifier + desc->length)) { if (ck_fn == NULL || ck_fn((uint8_t *)desc) != 0) return (desc); } return (NULL); } struct scsi_vpd_id_descriptor * scsi_get_devid(struct scsi_vpd_device_id *id, uint32_t page_len, scsi_devid_checkfn_t ck_fn) { uint32_t len; if (page_len < sizeof(*id)) return (NULL); len = MIN(scsi_2btoul(id->length), page_len - sizeof(*id)); return (scsi_get_devid_desc((struct scsi_vpd_id_descriptor *) id->desc_list, len, ck_fn)); } int scsi_transportid_sbuf(struct sbuf *sb, struct scsi_transportid_header *hdr, uint32_t valid_len) { switch (hdr->format_protocol & SCSI_TRN_PROTO_MASK) { case SCSI_PROTO_FC: { struct scsi_transportid_fcp *fcp; uint64_t n_port_name; fcp = (struct scsi_transportid_fcp *)hdr; n_port_name = scsi_8btou64(fcp->n_port_name); sbuf_printf(sb, "FCP address: 0x%.16jx",(uintmax_t)n_port_name); break; } case SCSI_PROTO_SPI: { struct scsi_transportid_spi *spi; spi = (struct scsi_transportid_spi *)hdr; sbuf_printf(sb, "SPI address: %u,%u", scsi_2btoul(spi->scsi_addr), scsi_2btoul(spi->rel_trgt_port_id)); break; } case SCSI_PROTO_SSA: /* * XXX KDM there is no transport ID defined in SPC-4 for * SSA. */ break; case SCSI_PROTO_1394: { struct scsi_transportid_1394 *sbp; uint64_t eui64; sbp = (struct scsi_transportid_1394 *)hdr; eui64 = scsi_8btou64(sbp->eui64); sbuf_printf(sb, "SBP address: 0x%.16jx", (uintmax_t)eui64); break; } case SCSI_PROTO_RDMA: { struct scsi_transportid_rdma *rdma; unsigned int i; rdma = (struct scsi_transportid_rdma *)hdr; sbuf_printf(sb, "RDMA address: 0x"); for (i = 0; i < sizeof(rdma->initiator_port_id); i++) sbuf_printf(sb, "%02x", rdma->initiator_port_id[i]); break; } case SCSI_PROTO_ISCSI: { uint32_t add_len, i; uint8_t *iscsi_name = NULL; int nul_found = 0; sbuf_printf(sb, "iSCSI address: "); if ((hdr->format_protocol & SCSI_TRN_FORMAT_MASK) == SCSI_TRN_ISCSI_FORMAT_DEVICE) { struct scsi_transportid_iscsi_device *dev; dev = (struct scsi_transportid_iscsi_device *)hdr; /* * Verify how much additional data we really have. */ add_len = scsi_2btoul(dev->additional_length); add_len = MIN(add_len, valid_len - __offsetof(struct scsi_transportid_iscsi_device, iscsi_name)); iscsi_name = &dev->iscsi_name[0]; } else if ((hdr->format_protocol & SCSI_TRN_FORMAT_MASK) == SCSI_TRN_ISCSI_FORMAT_PORT) { struct scsi_transportid_iscsi_port *port; port = (struct scsi_transportid_iscsi_port *)hdr; add_len = scsi_2btoul(port->additional_length); add_len = MIN(add_len, valid_len - __offsetof(struct scsi_transportid_iscsi_port, iscsi_name)); iscsi_name = &port->iscsi_name[0]; } else { sbuf_printf(sb, "unknown format %x", (hdr->format_protocol & SCSI_TRN_FORMAT_MASK) >> SCSI_TRN_FORMAT_SHIFT); break; } if (add_len == 0) { sbuf_printf(sb, "not enough data"); break; } /* * This is supposed to be a NUL-terminated ASCII * string, but you never know. So we're going to * check. We need to do this because there is no * sbuf equivalent of strncat(). */ for (i = 0; i < add_len; i++) { if (iscsi_name[i] == '\0') { nul_found = 1; break; } } /* * If there is a NUL in the name, we can just use * sbuf_cat(). Otherwise we need to use sbuf_bcat(). */ if (nul_found != 0) sbuf_cat(sb, iscsi_name); else sbuf_bcat(sb, iscsi_name, add_len); break; } case SCSI_PROTO_SAS: { struct scsi_transportid_sas *sas; uint64_t sas_addr; sas = (struct scsi_transportid_sas *)hdr; sas_addr = scsi_8btou64(sas->sas_address); sbuf_printf(sb, "SAS address: 0x%.16jx", (uintmax_t)sas_addr); break; } case SCSI_PROTO_ADITP: case SCSI_PROTO_ATA: case SCSI_PROTO_UAS: /* * No Transport ID format for ADI, ATA or USB is defined in * SPC-4. */ sbuf_printf(sb, "No known Transport ID format for protocol " "%#x", hdr->format_protocol & SCSI_TRN_PROTO_MASK); break; case SCSI_PROTO_SOP: { struct scsi_transportid_sop *sop; struct scsi_sop_routing_id_norm *rid; sop = (struct scsi_transportid_sop *)hdr; rid = (struct scsi_sop_routing_id_norm *)sop->routing_id; /* * Note that there is no alternate format specified in SPC-4 * for the PCIe routing ID, so we don't really have a way * to know whether the second byte of the routing ID is * a device and function or just a function. So we just * assume bus,device,function. */ sbuf_printf(sb, "SOP Routing ID: %u,%u,%u", rid->bus, rid->devfunc >> SCSI_TRN_SOP_DEV_SHIFT, rid->devfunc & SCSI_TRN_SOP_FUNC_NORM_MAX); break; } case SCSI_PROTO_NONE: default: sbuf_printf(sb, "Unknown protocol %#x", hdr->format_protocol & SCSI_TRN_PROTO_MASK); break; } return (0); } struct scsi_nv scsi_proto_map[] = { { "fcp", SCSI_PROTO_FC }, { "spi", SCSI_PROTO_SPI }, { "ssa", SCSI_PROTO_SSA }, { "sbp", SCSI_PROTO_1394 }, { "1394", SCSI_PROTO_1394 }, { "srp", SCSI_PROTO_RDMA }, { "rdma", SCSI_PROTO_RDMA }, { "iscsi", SCSI_PROTO_ISCSI }, { "iqn", SCSI_PROTO_ISCSI }, { "sas", SCSI_PROTO_SAS }, { "aditp", SCSI_PROTO_ADITP }, { "ata", SCSI_PROTO_ATA }, { "uas", SCSI_PROTO_UAS }, { "usb", SCSI_PROTO_UAS }, { "sop", SCSI_PROTO_SOP } }; const char * scsi_nv_to_str(struct scsi_nv *table, int num_table_entries, uint64_t value) { int i; for (i = 0; i < num_table_entries; i++) { if (table[i].value == value) return (table[i].name); } return (NULL); } /* * Given a name/value table, find a value matching the given name. * Return values: * SCSI_NV_FOUND - match found * SCSI_NV_AMBIGUOUS - more than one match, none of them exact * SCSI_NV_NOT_FOUND - no match found */ scsi_nv_status scsi_get_nv(struct scsi_nv *table, int num_table_entries, char *name, int *table_entry, scsi_nv_flags flags) { int i, num_matches = 0; for (i = 0; i < num_table_entries; i++) { size_t table_len, name_len; table_len = strlen(table[i].name); name_len = strlen(name); if ((((flags & SCSI_NV_FLAG_IG_CASE) != 0) && (strncasecmp(table[i].name, name, name_len) == 0)) || (((flags & SCSI_NV_FLAG_IG_CASE) == 0) && (strncmp(table[i].name, name, name_len) == 0))) { *table_entry = i; /* * Check for an exact match. If we have the same * number of characters in the table as the argument, * and we already know they're the same, we have * an exact match. */ if (table_len == name_len) return (SCSI_NV_FOUND); /* * Otherwise, bump up the number of matches. We'll * see later how many we have. */ num_matches++; } } if (num_matches > 1) return (SCSI_NV_AMBIGUOUS); else if (num_matches == 1) return (SCSI_NV_FOUND); else return (SCSI_NV_NOT_FOUND); } /* * Parse transport IDs for Fibre Channel, 1394 and SAS. Since these are * all 64-bit numbers, the code is similar. */ int scsi_parse_transportid_64bit(int proto_id, char *id_str, struct scsi_transportid_header **hdr, unsigned int *alloc_len, #ifdef _KERNEL struct malloc_type *type, int flags, #endif char *error_str, int error_str_len) { uint64_t value; char *endptr; int retval; size_t alloc_size; retval = 0; value = strtouq(id_str, &endptr, 0); if (*endptr != '\0') { if (error_str != NULL) { snprintf(error_str, error_str_len, "%s: error " "parsing ID %s, 64-bit number required", __func__, id_str); } retval = 1; goto bailout; } switch (proto_id) { case SCSI_PROTO_FC: alloc_size = sizeof(struct scsi_transportid_fcp); break; case SCSI_PROTO_1394: alloc_size = sizeof(struct scsi_transportid_1394); break; case SCSI_PROTO_SAS: alloc_size = sizeof(struct scsi_transportid_sas); break; default: if (error_str != NULL) { snprintf(error_str, error_str_len, "%s: unsupported " "protocol %d", __func__, proto_id); } retval = 1; goto bailout; break; /* NOTREACHED */ } #ifdef _KERNEL *hdr = malloc(alloc_size, type, flags); #else /* _KERNEL */ *hdr = malloc(alloc_size); #endif /*_KERNEL */ if (*hdr == NULL) { if (error_str != NULL) { snprintf(error_str, error_str_len, "%s: unable to " "allocate %zu bytes", __func__, alloc_size); } retval = 1; goto bailout; } *alloc_len = alloc_size; bzero(*hdr, alloc_size); switch (proto_id) { case SCSI_PROTO_FC: { struct scsi_transportid_fcp *fcp; fcp = (struct scsi_transportid_fcp *)(*hdr); fcp->format_protocol = SCSI_PROTO_FC | SCSI_TRN_FCP_FORMAT_DEFAULT; scsi_u64to8b(value, fcp->n_port_name); break; } case SCSI_PROTO_1394: { struct scsi_transportid_1394 *sbp; sbp = (struct scsi_transportid_1394 *)(*hdr); sbp->format_protocol = SCSI_PROTO_1394 | SCSI_TRN_1394_FORMAT_DEFAULT; scsi_u64to8b(value, sbp->eui64); break; } case SCSI_PROTO_SAS: { struct scsi_transportid_sas *sas; sas = (struct scsi_transportid_sas *)(*hdr); sas->format_protocol = SCSI_PROTO_SAS | SCSI_TRN_SAS_FORMAT_DEFAULT; scsi_u64to8b(value, sas->sas_address); break; } default: break; } bailout: return (retval); } /* * Parse a SPI (Parallel SCSI) address of the form: id,rel_tgt_port */ int scsi_parse_transportid_spi(char *id_str, struct scsi_transportid_header **hdr, unsigned int *alloc_len, #ifdef _KERNEL struct malloc_type *type, int flags, #endif char *error_str, int error_str_len) { unsigned long scsi_addr, target_port; struct scsi_transportid_spi *spi; char *tmpstr, *endptr; int retval; retval = 0; tmpstr = strsep(&id_str, ","); if (tmpstr == NULL) { if (error_str != NULL) { snprintf(error_str, error_str_len, "%s: no ID found", __func__); } retval = 1; goto bailout; } scsi_addr = strtoul(tmpstr, &endptr, 0); if (*endptr != '\0') { if (error_str != NULL) { snprintf(error_str, error_str_len, "%s: error " "parsing SCSI ID %s, number required", __func__, tmpstr); } retval = 1; goto bailout; } if (id_str == NULL) { if (error_str != NULL) { snprintf(error_str, error_str_len, "%s: no relative " "target port found", __func__); } retval = 1; goto bailout; } target_port = strtoul(id_str, &endptr, 0); if (*endptr != '\0') { if (error_str != NULL) { snprintf(error_str, error_str_len, "%s: error " "parsing relative target port %s, number " "required", __func__, id_str); } retval = 1; goto bailout; } #ifdef _KERNEL spi = malloc(sizeof(*spi), type, flags); #else spi = malloc(sizeof(*spi)); #endif if (spi == NULL) { if (error_str != NULL) { snprintf(error_str, error_str_len, "%s: unable to " "allocate %zu bytes", __func__, sizeof(*spi)); } retval = 1; goto bailout; } *alloc_len = sizeof(*spi); bzero(spi, sizeof(*spi)); spi->format_protocol = SCSI_PROTO_SPI | SCSI_TRN_SPI_FORMAT_DEFAULT; scsi_ulto2b(scsi_addr, spi->scsi_addr); scsi_ulto2b(target_port, spi->rel_trgt_port_id); *hdr = (struct scsi_transportid_header *)spi; bailout: return (retval); } /* * Parse an RDMA/SRP Initiator Port ID string. This is 32 hexadecimal digits, * optionally prefixed by "0x" or "0X". */ int scsi_parse_transportid_rdma(char *id_str, struct scsi_transportid_header **hdr, unsigned int *alloc_len, #ifdef _KERNEL struct malloc_type *type, int flags, #endif char *error_str, int error_str_len) { struct scsi_transportid_rdma *rdma; int retval; size_t id_len, rdma_id_size; uint8_t rdma_id[SCSI_TRN_RDMA_PORT_LEN]; char *tmpstr; unsigned int i, j; retval = 0; id_len = strlen(id_str); rdma_id_size = SCSI_TRN_RDMA_PORT_LEN; /* * Check the size. It needs to be either 32 or 34 characters long. */ if ((id_len != (rdma_id_size * 2)) && (id_len != ((rdma_id_size * 2) + 2))) { if (error_str != NULL) { snprintf(error_str, error_str_len, "%s: RDMA ID " "must be 32 hex digits (0x prefix " "optional), only %zu seen", __func__, id_len); } retval = 1; goto bailout; } tmpstr = id_str; /* * If the user gave us 34 characters, the string needs to start * with '0x'. */ if (id_len == ((rdma_id_size * 2) + 2)) { if ((tmpstr[0] == '0') && ((tmpstr[1] == 'x') || (tmpstr[1] == 'X'))) { tmpstr += 2; } else { if (error_str != NULL) { snprintf(error_str, error_str_len, "%s: RDMA " "ID prefix, if used, must be \"0x\", " "got %s", __func__, tmpstr); } retval = 1; goto bailout; } } bzero(rdma_id, sizeof(rdma_id)); /* * Convert ASCII hex into binary bytes. There is no standard * 128-bit integer type, and so no strtou128t() routine to convert * from hex into a large integer. In the end, we're not going to * an integer, but rather to a byte array, so that and the fact * that we require the user to give us 32 hex digits simplifies the * logic. */ for (i = 0; i < (rdma_id_size * 2); i++) { int cur_shift; unsigned char c; /* Increment the byte array one for every 2 hex digits */ j = i >> 1; /* * The first digit in every pair is the most significant * 4 bits. The second is the least significant 4 bits. */ if ((i % 2) == 0) cur_shift = 4; else cur_shift = 0; c = tmpstr[i]; /* Convert the ASCII hex character into a number */ if (isdigit(c)) c -= '0'; else if (isalpha(c)) c -= isupper(c) ? 'A' - 10 : 'a' - 10; else { if (error_str != NULL) { snprintf(error_str, error_str_len, "%s: " "RDMA ID must be hex digits, got " "invalid character %c", __func__, tmpstr[i]); } retval = 1; goto bailout; } /* * The converted number can't be less than 0; the type is * unsigned, and the subtraction logic will not give us * a negative number. So we only need to make sure that * the value is not greater than 0xf. (i.e. make sure the * user didn't give us a value like "0x12jklmno"). */ if (c > 0xf) { if (error_str != NULL) { snprintf(error_str, error_str_len, "%s: " "RDMA ID must be hex digits, got " "invalid character %c", __func__, tmpstr[i]); } retval = 1; goto bailout; } rdma_id[j] |= c << cur_shift; } #ifdef _KERNEL rdma = malloc(sizeof(*rdma), type, flags); #else rdma = malloc(sizeof(*rdma)); #endif if (rdma == NULL) { if (error_str != NULL) { snprintf(error_str, error_str_len, "%s: unable to " "allocate %zu bytes", __func__, sizeof(*rdma)); } retval = 1; goto bailout; } *alloc_len = sizeof(*rdma); bzero(rdma, *alloc_len); rdma->format_protocol = SCSI_PROTO_RDMA | SCSI_TRN_RDMA_FORMAT_DEFAULT; bcopy(rdma_id, rdma->initiator_port_id, SCSI_TRN_RDMA_PORT_LEN); *hdr = (struct scsi_transportid_header *)rdma; bailout: return (retval); } /* * Parse an iSCSI name. The format is either just the name: * * iqn.2012-06.com.example:target0 * or the name, separator and initiator session ID: * * iqn.2012-06.com.example:target0,i,0x123 * * The separator format is exact. */ int scsi_parse_transportid_iscsi(char *id_str, struct scsi_transportid_header **hdr, unsigned int *alloc_len, #ifdef _KERNEL struct malloc_type *type, int flags, #endif char *error_str, int error_str_len) { size_t id_len, sep_len, id_size, name_len; int retval; unsigned int i, sep_pos, sep_found; const char *sep_template = ",i,0x"; const char *iqn_prefix = "iqn."; struct scsi_transportid_iscsi_device *iscsi; retval = 0; sep_found = 0; id_len = strlen(id_str); sep_len = strlen(sep_template); /* * The separator is defined as exactly ',i,0x'. Any other commas, * or any other form, is an error. So look for a comma, and once * we find that, the next few characters must match the separator * exactly. Once we get through the separator, there should be at * least one character. */ for (i = 0, sep_pos = 0; i < id_len; i++) { if (sep_pos == 0) { if (id_str[i] == sep_template[sep_pos]) sep_pos++; continue; } if (sep_pos < sep_len) { if (id_str[i] == sep_template[sep_pos]) { sep_pos++; continue; } if (error_str != NULL) { snprintf(error_str, error_str_len, "%s: " "invalid separator in iSCSI name " "\"%s\"", __func__, id_str); } retval = 1; goto bailout; } else { sep_found = 1; break; } } /* * Check to see whether we have a separator but no digits after it. */ if ((sep_pos != 0) && (sep_found == 0)) { if (error_str != NULL) { snprintf(error_str, error_str_len, "%s: no digits " "found after separator in iSCSI name \"%s\"", __func__, id_str); } retval = 1; goto bailout; } /* * The incoming ID string has the "iqn." prefix stripped off. We * need enough space for the base structure (the structures are the * same for the two iSCSI forms), the prefix, the ID string and a * terminating NUL. */ id_size = sizeof(*iscsi) + strlen(iqn_prefix) + id_len + 1; #ifdef _KERNEL iscsi = malloc(id_size, type, flags); #else iscsi = malloc(id_size); #endif if (iscsi == NULL) { if (error_str != NULL) { snprintf(error_str, error_str_len, "%s: unable to " "allocate %zu bytes", __func__, id_size); } retval = 1; goto bailout; } *alloc_len = id_size; bzero(iscsi, id_size); iscsi->format_protocol = SCSI_PROTO_ISCSI; if (sep_found == 0) iscsi->format_protocol |= SCSI_TRN_ISCSI_FORMAT_DEVICE; else iscsi->format_protocol |= SCSI_TRN_ISCSI_FORMAT_PORT; name_len = id_size - sizeof(*iscsi); scsi_ulto2b(name_len, iscsi->additional_length); snprintf(iscsi->iscsi_name, name_len, "%s%s", iqn_prefix, id_str); *hdr = (struct scsi_transportid_header *)iscsi; bailout: return (retval); } /* * Parse a SCSI over PCIe (SOP) identifier. The Routing ID can either be * of the form 'bus,device,function' or 'bus,function'. */ int scsi_parse_transportid_sop(char *id_str, struct scsi_transportid_header **hdr, unsigned int *alloc_len, #ifdef _KERNEL struct malloc_type *type, int flags, #endif char *error_str, int error_str_len) { struct scsi_transportid_sop *sop; unsigned long bus, device, function; char *tmpstr, *endptr; int retval, device_spec; retval = 0; device_spec = 0; device = 0; tmpstr = strsep(&id_str, ","); if ((tmpstr == NULL) || (*tmpstr == '\0')) { if (error_str != NULL) { snprintf(error_str, error_str_len, "%s: no ID found", __func__); } retval = 1; goto bailout; } bus = strtoul(tmpstr, &endptr, 0); if (*endptr != '\0') { if (error_str != NULL) { snprintf(error_str, error_str_len, "%s: error " "parsing PCIe bus %s, number required", __func__, tmpstr); } retval = 1; goto bailout; } if ((id_str == NULL) || (*id_str == '\0')) { if (error_str != NULL) { snprintf(error_str, error_str_len, "%s: no PCIe " "device or function found", __func__); } retval = 1; goto bailout; } tmpstr = strsep(&id_str, ","); function = strtoul(tmpstr, &endptr, 0); if (*endptr != '\0') { if (error_str != NULL) { snprintf(error_str, error_str_len, "%s: error " "parsing PCIe device/function %s, number " "required", __func__, tmpstr); } retval = 1; goto bailout; } /* * Check to see whether the user specified a third value. If so, * the second is the device. */ if (id_str != NULL) { if (*id_str == '\0') { if (error_str != NULL) { snprintf(error_str, error_str_len, "%s: " "no PCIe function found", __func__); } retval = 1; goto bailout; } device = function; device_spec = 1; function = strtoul(id_str, &endptr, 0); if (*endptr != '\0') { if (error_str != NULL) { snprintf(error_str, error_str_len, "%s: " "error parsing PCIe function %s, " "number required", __func__, id_str); } retval = 1; goto bailout; } } if (bus > SCSI_TRN_SOP_BUS_MAX) { if (error_str != NULL) { snprintf(error_str, error_str_len, "%s: bus value " "%lu greater than maximum %u", __func__, bus, SCSI_TRN_SOP_BUS_MAX); } retval = 1; goto bailout; } if ((device_spec != 0) && (device > SCSI_TRN_SOP_DEV_MASK)) { if (error_str != NULL) { snprintf(error_str, error_str_len, "%s: device value " "%lu greater than maximum %u", __func__, device, SCSI_TRN_SOP_DEV_MAX); } retval = 1; goto bailout; } if (((device_spec != 0) && (function > SCSI_TRN_SOP_FUNC_NORM_MAX)) || ((device_spec == 0) && (function > SCSI_TRN_SOP_FUNC_ALT_MAX))) { if (error_str != NULL) { snprintf(error_str, error_str_len, "%s: function value " "%lu greater than maximum %u", __func__, function, (device_spec == 0) ? SCSI_TRN_SOP_FUNC_ALT_MAX : SCSI_TRN_SOP_FUNC_NORM_MAX); } retval = 1; goto bailout; } #ifdef _KERNEL sop = malloc(sizeof(*sop), type, flags); #else sop = malloc(sizeof(*sop)); #endif if (sop == NULL) { if (error_str != NULL) { snprintf(error_str, error_str_len, "%s: unable to " "allocate %zu bytes", __func__, sizeof(*sop)); } retval = 1; goto bailout; } *alloc_len = sizeof(*sop); bzero(sop, sizeof(*sop)); sop->format_protocol = SCSI_PROTO_SOP | SCSI_TRN_SOP_FORMAT_DEFAULT; if (device_spec != 0) { struct scsi_sop_routing_id_norm rid; rid.bus = bus; rid.devfunc = (device << SCSI_TRN_SOP_DEV_SHIFT) | function; bcopy(&rid, sop->routing_id, MIN(sizeof(rid), sizeof(sop->routing_id))); } else { struct scsi_sop_routing_id_alt rid; rid.bus = bus; rid.function = function; bcopy(&rid, sop->routing_id, MIN(sizeof(rid), sizeof(sop->routing_id))); } *hdr = (struct scsi_transportid_header *)sop; bailout: return (retval); } /* * transportid_str: NUL-terminated string with format: protcol,id * The ID is protocol specific. * hdr: Storage will be allocated for the transport ID. * alloc_len: The amount of memory allocated is returned here. * type: Malloc bucket (kernel only). * flags: Malloc flags (kernel only). * error_str: If non-NULL, it will contain error information (without * a terminating newline) if an error is returned. * error_str_len: Allocated length of the error string. * * Returns 0 for success, non-zero for failure. */ int scsi_parse_transportid(char *transportid_str, struct scsi_transportid_header **hdr, unsigned int *alloc_len, #ifdef _KERNEL struct malloc_type *type, int flags, #endif char *error_str, int error_str_len) { char *tmpstr; scsi_nv_status status; u_int num_proto_entries; int retval, table_entry; retval = 0; table_entry = 0; /* * We do allow a period as well as a comma to separate the protocol * from the ID string. This is to accommodate iSCSI names, which * start with "iqn.". */ tmpstr = strsep(&transportid_str, ",."); if (tmpstr == NULL) { if (error_str != NULL) { snprintf(error_str, error_str_len, "%s: transportid_str is NULL", __func__); } retval = 1; goto bailout; } num_proto_entries = nitems(scsi_proto_map); status = scsi_get_nv(scsi_proto_map, num_proto_entries, tmpstr, &table_entry, SCSI_NV_FLAG_IG_CASE); if (status != SCSI_NV_FOUND) { if (error_str != NULL) { snprintf(error_str, error_str_len, "%s: %s protocol " "name %s", __func__, (status == SCSI_NV_AMBIGUOUS) ? "ambiguous" : "invalid", tmpstr); } retval = 1; goto bailout; } switch (scsi_proto_map[table_entry].value) { case SCSI_PROTO_FC: case SCSI_PROTO_1394: case SCSI_PROTO_SAS: retval = scsi_parse_transportid_64bit( scsi_proto_map[table_entry].value, transportid_str, hdr, alloc_len, #ifdef _KERNEL type, flags, #endif error_str, error_str_len); break; case SCSI_PROTO_SPI: retval = scsi_parse_transportid_spi(transportid_str, hdr, alloc_len, #ifdef _KERNEL type, flags, #endif error_str, error_str_len); break; case SCSI_PROTO_RDMA: retval = scsi_parse_transportid_rdma(transportid_str, hdr, alloc_len, #ifdef _KERNEL type, flags, #endif error_str, error_str_len); break; case SCSI_PROTO_ISCSI: retval = scsi_parse_transportid_iscsi(transportid_str, hdr, alloc_len, #ifdef _KERNEL type, flags, #endif error_str, error_str_len); break; case SCSI_PROTO_SOP: retval = scsi_parse_transportid_sop(transportid_str, hdr, alloc_len, #ifdef _KERNEL type, flags, #endif error_str, error_str_len); break; case SCSI_PROTO_SSA: case SCSI_PROTO_ADITP: case SCSI_PROTO_ATA: case SCSI_PROTO_UAS: case SCSI_PROTO_NONE: default: /* * There is no format defined for a Transport ID for these * protocols. So even if the user gives us something, we * have no way to turn it into a standard SCSI Transport ID. */ retval = 1; if (error_str != NULL) { snprintf(error_str, error_str_len, "%s: no Transport " "ID format exists for protocol %s", __func__, tmpstr); } goto bailout; break; /* NOTREACHED */ } bailout: return (retval); } struct scsi_attrib_table_entry scsi_mam_attr_table[] = { { SMA_ATTR_REM_CAP_PARTITION, SCSI_ATTR_FLAG_NONE, "Remaining Capacity in Partition", /*suffix*/ "MB", /*to_str*/ scsi_attrib_int_sbuf,/*parse_str*/ NULL }, { SMA_ATTR_MAX_CAP_PARTITION, SCSI_ATTR_FLAG_NONE, "Maximum Capacity in Partition", /*suffix*/"MB", /*to_str*/ scsi_attrib_int_sbuf, /*parse_str*/ NULL }, { SMA_ATTR_TAPEALERT_FLAGS, SCSI_ATTR_FLAG_HEX, "TapeAlert Flags", /*suffix*/NULL, /*to_str*/ scsi_attrib_int_sbuf, /*parse_str*/ NULL }, { SMA_ATTR_LOAD_COUNT, SCSI_ATTR_FLAG_NONE, "Load Count", /*suffix*/NULL, /*to_str*/ scsi_attrib_int_sbuf, /*parse_str*/ NULL }, { SMA_ATTR_MAM_SPACE_REMAINING, SCSI_ATTR_FLAG_NONE, "MAM Space Remaining", /*suffix*/"bytes", /*to_str*/ scsi_attrib_int_sbuf, /*parse_str*/ NULL }, { SMA_ATTR_DEV_ASSIGNING_ORG, SCSI_ATTR_FLAG_NONE, "Assigning Organization", /*suffix*/NULL, /*to_str*/ scsi_attrib_ascii_sbuf, /*parse_str*/ NULL }, { SMA_ATTR_FORMAT_DENSITY_CODE, SCSI_ATTR_FLAG_HEX, "Format Density Code", /*suffix*/NULL, /*to_str*/ scsi_attrib_int_sbuf, /*parse_str*/ NULL }, { SMA_ATTR_INITIALIZATION_COUNT, SCSI_ATTR_FLAG_NONE, "Initialization Count", /*suffix*/NULL, /*to_str*/ scsi_attrib_int_sbuf, /*parse_str*/ NULL }, { SMA_ATTR_VOLUME_ID, SCSI_ATTR_FLAG_NONE, "Volume Identifier", /*suffix*/NULL, /*to_str*/ scsi_attrib_ascii_sbuf, /*parse_str*/ NULL }, { SMA_ATTR_VOLUME_CHANGE_REF, SCSI_ATTR_FLAG_HEX, "Volume Change Reference", /*suffix*/NULL, /*to_str*/ scsi_attrib_int_sbuf, /*parse_str*/ NULL }, { SMA_ATTR_DEV_SERIAL_LAST_LOAD, SCSI_ATTR_FLAG_NONE, "Device Vendor/Serial at Last Load", /*suffix*/NULL, /*to_str*/ scsi_attrib_vendser_sbuf, /*parse_str*/ NULL }, { SMA_ATTR_DEV_SERIAL_LAST_LOAD_1, SCSI_ATTR_FLAG_NONE, "Device Vendor/Serial at Last Load - 1", /*suffix*/NULL, /*to_str*/ scsi_attrib_vendser_sbuf, /*parse_str*/ NULL }, { SMA_ATTR_DEV_SERIAL_LAST_LOAD_2, SCSI_ATTR_FLAG_NONE, "Device Vendor/Serial at Last Load - 2", /*suffix*/NULL, /*to_str*/ scsi_attrib_vendser_sbuf, /*parse_str*/ NULL }, { SMA_ATTR_DEV_SERIAL_LAST_LOAD_3, SCSI_ATTR_FLAG_NONE, "Device Vendor/Serial at Last Load - 3", /*suffix*/NULL, /*to_str*/ scsi_attrib_vendser_sbuf, /*parse_str*/ NULL }, { SMA_ATTR_TOTAL_MB_WRITTEN_LT, SCSI_ATTR_FLAG_NONE, "Total MB Written in Medium Life", /*suffix*/ "MB", /*to_str*/ scsi_attrib_int_sbuf, /*parse_str*/ NULL }, { SMA_ATTR_TOTAL_MB_READ_LT, SCSI_ATTR_FLAG_NONE, "Total MB Read in Medium Life", /*suffix*/ "MB", /*to_str*/ scsi_attrib_int_sbuf, /*parse_str*/ NULL }, { SMA_ATTR_TOTAL_MB_WRITTEN_CUR, SCSI_ATTR_FLAG_NONE, "Total MB Written in Current/Last Load", /*suffix*/ "MB", /*to_str*/ scsi_attrib_int_sbuf, /*parse_str*/ NULL }, { SMA_ATTR_TOTAL_MB_READ_CUR, SCSI_ATTR_FLAG_NONE, "Total MB Read in Current/Last Load", /*suffix*/ "MB", /*to_str*/ scsi_attrib_int_sbuf, /*parse_str*/ NULL }, { SMA_ATTR_FIRST_ENC_BLOCK, SCSI_ATTR_FLAG_NONE, "Logical Position of First Encrypted Block", /*suffix*/ NULL, /*to_str*/ scsi_attrib_int_sbuf, /*parse_str*/ NULL }, { SMA_ATTR_NEXT_UNENC_BLOCK, SCSI_ATTR_FLAG_NONE, "Logical Position of First Unencrypted Block after First " "Encrypted Block", /*suffix*/ NULL, /*to_str*/ scsi_attrib_int_sbuf, /*parse_str*/ NULL }, { SMA_ATTR_MEDIUM_USAGE_HIST, SCSI_ATTR_FLAG_NONE, "Medium Usage History", /*suffix*/ NULL, /*to_str*/ NULL, /*parse_str*/ NULL }, { SMA_ATTR_PART_USAGE_HIST, SCSI_ATTR_FLAG_NONE, "Partition Usage History", /*suffix*/ NULL, /*to_str*/ NULL, /*parse_str*/ NULL }, { SMA_ATTR_MED_MANUF, SCSI_ATTR_FLAG_NONE, "Medium Manufacturer", /*suffix*/NULL, /*to_str*/ scsi_attrib_ascii_sbuf, /*parse_str*/ NULL }, { SMA_ATTR_MED_SERIAL, SCSI_ATTR_FLAG_NONE, "Medium Serial Number", /*suffix*/NULL, /*to_str*/ scsi_attrib_ascii_sbuf, /*parse_str*/ NULL }, { SMA_ATTR_MED_LENGTH, SCSI_ATTR_FLAG_NONE, "Medium Length", /*suffix*/"m", /*to_str*/ scsi_attrib_int_sbuf, /*parse_str*/ NULL }, { SMA_ATTR_MED_WIDTH, SCSI_ATTR_FLAG_FP | SCSI_ATTR_FLAG_DIV_10 | SCSI_ATTR_FLAG_FP_1DIGIT, "Medium Width", /*suffix*/"mm", /*to_str*/ scsi_attrib_int_sbuf, /*parse_str*/ NULL }, { SMA_ATTR_MED_ASSIGNING_ORG, SCSI_ATTR_FLAG_NONE, "Assigning Organization", /*suffix*/NULL, /*to_str*/ scsi_attrib_ascii_sbuf, /*parse_str*/ NULL }, { SMA_ATTR_MED_DENSITY_CODE, SCSI_ATTR_FLAG_HEX, "Medium Density Code", /*suffix*/NULL, /*to_str*/ scsi_attrib_int_sbuf, /*parse_str*/ NULL }, { SMA_ATTR_MED_MANUF_DATE, SCSI_ATTR_FLAG_NONE, "Medium Manufacture Date", /*suffix*/NULL, /*to_str*/ scsi_attrib_ascii_sbuf, /*parse_str*/ NULL }, { SMA_ATTR_MAM_CAPACITY, SCSI_ATTR_FLAG_NONE, "MAM Capacity", /*suffix*/"bytes", /*to_str*/ scsi_attrib_int_sbuf, /*parse_str*/ NULL }, { SMA_ATTR_MED_TYPE, SCSI_ATTR_FLAG_HEX, "Medium Type", /*suffix*/NULL, /*to_str*/ scsi_attrib_int_sbuf, /*parse_str*/ NULL }, { SMA_ATTR_MED_TYPE_INFO, SCSI_ATTR_FLAG_HEX, "Medium Type Information", /*suffix*/NULL, /*to_str*/ scsi_attrib_int_sbuf, /*parse_str*/ NULL }, { SMA_ATTR_MED_SERIAL_NUM, SCSI_ATTR_FLAG_NONE, "Medium Serial Number", /*suffix*/NULL, /*to_str*/ scsi_attrib_int_sbuf, /*parse_str*/ NULL }, { SMA_ATTR_APP_VENDOR, SCSI_ATTR_FLAG_NONE, "Application Vendor", /*suffix*/NULL, /*to_str*/ scsi_attrib_ascii_sbuf, /*parse_str*/ NULL }, { SMA_ATTR_APP_NAME, SCSI_ATTR_FLAG_NONE, "Application Name", /*suffix*/NULL, /*to_str*/ scsi_attrib_ascii_sbuf, /*parse_str*/ NULL }, { SMA_ATTR_APP_VERSION, SCSI_ATTR_FLAG_NONE, "Application Version", /*suffix*/NULL, /*to_str*/ scsi_attrib_ascii_sbuf, /*parse_str*/ NULL }, { SMA_ATTR_USER_MED_TEXT_LABEL, SCSI_ATTR_FLAG_NONE, "User Medium Text Label", /*suffix*/NULL, /*to_str*/ scsi_attrib_text_sbuf, /*parse_str*/ NULL }, { SMA_ATTR_LAST_WRITTEN_TIME, SCSI_ATTR_FLAG_NONE, "Date and Time Last Written", /*suffix*/NULL, /*to_str*/ scsi_attrib_ascii_sbuf, /*parse_str*/ NULL }, { SMA_ATTR_TEXT_LOCAL_ID, SCSI_ATTR_FLAG_HEX, "Text Localization Identifier", /*suffix*/NULL, /*to_str*/ scsi_attrib_int_sbuf, /*parse_str*/ NULL }, { SMA_ATTR_BARCODE, SCSI_ATTR_FLAG_NONE, "Barcode", /*suffix*/NULL, /*to_str*/ scsi_attrib_ascii_sbuf, /*parse_str*/ NULL }, { SMA_ATTR_HOST_OWNER_NAME, SCSI_ATTR_FLAG_NONE, "Owning Host Textual Name", /*suffix*/NULL, /*to_str*/ scsi_attrib_text_sbuf, /*parse_str*/ NULL }, { SMA_ATTR_MEDIA_POOL, SCSI_ATTR_FLAG_NONE, "Media Pool", /*suffix*/NULL, /*to_str*/ scsi_attrib_text_sbuf, /*parse_str*/ NULL }, { SMA_ATTR_PART_USER_LABEL, SCSI_ATTR_FLAG_NONE, "Partition User Text Label", /*suffix*/NULL, /*to_str*/ scsi_attrib_ascii_sbuf, /*parse_str*/ NULL }, { SMA_ATTR_LOAD_UNLOAD_AT_PART, SCSI_ATTR_FLAG_NONE, "Load/Unload at Partition", /*suffix*/NULL, /*to_str*/ scsi_attrib_int_sbuf, /*parse_str*/ NULL }, { SMA_ATTR_APP_FORMAT_VERSION, SCSI_ATTR_FLAG_NONE, "Application Format Version", /*suffix*/NULL, /*to_str*/ scsi_attrib_ascii_sbuf, /*parse_str*/ NULL }, { SMA_ATTR_VOL_COHERENCY_INFO, SCSI_ATTR_FLAG_NONE, "Volume Coherency Information", /*suffix*/NULL, /*to_str*/ scsi_attrib_volcoh_sbuf, /*parse_str*/ NULL }, { 0x0ff1, SCSI_ATTR_FLAG_NONE, "Spectra MLM Creation", /*suffix*/NULL, /*to_str*/ scsi_attrib_hexdump_sbuf, /*parse_str*/ NULL }, { 0x0ff2, SCSI_ATTR_FLAG_NONE, "Spectra MLM C3", /*suffix*/NULL, /*to_str*/ scsi_attrib_hexdump_sbuf, /*parse_str*/ NULL }, { 0x0ff3, SCSI_ATTR_FLAG_NONE, "Spectra MLM RW", /*suffix*/NULL, /*to_str*/ scsi_attrib_hexdump_sbuf, /*parse_str*/ NULL }, { 0x0ff4, SCSI_ATTR_FLAG_NONE, "Spectra MLM SDC List", /*suffix*/NULL, /*to_str*/ scsi_attrib_hexdump_sbuf, /*parse_str*/ NULL }, { 0x0ff7, SCSI_ATTR_FLAG_NONE, "Spectra MLM Post Scan", /*suffix*/NULL, /*to_str*/ scsi_attrib_hexdump_sbuf, /*parse_str*/ NULL }, { 0x0ffe, SCSI_ATTR_FLAG_NONE, "Spectra MLM Checksum", /*suffix*/NULL, /*to_str*/ scsi_attrib_hexdump_sbuf, /*parse_str*/ NULL }, { 0x17f1, SCSI_ATTR_FLAG_NONE, "Spectra MLM Creation", /*suffix*/NULL, /*to_str*/ scsi_attrib_hexdump_sbuf, /*parse_str*/ NULL }, { 0x17f2, SCSI_ATTR_FLAG_NONE, "Spectra MLM C3", /*suffix*/NULL, /*to_str*/ scsi_attrib_hexdump_sbuf, /*parse_str*/ NULL }, { 0x17f3, SCSI_ATTR_FLAG_NONE, "Spectra MLM RW", /*suffix*/NULL, /*to_str*/ scsi_attrib_hexdump_sbuf, /*parse_str*/ NULL }, { 0x17f4, SCSI_ATTR_FLAG_NONE, "Spectra MLM SDC List", /*suffix*/NULL, /*to_str*/ scsi_attrib_hexdump_sbuf, /*parse_str*/ NULL }, { 0x17f7, SCSI_ATTR_FLAG_NONE, "Spectra MLM Post Scan", /*suffix*/NULL, /*to_str*/ scsi_attrib_hexdump_sbuf, /*parse_str*/ NULL }, { 0x17ff, SCSI_ATTR_FLAG_NONE, "Spectra MLM Checksum", /*suffix*/NULL, /*to_str*/ scsi_attrib_hexdump_sbuf, /*parse_str*/ NULL }, }; /* * Print out Volume Coherency Information (Attribute 0x080c). * This field has two variable length members, including one at the * beginning, so it isn't practical to have a fixed structure definition. * This is current as of SSC4r03 (see section 4.2.21.3), dated March 25, * 2013. */ int scsi_attrib_volcoh_sbuf(struct sbuf *sb, struct scsi_mam_attribute_header *hdr, uint32_t valid_len, uint32_t flags, uint32_t output_flags, char *error_str, int error_str_len) { size_t avail_len; uint32_t field_size; uint64_t tmp_val; uint8_t *cur_ptr; int retval; int vcr_len, as_len; retval = 0; tmp_val = 0; field_size = scsi_2btoul(hdr->length); avail_len = valid_len - sizeof(*hdr); if (field_size > avail_len) { if (error_str != NULL) { snprintf(error_str, error_str_len, "Available " "length of attribute ID 0x%.4x %zu < field " "length %u", scsi_2btoul(hdr->id), avail_len, field_size); } retval = 1; goto bailout; } else if (field_size == 0) { /* * It isn't clear from the spec whether a field length of * 0 is invalid here. It probably is, but be lenient here * to avoid inconveniencing the user. */ goto bailout; } cur_ptr = hdr->attribute; vcr_len = *cur_ptr; cur_ptr++; sbuf_printf(sb, "\n\tVolume Change Reference Value:"); switch (vcr_len) { case 0: if (error_str != NULL) { snprintf(error_str, error_str_len, "Volume Change " "Reference value has length of 0"); } retval = 1; goto bailout; break; /*NOTREACHED*/ case 1: tmp_val = *cur_ptr; break; case 2: tmp_val = scsi_2btoul(cur_ptr); break; case 3: tmp_val = scsi_3btoul(cur_ptr); break; case 4: tmp_val = scsi_4btoul(cur_ptr); break; case 8: tmp_val = scsi_8btou64(cur_ptr); break; default: sbuf_printf(sb, "\n"); sbuf_hexdump(sb, cur_ptr, vcr_len, NULL, 0); break; } if (vcr_len <= 8) sbuf_printf(sb, " 0x%jx\n", (uintmax_t)tmp_val); cur_ptr += vcr_len; tmp_val = scsi_8btou64(cur_ptr); sbuf_printf(sb, "\tVolume Coherency Count: %ju\n", (uintmax_t)tmp_val); cur_ptr += sizeof(tmp_val); tmp_val = scsi_8btou64(cur_ptr); sbuf_printf(sb, "\tVolume Coherency Set Identifier: 0x%jx\n", (uintmax_t)tmp_val); /* * Figure out how long the Application Client Specific Information * is and produce a hexdump. */ cur_ptr += sizeof(tmp_val); as_len = scsi_2btoul(cur_ptr); cur_ptr += sizeof(uint16_t); sbuf_printf(sb, "\tApplication Client Specific Information: "); if (((as_len == SCSI_LTFS_VER0_LEN) || (as_len == SCSI_LTFS_VER1_LEN)) && (strncmp(cur_ptr, SCSI_LTFS_STR_NAME, SCSI_LTFS_STR_LEN) == 0)) { sbuf_printf(sb, "LTFS\n"); cur_ptr += SCSI_LTFS_STR_LEN + 1; if (cur_ptr[SCSI_LTFS_UUID_LEN] != '\0') cur_ptr[SCSI_LTFS_UUID_LEN] = '\0'; sbuf_printf(sb, "\tLTFS UUID: %s\n", cur_ptr); cur_ptr += SCSI_LTFS_UUID_LEN + 1; /* XXX KDM check the length */ sbuf_printf(sb, "\tLTFS Version: %d\n", *cur_ptr); } else { sbuf_printf(sb, "Unknown\n"); sbuf_hexdump(sb, cur_ptr, as_len, NULL, 0); } bailout: return (retval); } int scsi_attrib_vendser_sbuf(struct sbuf *sb, struct scsi_mam_attribute_header *hdr, uint32_t valid_len, uint32_t flags, uint32_t output_flags, char *error_str, int error_str_len) { size_t avail_len; uint32_t field_size; struct scsi_attrib_vendser *vendser; cam_strvis_flags strvis_flags; int retval = 0; field_size = scsi_2btoul(hdr->length); avail_len = valid_len - sizeof(*hdr); if (field_size > avail_len) { if (error_str != NULL) { snprintf(error_str, error_str_len, "Available " "length of attribute ID 0x%.4x %zu < field " "length %u", scsi_2btoul(hdr->id), avail_len, field_size); } retval = 1; goto bailout; } else if (field_size == 0) { /* * A field size of 0 doesn't make sense here. The device * can at least give you the vendor ID, even if it can't * give you the serial number. */ if (error_str != NULL) { snprintf(error_str, error_str_len, "The length of " "attribute ID 0x%.4x is 0", scsi_2btoul(hdr->id)); } retval = 1; goto bailout; } vendser = (struct scsi_attrib_vendser *)hdr->attribute; switch (output_flags & SCSI_ATTR_OUTPUT_NONASCII_MASK) { case SCSI_ATTR_OUTPUT_NONASCII_TRIM: strvis_flags = CAM_STRVIS_FLAG_NONASCII_TRIM; break; case SCSI_ATTR_OUTPUT_NONASCII_RAW: strvis_flags = CAM_STRVIS_FLAG_NONASCII_RAW; break; case SCSI_ATTR_OUTPUT_NONASCII_ESC: default: strvis_flags = CAM_STRVIS_FLAG_NONASCII_ESC; break;; } cam_strvis_sbuf(sb, vendser->vendor, sizeof(vendser->vendor), strvis_flags); sbuf_putc(sb, ' '); cam_strvis_sbuf(sb, vendser->serial_num, sizeof(vendser->serial_num), strvis_flags); bailout: return (retval); } int scsi_attrib_hexdump_sbuf(struct sbuf *sb, struct scsi_mam_attribute_header *hdr, uint32_t valid_len, uint32_t flags, uint32_t output_flags, char *error_str, int error_str_len) { uint32_t field_size; ssize_t avail_len; uint32_t print_len; uint8_t *num_ptr; int retval = 0; field_size = scsi_2btoul(hdr->length); avail_len = valid_len - sizeof(*hdr); print_len = MIN(avail_len, field_size); num_ptr = hdr->attribute; if (print_len > 0) { sbuf_printf(sb, "\n"); sbuf_hexdump(sb, num_ptr, print_len, NULL, 0); } return (retval); } int scsi_attrib_int_sbuf(struct sbuf *sb, struct scsi_mam_attribute_header *hdr, uint32_t valid_len, uint32_t flags, uint32_t output_flags, char *error_str, int error_str_len) { uint64_t print_number; size_t avail_len; uint32_t number_size; int retval = 0; number_size = scsi_2btoul(hdr->length); avail_len = valid_len - sizeof(*hdr); if (avail_len < number_size) { if (error_str != NULL) { snprintf(error_str, error_str_len, "Available " "length of attribute ID 0x%.4x %zu < field " "length %u", scsi_2btoul(hdr->id), avail_len, number_size); } retval = 1; goto bailout; } switch (number_size) { case 0: /* * We don't treat this as an error, since there may be * scenarios where a device reports a field but then gives * a length of 0. See the note in scsi_attrib_ascii_sbuf(). */ goto bailout; break; /*NOTREACHED*/ case 1: print_number = hdr->attribute[0]; break; case 2: print_number = scsi_2btoul(hdr->attribute); break; case 3: print_number = scsi_3btoul(hdr->attribute); break; case 4: print_number = scsi_4btoul(hdr->attribute); break; case 8: print_number = scsi_8btou64(hdr->attribute); break; default: /* * If we wind up here, the number is too big to print * normally, so just do a hexdump. */ retval = scsi_attrib_hexdump_sbuf(sb, hdr, valid_len, flags, output_flags, error_str, error_str_len); goto bailout; break; } if (flags & SCSI_ATTR_FLAG_FP) { #ifndef _KERNEL long double num_float; num_float = (long double)print_number; if (flags & SCSI_ATTR_FLAG_DIV_10) num_float /= 10; sbuf_printf(sb, "%.*Lf", (flags & SCSI_ATTR_FLAG_FP_1DIGIT) ? 1 : 0, num_float); #else /* _KERNEL */ sbuf_printf(sb, "%ju", (flags & SCSI_ATTR_FLAG_DIV_10) ? (print_number / 10) : print_number); #endif /* _KERNEL */ } else if (flags & SCSI_ATTR_FLAG_HEX) { sbuf_printf(sb, "0x%jx", (uintmax_t)print_number); } else sbuf_printf(sb, "%ju", (uintmax_t)print_number); bailout: return (retval); } int scsi_attrib_ascii_sbuf(struct sbuf *sb, struct scsi_mam_attribute_header *hdr, uint32_t valid_len, uint32_t flags, uint32_t output_flags, char *error_str, int error_str_len) { size_t avail_len; uint32_t field_size, print_size; int retval = 0; avail_len = valid_len - sizeof(*hdr); field_size = scsi_2btoul(hdr->length); print_size = MIN(avail_len, field_size); if (print_size > 0) { cam_strvis_flags strvis_flags; switch (output_flags & SCSI_ATTR_OUTPUT_NONASCII_MASK) { case SCSI_ATTR_OUTPUT_NONASCII_TRIM: strvis_flags = CAM_STRVIS_FLAG_NONASCII_TRIM; break; case SCSI_ATTR_OUTPUT_NONASCII_RAW: strvis_flags = CAM_STRVIS_FLAG_NONASCII_RAW; break; case SCSI_ATTR_OUTPUT_NONASCII_ESC: default: strvis_flags = CAM_STRVIS_FLAG_NONASCII_ESC; break; } cam_strvis_sbuf(sb, hdr->attribute, print_size, strvis_flags); } else if (avail_len < field_size) { /* * We only report an error if the user didn't allocate * enough space to hold the full value of this field. If * the field length is 0, that is allowed by the spec. * e.g. in SPC-4r37, section 7.4.2.2.5, VOLUME IDENTIFIER * "This attribute indicates the current volume identifier * (see SMC-3) of the medium. If the device server supports * this attribute but does not have access to the volume * identifier, the device server shall report this attribute * with an attribute length value of zero." */ if (error_str != NULL) { snprintf(error_str, error_str_len, "Available " "length of attribute ID 0x%.4x %zu < field " "length %u", scsi_2btoul(hdr->id), avail_len, field_size); } retval = 1; } return (retval); } int scsi_attrib_text_sbuf(struct sbuf *sb, struct scsi_mam_attribute_header *hdr, uint32_t valid_len, uint32_t flags, uint32_t output_flags, char *error_str, int error_str_len) { size_t avail_len; uint32_t field_size, print_size; int retval = 0; int esc_text = 1; avail_len = valid_len - sizeof(*hdr); field_size = scsi_2btoul(hdr->length); print_size = MIN(avail_len, field_size); if ((output_flags & SCSI_ATTR_OUTPUT_TEXT_MASK) == SCSI_ATTR_OUTPUT_TEXT_RAW) esc_text = 0; if (print_size > 0) { uint32_t i; for (i = 0; i < print_size; i++) { if (hdr->attribute[i] == '\0') continue; else if (((unsigned char)hdr->attribute[i] < 0x80) || (esc_text == 0)) sbuf_putc(sb, hdr->attribute[i]); else sbuf_printf(sb, "%%%02x", (unsigned char)hdr->attribute[i]); } } else if (avail_len < field_size) { /* * We only report an error if the user didn't allocate * enough space to hold the full value of this field. */ if (error_str != NULL) { snprintf(error_str, error_str_len, "Available " "length of attribute ID 0x%.4x %zu < field " "length %u", scsi_2btoul(hdr->id), avail_len, field_size); } retval = 1; } return (retval); } struct scsi_attrib_table_entry * scsi_find_attrib_entry(struct scsi_attrib_table_entry *table, size_t num_table_entries, uint32_t id) { uint32_t i; for (i = 0; i < num_table_entries; i++) { if (table[i].id == id) return (&table[i]); } return (NULL); } struct scsi_attrib_table_entry * scsi_get_attrib_entry(uint32_t id) { return (scsi_find_attrib_entry(scsi_mam_attr_table, nitems(scsi_mam_attr_table), id)); } int scsi_attrib_value_sbuf(struct sbuf *sb, uint32_t valid_len, struct scsi_mam_attribute_header *hdr, uint32_t output_flags, char *error_str, size_t error_str_len) { int retval; switch (hdr->byte2 & SMA_FORMAT_MASK) { case SMA_FORMAT_ASCII: retval = scsi_attrib_ascii_sbuf(sb, hdr, valid_len, SCSI_ATTR_FLAG_NONE, output_flags, error_str,error_str_len); break; case SMA_FORMAT_BINARY: if (scsi_2btoul(hdr->length) <= 8) retval = scsi_attrib_int_sbuf(sb, hdr, valid_len, SCSI_ATTR_FLAG_NONE, output_flags, error_str, error_str_len); else retval = scsi_attrib_hexdump_sbuf(sb, hdr, valid_len, SCSI_ATTR_FLAG_NONE, output_flags, error_str, error_str_len); break; case SMA_FORMAT_TEXT: retval = scsi_attrib_text_sbuf(sb, hdr, valid_len, SCSI_ATTR_FLAG_NONE, output_flags, error_str, error_str_len); break; default: if (error_str != NULL) { snprintf(error_str, error_str_len, "Unknown attribute " "format 0x%x", hdr->byte2 & SMA_FORMAT_MASK); } retval = 1; goto bailout; break; /*NOTREACHED*/ } sbuf_trim(sb); bailout: return (retval); } void scsi_attrib_prefix_sbuf(struct sbuf *sb, uint32_t output_flags, struct scsi_mam_attribute_header *hdr, uint32_t valid_len, const char *desc) { int need_space = 0; uint32_t len; uint32_t id; /* * We can't do anything if we don't have enough valid data for the * header. */ if (valid_len < sizeof(*hdr)) return; id = scsi_2btoul(hdr->id); /* * Note that we print out the value of the attribute listed in the * header, regardless of whether we actually got that many bytes * back from the device through the controller. A truncated result * could be the result of a failure to ask for enough data; the * header indicates how many bytes are allocated for this attribute * in the MAM. */ len = scsi_2btoul(hdr->length); if ((output_flags & SCSI_ATTR_OUTPUT_FIELD_MASK) == SCSI_ATTR_OUTPUT_FIELD_NONE) return; if ((output_flags & SCSI_ATTR_OUTPUT_FIELD_DESC) && (desc != NULL)) { sbuf_printf(sb, "%s", desc); need_space = 1; } if (output_flags & SCSI_ATTR_OUTPUT_FIELD_NUM) { sbuf_printf(sb, "%s(0x%.4x)", (need_space) ? " " : "", id); need_space = 0; } if (output_flags & SCSI_ATTR_OUTPUT_FIELD_SIZE) { sbuf_printf(sb, "%s[%d]", (need_space) ? " " : "", len); need_space = 0; } if (output_flags & SCSI_ATTR_OUTPUT_FIELD_RW) { sbuf_printf(sb, "%s(%s)", (need_space) ? " " : "", (hdr->byte2 & SMA_READ_ONLY) ? "RO" : "RW"); } sbuf_printf(sb, ": "); } int scsi_attrib_sbuf(struct sbuf *sb, struct scsi_mam_attribute_header *hdr, uint32_t valid_len, struct scsi_attrib_table_entry *user_table, size_t num_user_entries, int prefer_user_table, uint32_t output_flags, char *error_str, int error_str_len) { int retval; struct scsi_attrib_table_entry *table1 = NULL, *table2 = NULL; struct scsi_attrib_table_entry *entry = NULL; size_t table1_size = 0, table2_size = 0; uint32_t id; retval = 0; if (valid_len < sizeof(*hdr)) { retval = 1; goto bailout; } id = scsi_2btoul(hdr->id); if (user_table != NULL) { if (prefer_user_table != 0) { table1 = user_table; table1_size = num_user_entries; table2 = scsi_mam_attr_table; table2_size = nitems(scsi_mam_attr_table); } else { table1 = scsi_mam_attr_table; table1_size = nitems(scsi_mam_attr_table); table2 = user_table; table2_size = num_user_entries; } } else { table1 = scsi_mam_attr_table; table1_size = nitems(scsi_mam_attr_table); } entry = scsi_find_attrib_entry(table1, table1_size, id); if (entry != NULL) { scsi_attrib_prefix_sbuf(sb, output_flags, hdr, valid_len, entry->desc); if (entry->to_str == NULL) goto print_default; retval = entry->to_str(sb, hdr, valid_len, entry->flags, output_flags, error_str, error_str_len); goto bailout; } if (table2 != NULL) { entry = scsi_find_attrib_entry(table2, table2_size, id); if (entry != NULL) { if (entry->to_str == NULL) goto print_default; scsi_attrib_prefix_sbuf(sb, output_flags, hdr, valid_len, entry->desc); retval = entry->to_str(sb, hdr, valid_len, entry->flags, output_flags, error_str, error_str_len); goto bailout; } } scsi_attrib_prefix_sbuf(sb, output_flags, hdr, valid_len, NULL); print_default: retval = scsi_attrib_value_sbuf(sb, valid_len, hdr, output_flags, error_str, error_str_len); bailout: if (retval == 0) { if ((entry != NULL) && (entry->suffix != NULL)) sbuf_printf(sb, " %s", entry->suffix); sbuf_trim(sb); sbuf_printf(sb, "\n"); } return (retval); } void scsi_test_unit_ready(struct ccb_scsiio *csio, u_int32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb *), u_int8_t tag_action, u_int8_t sense_len, u_int32_t timeout) { struct scsi_test_unit_ready *scsi_cmd; cam_fill_csio(csio, retries, cbfcnp, CAM_DIR_NONE, tag_action, /*data_ptr*/NULL, /*dxfer_len*/0, sense_len, sizeof(*scsi_cmd), timeout); scsi_cmd = (struct scsi_test_unit_ready *)&csio->cdb_io.cdb_bytes; bzero(scsi_cmd, sizeof(*scsi_cmd)); scsi_cmd->opcode = TEST_UNIT_READY; } void scsi_request_sense(struct ccb_scsiio *csio, u_int32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb *), void *data_ptr, u_int8_t dxfer_len, u_int8_t tag_action, u_int8_t sense_len, u_int32_t timeout) { struct scsi_request_sense *scsi_cmd; cam_fill_csio(csio, retries, cbfcnp, CAM_DIR_IN, tag_action, data_ptr, dxfer_len, sense_len, sizeof(*scsi_cmd), timeout); scsi_cmd = (struct scsi_request_sense *)&csio->cdb_io.cdb_bytes; bzero(scsi_cmd, sizeof(*scsi_cmd)); scsi_cmd->opcode = REQUEST_SENSE; scsi_cmd->length = dxfer_len; } void scsi_inquiry(struct ccb_scsiio *csio, u_int32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb *), u_int8_t tag_action, u_int8_t *inq_buf, u_int32_t inq_len, int evpd, u_int8_t page_code, u_int8_t sense_len, u_int32_t timeout) { struct scsi_inquiry *scsi_cmd; cam_fill_csio(csio, retries, cbfcnp, /*flags*/CAM_DIR_IN, tag_action, /*data_ptr*/inq_buf, /*dxfer_len*/inq_len, sense_len, sizeof(*scsi_cmd), timeout); scsi_cmd = (struct scsi_inquiry *)&csio->cdb_io.cdb_bytes; bzero(scsi_cmd, sizeof(*scsi_cmd)); scsi_cmd->opcode = INQUIRY; if (evpd) { scsi_cmd->byte2 |= SI_EVPD; scsi_cmd->page_code = page_code; } scsi_ulto2b(inq_len, scsi_cmd->length); } void scsi_mode_sense(struct ccb_scsiio *csio, uint32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb *), uint8_t tag_action, int dbd, uint8_t pc, uint8_t page, uint8_t *param_buf, uint32_t param_len, uint8_t sense_len, uint32_t timeout) { scsi_mode_sense_subpage(csio, retries, cbfcnp, tag_action, dbd, pc, page, 0, param_buf, param_len, 0, sense_len, timeout); } void scsi_mode_sense_len(struct ccb_scsiio *csio, uint32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb *), uint8_t tag_action, int dbd, uint8_t pc, uint8_t page, uint8_t *param_buf, uint32_t param_len, int minimum_cmd_size, uint8_t sense_len, uint32_t timeout) { scsi_mode_sense_subpage(csio, retries, cbfcnp, tag_action, dbd, pc, page, 0, param_buf, param_len, minimum_cmd_size, sense_len, timeout); } void scsi_mode_sense_subpage(struct ccb_scsiio *csio, uint32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb *), uint8_t tag_action, int dbd, uint8_t pc, uint8_t page, uint8_t subpage, uint8_t *param_buf, uint32_t param_len, int minimum_cmd_size, uint8_t sense_len, uint32_t timeout) { u_int8_t cdb_len; /* * Use the smallest possible command to perform the operation. */ if ((param_len < 256) && (minimum_cmd_size < 10)) { /* * We can fit in a 6 byte cdb. */ struct scsi_mode_sense_6 *scsi_cmd; scsi_cmd = (struct scsi_mode_sense_6 *)&csio->cdb_io.cdb_bytes; bzero(scsi_cmd, sizeof(*scsi_cmd)); scsi_cmd->opcode = MODE_SENSE_6; if (dbd != 0) scsi_cmd->byte2 |= SMS_DBD; scsi_cmd->page = pc | page; scsi_cmd->subpage = subpage; scsi_cmd->length = param_len; cdb_len = sizeof(*scsi_cmd); } else { /* * Need a 10 byte cdb. */ struct scsi_mode_sense_10 *scsi_cmd; scsi_cmd = (struct scsi_mode_sense_10 *)&csio->cdb_io.cdb_bytes; bzero(scsi_cmd, sizeof(*scsi_cmd)); scsi_cmd->opcode = MODE_SENSE_10; if (dbd != 0) scsi_cmd->byte2 |= SMS_DBD; scsi_cmd->page = pc | page; scsi_cmd->subpage = subpage; scsi_ulto2b(param_len, scsi_cmd->length); cdb_len = sizeof(*scsi_cmd); } cam_fill_csio(csio, retries, cbfcnp, CAM_DIR_IN, tag_action, param_buf, param_len, sense_len, cdb_len, timeout); } void scsi_mode_select(struct ccb_scsiio *csio, u_int32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb *), u_int8_t tag_action, int scsi_page_fmt, int save_pages, u_int8_t *param_buf, u_int32_t param_len, u_int8_t sense_len, u_int32_t timeout) { scsi_mode_select_len(csio, retries, cbfcnp, tag_action, scsi_page_fmt, save_pages, param_buf, param_len, 0, sense_len, timeout); } void scsi_mode_select_len(struct ccb_scsiio *csio, u_int32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb *), u_int8_t tag_action, int scsi_page_fmt, int save_pages, u_int8_t *param_buf, u_int32_t param_len, int minimum_cmd_size, u_int8_t sense_len, u_int32_t timeout) { u_int8_t cdb_len; /* * Use the smallest possible command to perform the operation. */ if ((param_len < 256) && (minimum_cmd_size < 10)) { /* * We can fit in a 6 byte cdb. */ struct scsi_mode_select_6 *scsi_cmd; scsi_cmd = (struct scsi_mode_select_6 *)&csio->cdb_io.cdb_bytes; bzero(scsi_cmd, sizeof(*scsi_cmd)); scsi_cmd->opcode = MODE_SELECT_6; if (scsi_page_fmt != 0) scsi_cmd->byte2 |= SMS_PF; if (save_pages != 0) scsi_cmd->byte2 |= SMS_SP; scsi_cmd->length = param_len; cdb_len = sizeof(*scsi_cmd); } else { /* * Need a 10 byte cdb. */ struct scsi_mode_select_10 *scsi_cmd; scsi_cmd = (struct scsi_mode_select_10 *)&csio->cdb_io.cdb_bytes; bzero(scsi_cmd, sizeof(*scsi_cmd)); scsi_cmd->opcode = MODE_SELECT_10; if (scsi_page_fmt != 0) scsi_cmd->byte2 |= SMS_PF; if (save_pages != 0) scsi_cmd->byte2 |= SMS_SP; scsi_ulto2b(param_len, scsi_cmd->length); cdb_len = sizeof(*scsi_cmd); } cam_fill_csio(csio, retries, cbfcnp, CAM_DIR_OUT, tag_action, param_buf, param_len, sense_len, cdb_len, timeout); } void scsi_log_sense(struct ccb_scsiio *csio, u_int32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb *), u_int8_t tag_action, u_int8_t page_code, u_int8_t page, int save_pages, int ppc, u_int32_t paramptr, u_int8_t *param_buf, u_int32_t param_len, u_int8_t sense_len, u_int32_t timeout) { struct scsi_log_sense *scsi_cmd; u_int8_t cdb_len; scsi_cmd = (struct scsi_log_sense *)&csio->cdb_io.cdb_bytes; bzero(scsi_cmd, sizeof(*scsi_cmd)); scsi_cmd->opcode = LOG_SENSE; scsi_cmd->page = page_code | page; if (save_pages != 0) scsi_cmd->byte2 |= SLS_SP; if (ppc != 0) scsi_cmd->byte2 |= SLS_PPC; scsi_ulto2b(paramptr, scsi_cmd->paramptr); scsi_ulto2b(param_len, scsi_cmd->length); cdb_len = sizeof(*scsi_cmd); cam_fill_csio(csio, retries, cbfcnp, /*flags*/CAM_DIR_IN, tag_action, /*data_ptr*/param_buf, /*dxfer_len*/param_len, sense_len, cdb_len, timeout); } void scsi_log_select(struct ccb_scsiio *csio, u_int32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb *), u_int8_t tag_action, u_int8_t page_code, int save_pages, int pc_reset, u_int8_t *param_buf, u_int32_t param_len, u_int8_t sense_len, u_int32_t timeout) { struct scsi_log_select *scsi_cmd; u_int8_t cdb_len; scsi_cmd = (struct scsi_log_select *)&csio->cdb_io.cdb_bytes; bzero(scsi_cmd, sizeof(*scsi_cmd)); scsi_cmd->opcode = LOG_SELECT; scsi_cmd->page = page_code & SLS_PAGE_CODE; if (save_pages != 0) scsi_cmd->byte2 |= SLS_SP; if (pc_reset != 0) scsi_cmd->byte2 |= SLS_PCR; scsi_ulto2b(param_len, scsi_cmd->length); cdb_len = sizeof(*scsi_cmd); cam_fill_csio(csio, retries, cbfcnp, /*flags*/CAM_DIR_OUT, tag_action, /*data_ptr*/param_buf, /*dxfer_len*/param_len, sense_len, cdb_len, timeout); } /* * Prevent or allow the user to remove the media */ void scsi_prevent(struct ccb_scsiio *csio, u_int32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb *), u_int8_t tag_action, u_int8_t action, u_int8_t sense_len, u_int32_t timeout) { struct scsi_prevent *scsi_cmd; cam_fill_csio(csio, retries, cbfcnp, /*flags*/CAM_DIR_NONE, tag_action, /*data_ptr*/NULL, /*dxfer_len*/0, sense_len, sizeof(*scsi_cmd), timeout); scsi_cmd = (struct scsi_prevent *)&csio->cdb_io.cdb_bytes; bzero(scsi_cmd, sizeof(*scsi_cmd)); scsi_cmd->opcode = PREVENT_ALLOW; scsi_cmd->how = action; } /* XXX allow specification of address and PMI bit and LBA */ void scsi_read_capacity(struct ccb_scsiio *csio, u_int32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb *), u_int8_t tag_action, struct scsi_read_capacity_data *rcap_buf, u_int8_t sense_len, u_int32_t timeout) { struct scsi_read_capacity *scsi_cmd; cam_fill_csio(csio, retries, cbfcnp, /*flags*/CAM_DIR_IN, tag_action, /*data_ptr*/(u_int8_t *)rcap_buf, /*dxfer_len*/sizeof(*rcap_buf), sense_len, sizeof(*scsi_cmd), timeout); scsi_cmd = (struct scsi_read_capacity *)&csio->cdb_io.cdb_bytes; bzero(scsi_cmd, sizeof(*scsi_cmd)); scsi_cmd->opcode = READ_CAPACITY; } void scsi_read_capacity_16(struct ccb_scsiio *csio, uint32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb *), uint8_t tag_action, uint64_t lba, int reladr, int pmi, uint8_t *rcap_buf, int rcap_buf_len, uint8_t sense_len, uint32_t timeout) { struct scsi_read_capacity_16 *scsi_cmd; cam_fill_csio(csio, retries, cbfcnp, /*flags*/CAM_DIR_IN, tag_action, /*data_ptr*/(u_int8_t *)rcap_buf, /*dxfer_len*/rcap_buf_len, sense_len, sizeof(*scsi_cmd), timeout); scsi_cmd = (struct scsi_read_capacity_16 *)&csio->cdb_io.cdb_bytes; bzero(scsi_cmd, sizeof(*scsi_cmd)); scsi_cmd->opcode = SERVICE_ACTION_IN; scsi_cmd->service_action = SRC16_SERVICE_ACTION; scsi_u64to8b(lba, scsi_cmd->addr); scsi_ulto4b(rcap_buf_len, scsi_cmd->alloc_len); if (pmi) reladr |= SRC16_PMI; if (reladr) reladr |= SRC16_RELADR; } void scsi_report_luns(struct ccb_scsiio *csio, u_int32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb *), u_int8_t tag_action, u_int8_t select_report, struct scsi_report_luns_data *rpl_buf, u_int32_t alloc_len, u_int8_t sense_len, u_int32_t timeout) { struct scsi_report_luns *scsi_cmd; cam_fill_csio(csio, retries, cbfcnp, /*flags*/CAM_DIR_IN, tag_action, /*data_ptr*/(u_int8_t *)rpl_buf, /*dxfer_len*/alloc_len, sense_len, sizeof(*scsi_cmd), timeout); scsi_cmd = (struct scsi_report_luns *)&csio->cdb_io.cdb_bytes; bzero(scsi_cmd, sizeof(*scsi_cmd)); scsi_cmd->opcode = REPORT_LUNS; scsi_cmd->select_report = select_report; scsi_ulto4b(alloc_len, scsi_cmd->length); } void scsi_report_target_group(struct ccb_scsiio *csio, u_int32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb *), u_int8_t tag_action, u_int8_t pdf, void *buf, u_int32_t alloc_len, u_int8_t sense_len, u_int32_t timeout) { struct scsi_target_group *scsi_cmd; cam_fill_csio(csio, retries, cbfcnp, /*flags*/CAM_DIR_IN, tag_action, /*data_ptr*/(u_int8_t *)buf, /*dxfer_len*/alloc_len, sense_len, sizeof(*scsi_cmd), timeout); scsi_cmd = (struct scsi_target_group *)&csio->cdb_io.cdb_bytes; bzero(scsi_cmd, sizeof(*scsi_cmd)); scsi_cmd->opcode = MAINTENANCE_IN; scsi_cmd->service_action = REPORT_TARGET_PORT_GROUPS | pdf; scsi_ulto4b(alloc_len, scsi_cmd->length); } void scsi_report_timestamp(struct ccb_scsiio *csio, u_int32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb *), u_int8_t tag_action, u_int8_t pdf, void *buf, u_int32_t alloc_len, u_int8_t sense_len, u_int32_t timeout) { struct scsi_timestamp *scsi_cmd; cam_fill_csio(csio, retries, cbfcnp, /*flags*/CAM_DIR_IN, tag_action, /*data_ptr*/(u_int8_t *)buf, /*dxfer_len*/alloc_len, sense_len, sizeof(*scsi_cmd), timeout); scsi_cmd = (struct scsi_timestamp *)&csio->cdb_io.cdb_bytes; bzero(scsi_cmd, sizeof(*scsi_cmd)); scsi_cmd->opcode = MAINTENANCE_IN; scsi_cmd->service_action = REPORT_TIMESTAMP | pdf; scsi_ulto4b(alloc_len, scsi_cmd->length); } void scsi_set_target_group(struct ccb_scsiio *csio, u_int32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb *), u_int8_t tag_action, void *buf, u_int32_t alloc_len, u_int8_t sense_len, u_int32_t timeout) { struct scsi_target_group *scsi_cmd; cam_fill_csio(csio, retries, cbfcnp, /*flags*/CAM_DIR_OUT, tag_action, /*data_ptr*/(u_int8_t *)buf, /*dxfer_len*/alloc_len, sense_len, sizeof(*scsi_cmd), timeout); scsi_cmd = (struct scsi_target_group *)&csio->cdb_io.cdb_bytes; bzero(scsi_cmd, sizeof(*scsi_cmd)); scsi_cmd->opcode = MAINTENANCE_OUT; scsi_cmd->service_action = SET_TARGET_PORT_GROUPS; scsi_ulto4b(alloc_len, scsi_cmd->length); } void scsi_create_timestamp(uint8_t *timestamp_6b_buf, uint64_t timestamp) { uint8_t buf[8]; scsi_u64to8b(timestamp, buf); /* * Using memcopy starting at buf[2] because the set timestamp parameters * only has six bytes for the timestamp to fit into, and we don't have a * scsi_u64to6b function. */ memcpy(timestamp_6b_buf, &buf[2], 6); } void scsi_set_timestamp(struct ccb_scsiio *csio, u_int32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb *), u_int8_t tag_action, void *buf, u_int32_t alloc_len, u_int8_t sense_len, u_int32_t timeout) { struct scsi_timestamp *scsi_cmd; cam_fill_csio(csio, retries, cbfcnp, /*flags*/CAM_DIR_OUT, tag_action, /*data_ptr*/(u_int8_t *) buf, /*dxfer_len*/alloc_len, sense_len, sizeof(*scsi_cmd), timeout); scsi_cmd = (struct scsi_timestamp *)&csio->cdb_io.cdb_bytes; bzero(scsi_cmd, sizeof(*scsi_cmd)); scsi_cmd->opcode = MAINTENANCE_OUT; scsi_cmd->service_action = SET_TIMESTAMP; scsi_ulto4b(alloc_len, scsi_cmd->length); } /* * Syncronize the media to the contents of the cache for * the given lba/count pair. Specifying 0/0 means sync * the whole cache. */ void scsi_synchronize_cache(struct ccb_scsiio *csio, u_int32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb *), u_int8_t tag_action, u_int32_t begin_lba, u_int16_t lb_count, u_int8_t sense_len, u_int32_t timeout) { struct scsi_sync_cache *scsi_cmd; cam_fill_csio(csio, retries, cbfcnp, /*flags*/CAM_DIR_NONE, tag_action, /*data_ptr*/NULL, /*dxfer_len*/0, sense_len, sizeof(*scsi_cmd), timeout); scsi_cmd = (struct scsi_sync_cache *)&csio->cdb_io.cdb_bytes; bzero(scsi_cmd, sizeof(*scsi_cmd)); scsi_cmd->opcode = SYNCHRONIZE_CACHE; scsi_ulto4b(begin_lba, scsi_cmd->begin_lba); scsi_ulto2b(lb_count, scsi_cmd->lb_count); } void scsi_read_write(struct ccb_scsiio *csio, u_int32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb *), u_int8_t tag_action, int readop, u_int8_t byte2, int minimum_cmd_size, u_int64_t lba, u_int32_t block_count, u_int8_t *data_ptr, u_int32_t dxfer_len, u_int8_t sense_len, u_int32_t timeout) { int read; u_int8_t cdb_len; read = (readop & SCSI_RW_DIRMASK) == SCSI_RW_READ; /* * Use the smallest possible command to perform the operation * as some legacy hardware does not support the 10 byte commands. * If any of the bits in byte2 is set, we have to go with a larger * command. */ if ((minimum_cmd_size < 10) && ((lba & 0x1fffff) == lba) && ((block_count & 0xff) == block_count) && (byte2 == 0)) { /* * We can fit in a 6 byte cdb. */ struct scsi_rw_6 *scsi_cmd; scsi_cmd = (struct scsi_rw_6 *)&csio->cdb_io.cdb_bytes; scsi_cmd->opcode = read ? READ_6 : WRITE_6; scsi_ulto3b(lba, scsi_cmd->addr); scsi_cmd->length = block_count & 0xff; scsi_cmd->control = 0; cdb_len = sizeof(*scsi_cmd); CAM_DEBUG(csio->ccb_h.path, CAM_DEBUG_SUBTRACE, ("6byte: %x%x%x:%d:%d\n", scsi_cmd->addr[0], scsi_cmd->addr[1], scsi_cmd->addr[2], scsi_cmd->length, dxfer_len)); } else if ((minimum_cmd_size < 12) && ((block_count & 0xffff) == block_count) && ((lba & 0xffffffff) == lba)) { /* * Need a 10 byte cdb. */ struct scsi_rw_10 *scsi_cmd; scsi_cmd = (struct scsi_rw_10 *)&csio->cdb_io.cdb_bytes; scsi_cmd->opcode = read ? READ_10 : WRITE_10; scsi_cmd->byte2 = byte2; scsi_ulto4b(lba, scsi_cmd->addr); scsi_cmd->reserved = 0; scsi_ulto2b(block_count, scsi_cmd->length); scsi_cmd->control = 0; cdb_len = sizeof(*scsi_cmd); CAM_DEBUG(csio->ccb_h.path, CAM_DEBUG_SUBTRACE, ("10byte: %x%x%x%x:%x%x: %d\n", scsi_cmd->addr[0], scsi_cmd->addr[1], scsi_cmd->addr[2], scsi_cmd->addr[3], scsi_cmd->length[0], scsi_cmd->length[1], dxfer_len)); } else if ((minimum_cmd_size < 16) && ((block_count & 0xffffffff) == block_count) && ((lba & 0xffffffff) == lba)) { /* * The block count is too big for a 10 byte CDB, use a 12 * byte CDB. */ struct scsi_rw_12 *scsi_cmd; scsi_cmd = (struct scsi_rw_12 *)&csio->cdb_io.cdb_bytes; scsi_cmd->opcode = read ? READ_12 : WRITE_12; scsi_cmd->byte2 = byte2; scsi_ulto4b(lba, scsi_cmd->addr); scsi_cmd->reserved = 0; scsi_ulto4b(block_count, scsi_cmd->length); scsi_cmd->control = 0; cdb_len = sizeof(*scsi_cmd); CAM_DEBUG(csio->ccb_h.path, CAM_DEBUG_SUBTRACE, ("12byte: %x%x%x%x:%x%x%x%x: %d\n", scsi_cmd->addr[0], scsi_cmd->addr[1], scsi_cmd->addr[2], scsi_cmd->addr[3], scsi_cmd->length[0], scsi_cmd->length[1], scsi_cmd->length[2], scsi_cmd->length[3], dxfer_len)); } else { /* * 16 byte CDB. We'll only get here if the LBA is larger * than 2^32, or if the user asks for a 16 byte command. */ struct scsi_rw_16 *scsi_cmd; scsi_cmd = (struct scsi_rw_16 *)&csio->cdb_io.cdb_bytes; scsi_cmd->opcode = read ? READ_16 : WRITE_16; scsi_cmd->byte2 = byte2; scsi_u64to8b(lba, scsi_cmd->addr); scsi_cmd->reserved = 0; scsi_ulto4b(block_count, scsi_cmd->length); scsi_cmd->control = 0; cdb_len = sizeof(*scsi_cmd); } cam_fill_csio(csio, retries, cbfcnp, (read ? CAM_DIR_IN : CAM_DIR_OUT) | ((readop & SCSI_RW_BIO) != 0 ? CAM_DATA_BIO : 0), tag_action, data_ptr, dxfer_len, sense_len, cdb_len, timeout); } void scsi_write_same(struct ccb_scsiio *csio, u_int32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb *), u_int8_t tag_action, u_int8_t byte2, int minimum_cmd_size, u_int64_t lba, u_int32_t block_count, u_int8_t *data_ptr, u_int32_t dxfer_len, u_int8_t sense_len, u_int32_t timeout) { u_int8_t cdb_len; if ((minimum_cmd_size < 16) && ((block_count & 0xffff) == block_count) && ((lba & 0xffffffff) == lba)) { /* * Need a 10 byte cdb. */ struct scsi_write_same_10 *scsi_cmd; scsi_cmd = (struct scsi_write_same_10 *)&csio->cdb_io.cdb_bytes; scsi_cmd->opcode = WRITE_SAME_10; scsi_cmd->byte2 = byte2; scsi_ulto4b(lba, scsi_cmd->addr); scsi_cmd->group = 0; scsi_ulto2b(block_count, scsi_cmd->length); scsi_cmd->control = 0; cdb_len = sizeof(*scsi_cmd); CAM_DEBUG(csio->ccb_h.path, CAM_DEBUG_SUBTRACE, ("10byte: %x%x%x%x:%x%x: %d\n", scsi_cmd->addr[0], scsi_cmd->addr[1], scsi_cmd->addr[2], scsi_cmd->addr[3], scsi_cmd->length[0], scsi_cmd->length[1], dxfer_len)); } else { /* * 16 byte CDB. We'll only get here if the LBA is larger * than 2^32, or if the user asks for a 16 byte command. */ struct scsi_write_same_16 *scsi_cmd; scsi_cmd = (struct scsi_write_same_16 *)&csio->cdb_io.cdb_bytes; scsi_cmd->opcode = WRITE_SAME_16; scsi_cmd->byte2 = byte2; scsi_u64to8b(lba, scsi_cmd->addr); scsi_ulto4b(block_count, scsi_cmd->length); scsi_cmd->group = 0; scsi_cmd->control = 0; cdb_len = sizeof(*scsi_cmd); CAM_DEBUG(csio->ccb_h.path, CAM_DEBUG_SUBTRACE, ("16byte: %x%x%x%x%x%x%x%x:%x%x%x%x: %d\n", scsi_cmd->addr[0], scsi_cmd->addr[1], scsi_cmd->addr[2], scsi_cmd->addr[3], scsi_cmd->addr[4], scsi_cmd->addr[5], scsi_cmd->addr[6], scsi_cmd->addr[7], scsi_cmd->length[0], scsi_cmd->length[1], scsi_cmd->length[2], scsi_cmd->length[3], dxfer_len)); } cam_fill_csio(csio, retries, cbfcnp, /*flags*/CAM_DIR_OUT, tag_action, data_ptr, dxfer_len, sense_len, cdb_len, timeout); } void scsi_ata_identify(struct ccb_scsiio *csio, u_int32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb *), u_int8_t tag_action, u_int8_t *data_ptr, u_int16_t dxfer_len, u_int8_t sense_len, u_int32_t timeout) { scsi_ata_pass(csio, retries, cbfcnp, /*flags*/CAM_DIR_IN, tag_action, /*protocol*/AP_PROTO_PIO_IN, /*ata_flags*/AP_FLAG_TDIR_FROM_DEV | AP_FLAG_BYT_BLOK_BYTES | AP_FLAG_TLEN_SECT_CNT, /*features*/0, /*sector_count*/dxfer_len, /*lba*/0, /*command*/ATA_ATA_IDENTIFY, /*device*/ 0, /*icc*/ 0, /*auxiliary*/ 0, /*control*/0, data_ptr, dxfer_len, /*cdb_storage*/ NULL, /*cdb_storage_len*/ 0, /*minimum_cmd_size*/ 0, sense_len, timeout); } void scsi_ata_trim(struct ccb_scsiio *csio, u_int32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb *), u_int8_t tag_action, u_int16_t block_count, u_int8_t *data_ptr, u_int16_t dxfer_len, u_int8_t sense_len, u_int32_t timeout) { scsi_ata_pass_16(csio, retries, cbfcnp, /*flags*/CAM_DIR_OUT, tag_action, /*protocol*/AP_EXTEND|AP_PROTO_DMA, /*ata_flags*/AP_FLAG_TLEN_SECT_CNT|AP_FLAG_BYT_BLOK_BLOCKS, /*features*/ATA_DSM_TRIM, /*sector_count*/block_count, /*lba*/0, /*command*/ATA_DATA_SET_MANAGEMENT, /*control*/0, data_ptr, dxfer_len, sense_len, timeout); } int scsi_ata_read_log(struct ccb_scsiio *csio, uint32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb *), uint8_t tag_action, uint32_t log_address, uint32_t page_number, uint16_t block_count, uint8_t protocol, uint8_t *data_ptr, uint32_t dxfer_len, uint8_t sense_len, uint32_t timeout) { uint8_t command, protocol_out; uint16_t count_out; uint64_t lba; int retval; retval = 0; switch (protocol) { case AP_PROTO_DMA: count_out = block_count; command = ATA_READ_LOG_DMA_EXT; protocol_out = AP_PROTO_DMA; break; case AP_PROTO_PIO_IN: default: count_out = block_count; command = ATA_READ_LOG_EXT; protocol_out = AP_PROTO_PIO_IN; break; } lba = (((uint64_t)page_number & 0xff00) << 32) | ((page_number & 0x00ff) << 8) | (log_address & 0xff); protocol_out |= AP_EXTEND; retval = scsi_ata_pass(csio, retries, cbfcnp, /*flags*/CAM_DIR_IN, tag_action, /*protocol*/ protocol_out, /*ata_flags*/AP_FLAG_TLEN_SECT_CNT | AP_FLAG_BYT_BLOK_BLOCKS | AP_FLAG_TDIR_FROM_DEV, /*feature*/ 0, /*sector_count*/ count_out, /*lba*/ lba, /*command*/ command, /*device*/ 0, /*icc*/ 0, /*auxiliary*/ 0, /*control*/0, data_ptr, dxfer_len, /*cdb_storage*/ NULL, /*cdb_storage_len*/ 0, /*minimum_cmd_size*/ 0, sense_len, timeout); return (retval); } int scsi_ata_setfeatures(struct ccb_scsiio *csio, uint32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb *), uint8_t tag_action, uint8_t feature, uint64_t lba, uint32_t count, uint8_t sense_len, uint32_t timeout) { return (scsi_ata_pass(csio, retries, cbfcnp, /*flags*/CAM_DIR_NONE, tag_action, /*protocol*/AP_PROTO_PIO_IN, /*ata_flags*/AP_FLAG_TDIR_FROM_DEV | AP_FLAG_BYT_BLOK_BYTES | AP_FLAG_TLEN_SECT_CNT, /*features*/feature, /*sector_count*/count, /*lba*/lba, /*command*/ATA_SETFEATURES, /*device*/ 0, /*icc*/ 0, /*auxiliary*/0, /*control*/0, /*data_ptr*/NULL, /*dxfer_len*/0, /*cdb_storage*/NULL, /*cdb_storage_len*/0, /*minimum_cmd_size*/0, sense_len, timeout)); } /* * Note! This is an unusual CDB building function because it can return * an error in the event that the command in question requires a variable * length CDB, but the caller has not given storage space for one or has not * given enough storage space. If there is enough space available in the * standard SCSI CCB CDB bytes, we'll prefer that over passed in storage. */ int scsi_ata_pass(struct ccb_scsiio *csio, uint32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb *), uint32_t flags, uint8_t tag_action, uint8_t protocol, uint8_t ata_flags, uint16_t features, uint16_t sector_count, uint64_t lba, uint8_t command, uint8_t device, uint8_t icc, uint32_t auxiliary, uint8_t control, u_int8_t *data_ptr, uint32_t dxfer_len, uint8_t *cdb_storage, size_t cdb_storage_len, int minimum_cmd_size, u_int8_t sense_len, u_int32_t timeout) { uint32_t cam_flags; uint8_t *cdb_ptr; int cmd_size; int retval; uint8_t cdb_len; retval = 0; cam_flags = flags; /* * Round the user's request to the nearest command size that is at * least as big as what he requested. */ if (minimum_cmd_size <= 12) cmd_size = 12; else if (minimum_cmd_size > 16) cmd_size = 32; else cmd_size = 16; /* * If we have parameters that require a 48-bit ATA command, we have to * use the 16 byte ATA PASS-THROUGH command at least. */ if (((lba > ATA_MAX_28BIT_LBA) || (sector_count > 255) || (features > 255) || (protocol & AP_EXTEND)) && ((cmd_size < 16) || ((protocol & AP_EXTEND) == 0))) { if (cmd_size < 16) cmd_size = 16; protocol |= AP_EXTEND; } /* * The icc and auxiliary ATA registers are only supported in the * 32-byte version of the ATA PASS-THROUGH command. */ if ((icc != 0) || (auxiliary != 0)) { cmd_size = 32; protocol |= AP_EXTEND; } if ((cmd_size > sizeof(csio->cdb_io.cdb_bytes)) && ((cdb_storage == NULL) || (cdb_storage_len < cmd_size))) { retval = 1; goto bailout; } /* * At this point we know we have enough space to store the command * in one place or another. We prefer the built-in array, but used * the passed in storage if necessary. */ if (cmd_size <= sizeof(csio->cdb_io.cdb_bytes)) cdb_ptr = csio->cdb_io.cdb_bytes; else { cdb_ptr = cdb_storage; cam_flags |= CAM_CDB_POINTER; } if (cmd_size <= 12) { struct ata_pass_12 *cdb; cdb = (struct ata_pass_12 *)cdb_ptr; cdb_len = sizeof(*cdb); bzero(cdb, cdb_len); cdb->opcode = ATA_PASS_12; cdb->protocol = protocol; cdb->flags = ata_flags; cdb->features = features; cdb->sector_count = sector_count; cdb->lba_low = lba & 0xff; cdb->lba_mid = (lba >> 8) & 0xff; cdb->lba_high = (lba >> 16) & 0xff; cdb->device = ((lba >> 24) & 0xf) | ATA_DEV_LBA; cdb->command = command; cdb->control = control; } else if (cmd_size <= 16) { struct ata_pass_16 *cdb; cdb = (struct ata_pass_16 *)cdb_ptr; cdb_len = sizeof(*cdb); bzero(cdb, cdb_len); cdb->opcode = ATA_PASS_16; cdb->protocol = protocol; cdb->flags = ata_flags; cdb->features = features & 0xff; cdb->sector_count = sector_count & 0xff; cdb->lba_low = lba & 0xff; cdb->lba_mid = (lba >> 8) & 0xff; cdb->lba_high = (lba >> 16) & 0xff; /* * If AP_EXTEND is set, we're sending a 48-bit command. * Otherwise it's a 28-bit command. */ if (protocol & AP_EXTEND) { cdb->lba_low_ext = (lba >> 24) & 0xff; cdb->lba_mid_ext = (lba >> 32) & 0xff; cdb->lba_high_ext = (lba >> 40) & 0xff; cdb->features_ext = (features >> 8) & 0xff; cdb->sector_count_ext = (sector_count >> 8) & 0xff; cdb->device = device | ATA_DEV_LBA; } else { cdb->lba_low_ext = (lba >> 24) & 0xf; cdb->device = ((lba >> 24) & 0xf) | ATA_DEV_LBA; } cdb->command = command; cdb->control = control; } else { struct ata_pass_32 *cdb; uint8_t tmp_lba[8]; cdb = (struct ata_pass_32 *)cdb_ptr; cdb_len = sizeof(*cdb); bzero(cdb, cdb_len); cdb->opcode = VARIABLE_LEN_CDB; cdb->control = control; cdb->length = sizeof(*cdb) - __offsetof(struct ata_pass_32, service_action); scsi_ulto2b(ATA_PASS_32_SA, cdb->service_action); cdb->protocol = protocol; cdb->flags = ata_flags; if ((protocol & AP_EXTEND) == 0) { lba &= 0x0fffffff; cdb->device = ((lba >> 24) & 0xf) | ATA_DEV_LBA; features &= 0xff; sector_count &= 0xff; } else { cdb->device = device | ATA_DEV_LBA; } scsi_u64to8b(lba, tmp_lba); bcopy(&tmp_lba[2], cdb->lba, sizeof(cdb->lba)); scsi_ulto2b(features, cdb->features); scsi_ulto2b(sector_count, cdb->count); cdb->command = command; cdb->icc = icc; scsi_ulto4b(auxiliary, cdb->auxiliary); } cam_fill_csio(csio, retries, cbfcnp, cam_flags, tag_action, data_ptr, dxfer_len, sense_len, cmd_size, timeout); bailout: return (retval); } void scsi_ata_pass_16(struct ccb_scsiio *csio, u_int32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb *), u_int32_t flags, u_int8_t tag_action, u_int8_t protocol, u_int8_t ata_flags, u_int16_t features, u_int16_t sector_count, uint64_t lba, u_int8_t command, u_int8_t control, u_int8_t *data_ptr, u_int16_t dxfer_len, u_int8_t sense_len, u_int32_t timeout) { struct ata_pass_16 *ata_cmd; ata_cmd = (struct ata_pass_16 *)&csio->cdb_io.cdb_bytes; ata_cmd->opcode = ATA_PASS_16; ata_cmd->protocol = protocol; ata_cmd->flags = ata_flags; ata_cmd->features_ext = features >> 8; ata_cmd->features = features; ata_cmd->sector_count_ext = sector_count >> 8; ata_cmd->sector_count = sector_count; ata_cmd->lba_low = lba; ata_cmd->lba_mid = lba >> 8; ata_cmd->lba_high = lba >> 16; ata_cmd->device = ATA_DEV_LBA; if (protocol & AP_EXTEND) { ata_cmd->lba_low_ext = lba >> 24; ata_cmd->lba_mid_ext = lba >> 32; ata_cmd->lba_high_ext = lba >> 40; } else ata_cmd->device |= (lba >> 24) & 0x0f; ata_cmd->command = command; ata_cmd->control = control; cam_fill_csio(csio, retries, cbfcnp, flags, tag_action, data_ptr, dxfer_len, sense_len, sizeof(*ata_cmd), timeout); } void scsi_unmap(struct ccb_scsiio *csio, u_int32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb *), u_int8_t tag_action, u_int8_t byte2, u_int8_t *data_ptr, u_int16_t dxfer_len, u_int8_t sense_len, u_int32_t timeout) { struct scsi_unmap *scsi_cmd; scsi_cmd = (struct scsi_unmap *)&csio->cdb_io.cdb_bytes; scsi_cmd->opcode = UNMAP; scsi_cmd->byte2 = byte2; scsi_ulto4b(0, scsi_cmd->reserved); scsi_cmd->group = 0; scsi_ulto2b(dxfer_len, scsi_cmd->length); scsi_cmd->control = 0; cam_fill_csio(csio, retries, cbfcnp, /*flags*/CAM_DIR_OUT, tag_action, data_ptr, dxfer_len, sense_len, sizeof(*scsi_cmd), timeout); } void scsi_receive_diagnostic_results(struct ccb_scsiio *csio, u_int32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb*), uint8_t tag_action, int pcv, uint8_t page_code, uint8_t *data_ptr, uint16_t allocation_length, uint8_t sense_len, uint32_t timeout) { struct scsi_receive_diag *scsi_cmd; scsi_cmd = (struct scsi_receive_diag *)&csio->cdb_io.cdb_bytes; memset(scsi_cmd, 0, sizeof(*scsi_cmd)); scsi_cmd->opcode = RECEIVE_DIAGNOSTIC; if (pcv) { scsi_cmd->byte2 |= SRD_PCV; scsi_cmd->page_code = page_code; } scsi_ulto2b(allocation_length, scsi_cmd->length); cam_fill_csio(csio, retries, cbfcnp, /*flags*/CAM_DIR_IN, tag_action, data_ptr, allocation_length, sense_len, sizeof(*scsi_cmd), timeout); } void scsi_send_diagnostic(struct ccb_scsiio *csio, u_int32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb *), uint8_t tag_action, int unit_offline, int device_offline, int self_test, int page_format, int self_test_code, uint8_t *data_ptr, uint16_t param_list_length, uint8_t sense_len, uint32_t timeout) { struct scsi_send_diag *scsi_cmd; scsi_cmd = (struct scsi_send_diag *)&csio->cdb_io.cdb_bytes; memset(scsi_cmd, 0, sizeof(*scsi_cmd)); scsi_cmd->opcode = SEND_DIAGNOSTIC; /* * The default self-test mode control and specific test * control are mutually exclusive. */ if (self_test) self_test_code = SSD_SELF_TEST_CODE_NONE; scsi_cmd->byte2 = ((self_test_code << SSD_SELF_TEST_CODE_SHIFT) & SSD_SELF_TEST_CODE_MASK) | (unit_offline ? SSD_UNITOFFL : 0) | (device_offline ? SSD_DEVOFFL : 0) | (self_test ? SSD_SELFTEST : 0) | (page_format ? SSD_PF : 0); scsi_ulto2b(param_list_length, scsi_cmd->length); cam_fill_csio(csio, retries, cbfcnp, /*flags*/param_list_length ? CAM_DIR_OUT : CAM_DIR_NONE, tag_action, data_ptr, param_list_length, sense_len, sizeof(*scsi_cmd), timeout); } void scsi_read_buffer(struct ccb_scsiio *csio, u_int32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb*), uint8_t tag_action, int mode, uint8_t buffer_id, u_int32_t offset, uint8_t *data_ptr, uint32_t allocation_length, uint8_t sense_len, uint32_t timeout) { struct scsi_read_buffer *scsi_cmd; scsi_cmd = (struct scsi_read_buffer *)&csio->cdb_io.cdb_bytes; memset(scsi_cmd, 0, sizeof(*scsi_cmd)); scsi_cmd->opcode = READ_BUFFER; scsi_cmd->byte2 = mode; scsi_cmd->buffer_id = buffer_id; scsi_ulto3b(offset, scsi_cmd->offset); scsi_ulto3b(allocation_length, scsi_cmd->length); cam_fill_csio(csio, retries, cbfcnp, /*flags*/CAM_DIR_IN, tag_action, data_ptr, allocation_length, sense_len, sizeof(*scsi_cmd), timeout); } void scsi_write_buffer(struct ccb_scsiio *csio, u_int32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb *), uint8_t tag_action, int mode, uint8_t buffer_id, u_int32_t offset, uint8_t *data_ptr, uint32_t param_list_length, uint8_t sense_len, uint32_t timeout) { struct scsi_write_buffer *scsi_cmd; scsi_cmd = (struct scsi_write_buffer *)&csio->cdb_io.cdb_bytes; memset(scsi_cmd, 0, sizeof(*scsi_cmd)); scsi_cmd->opcode = WRITE_BUFFER; scsi_cmd->byte2 = mode; scsi_cmd->buffer_id = buffer_id; scsi_ulto3b(offset, scsi_cmd->offset); scsi_ulto3b(param_list_length, scsi_cmd->length); cam_fill_csio(csio, retries, cbfcnp, /*flags*/param_list_length ? CAM_DIR_OUT : CAM_DIR_NONE, tag_action, data_ptr, param_list_length, sense_len, sizeof(*scsi_cmd), timeout); } void scsi_start_stop(struct ccb_scsiio *csio, u_int32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb *), u_int8_t tag_action, int start, int load_eject, int immediate, u_int8_t sense_len, u_int32_t timeout) { struct scsi_start_stop_unit *scsi_cmd; int extra_flags = 0; scsi_cmd = (struct scsi_start_stop_unit *)&csio->cdb_io.cdb_bytes; bzero(scsi_cmd, sizeof(*scsi_cmd)); scsi_cmd->opcode = START_STOP_UNIT; if (start != 0) { scsi_cmd->how |= SSS_START; /* it takes a lot of power to start a drive */ extra_flags |= CAM_HIGH_POWER; } if (load_eject != 0) scsi_cmd->how |= SSS_LOEJ; if (immediate != 0) scsi_cmd->byte2 |= SSS_IMMED; cam_fill_csio(csio, retries, cbfcnp, /*flags*/CAM_DIR_NONE | extra_flags, tag_action, /*data_ptr*/NULL, /*dxfer_len*/0, sense_len, sizeof(*scsi_cmd), timeout); } void scsi_read_attribute(struct ccb_scsiio *csio, u_int32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb *), u_int8_t tag_action, u_int8_t service_action, uint32_t element, u_int8_t elem_type, int logical_volume, int partition, u_int32_t first_attribute, int cache, u_int8_t *data_ptr, u_int32_t length, int sense_len, u_int32_t timeout) { struct scsi_read_attribute *scsi_cmd; scsi_cmd = (struct scsi_read_attribute *)&csio->cdb_io.cdb_bytes; bzero(scsi_cmd, sizeof(*scsi_cmd)); scsi_cmd->opcode = READ_ATTRIBUTE; scsi_cmd->service_action = service_action; scsi_ulto2b(element, scsi_cmd->element); scsi_cmd->elem_type = elem_type; scsi_cmd->logical_volume = logical_volume; scsi_cmd->partition = partition; scsi_ulto2b(first_attribute, scsi_cmd->first_attribute); scsi_ulto4b(length, scsi_cmd->length); if (cache != 0) scsi_cmd->cache |= SRA_CACHE; cam_fill_csio(csio, retries, cbfcnp, /*flags*/CAM_DIR_IN, tag_action, /*data_ptr*/data_ptr, /*dxfer_len*/length, sense_len, sizeof(*scsi_cmd), timeout); } void scsi_write_attribute(struct ccb_scsiio *csio, u_int32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb *), u_int8_t tag_action, uint32_t element, int logical_volume, int partition, int wtc, u_int8_t *data_ptr, u_int32_t length, int sense_len, u_int32_t timeout) { struct scsi_write_attribute *scsi_cmd; scsi_cmd = (struct scsi_write_attribute *)&csio->cdb_io.cdb_bytes; bzero(scsi_cmd, sizeof(*scsi_cmd)); scsi_cmd->opcode = WRITE_ATTRIBUTE; if (wtc != 0) scsi_cmd->byte2 = SWA_WTC; scsi_ulto3b(element, scsi_cmd->element); scsi_cmd->logical_volume = logical_volume; scsi_cmd->partition = partition; scsi_ulto4b(length, scsi_cmd->length); cam_fill_csio(csio, retries, cbfcnp, /*flags*/CAM_DIR_OUT, tag_action, /*data_ptr*/data_ptr, /*dxfer_len*/length, sense_len, sizeof(*scsi_cmd), timeout); } void scsi_persistent_reserve_in(struct ccb_scsiio *csio, uint32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb *), uint8_t tag_action, int service_action, uint8_t *data_ptr, uint32_t dxfer_len, int sense_len, int timeout) { struct scsi_per_res_in *scsi_cmd; scsi_cmd = (struct scsi_per_res_in *)&csio->cdb_io.cdb_bytes; bzero(scsi_cmd, sizeof(*scsi_cmd)); scsi_cmd->opcode = PERSISTENT_RES_IN; scsi_cmd->action = service_action; scsi_ulto2b(dxfer_len, scsi_cmd->length); cam_fill_csio(csio, retries, cbfcnp, /*flags*/CAM_DIR_IN, tag_action, data_ptr, dxfer_len, sense_len, sizeof(*scsi_cmd), timeout); } void scsi_persistent_reserve_out(struct ccb_scsiio *csio, uint32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb *), uint8_t tag_action, int service_action, int scope, int res_type, uint8_t *data_ptr, uint32_t dxfer_len, int sense_len, int timeout) { struct scsi_per_res_out *scsi_cmd; scsi_cmd = (struct scsi_per_res_out *)&csio->cdb_io.cdb_bytes; bzero(scsi_cmd, sizeof(*scsi_cmd)); scsi_cmd->opcode = PERSISTENT_RES_OUT; scsi_cmd->action = service_action; scsi_cmd->scope_type = scope | res_type; scsi_ulto4b(dxfer_len, scsi_cmd->length); cam_fill_csio(csio, retries, cbfcnp, /*flags*/CAM_DIR_OUT, tag_action, /*data_ptr*/data_ptr, /*dxfer_len*/dxfer_len, sense_len, sizeof(*scsi_cmd), timeout); } void scsi_security_protocol_in(struct ccb_scsiio *csio, uint32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb *), uint8_t tag_action, uint32_t security_protocol, uint32_t security_protocol_specific, int byte4, uint8_t *data_ptr, uint32_t dxfer_len, int sense_len, int timeout) { struct scsi_security_protocol_in *scsi_cmd; scsi_cmd = (struct scsi_security_protocol_in *)&csio->cdb_io.cdb_bytes; bzero(scsi_cmd, sizeof(*scsi_cmd)); scsi_cmd->opcode = SECURITY_PROTOCOL_IN; scsi_cmd->security_protocol = security_protocol; scsi_ulto2b(security_protocol_specific, scsi_cmd->security_protocol_specific); scsi_cmd->byte4 = byte4; scsi_ulto4b(dxfer_len, scsi_cmd->length); cam_fill_csio(csio, retries, cbfcnp, /*flags*/CAM_DIR_IN, tag_action, data_ptr, dxfer_len, sense_len, sizeof(*scsi_cmd), timeout); } void scsi_security_protocol_out(struct ccb_scsiio *csio, uint32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb *), uint8_t tag_action, uint32_t security_protocol, uint32_t security_protocol_specific, int byte4, uint8_t *data_ptr, uint32_t dxfer_len, int sense_len, int timeout) { struct scsi_security_protocol_out *scsi_cmd; scsi_cmd = (struct scsi_security_protocol_out *)&csio->cdb_io.cdb_bytes; bzero(scsi_cmd, sizeof(*scsi_cmd)); scsi_cmd->opcode = SECURITY_PROTOCOL_OUT; scsi_cmd->security_protocol = security_protocol; scsi_ulto2b(security_protocol_specific, scsi_cmd->security_protocol_specific); scsi_cmd->byte4 = byte4; scsi_ulto4b(dxfer_len, scsi_cmd->length); cam_fill_csio(csio, retries, cbfcnp, /*flags*/CAM_DIR_OUT, tag_action, data_ptr, dxfer_len, sense_len, sizeof(*scsi_cmd), timeout); } void scsi_report_supported_opcodes(struct ccb_scsiio *csio, uint32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb *), uint8_t tag_action, int options, int req_opcode, int req_service_action, uint8_t *data_ptr, uint32_t dxfer_len, int sense_len, int timeout) { struct scsi_report_supported_opcodes *scsi_cmd; scsi_cmd = (struct scsi_report_supported_opcodes *) &csio->cdb_io.cdb_bytes; bzero(scsi_cmd, sizeof(*scsi_cmd)); scsi_cmd->opcode = MAINTENANCE_IN; scsi_cmd->service_action = REPORT_SUPPORTED_OPERATION_CODES; scsi_cmd->options = options; scsi_cmd->requested_opcode = req_opcode; scsi_ulto2b(req_service_action, scsi_cmd->requested_service_action); scsi_ulto4b(dxfer_len, scsi_cmd->length); cam_fill_csio(csio, retries, cbfcnp, /*flags*/CAM_DIR_IN, tag_action, data_ptr, dxfer_len, sense_len, sizeof(*scsi_cmd), timeout); } /* * Try make as good a match as possible with * available sub drivers */ int scsi_inquiry_match(caddr_t inqbuffer, caddr_t table_entry) { struct scsi_inquiry_pattern *entry; struct scsi_inquiry_data *inq; entry = (struct scsi_inquiry_pattern *)table_entry; inq = (struct scsi_inquiry_data *)inqbuffer; if (((SID_TYPE(inq) == entry->type) || (entry->type == T_ANY)) && (SID_IS_REMOVABLE(inq) ? entry->media_type & SIP_MEDIA_REMOVABLE : entry->media_type & SIP_MEDIA_FIXED) && (cam_strmatch(inq->vendor, entry->vendor, sizeof(inq->vendor)) == 0) && (cam_strmatch(inq->product, entry->product, sizeof(inq->product)) == 0) && (cam_strmatch(inq->revision, entry->revision, sizeof(inq->revision)) == 0)) { return (0); } return (-1); } /* * Try make as good a match as possible with * available sub drivers */ int scsi_static_inquiry_match(caddr_t inqbuffer, caddr_t table_entry) { struct scsi_static_inquiry_pattern *entry; struct scsi_inquiry_data *inq; entry = (struct scsi_static_inquiry_pattern *)table_entry; inq = (struct scsi_inquiry_data *)inqbuffer; if (((SID_TYPE(inq) == entry->type) || (entry->type == T_ANY)) && (SID_IS_REMOVABLE(inq) ? entry->media_type & SIP_MEDIA_REMOVABLE : entry->media_type & SIP_MEDIA_FIXED) && (cam_strmatch(inq->vendor, entry->vendor, sizeof(inq->vendor)) == 0) && (cam_strmatch(inq->product, entry->product, sizeof(inq->product)) == 0) && (cam_strmatch(inq->revision, entry->revision, sizeof(inq->revision)) == 0)) { return (0); } return (-1); } /** * Compare two buffers of vpd device descriptors for a match. * * \param lhs Pointer to first buffer of descriptors to compare. * \param lhs_len The length of the first buffer. * \param rhs Pointer to second buffer of descriptors to compare. * \param rhs_len The length of the second buffer. * * \return 0 on a match, -1 otherwise. * * Treat rhs and lhs as arrays of vpd device id descriptors. Walk lhs matching * against each element in rhs until all data are exhausted or we have found * a match. */ int scsi_devid_match(uint8_t *lhs, size_t lhs_len, uint8_t *rhs, size_t rhs_len) { struct scsi_vpd_id_descriptor *lhs_id; struct scsi_vpd_id_descriptor *lhs_last; struct scsi_vpd_id_descriptor *rhs_last; uint8_t *lhs_end; uint8_t *rhs_end; lhs_end = lhs + lhs_len; rhs_end = rhs + rhs_len; /* * rhs_last and lhs_last are the last posible position of a valid * descriptor assuming it had a zero length identifier. We use * these variables to insure we can safely dereference the length * field in our loop termination tests. */ lhs_last = (struct scsi_vpd_id_descriptor *) (lhs_end - __offsetof(struct scsi_vpd_id_descriptor, identifier)); rhs_last = (struct scsi_vpd_id_descriptor *) (rhs_end - __offsetof(struct scsi_vpd_id_descriptor, identifier)); lhs_id = (struct scsi_vpd_id_descriptor *)lhs; while (lhs_id <= lhs_last && (lhs_id->identifier + lhs_id->length) <= lhs_end) { struct scsi_vpd_id_descriptor *rhs_id; rhs_id = (struct scsi_vpd_id_descriptor *)rhs; while (rhs_id <= rhs_last && (rhs_id->identifier + rhs_id->length) <= rhs_end) { if ((rhs_id->id_type & (SVPD_ID_ASSOC_MASK | SVPD_ID_TYPE_MASK)) == (lhs_id->id_type & (SVPD_ID_ASSOC_MASK | SVPD_ID_TYPE_MASK)) && rhs_id->length == lhs_id->length && memcmp(rhs_id->identifier, lhs_id->identifier, rhs_id->length) == 0) return (0); rhs_id = (struct scsi_vpd_id_descriptor *) (rhs_id->identifier + rhs_id->length); } lhs_id = (struct scsi_vpd_id_descriptor *) (lhs_id->identifier + lhs_id->length); } return (-1); } #ifdef _KERNEL int scsi_vpd_supported_page(struct cam_periph *periph, uint8_t page_id) { struct cam_ed *device; struct scsi_vpd_supported_pages *vpds; int i, num_pages; device = periph->path->device; vpds = (struct scsi_vpd_supported_pages *)device->supported_vpds; if (vpds != NULL) { num_pages = device->supported_vpds_len - SVPD_SUPPORTED_PAGES_HDR_LEN; for (i = 0; i < num_pages; i++) { if (vpds->page_list[i] == page_id) return (1); } } return (0); } static void init_scsi_delay(void) { int delay; delay = SCSI_DELAY; TUNABLE_INT_FETCH("kern.cam.scsi_delay", &delay); if (set_scsi_delay(delay) != 0) { printf("cam: invalid value for tunable kern.cam.scsi_delay\n"); set_scsi_delay(SCSI_DELAY); } } SYSINIT(scsi_delay, SI_SUB_TUNABLES, SI_ORDER_ANY, init_scsi_delay, NULL); static int sysctl_scsi_delay(SYSCTL_HANDLER_ARGS) { int error, delay; delay = scsi_delay; error = sysctl_handle_int(oidp, &delay, 0, req); if (error != 0 || req->newptr == NULL) return (error); return (set_scsi_delay(delay)); } SYSCTL_PROC(_kern_cam, OID_AUTO, scsi_delay, CTLTYPE_INT|CTLFLAG_RW, 0, 0, sysctl_scsi_delay, "I", "Delay to allow devices to settle after a SCSI bus reset (ms)"); static int set_scsi_delay(int delay) { /* * If someone sets this to 0, we assume that they want the * minimum allowable bus settle delay. */ if (delay == 0) { printf("cam: using minimum scsi_delay (%dms)\n", SCSI_MIN_DELAY); delay = SCSI_MIN_DELAY; } if (delay < SCSI_MIN_DELAY) return (EINVAL); scsi_delay = delay; return (0); } #endif /* _KERNEL */ Index: projects/nfsv42/sys/cam/scsi/scsi_all.h =================================================================== --- projects/nfsv42/sys/cam/scsi/scsi_all.h (revision 350367) +++ projects/nfsv42/sys/cam/scsi/scsi_all.h (revision 350368) @@ -1,4423 +1,4465 @@ /*- * Largely written by Julian Elischer (julian@tfs.com) * for TRW Financial Systems. * * TRW Financial Systems, in accordance with their agreement with Carnegie * Mellon University, makes this software available to CMU to distribute * or use in any manner that they see fit as long as this message is kept with * the software. For this reason TFS also grants any other persons or * organisations permission to use or modify this software. * * TFS supplies this software to be publicly redistributed * on the understanding that TFS is not responsible for the correct * functioning of this software in any circumstances. * * Ported to run under 386BSD by Julian Elischer (julian@tfs.com) Sept 1992 * * $FreeBSD$ */ /* * SCSI general interface description */ #ifndef _SCSI_SCSI_ALL_H #define _SCSI_SCSI_ALL_H 1 #include #ifdef _KERNEL #include #else #include #endif #ifdef _KERNEL /* * This is the number of seconds we wait for devices to settle after a SCSI * bus reset. */ extern int scsi_delay; #endif /* _KERNEL */ /* * SCSI command format */ /* * Define dome bits that are in ALL (or a lot of) scsi commands */ #define SCSI_CTL_LINK 0x01 #define SCSI_CTL_FLAG 0x02 #define SCSI_CTL_VENDOR 0xC0 #define SCSI_CMD_LUN 0xA0 /* these two should not be needed */ #define SCSI_CMD_LUN_SHIFT 5 /* LUN in the cmd is no longer SCSI */ #define SCSI_MAX_CDBLEN 16 /* * 16 byte commands are in the * SCSI-3 spec */ #if defined(CAM_MAX_CDBLEN) && (CAM_MAX_CDBLEN < SCSI_MAX_CDBLEN) #error "CAM_MAX_CDBLEN cannot be less than SCSI_MAX_CDBLEN" #endif /* 6byte CDBs special case 0 length to be 256 */ #define SCSI_CDB6_LEN(len) ((len) == 0 ? 256 : len) /* * This type defines actions to be taken when a particular sense code is * received. Right now, these flags are only defined to take up 16 bits, * but can be expanded in the future if necessary. */ typedef enum { SS_NOP = 0x000000, /* Do nothing */ SS_RETRY = 0x010000, /* Retry the command */ SS_FAIL = 0x020000, /* Bail out */ SS_START = 0x030000, /* Send a Start Unit command to the device, * then retry the original command. */ SS_TUR = 0x040000, /* Send a Test Unit Ready command to the * device, then retry the original command. */ SS_MASK = 0xff0000 } scsi_sense_action; typedef enum { SSQ_NONE = 0x0000, SSQ_DECREMENT_COUNT = 0x0100, /* Decrement the retry count */ SSQ_MANY = 0x0200, /* send lots of recovery commands */ SSQ_RANGE = 0x0400, /* * This table entry represents the * end of a range of ASCQs that * have identical error actions * and text. */ SSQ_PRINT_SENSE = 0x0800, SSQ_UA = 0x1000, /* Broadcast UA. */ SSQ_RESCAN = 0x2000, /* Rescan target for LUNs. */ SSQ_LOST = 0x4000, /* Destroy the LUNs. */ SSQ_MASK = 0xff00 } scsi_sense_action_qualifier; /* Mask for error status values */ #define SS_ERRMASK 0xff /* The default, retyable, error action */ #define SS_RDEF SS_RETRY|SSQ_DECREMENT_COUNT|SSQ_PRINT_SENSE|EIO /* The retyable, error action, with table specified error code */ #define SS_RET SS_RETRY|SSQ_DECREMENT_COUNT|SSQ_PRINT_SENSE /* Wait for transient error status to change */ #define SS_WAIT SS_TUR|SSQ_MANY|SSQ_DECREMENT_COUNT|SSQ_PRINT_SENSE /* Fatal error action, with table specified error code */ #define SS_FATAL SS_FAIL|SSQ_PRINT_SENSE struct scsi_generic { u_int8_t opcode; u_int8_t bytes[11]; }; struct scsi_request_sense { u_int8_t opcode; u_int8_t byte2; #define SRS_DESC 0x01 u_int8_t unused[2]; u_int8_t length; u_int8_t control; }; struct scsi_test_unit_ready { u_int8_t opcode; u_int8_t byte2; u_int8_t unused[3]; u_int8_t control; }; struct scsi_receive_diag { uint8_t opcode; uint8_t byte2; #define SRD_PCV 0x01 uint8_t page_code; uint8_t length[2]; uint8_t control; }; struct scsi_send_diag { uint8_t opcode; uint8_t byte2; #define SSD_UNITOFFL 0x01 #define SSD_DEVOFFL 0x02 #define SSD_SELFTEST 0x04 #define SSD_PF 0x10 #define SSD_SELF_TEST_CODE_MASK 0xE0 #define SSD_SELF_TEST_CODE_SHIFT 5 #define SSD_SELF_TEST_CODE_NONE 0x00 #define SSD_SELF_TEST_CODE_BG_SHORT 0x01 #define SSD_SELF_TEST_CODE_BG_EXTENDED 0x02 #define SSD_SELF_TEST_CODE_BG_ABORT 0x04 #define SSD_SELF_TEST_CODE_FG_SHORT 0x05 #define SSD_SELF_TEST_CODE_FG_EXTENDED 0x06 uint8_t reserved; uint8_t length[2]; uint8_t control; }; struct scsi_sense { u_int8_t opcode; u_int8_t byte2; u_int8_t unused[2]; u_int8_t length; u_int8_t control; }; struct scsi_inquiry { u_int8_t opcode; u_int8_t byte2; #define SI_EVPD 0x01 #define SI_CMDDT 0x02 u_int8_t page_code; u_int8_t length[2]; u_int8_t control; }; struct scsi_mode_sense_6 { u_int8_t opcode; u_int8_t byte2; #define SMS_DBD 0x08 u_int8_t page; #define SMS_PAGE_CODE 0x3F #define SMS_VENDOR_SPECIFIC_PAGE 0x00 #define SMS_DISCONNECT_RECONNECT_PAGE 0x02 #define SMS_FORMAT_DEVICE_PAGE 0x03 #define SMS_GEOMETRY_PAGE 0x04 #define SMS_CACHE_PAGE 0x08 #define SMS_PERIPHERAL_DEVICE_PAGE 0x09 #define SMS_CONTROL_MODE_PAGE 0x0A #define SMS_PROTO_SPECIFIC_PAGE 0x19 #define SMS_INFO_EXCEPTIONS_PAGE 0x1C #define SMS_ALL_PAGES_PAGE 0x3F #define SMS_PAGE_CTRL_MASK 0xC0 #define SMS_PAGE_CTRL_CURRENT 0x00 #define SMS_PAGE_CTRL_CHANGEABLE 0x40 #define SMS_PAGE_CTRL_DEFAULT 0x80 #define SMS_PAGE_CTRL_SAVED 0xC0 u_int8_t subpage; #define SMS_SUBPAGE_PAGE_0 0x00 #define SMS_SUBPAGE_ALL 0xff u_int8_t length; u_int8_t control; }; struct scsi_mode_sense_10 { u_int8_t opcode; u_int8_t byte2; /* same bits as small version */ #define SMS10_LLBAA 0x10 u_int8_t page; /* same bits as small version */ u_int8_t subpage; u_int8_t unused[3]; u_int8_t length[2]; u_int8_t control; }; struct scsi_mode_select_6 { u_int8_t opcode; u_int8_t byte2; #define SMS_SP 0x01 #define SMS_RTD 0x02 #define SMS_PF 0x10 u_int8_t unused[2]; u_int8_t length; u_int8_t control; }; struct scsi_mode_select_10 { u_int8_t opcode; u_int8_t byte2; /* same bits as small version */ u_int8_t unused[5]; u_int8_t length[2]; u_int8_t control; }; /* * When sending a mode select to a tape drive, the medium type must be 0. */ struct scsi_mode_hdr_6 { u_int8_t datalen; u_int8_t medium_type; u_int8_t dev_specific; u_int8_t block_descr_len; }; struct scsi_mode_hdr_10 { u_int8_t datalen[2]; u_int8_t medium_type; u_int8_t dev_specific; - u_int8_t reserved[2]; + u_int8_t flags; +#define SMH_LONGLBA 0x01 + u_int8_t reserved; u_int8_t block_descr_len[2]; }; struct scsi_mode_block_descr { u_int8_t density_code; u_int8_t num_blocks[3]; u_int8_t reserved; u_int8_t block_len[3]; }; +struct scsi_mode_block_descr_dshort +{ + u_int8_t num_blocks[4]; + u_int8_t reserved; + u_int8_t block_len[3]; +}; + +struct scsi_mode_block_descr_dlong +{ + u_int8_t num_blocks[8]; + u_int8_t reserved[4]; + u_int8_t block_len[4]; +}; + struct scsi_per_res_in { u_int8_t opcode; u_int8_t action; #define SPRI_RK 0x00 #define SPRI_RR 0x01 #define SPRI_RC 0x02 #define SPRI_RS 0x03 u_int8_t reserved[5]; u_int8_t length[2]; #define SPRI_MAX_LEN 0xffff u_int8_t control; }; struct scsi_per_res_in_header { u_int8_t generation[4]; u_int8_t length[4]; }; struct scsi_per_res_key { u_int8_t key[8]; }; struct scsi_per_res_in_keys { struct scsi_per_res_in_header header; struct scsi_per_res_key keys[0]; }; struct scsi_per_res_cap { uint8_t length[2]; uint8_t flags1; #define SPRI_RLR_C 0x80 #define SPRI_CRH 0x10 #define SPRI_SIP_C 0x08 #define SPRI_ATP_C 0x04 #define SPRI_PTPL_C 0x01 uint8_t flags2; #define SPRI_TMV 0x80 #define SPRI_ALLOW_CMD_MASK 0x70 #define SPRI_ALLOW_CMD_SHIFT 4 #define SPRI_ALLOW_NA 0x00 #define SPRI_ALLOW_1 0x10 #define SPRI_ALLOW_2 0x20 #define SPRI_ALLOW_3 0x30 #define SPRI_ALLOW_4 0x40 #define SPRI_ALLOW_5 0x50 #define SPRI_PTPL_A 0x01 uint8_t type_mask[2]; #define SPRI_TM_WR_EX_AR 0x8000 #define SPRI_TM_EX_AC_RO 0x4000 #define SPRI_TM_WR_EX_RO 0x2000 #define SPRI_TM_EX_AC 0x0800 #define SPRI_TM_WR_EX 0x0200 #define SPRI_TM_EX_AC_AR 0x0001 uint8_t reserved[2]; }; struct scsi_per_res_in_rsrv_data { uint8_t reservation[8]; uint8_t scope_addr[4]; uint8_t reserved; uint8_t scopetype; #define SPRT_WE 0x01 #define SPRT_EA 0x03 #define SPRT_WERO 0x05 #define SPRT_EARO 0x06 #define SPRT_WEAR 0x07 #define SPRT_EAAR 0x08 uint8_t extent_length[2]; }; struct scsi_per_res_in_rsrv { struct scsi_per_res_in_header header; struct scsi_per_res_in_rsrv_data data; }; struct scsi_per_res_in_full_desc { struct scsi_per_res_key res_key; uint8_t reserved1[4]; uint8_t flags; #define SPRI_FULL_ALL_TG_PT 0x02 #define SPRI_FULL_R_HOLDER 0x01 uint8_t scopetype; uint8_t reserved2[4]; uint8_t rel_trgt_port_id[2]; uint8_t additional_length[4]; uint8_t transport_id[]; }; struct scsi_per_res_in_full { struct scsi_per_res_in_header header; struct scsi_per_res_in_full_desc desc[]; }; struct scsi_per_res_out { u_int8_t opcode; u_int8_t action; #define SPRO_REGISTER 0x00 #define SPRO_RESERVE 0x01 #define SPRO_RELEASE 0x02 #define SPRO_CLEAR 0x03 #define SPRO_PREEMPT 0x04 #define SPRO_PRE_ABO 0x05 #define SPRO_REG_IGNO 0x06 #define SPRO_REG_MOVE 0x07 #define SPRO_REPL_LOST_RES 0x08 #define SPRO_ACTION_MASK 0x1f u_int8_t scope_type; #define SPR_SCOPE_MASK 0xf0 #define SPR_SCOPE_SHIFT 4 #define SPR_LU_SCOPE 0x00 #define SPR_EXTENT_SCOPE 0x10 #define SPR_ELEMENT_SCOPE 0x20 #define SPR_TYPE_MASK 0x0f #define SPR_TYPE_RD_SHARED 0x00 #define SPR_TYPE_WR_EX 0x01 #define SPR_TYPE_RD_EX 0x02 #define SPR_TYPE_EX_AC 0x03 #define SPR_TYPE_SHARED 0x04 #define SPR_TYPE_WR_EX_RO 0x05 #define SPR_TYPE_EX_AC_RO 0x06 #define SPR_TYPE_WR_EX_AR 0x07 #define SPR_TYPE_EX_AC_AR 0x08 u_int8_t reserved[2]; u_int8_t length[4]; u_int8_t control; }; struct scsi_per_res_out_parms { struct scsi_per_res_key res_key; u_int8_t serv_act_res_key[8]; u_int8_t scope_spec_address[4]; u_int8_t flags; #define SPR_SPEC_I_PT 0x08 #define SPR_ALL_TG_PT 0x04 #define SPR_APTPL 0x01 u_int8_t reserved1; u_int8_t extent_length[2]; u_int8_t transport_id_list[]; }; struct scsi_per_res_out_trans_ids { u_int8_t additional_length[4]; u_int8_t transport_ids[]; }; /* * Used with REGISTER AND MOVE serivce action of the PERSISTENT RESERVE OUT * command. */ struct scsi_per_res_reg_move { struct scsi_per_res_key res_key; u_int8_t serv_act_res_key[8]; u_int8_t reserved; u_int8_t flags; #define SPR_REG_MOVE_UNREG 0x02 #define SPR_REG_MOVE_APTPL 0x01 u_int8_t rel_trgt_port_id[2]; u_int8_t transport_id_length[4]; u_int8_t transport_id[]; }; struct scsi_transportid_header { uint8_t format_protocol; #define SCSI_TRN_FORMAT_MASK 0xc0 #define SCSI_TRN_FORMAT_SHIFT 6 #define SCSI_TRN_PROTO_MASK 0x0f }; struct scsi_transportid_fcp { uint8_t format_protocol; #define SCSI_TRN_FCP_FORMAT_DEFAULT 0x00 uint8_t reserved1[7]; uint8_t n_port_name[8]; uint8_t reserved2[8]; }; struct scsi_transportid_spi { uint8_t format_protocol; #define SCSI_TRN_SPI_FORMAT_DEFAULT 0x00 uint8_t reserved1; uint8_t scsi_addr[2]; uint8_t obsolete[2]; uint8_t rel_trgt_port_id[2]; uint8_t reserved2[16]; }; struct scsi_transportid_1394 { uint8_t format_protocol; #define SCSI_TRN_1394_FORMAT_DEFAULT 0x00 uint8_t reserved1[7]; uint8_t eui64[8]; uint8_t reserved2[8]; }; struct scsi_transportid_rdma { uint8_t format_protocol; #define SCSI_TRN_RDMA_FORMAT_DEFAULT 0x00 uint8_t reserved[7]; #define SCSI_TRN_RDMA_PORT_LEN 16 uint8_t initiator_port_id[SCSI_TRN_RDMA_PORT_LEN]; }; struct scsi_transportid_iscsi_device { uint8_t format_protocol; #define SCSI_TRN_ISCSI_FORMAT_DEVICE 0x00 uint8_t reserved; uint8_t additional_length[2]; uint8_t iscsi_name[]; }; struct scsi_transportid_iscsi_port { uint8_t format_protocol; #define SCSI_TRN_ISCSI_FORMAT_PORT 0x40 uint8_t reserved; uint8_t additional_length[2]; uint8_t iscsi_name[]; /* * Followed by a separator and iSCSI initiator session ID */ }; struct scsi_transportid_sas { uint8_t format_protocol; #define SCSI_TRN_SAS_FORMAT_DEFAULT 0x00 uint8_t reserved1[3]; uint8_t sas_address[8]; uint8_t reserved2[12]; }; struct scsi_sop_routing_id_norm { uint8_t bus; uint8_t devfunc; #define SCSI_TRN_SOP_BUS_MAX 0xff #define SCSI_TRN_SOP_DEV_MAX 0x1f #define SCSI_TRN_SOP_DEV_MASK 0xf8 #define SCSI_TRN_SOP_DEV_SHIFT 3 #define SCSI_TRN_SOP_FUNC_NORM_MASK 0x07 #define SCSI_TRN_SOP_FUNC_NORM_MAX 0x07 }; struct scsi_sop_routing_id_alt { uint8_t bus; uint8_t function; #define SCSI_TRN_SOP_FUNC_ALT_MAX 0xff }; struct scsi_transportid_sop { uint8_t format_protocol; #define SCSI_TRN_SOP_FORMAT_DEFAULT 0x00 uint8_t reserved1; uint8_t routing_id[2]; uint8_t reserved2[20]; }; struct scsi_log_sense { u_int8_t opcode; u_int8_t byte2; #define SLS_SP 0x01 #define SLS_PPC 0x02 u_int8_t page; #define SLS_PAGE_CODE 0x3F #define SLS_SUPPORTED_PAGES_PAGE 0x00 #define SLS_OVERRUN_PAGE 0x01 #define SLS_ERROR_WRITE_PAGE 0x02 #define SLS_ERROR_READ_PAGE 0x03 #define SLS_ERROR_READREVERSE_PAGE 0x04 #define SLS_ERROR_VERIFY_PAGE 0x05 #define SLS_ERROR_NONMEDIUM_PAGE 0x06 #define SLS_ERROR_LASTN_PAGE 0x07 #define SLS_LOGICAL_BLOCK_PROVISIONING 0x0c +#define SLS_TEMPERATURE 0x0d #define SLS_SELF_TEST_PAGE 0x10 #define SLS_SOLID_STATE_MEDIA 0x11 #define SLS_STAT_AND_PERF 0x19 #define SLS_IE_PAGE 0x2f #define SLS_PAGE_CTRL_MASK 0xC0 #define SLS_PAGE_CTRL_THRESHOLD 0x00 #define SLS_PAGE_CTRL_CUMULATIVE 0x40 #define SLS_PAGE_CTRL_THRESH_DEFAULT 0x80 #define SLS_PAGE_CTRL_CUMUL_DEFAULT 0xC0 u_int8_t subpage; #define SLS_SUPPORTED_SUBPAGES_SUBPAGE 0xff u_int8_t reserved; u_int8_t paramptr[2]; u_int8_t length[2]; u_int8_t control; }; struct scsi_log_select { u_int8_t opcode; u_int8_t byte2; /* SLS_SP 0x01 */ #define SLS_PCR 0x02 u_int8_t page; /* SLS_PAGE_CTRL_MASK 0xC0 */ /* SLS_PAGE_CTRL_THRESHOLD 0x00 */ /* SLS_PAGE_CTRL_CUMULATIVE 0x40 */ /* SLS_PAGE_CTRL_THRESH_DEFAULT 0x80 */ /* SLS_PAGE_CTRL_CUMUL_DEFAULT 0xC0 */ u_int8_t reserved[4]; u_int8_t length[2]; u_int8_t control; }; struct scsi_log_header { u_int8_t page; #define SL_PAGE_CODE 0x3F #define SL_SPF 0x40 #define SL_DS 0x80 u_int8_t subpage; u_int8_t datalen[2]; }; struct scsi_log_param_header { u_int8_t param_code[2]; u_int8_t param_control; #define SLP_LP 0x01 #define SLP_LBIN 0x02 #define SLP_TMC_MASK 0x0C #define SLP_TMC_ALWAYS 0x00 #define SLP_TMC_EQUAL 0x04 #define SLP_TMC_NOTEQUAL 0x08 #define SLP_TMC_GREATER 0x0C #define SLP_ETC 0x10 #define SLP_TSD 0x20 #define SLP_DS 0x40 #define SLP_DU 0x80 u_int8_t param_len; }; struct scsi_log_media_pct_used { struct scsi_log_param_header hdr; #define SLP_SS_MEDIA_PCT_USED 0x0001 uint8_t reserved[3]; uint8_t pct_used; }; struct scsi_log_stat_and_perf { struct scsi_log_param_header hdr; #define SLP_SAP 0x0001 uint8_t read_num[8]; uint8_t write_num[8]; uint8_t recvieved_lba[8]; uint8_t transmitted_lba[8]; uint8_t read_int[8]; uint8_t write_int[8]; uint8_t weighted_num[8]; uint8_t weighted_int[8]; }; struct scsi_log_idle_time { struct scsi_log_param_header hdr; #define SLP_IT 0x0002 uint8_t idle_int[8]; }; struct scsi_log_time_interval { struct scsi_log_param_header hdr; #define SLP_TI 0x0003 uint8_t exponent[4]; uint8_t integer[4]; }; struct scsi_log_fua_stat_and_perf { struct scsi_log_param_header hdr; #define SLP_FUA_SAP 0x0004 uint8_t fua_read_num[8]; uint8_t fua_write_num[8]; uint8_t fuanv_read_num[8]; uint8_t fuanv_write_num[8]; uint8_t fua_read_int[8]; uint8_t fua_write_int[8]; uint8_t fuanv_read_int[8]; uint8_t fuanv_write_int[8]; }; struct scsi_log_informational_exceptions { struct scsi_log_param_header hdr; #define SLP_IE_GEN 0x0000 uint8_t ie_asc; uint8_t ie_ascq; uint8_t temperature; }; +struct scsi_log_temperature { + struct scsi_log_param_header hdr; +#define SLP_TEMPERATURE 0x0000 +#define SLP_REFTEMPERATURE 0x0001 + uint8_t reserved; + uint8_t temperature; +}; + struct scsi_control_page { u_int8_t page_code; u_int8_t page_length; u_int8_t rlec; #define SCP_RLEC 0x01 /*Report Log Exception Cond*/ #define SCP_GLTSD 0x02 /*Global Logging target save disable */ #define SCP_DSENSE 0x04 /*Descriptor Sense */ #define SCP_DPICZ 0x08 /*Disable Prot. Info Check if Prot. Field is Zero */ #define SCP_TMF_ONLY 0x10 /*TM Functions Only*/ #define SCP_TST_MASK 0xE0 /*Task Set Type Mask*/ #define SCP_TST_ONE 0x00 /*One Task Set*/ #define SCP_TST_SEPARATE 0x20 /*Separate Task Sets*/ u_int8_t queue_flags; #define SCP_QUEUE_ALG_MASK 0xF0 #define SCP_QUEUE_ALG_RESTRICTED 0x00 #define SCP_QUEUE_ALG_UNRESTRICTED 0x10 #define SCP_NUAR 0x08 /*No UA on release*/ #define SCP_QUEUE_ERR 0x02 /*Queued I/O aborted for CACs*/ #define SCP_QUEUE_DQUE 0x01 /*Queued I/O disabled*/ u_int8_t eca_and_aen; #define SCP_EECA 0x80 /*Enable Extended CA*/ #define SCP_RAC 0x40 /*Report a check*/ #define SCP_SWP 0x08 /*Software Write Protect*/ #define SCP_RAENP 0x04 /*Ready AEN Permission*/ #define SCP_UAAENP 0x02 /*UA AEN Permission*/ #define SCP_EAENP 0x01 /*Error AEN Permission*/ u_int8_t flags4; #define SCP_ATO 0x80 /*Application tag owner*/ #define SCP_TAS 0x40 /*Task aborted status*/ #define SCP_ATMPE 0x20 /*Application tag mode page*/ #define SCP_RWWP 0x10 /*Reject write without prot*/ u_int8_t aen_holdoff_period[2]; u_int8_t busy_timeout_period[2]; u_int8_t extended_selftest_completion_time[2]; }; struct scsi_control_ext_page { uint8_t page_code; #define SCEP_PAGE_CODE 0x0a uint8_t subpage_code; #define SCEP_SUBPAGE_CODE 0x01 uint8_t page_length[2]; uint8_t flags; #define SCEP_TCMOS 0x04 /* Timestamp Changeable by */ #define SCEP_SCSIP 0x02 /* SCSI Precedence (clock) */ #define SCEP_IALUAE 0x01 /* Implicit ALUA Enabled */ uint8_t prio; uint8_t max_sense; uint8_t reserve[25]; }; struct scsi_cache_page { u_int8_t page_code; #define SCHP_PAGE_SAVABLE 0x80 /* Page is savable */ u_int8_t page_length; u_int8_t cache_flags; #define SCHP_FLAGS_WCE 0x04 /* Write Cache Enable */ #define SCHP_FLAGS_MF 0x02 /* Multiplication factor */ #define SCHP_FLAGS_RCD 0x01 /* Read Cache Disable */ u_int8_t rw_cache_policy; u_int8_t dis_prefetch[2]; u_int8_t min_prefetch[2]; u_int8_t max_prefetch[2]; u_int8_t max_prefetch_ceil[2]; }; /* * XXX KDM * Updated version of the cache page, as of SBC. Update this to SBC-3 and * rationalize the two. */ struct scsi_caching_page { uint8_t page_code; #define SMS_CACHING_PAGE 0x08 uint8_t page_length; uint8_t flags1; #define SCP_IC 0x80 #define SCP_ABPF 0x40 #define SCP_CAP 0x20 #define SCP_DISC 0x10 #define SCP_SIZE 0x08 #define SCP_WCE 0x04 #define SCP_MF 0x02 #define SCP_RCD 0x01 uint8_t ret_priority; uint8_t disable_pf_transfer_len[2]; uint8_t min_prefetch[2]; uint8_t max_prefetch[2]; uint8_t max_pf_ceiling[2]; uint8_t flags2; #define SCP_FSW 0x80 #define SCP_LBCSS 0x40 #define SCP_DRA 0x20 #define SCP_VS1 0x10 #define SCP_VS2 0x08 uint8_t cache_segments; uint8_t cache_seg_size[2]; uint8_t reserved; uint8_t non_cache_seg_size[3]; }; struct scsi_info_exceptions_page { u_int8_t page_code; #define SIEP_PAGE_SAVABLE 0x80 /* Page is savable */ u_int8_t page_length; u_int8_t info_flags; #define SIEP_FLAGS_PERF 0x80 #define SIEP_FLAGS_EBF 0x20 #define SIEP_FLAGS_EWASC 0x10 #define SIEP_FLAGS_DEXCPT 0x08 #define SIEP_FLAGS_TEST 0x04 #define SIEP_FLAGS_EBACKERR 0x02 #define SIEP_FLAGS_LOGERR 0x01 u_int8_t mrie; #define SIEP_MRIE_NO 0x00 #define SIEP_MRIE_UA 0x02 #define SIEP_MRIE_REC_COND 0x03 #define SIEP_MRIE_REC_UNCOND 0x04 #define SIEP_MRIE_NO_SENSE 0x05 #define SIEP_MRIE_ON_REQ 0x06 u_int8_t interval_timer[4]; u_int8_t report_count[4]; }; struct scsi_logical_block_provisioning_page_descr { uint8_t flags; #define SLBPPD_ENABLED 0x80 #define SLBPPD_TYPE_MASK 0x38 #define SLBPPD_ARMING_MASK 0x07 #define SLBPPD_ARMING_DEC 0x02 #define SLBPPD_ARMING_INC 0x01 uint8_t resource; uint8_t reserved[2]; uint8_t count[4]; }; struct scsi_logical_block_provisioning_page { uint8_t page_code; uint8_t subpage_code; uint8_t page_length[2]; uint8_t flags; #define SLBPP_SITUA 0x01 uint8_t reserved[11]; struct scsi_logical_block_provisioning_page_descr descr[0]; }; /* * SCSI protocol identifier values, current as of SPC4r36l. */ #define SCSI_PROTO_FC 0x00 /* Fibre Channel */ #define SCSI_PROTO_SPI 0x01 /* Parallel SCSI */ #define SCSI_PROTO_SSA 0x02 /* Serial Storage Arch. */ #define SCSI_PROTO_1394 0x03 /* IEEE 1394 (Firewire) */ #define SCSI_PROTO_RDMA 0x04 /* SCSI RDMA Protocol */ #define SCSI_PROTO_ISCSI 0x05 /* Internet SCSI */ #define SCSI_PROTO_iSCSI 0x05 /* Internet SCSI */ #define SCSI_PROTO_SAS 0x06 /* SAS Serial SCSI Protocol */ #define SCSI_PROTO_ADT 0x07 /* Automation/Drive Int. Trans. Prot.*/ #define SCSI_PROTO_ADITP 0x07 /* Automation/Drive Int. Trans. Prot.*/ #define SCSI_PROTO_ATA 0x08 /* AT Attachment Interface */ #define SCSI_PROTO_UAS 0x09 /* USB Atached SCSI */ #define SCSI_PROTO_SOP 0x0a /* SCSI over PCI Express */ #define SCSI_PROTO_NONE 0x0f /* No specific protocol */ struct scsi_proto_specific_page { u_int8_t page_code; #define SPSP_PAGE_SAVABLE 0x80 /* Page is savable */ u_int8_t page_length; u_int8_t protocol; #define SPSP_PROTO_FC SCSI_PROTO_FC #define SPSP_PROTO_SPI SCSI_PROTO_SPI #define SPSP_PROTO_SSA SCSI_PROTO_SSA #define SPSP_PROTO_1394 SCSI_PROTO_1394 #define SPSP_PROTO_RDMA SCSI_PROTO_RDMA #define SPSP_PROTO_ISCSI SCSI_PROTO_ISCSI #define SPSP_PROTO_SAS SCSI_PROTO_SAS #define SPSP_PROTO_ADT SCSI_PROTO_ADITP #define SPSP_PROTO_ATA SCSI_PROTO_ATA #define SPSP_PROTO_UAS SCSI_PROTO_UAS #define SPSP_PROTO_SOP SCSI_PROTO_SOP #define SPSP_PROTO_NONE SCSI_PROTO_NONE }; struct scsi_reserve { u_int8_t opcode; u_int8_t byte2; #define SR_EXTENT 0x01 #define SR_ID_MASK 0x0e #define SR_3RDPTY 0x10 #define SR_LUN_MASK 0xe0 u_int8_t resv_id; u_int8_t length[2]; u_int8_t control; }; struct scsi_reserve_10 { uint8_t opcode; uint8_t byte2; #define SR10_3RDPTY 0x10 #define SR10_LONGID 0x02 #define SR10_EXTENT 0x01 uint8_t resv_id; uint8_t thirdparty_id; uint8_t reserved[3]; uint8_t length[2]; uint8_t control; }; struct scsi_release { u_int8_t opcode; u_int8_t byte2; u_int8_t resv_id; u_int8_t unused[1]; u_int8_t length; u_int8_t control; }; struct scsi_release_10 { uint8_t opcode; uint8_t byte2; uint8_t resv_id; uint8_t thirdparty_id; uint8_t reserved[3]; uint8_t length[2]; uint8_t control; }; struct scsi_prevent { u_int8_t opcode; u_int8_t byte2; u_int8_t unused[2]; u_int8_t how; u_int8_t control; }; #define PR_PREVENT 0x01 #define PR_ALLOW 0x00 struct scsi_sync_cache { u_int8_t opcode; u_int8_t byte2; #define SSC_IMMED 0x02 #define SSC_RELADR 0x01 u_int8_t begin_lba[4]; u_int8_t reserved; u_int8_t lb_count[2]; u_int8_t control; }; struct scsi_sync_cache_16 { uint8_t opcode; uint8_t byte2; uint8_t begin_lba[8]; uint8_t lb_count[4]; uint8_t reserved; uint8_t control; }; struct scsi_format { uint8_t opcode; uint8_t byte2; #define SF_LONGLIST 0x20 #define SF_FMTDATA 0x10 #define SF_CMPLIST 0x08 #define SF_FORMAT_MASK 0x07 #define SF_FORMAT_BLOCK 0x00 #define SF_FORMAT_LONG_BLOCK 0x03 #define SF_FORMAT_BFI 0x04 #define SF_FORMAT_PHYS 0x05 uint8_t vendor; uint8_t interleave[2]; uint8_t control; }; struct scsi_format_header_short { uint8_t reserved; #define SF_DATA_FOV 0x80 #define SF_DATA_DPRY 0x40 #define SF_DATA_DCRT 0x20 #define SF_DATA_STPF 0x10 #define SF_DATA_IP 0x08 #define SF_DATA_DSP 0x04 #define SF_DATA_IMMED 0x02 #define SF_DATA_VS 0x01 uint8_t byte2; uint8_t defect_list_len[2]; }; struct scsi_format_header_long { uint8_t reserved; uint8_t byte2; uint8_t reserved2[2]; uint8_t defect_list_len[4]; }; struct scsi_changedef { u_int8_t opcode; u_int8_t byte2; u_int8_t unused1; u_int8_t how; u_int8_t unused[4]; u_int8_t datalen; u_int8_t control; }; struct scsi_read_buffer { u_int8_t opcode; u_int8_t byte2; #define RWB_MODE 0x1F #define RWB_MODE_HDR_DATA 0x00 #define RWB_MODE_VENDOR 0x01 #define RWB_MODE_DATA 0x02 #define RWB_MODE_DESCR 0x03 #define RWB_MODE_DOWNLOAD 0x04 #define RWB_MODE_DOWNLOAD_SAVE 0x05 #define RWB_MODE_ECHO 0x0A #define RWB_MODE_ECHO_DESCR 0x0B #define RWB_MODE_ERROR_HISTORY 0x1C u_int8_t buffer_id; u_int8_t offset[3]; u_int8_t length[3]; u_int8_t control; }; struct scsi_read_buffer_16 { uint8_t opcode; uint8_t byte2; uint8_t offset[8]; uint8_t length[4]; uint8_t buffer_id; uint8_t control; }; struct scsi_write_buffer { u_int8_t opcode; u_int8_t byte2; u_int8_t buffer_id; u_int8_t offset[3]; u_int8_t length[3]; u_int8_t control; }; struct scsi_read_attribute { u_int8_t opcode; u_int8_t service_action; #define SRA_SA_ATTR_VALUES 0x00 #define SRA_SA_ATTR_LIST 0x01 #define SRA_SA_LOG_VOL_LIST 0x02 #define SRA_SA_PART_LIST 0x03 #define SRA_SA_RESTRICTED 0x04 #define SRA_SA_SUPPORTED_ATTRS 0x05 #define SRA_SA_MASK 0x1f u_int8_t element[2]; u_int8_t elem_type; u_int8_t logical_volume; u_int8_t reserved1; u_int8_t partition; u_int8_t first_attribute[2]; u_int8_t length[4]; u_int8_t cache; #define SRA_CACHE 0x01 u_int8_t control; }; struct scsi_write_attribute { u_int8_t opcode; u_int8_t byte2; #define SWA_WTC 0x01 u_int8_t element[3]; u_int8_t logical_volume; u_int8_t reserved1; u_int8_t partition; u_int8_t reserved2[2]; u_int8_t length[4]; u_int8_t reserved3; u_int8_t control; }; struct scsi_read_attribute_values { u_int8_t length[4]; u_int8_t attribute_0[0]; }; struct scsi_mam_attribute_header { u_int8_t id[2]; /* * Attributes obtained from SPC-4r36g (section 7.4.2.2) and * SSC-4r03 (section 4.2.21). */ #define SMA_ATTR_ID_DEVICE_MIN 0x0000 #define SMA_ATTR_REM_CAP_PARTITION 0x0000 #define SMA_ATTR_MAX_CAP_PARTITION 0x0001 #define SMA_ATTR_TAPEALERT_FLAGS 0x0002 #define SMA_ATTR_LOAD_COUNT 0x0003 #define SMA_ATTR_MAM_SPACE_REMAINING 0x0004 #define SMA_ATTR_DEV_ASSIGNING_ORG 0x0005 #define SMA_ATTR_FORMAT_DENSITY_CODE 0x0006 #define SMA_ATTR_INITIALIZATION_COUNT 0x0007 #define SMA_ATTR_VOLUME_ID 0x0008 #define SMA_ATTR_VOLUME_CHANGE_REF 0x0009 #define SMA_ATTR_DEV_SERIAL_LAST_LOAD 0x020a #define SMA_ATTR_DEV_SERIAL_LAST_LOAD_1 0x020b #define SMA_ATTR_DEV_SERIAL_LAST_LOAD_2 0x020c #define SMA_ATTR_DEV_SERIAL_LAST_LOAD_3 0x020d #define SMA_ATTR_TOTAL_MB_WRITTEN_LT 0x0220 #define SMA_ATTR_TOTAL_MB_READ_LT 0x0221 #define SMA_ATTR_TOTAL_MB_WRITTEN_CUR 0x0222 #define SMA_ATTR_TOTAL_MB_READ_CUR 0x0223 #define SMA_ATTR_FIRST_ENC_BLOCK 0x0224 #define SMA_ATTR_NEXT_UNENC_BLOCK 0x0225 #define SMA_ATTR_MEDIUM_USAGE_HIST 0x0340 #define SMA_ATTR_PART_USAGE_HIST 0x0341 #define SMA_ATTR_ID_DEVICE_MAX 0x03ff #define SMA_ATTR_ID_MEDIUM_MIN 0x0400 #define SMA_ATTR_MED_MANUF 0x0400 #define SMA_ATTR_MED_SERIAL 0x0401 #define SMA_ATTR_MED_LENGTH 0x0402 #define SMA_ATTR_MED_WIDTH 0x0403 #define SMA_ATTR_MED_ASSIGNING_ORG 0x0404 #define SMA_ATTR_MED_DENSITY_CODE 0x0405 #define SMA_ATTR_MED_MANUF_DATE 0x0406 #define SMA_ATTR_MAM_CAPACITY 0x0407 #define SMA_ATTR_MED_TYPE 0x0408 #define SMA_ATTR_MED_TYPE_INFO 0x0409 #define SMA_ATTR_MED_SERIAL_NUM 0x040a #define SMA_ATTR_ID_MEDIUM_MAX 0x07ff #define SMA_ATTR_ID_HOST_MIN 0x0800 #define SMA_ATTR_APP_VENDOR 0x0800 #define SMA_ATTR_APP_NAME 0x0801 #define SMA_ATTR_APP_VERSION 0x0802 #define SMA_ATTR_USER_MED_TEXT_LABEL 0x0803 #define SMA_ATTR_LAST_WRITTEN_TIME 0x0804 #define SMA_ATTR_TEXT_LOCAL_ID 0x0805 #define SMA_ATTR_BARCODE 0x0806 #define SMA_ATTR_HOST_OWNER_NAME 0x0807 #define SMA_ATTR_MEDIA_POOL 0x0808 #define SMA_ATTR_PART_USER_LABEL 0x0809 #define SMA_ATTR_LOAD_UNLOAD_AT_PART 0x080a #define SMA_ATTR_APP_FORMAT_VERSION 0x080b #define SMA_ATTR_VOL_COHERENCY_INFO 0x080c #define SMA_ATTR_ID_HOST_MAX 0x0bff #define SMA_ATTR_VENDOR_DEVICE_MIN 0x0c00 #define SMA_ATTR_VENDOR_DEVICE_MAX 0x0fff #define SMA_ATTR_VENDOR_MEDIUM_MIN 0x1000 #define SMA_ATTR_VENDOR_MEDIUM_MAX 0x13ff #define SMA_ATTR_VENDOR_HOST_MIN 0x1400 #define SMA_ATTR_VENDOR_HOST_MAX 0x17ff u_int8_t byte2; #define SMA_FORMAT_BINARY 0x00 #define SMA_FORMAT_ASCII 0x01 #define SMA_FORMAT_TEXT 0x02 #define SMA_FORMAT_MASK 0x03 #define SMA_READ_ONLY 0x80 u_int8_t length[2]; u_int8_t attribute[0]; }; struct scsi_attrib_list_header { u_int8_t length[4]; u_int8_t first_attr_0[0]; }; struct scsi_attrib_lv_list { u_int8_t length[2]; u_int8_t first_lv_number; u_int8_t num_logical_volumes; }; struct scsi_attrib_vendser { uint8_t vendor[8]; uint8_t serial_num[32]; }; /* * These values are used to decode the Volume Coherency Information * Attribute (0x080c) for LTFS-format coherency information. * Although the Application Client Specific lengths are different for * Version 0 and Version 1, the data is in fact the same. The length * difference was due to a code bug. */ #define SCSI_LTFS_VER0_LEN 42 #define SCSI_LTFS_VER1_LEN 43 #define SCSI_LTFS_UUID_LEN 36 #define SCSI_LTFS_STR_NAME "LTFS" #define SCSI_LTFS_STR_LEN 4 typedef enum { SCSI_ATTR_FLAG_NONE = 0x00, SCSI_ATTR_FLAG_HEX = 0x01, SCSI_ATTR_FLAG_FP = 0x02, SCSI_ATTR_FLAG_DIV_10 = 0x04, SCSI_ATTR_FLAG_FP_1DIGIT = 0x08 } scsi_attrib_flags; typedef enum { SCSI_ATTR_OUTPUT_NONE = 0x00, SCSI_ATTR_OUTPUT_TEXT_MASK = 0x03, SCSI_ATTR_OUTPUT_TEXT_RAW = 0x00, SCSI_ATTR_OUTPUT_TEXT_ESC = 0x01, SCSI_ATTR_OUTPUT_TEXT_RSV1 = 0x02, SCSI_ATTR_OUTPUT_TEXT_RSV2 = 0x03, SCSI_ATTR_OUTPUT_NONASCII_MASK = 0x0c, SCSI_ATTR_OUTPUT_NONASCII_TRIM = 0x00, SCSI_ATTR_OUTPUT_NONASCII_ESC = 0x04, SCSI_ATTR_OUTPUT_NONASCII_RAW = 0x08, SCSI_ATTR_OUTPUT_NONASCII_RSV1 = 0x0c, SCSI_ATTR_OUTPUT_FIELD_MASK = 0xf0, SCSI_ATTR_OUTPUT_FIELD_ALL = 0xf0, SCSI_ATTR_OUTPUT_FIELD_NONE = 0x00, SCSI_ATTR_OUTPUT_FIELD_DESC = 0x10, SCSI_ATTR_OUTPUT_FIELD_NUM = 0x20, SCSI_ATTR_OUTPUT_FIELD_SIZE = 0x40, SCSI_ATTR_OUTPUT_FIELD_RW = 0x80 } scsi_attrib_output_flags; struct sbuf; struct scsi_attrib_table_entry { u_int32_t id; u_int32_t flags; const char *desc; const char *suffix; int (*to_str)(struct sbuf *sb, struct scsi_mam_attribute_header *hdr, uint32_t valid_len, uint32_t flags, uint32_t output_flags, char *error_str, int error_str_len); int (*parse_str)(char *str, struct scsi_mam_attribute_header *hdr, uint32_t alloc_len, uint32_t flags, char *error_str, int error_str_len); }; struct scsi_rw_6 { u_int8_t opcode; u_int8_t addr[3]; /* only 5 bits are valid in the MSB address byte */ #define SRW_TOPADDR 0x1F u_int8_t length; u_int8_t control; }; struct scsi_rw_10 { u_int8_t opcode; #define SRW10_RELADDR 0x01 /* EBP defined for WRITE(10) only */ #define SRW10_EBP 0x04 #define SRW10_FUA 0x08 #define SRW10_DPO 0x10 u_int8_t byte2; u_int8_t addr[4]; u_int8_t reserved; u_int8_t length[2]; u_int8_t control; }; struct scsi_rw_12 { u_int8_t opcode; #define SRW12_RELADDR 0x01 #define SRW12_FUA 0x08 #define SRW12_DPO 0x10 u_int8_t byte2; u_int8_t addr[4]; u_int8_t length[4]; u_int8_t reserved; u_int8_t control; }; struct scsi_rw_16 { u_int8_t opcode; #define SRW16_RELADDR 0x01 #define SRW16_FUA 0x08 #define SRW16_DPO 0x10 u_int8_t byte2; u_int8_t addr[8]; u_int8_t length[4]; u_int8_t reserved; u_int8_t control; }; struct scsi_write_atomic_16 { uint8_t opcode; uint8_t byte2; uint8_t addr[8]; uint8_t boundary[2]; uint8_t length[2]; uint8_t group; uint8_t control; }; struct scsi_write_same_10 { uint8_t opcode; uint8_t byte2; #define SWS_LBDATA 0x02 #define SWS_PBDATA 0x04 #define SWS_UNMAP 0x08 #define SWS_ANCHOR 0x10 uint8_t addr[4]; uint8_t group; uint8_t length[2]; uint8_t control; }; struct scsi_write_same_16 { uint8_t opcode; uint8_t byte2; #define SWS_NDOB 0x01 uint8_t addr[8]; uint8_t length[4]; uint8_t group; uint8_t control; }; struct scsi_unmap { uint8_t opcode; uint8_t byte2; #define SU_ANCHOR 0x01 uint8_t reserved[4]; uint8_t group; uint8_t length[2]; uint8_t control; }; struct scsi_unmap_header { uint8_t length[2]; uint8_t desc_length[2]; uint8_t reserved[4]; }; struct scsi_unmap_desc { uint8_t lba[8]; uint8_t length[4]; uint8_t reserved[4]; }; struct scsi_write_verify_10 { uint8_t opcode; uint8_t byte2; #define SWV_BYTCHK 0x02 #define SWV_DPO 0x10 #define SWV_WRPROECT_MASK 0xe0 uint8_t addr[4]; uint8_t group; uint8_t length[2]; uint8_t control; }; struct scsi_write_verify_12 { uint8_t opcode; uint8_t byte2; uint8_t addr[4]; uint8_t length[4]; uint8_t group; uint8_t control; }; struct scsi_write_verify_16 { uint8_t opcode; uint8_t byte2; uint8_t addr[8]; uint8_t length[4]; uint8_t group; uint8_t control; }; struct scsi_start_stop_unit { u_int8_t opcode; u_int8_t byte2; #define SSS_IMMED 0x01 u_int8_t reserved[2]; u_int8_t how; #define SSS_START 0x01 #define SSS_LOEJ 0x02 #define SSS_PC_MASK 0xf0 #define SSS_PC_START_VALID 0x00 #define SSS_PC_ACTIVE 0x10 #define SSS_PC_IDLE 0x20 #define SSS_PC_STANDBY 0x30 #define SSS_PC_LU_CONTROL 0x70 #define SSS_PC_FORCE_IDLE_0 0xa0 #define SSS_PC_FORCE_STANDBY_0 0xb0 u_int8_t control; }; struct ata_pass_12 { u_int8_t opcode; u_int8_t protocol; #define AP_PROTO_HARD_RESET (0x00 << 1) #define AP_PROTO_SRST (0x01 << 1) #define AP_PROTO_NON_DATA (0x03 << 1) #define AP_PROTO_PIO_IN (0x04 << 1) #define AP_PROTO_PIO_OUT (0x05 << 1) #define AP_PROTO_DMA (0x06 << 1) #define AP_PROTO_DMA_QUEUED (0x07 << 1) #define AP_PROTO_DEVICE_DIAG (0x08 << 1) #define AP_PROTO_DEVICE_RESET (0x09 << 1) #define AP_PROTO_UDMA_IN (0x0a << 1) #define AP_PROTO_UDMA_OUT (0x0b << 1) #define AP_PROTO_FPDMA (0x0c << 1) #define AP_PROTO_RESP_INFO (0x0f << 1) #define AP_PROTO_MASK 0x1e #define AP_MULTI 0xe0 u_int8_t flags; #define AP_T_LEN 0x03 #define AP_BB 0x04 #define AP_T_DIR 0x08 #define AP_CK_COND 0x20 #define AP_OFFLINE 0x60 u_int8_t features; u_int8_t sector_count; u_int8_t lba_low; u_int8_t lba_mid; u_int8_t lba_high; u_int8_t device; u_int8_t command; u_int8_t reserved; u_int8_t control; }; struct scsi_maintenance_in { uint8_t opcode; uint8_t byte2; #define SERVICE_ACTION_MASK 0x1f #define SA_RPRT_TRGT_GRP 0x0a uint8_t reserved[4]; uint8_t length[4]; uint8_t reserved1; uint8_t control; }; struct scsi_report_supported_opcodes { uint8_t opcode; uint8_t service_action; uint8_t options; #define RSO_RCTD 0x80 #define RSO_OPTIONS_MASK 0x07 #define RSO_OPTIONS_ALL 0x00 #define RSO_OPTIONS_OC 0x01 #define RSO_OPTIONS_OC_SA 0x02 #define RSO_OPTIONS_OC_ASA 0x03 uint8_t requested_opcode; uint8_t requested_service_action[2]; uint8_t length[4]; uint8_t reserved1; uint8_t control; }; struct scsi_report_supported_opcodes_timeout { uint8_t length[2]; uint8_t reserved; uint8_t cmd_specific; uint8_t nominal_time[4]; uint8_t recommended_time[4]; }; struct scsi_report_supported_opcodes_descr { uint8_t opcode; uint8_t reserved; uint8_t service_action[2]; uint8_t reserved2; uint8_t flags; #define RSO_SERVACTV 0x01 #define RSO_CTDP 0x02 #define RSO_CDLP_MASK 0x0c #define RSO_CDLP_NO 0x00 #define RSO_CDLP_A 0x04 #define RSO_CDLP_B 0x08 uint8_t cdb_length[2]; struct scsi_report_supported_opcodes_timeout timeout[0]; }; struct scsi_report_supported_opcodes_all { uint8_t length[4]; struct scsi_report_supported_opcodes_descr descr[0]; }; struct scsi_report_supported_opcodes_one { uint8_t reserved; uint8_t support; #define RSO_ONE_CTDP 0x80 #define RSO_ONE_CDLP_MASK 0x18 #define RSO_ONE_CDLP_NO 0x00 #define RSO_ONE_CDLP_A 0x08 #define RSO_ONE_CDLP_B 0x10 #define RSO_ONE_SUP_MASK 0x07 #define RSO_ONE_SUP_UNAVAIL 0x00 #define RSO_ONE_SUP_NOT_SUP 0x01 #define RSO_ONE_SUP_AVAIL 0x03 #define RSO_ONE_SUP_VENDOR 0x05 uint8_t cdb_length[2]; uint8_t cdb_usage[]; }; struct scsi_report_supported_tmf { uint8_t opcode; uint8_t service_action; uint8_t options; #define RST_REPD 0x80 uint8_t reserved[3]; uint8_t length[4]; uint8_t reserved1; uint8_t control; }; struct scsi_report_supported_tmf_data { uint8_t byte1; #define RST_WAKES 0x01 #define RST_TRS 0x02 #define RST_QTS 0x04 #define RST_LURS 0x08 #define RST_CTSS 0x10 #define RST_CACAS 0x20 #define RST_ATSS 0x40 #define RST_ATS 0x80 uint8_t byte2; #define RST_ITNRS 0x01 #define RST_QTSS 0x02 #define RST_QAES 0x04 uint8_t reserved; uint8_t length; }; struct scsi_report_supported_tmf_ext_data { uint8_t byte1; uint8_t byte2; uint8_t reserved; uint8_t length; uint8_t byte5; #define RST_TMFTMOV 0x01 uint8_t reserved2; uint8_t byte7; #define RST_WAKETS 0x01 #define RST_TRTS 0x02 #define RST_QTTS 0x04 #define RST_LURTS 0x08 #define RST_CTSTS 0x10 #define RST_CACATS 0x20 #define RST_ATSTS 0x40 #define RST_ATTS 0x80 uint8_t byte8; #define RST_ITNRTS 0x01 #define RST_QTSTS 0x02 #define RST_QAETS 0x04 uint8_t long_timeout[4]; uint8_t short_timeout[4]; }; struct scsi_report_timestamp { uint8_t opcode; uint8_t service_action; uint8_t reserved[4]; uint8_t length[4]; uint8_t reserved1; uint8_t control; }; struct scsi_report_timestamp_data { uint8_t length[2]; uint8_t origin; #define RTS_ORIG_MASK 0x00 #define RTS_ORIG_ZERO 0x00 #define RTS_ORIG_SET 0x02 #define RTS_ORIG_OUTSIDE 0x03 uint8_t reserved; uint8_t timestamp[6]; uint8_t reserve2[2]; }; struct scsi_receive_copy_status_lid1 { uint8_t opcode; uint8_t service_action; #define RCS_RCS_LID1 0x00 uint8_t list_identifier; uint8_t reserved[7]; uint8_t length[4]; uint8_t reserved1; uint8_t control; }; struct scsi_receive_copy_status_lid1_data { uint8_t available_data[4]; uint8_t copy_command_status; #define RCS_CCS_INPROG 0x00 #define RCS_CCS_COMPLETED 0x01 #define RCS_CCS_ERROR 0x02 uint8_t segments_processed[2]; uint8_t transfer_count_units; #define RCS_TC_BYTES 0x00 #define RCS_TC_KBYTES 0x01 #define RCS_TC_MBYTES 0x02 #define RCS_TC_GBYTES 0x03 #define RCS_TC_TBYTES 0x04 #define RCS_TC_PBYTES 0x05 #define RCS_TC_EBYTES 0x06 #define RCS_TC_LBAS 0xf1 uint8_t transfer_count[4]; }; struct scsi_receive_copy_failure_details { uint8_t opcode; uint8_t service_action; #define RCS_RCFD 0x04 uint8_t list_identifier; uint8_t reserved[7]; uint8_t length[4]; uint8_t reserved1; uint8_t control; }; struct scsi_receive_copy_failure_details_data { uint8_t available_data[4]; uint8_t reserved[52]; uint8_t copy_command_status; uint8_t reserved2; uint8_t sense_data_length[2]; uint8_t sense_data[]; }; struct scsi_receive_copy_status_lid4 { uint8_t opcode; uint8_t service_action; #define RCS_RCS_LID4 0x05 uint8_t list_identifier[4]; uint8_t reserved[4]; uint8_t length[4]; uint8_t reserved1; uint8_t control; }; struct scsi_receive_copy_status_lid4_data { uint8_t available_data[4]; uint8_t response_to_service_action; uint8_t copy_command_status; #define RCS_CCS_COMPLETED_PROD 0x03 #define RCS_CCS_COMPLETED_RESID 0x04 #define RCS_CCS_INPROG_FGBG 0x10 #define RCS_CCS_INPROG_FG 0x11 #define RCS_CCS_INPROG_BG 0x12 #define RCS_CCS_ABORTED 0x60 uint8_t operation_counter[2]; uint8_t estimated_status_update_delay[4]; uint8_t extended_copy_completion_status; uint8_t length_of_the_sense_data_field; uint8_t sense_data_length; uint8_t transfer_count_units; uint8_t transfer_count[8]; uint8_t segments_processed[2]; uint8_t reserved[6]; uint8_t sense_data[]; }; struct scsi_receive_copy_operating_parameters { uint8_t opcode; uint8_t service_action; #define RCS_RCOP 0x03 uint8_t reserved[8]; uint8_t length[4]; uint8_t reserved1; uint8_t control; }; struct scsi_receive_copy_operating_parameters_data { uint8_t length[4]; uint8_t snlid; #define RCOP_SNLID 0x01 uint8_t reserved[3]; uint8_t maximum_cscd_descriptor_count[2]; uint8_t maximum_segment_descriptor_count[2]; uint8_t maximum_descriptor_list_length[4]; uint8_t maximum_segment_length[4]; uint8_t maximum_inline_data_length[4]; uint8_t held_data_limit[4]; uint8_t maximum_stream_device_transfer_size[4]; uint8_t reserved2[2]; uint8_t total_concurrent_copies[2]; uint8_t maximum_concurrent_copies; uint8_t data_segment_granularity; uint8_t inline_data_granularity; uint8_t held_data_granularity; uint8_t reserved3[3]; uint8_t implemented_descriptor_list_length; uint8_t list_of_implemented_descriptor_type_codes[0]; }; struct scsi_extended_copy { uint8_t opcode; uint8_t service_action; #define EC_EC_LID1 0x00 #define EC_EC_LID4 0x01 uint8_t reserved[8]; uint8_t length[4]; uint8_t reserved1; uint8_t control; }; struct scsi_ec_cscd_dtsp { uint8_t flags; #define EC_CSCD_FIXED 0x01 #define EC_CSCD_PAD 0x04 uint8_t block_length[3]; }; struct scsi_ec_cscd { uint8_t type_code; #define EC_CSCD_EXT 0xff uint8_t luidt_pdt; #define EC_NUL 0x20 #define EC_LUIDT_MASK 0xc0 #define EC_LUIDT_LUN 0x00 #define EC_LUIDT_PROXY_TOKEN 0x40 uint8_t relative_initiator_port[2]; uint8_t cscd_params[24]; struct scsi_ec_cscd_dtsp dtsp; }; struct scsi_ec_cscd_id { uint8_t type_code; #define EC_CSCD_ID 0xe4 uint8_t luidt_pdt; uint8_t relative_initiator_port[2]; uint8_t codeset; uint8_t id_type; uint8_t reserved; uint8_t length; uint8_t designator[20]; struct scsi_ec_cscd_dtsp dtsp; }; struct scsi_ec_segment { uint8_t type_code; uint8_t flags; #define EC_SEG_DC 0x02 #define EC_SEG_CAT 0x01 uint8_t descr_length[2]; uint8_t params[]; }; struct scsi_ec_segment_b2b { uint8_t type_code; #define EC_SEG_B2B 0x02 uint8_t flags; uint8_t descr_length[2]; uint8_t src_cscd[2]; uint8_t dst_cscd[2]; uint8_t reserved[2]; uint8_t number_of_blocks[2]; uint8_t src_lba[8]; uint8_t dst_lba[8]; }; struct scsi_ec_segment_verify { uint8_t type_code; #define EC_SEG_VERIFY 0x07 uint8_t reserved; uint8_t descr_length[2]; uint8_t src_cscd[2]; uint8_t reserved2[2]; uint8_t tur; uint8_t reserved3[3]; }; struct scsi_ec_segment_register_key { uint8_t type_code; #define EC_SEG_REGISTER_KEY 0x14 uint8_t reserved; uint8_t descr_length[2]; uint8_t reserved2[2]; uint8_t dst_cscd[2]; uint8_t res_key[8]; uint8_t sa_res_key[8]; uint8_t reserved3[4]; }; struct scsi_extended_copy_lid1_data { uint8_t list_identifier; uint8_t flags; #define EC_PRIORITY 0x07 #define EC_LIST_ID_USAGE_MASK 0x18 #define EC_LIST_ID_USAGE_FULL 0x08 #define EC_LIST_ID_USAGE_NOHOLD 0x10 #define EC_LIST_ID_USAGE_NONE 0x18 #define EC_STR 0x20 uint8_t cscd_list_length[2]; uint8_t reserved[4]; uint8_t segment_list_length[4]; uint8_t inline_data_length[4]; uint8_t data[]; }; struct scsi_extended_copy_lid4_data { uint8_t list_format; #define EC_LIST_FORMAT 0x01 uint8_t flags; uint8_t header_cscd_list_length[2]; uint8_t reserved[11]; uint8_t flags2; #define EC_IMMED 0x01 #define EC_G_SENSE 0x02 uint8_t header_cscd_type_code; uint8_t reserved2[3]; uint8_t list_identifier[4]; uint8_t reserved3[18]; uint8_t cscd_list_length[2]; uint8_t segment_list_length[2]; uint8_t inline_data_length[2]; uint8_t data[]; }; struct scsi_copy_operation_abort { uint8_t opcode; uint8_t service_action; #define EC_COA 0x1c uint8_t list_identifier[4]; uint8_t reserved[9]; uint8_t control; }; struct scsi_populate_token { uint8_t opcode; uint8_t service_action; #define EC_PT 0x10 uint8_t reserved[4]; uint8_t list_identifier[4]; uint8_t length[4]; uint8_t group_number; uint8_t control; }; struct scsi_range_desc { uint8_t lba[8]; uint8_t length[4]; uint8_t reserved[4]; }; struct scsi_populate_token_data { uint8_t length[2]; uint8_t flags; #define EC_PT_IMMED 0x01 #define EC_PT_RTV 0x02 uint8_t reserved; uint8_t inactivity_timeout[4]; uint8_t rod_type[4]; uint8_t reserved2[2]; uint8_t range_descriptor_length[2]; struct scsi_range_desc desc[]; }; struct scsi_write_using_token { uint8_t opcode; uint8_t service_action; #define EC_WUT 0x11 uint8_t reserved[4]; uint8_t list_identifier[4]; uint8_t length[4]; uint8_t group_number; uint8_t control; }; struct scsi_write_using_token_data { uint8_t length[2]; uint8_t flags; #define EC_WUT_IMMED 0x01 #define EC_WUT_DEL_TKN 0x02 uint8_t reserved[5]; uint8_t offset_into_rod[8]; uint8_t rod_token[512]; uint8_t reserved2[6]; uint8_t range_descriptor_length[2]; struct scsi_range_desc desc[]; }; struct scsi_receive_rod_token_information { uint8_t opcode; uint8_t service_action; #define RCS_RRTI 0x07 uint8_t list_identifier[4]; uint8_t reserved[4]; uint8_t length[4]; uint8_t reserved2; uint8_t control; }; struct scsi_token { uint8_t type[4]; #define ROD_TYPE_INTERNAL 0x00000000 #define ROD_TYPE_AUR 0x00010000 #define ROD_TYPE_PIT_DEF 0x00800000 #define ROD_TYPE_PIT_VULN 0x00800001 #define ROD_TYPE_PIT_PERS 0x00800002 #define ROD_TYPE_PIT_ANY 0x0080FFFF #define ROD_TYPE_BLOCK_ZERO 0xFFFF0001 uint8_t reserved[2]; uint8_t length[2]; uint8_t body[0]; }; struct scsi_report_all_rod_tokens { uint8_t opcode; uint8_t service_action; #define RCS_RART 0x08 uint8_t reserved[8]; uint8_t length[4]; uint8_t reserved2; uint8_t control; }; struct scsi_report_all_rod_tokens_data { uint8_t available_data[4]; uint8_t reserved[4]; uint8_t rod_management_token_list[]; }; struct ata_pass_16 { u_int8_t opcode; u_int8_t protocol; #define AP_EXTEND 0x01 u_int8_t flags; #define AP_FLAG_TLEN_NO_DATA (0 << 0) #define AP_FLAG_TLEN_FEAT (1 << 0) #define AP_FLAG_TLEN_SECT_CNT (2 << 0) #define AP_FLAG_TLEN_STPSIU (3 << 0) #define AP_FLAG_BYT_BLOK_BYTES (0 << 2) #define AP_FLAG_BYT_BLOK_BLOCKS (1 << 2) #define AP_FLAG_TDIR_TO_DEV (0 << 3) #define AP_FLAG_TDIR_FROM_DEV (1 << 3) #define AP_FLAG_CHK_COND (1 << 5) u_int8_t features_ext; u_int8_t features; u_int8_t sector_count_ext; u_int8_t sector_count; u_int8_t lba_low_ext; u_int8_t lba_low; u_int8_t lba_mid_ext; u_int8_t lba_mid; u_int8_t lba_high_ext; u_int8_t lba_high; u_int8_t device; u_int8_t command; u_int8_t control; }; struct ata_pass_32 { uint8_t opcode; uint8_t control; uint8_t reserved1[5]; uint8_t length; uint8_t service_action[2]; #define ATA_PASS_32_SA 0x1ff0 uint8_t protocol; uint8_t flags; uint8_t reserved2[2]; uint8_t lba[6]; uint8_t features[2]; uint8_t count[2]; uint8_t device; uint8_t command; uint8_t reserved3; uint8_t icc; uint8_t auxiliary[4]; }; #define SC_SCSI_1 0x01 #define SC_SCSI_2 0x03 /* * Opcodes */ #define TEST_UNIT_READY 0x00 #define REQUEST_SENSE 0x03 #define READ_6 0x08 #define WRITE_6 0x0A #define INQUIRY 0x12 #define MODE_SELECT_6 0x15 #define MODE_SENSE_6 0x1A #define START_STOP_UNIT 0x1B #define START_STOP 0x1B #define RESERVE 0x16 #define RELEASE 0x17 #define RECEIVE_DIAGNOSTIC 0x1C #define SEND_DIAGNOSTIC 0x1D #define PREVENT_ALLOW 0x1E #define READ_CAPACITY 0x25 #define READ_10 0x28 #define WRITE_10 0x2A #define POSITION_TO_ELEMENT 0x2B #define WRITE_VERIFY_10 0x2E #define VERIFY_10 0x2F #define SYNCHRONIZE_CACHE 0x35 #define READ_DEFECT_DATA_10 0x37 #define WRITE_BUFFER 0x3B #define READ_BUFFER 0x3C #define CHANGE_DEFINITION 0x40 #define WRITE_SAME_10 0x41 #define UNMAP 0x42 #define LOG_SELECT 0x4C #define LOG_SENSE 0x4D #define MODE_SELECT_10 0x55 #define RESERVE_10 0x56 #define RELEASE_10 0x57 #define MODE_SENSE_10 0x5A #define PERSISTENT_RES_IN 0x5E #define PERSISTENT_RES_OUT 0x5F #define EXTENDED_CDB 0x7E #define VARIABLE_LEN_CDB 0x7F #define EXTENDED_COPY 0x83 #define RECEIVE_COPY_STATUS 0x84 #define ATA_PASS_16 0x85 #define READ_16 0x88 #define COMPARE_AND_WRITE 0x89 #define WRITE_16 0x8A #define READ_ATTRIBUTE 0x8C #define WRITE_ATTRIBUTE 0x8D #define WRITE_VERIFY_16 0x8E #define VERIFY_16 0x8F #define SYNCHRONIZE_CACHE_16 0x91 #define WRITE_SAME_16 0x93 #define READ_BUFFER_16 0x9B #define WRITE_ATOMIC_16 0x9C #define SERVICE_ACTION_IN 0x9E #define REPORT_LUNS 0xA0 #define ATA_PASS_12 0xA1 #define SECURITY_PROTOCOL_IN 0xA2 #define MAINTENANCE_IN 0xA3 #define MAINTENANCE_OUT 0xA4 #define MOVE_MEDIUM 0xA5 #define READ_12 0xA8 #define WRITE_12 0xAA #define WRITE_VERIFY_12 0xAE #define VERIFY_12 0xAF #define SECURITY_PROTOCOL_OUT 0xB5 #define READ_ELEMENT_STATUS 0xB8 #define READ_CD 0xBE /* Maintenance In Service Action Codes */ #define REPORT_IDENTIFYING_INFRMATION 0x05 #define REPORT_TARGET_PORT_GROUPS 0x0A #define REPORT_ALIASES 0x0B #define REPORT_SUPPORTED_OPERATION_CODES 0x0C #define REPORT_SUPPORTED_TASK_MANAGEMENT_FUNCTIONS 0x0D #define REPORT_PRIORITY 0x0E #define REPORT_TIMESTAMP 0x0F #define MANAGEMENT_PROTOCOL_IN 0x10 /* Maintenance Out Service Action Codes */ #define SET_IDENTIFY_INFORMATION 0x06 #define SET_TARGET_PORT_GROUPS 0x0A #define CHANGE_ALIASES 0x0B #define SET_PRIORITY 0x0E #define SET_TIMESTAMP 0x0F #define MANGAEMENT_PROTOCOL_OUT 0x10 /* * Device Types */ #define T_DIRECT 0x00 #define T_SEQUENTIAL 0x01 #define T_PRINTER 0x02 #define T_PROCESSOR 0x03 #define T_WORM 0x04 #define T_CDROM 0x05 #define T_SCANNER 0x06 #define T_OPTICAL 0x07 #define T_CHANGER 0x08 #define T_COMM 0x09 #define T_ASC0 0x0a #define T_ASC1 0x0b #define T_STORARRAY 0x0c #define T_ENCLOSURE 0x0d #define T_RBC 0x0e #define T_OCRW 0x0f #define T_OSD 0x11 #define T_ADC 0x12 #define T_ZBC_HM 0x14 #define T_NODEVICE 0x1f #define T_ANY 0xff /* Used in Quirk table matches */ #define T_REMOV 1 #define T_FIXED 0 /* * This length is the initial inquiry length used by the probe code, as * well as the length necessary for scsi_print_inquiry() to function * correctly. If either use requires a different length in the future, * the two values should be de-coupled. */ #define SHORT_INQUIRY_LENGTH 36 struct scsi_inquiry_data { u_int8_t device; #define SID_TYPE(inq_data) ((inq_data)->device & 0x1f) #define SID_QUAL(inq_data) (((inq_data)->device & 0xE0) >> 5) #define SID_QUAL_LU_CONNECTED 0x00 /* * The specified peripheral device * type is currently connected to * logical unit. If the target cannot * determine whether or not a physical * device is currently connected, it * shall also use this peripheral * qualifier when returning the INQUIRY * data. This peripheral qualifier * does not mean that the device is * ready for access by the initiator. */ #define SID_QUAL_LU_OFFLINE 0x01 /* * The target is capable of supporting * the specified peripheral device type * on this logical unit; however, the * physical device is not currently * connected to this logical unit. */ #define SID_QUAL_RSVD 0x02 #define SID_QUAL_BAD_LU 0x03 /* * The target is not capable of * supporting a physical device on * this logical unit. For this * peripheral qualifier the peripheral * device type shall be set to 1Fh to * provide compatibility with previous * versions of SCSI. All other * peripheral device type values are * reserved for this peripheral * qualifier. */ #define SID_QUAL_IS_VENDOR_UNIQUE(inq_data) ((SID_QUAL(inq_data) & 0x04) != 0) u_int8_t dev_qual2; #define SID_QUAL2 0x7F #define SID_LU_CONG 0x40 #define SID_RMB 0x80 #define SID_IS_REMOVABLE(inq_data) (((inq_data)->dev_qual2 & SID_RMB) != 0) u_int8_t version; #define SID_ANSI_REV(inq_data) ((inq_data)->version & 0x07) #define SCSI_REV_0 0 #define SCSI_REV_CCS 1 #define SCSI_REV_2 2 #define SCSI_REV_SPC 3 #define SCSI_REV_SPC2 4 #define SCSI_REV_SPC3 5 #define SCSI_REV_SPC4 6 #define SCSI_REV_SPC5 7 #define SID_ECMA 0x38 #define SID_ISO 0xC0 u_int8_t response_format; #define SID_AENC 0x80 #define SID_TrmIOP 0x40 #define SID_NormACA 0x20 #define SID_HiSup 0x10 u_int8_t additional_length; #define SID_ADDITIONAL_LENGTH(iqd) \ ((iqd)->additional_length + \ __offsetof(struct scsi_inquiry_data, additional_length) + 1) u_int8_t spc3_flags; #define SPC3_SID_PROTECT 0x01 #define SPC3_SID_3PC 0x08 #define SPC3_SID_TPGS_MASK 0x30 #define SPC3_SID_TPGS_IMPLICIT 0x10 #define SPC3_SID_TPGS_EXPLICIT 0x20 #define SPC3_SID_ACC 0x40 #define SPC3_SID_SCCS 0x80 u_int8_t spc2_flags; #define SPC2_SID_ADDR16 0x01 #define SPC2_SID_MChngr 0x08 #define SPC2_SID_MultiP 0x10 #define SPC2_SID_EncServ 0x40 #define SPC2_SID_BQueue 0x80 #define INQ_DATA_TQ_ENABLED(iqd) \ ((SID_ANSI_REV(iqd) < SCSI_REV_SPC2)? ((iqd)->flags & SID_CmdQue) : \ (((iqd)->flags & SID_CmdQue) && !((iqd)->spc2_flags & SPC2_SID_BQueue)) || \ (!((iqd)->flags & SID_CmdQue) && ((iqd)->spc2_flags & SPC2_SID_BQueue))) u_int8_t flags; #define SID_SftRe 0x01 #define SID_CmdQue 0x02 #define SID_Linked 0x08 #define SID_Sync 0x10 #define SID_WBus16 0x20 #define SID_WBus32 0x40 #define SID_RelAdr 0x80 #define SID_VENDOR_SIZE 8 char vendor[SID_VENDOR_SIZE]; #define SID_PRODUCT_SIZE 16 char product[SID_PRODUCT_SIZE]; #define SID_REVISION_SIZE 4 char revision[SID_REVISION_SIZE]; /* * The following fields were taken from SCSI Primary Commands - 2 * (SPC-2) Revision 14, Dated 11 November 1999 */ #define SID_VENDOR_SPECIFIC_0_SIZE 20 u_int8_t vendor_specific0[SID_VENDOR_SPECIFIC_0_SIZE]; /* * An extension of SCSI Parallel Specific Values */ #define SID_SPI_IUS 0x01 #define SID_SPI_QAS 0x02 #define SID_SPI_CLOCK_ST 0x00 #define SID_SPI_CLOCK_DT 0x04 #define SID_SPI_CLOCK_DT_ST 0x0C #define SID_SPI_MASK 0x0F u_int8_t spi3data; u_int8_t reserved2; /* * Version Descriptors, stored 2 byte values. */ u_int8_t version1[2]; u_int8_t version2[2]; u_int8_t version3[2]; u_int8_t version4[2]; u_int8_t version5[2]; u_int8_t version6[2]; u_int8_t version7[2]; u_int8_t version8[2]; u_int8_t reserved3[22]; #define SID_VENDOR_SPECIFIC_1_SIZE 160 u_int8_t vendor_specific1[SID_VENDOR_SPECIFIC_1_SIZE]; }; /* * This structure is more suited to initiator operation, because the * maximum number of supported pages is already allocated. */ struct scsi_vpd_supported_page_list { u_int8_t device; u_int8_t page_code; #define SVPD_SUPPORTED_PAGE_LIST 0x00 #define SVPD_SUPPORTED_PAGES_HDR_LEN 4 u_int8_t reserved; u_int8_t length; /* number of VPD entries */ #define SVPD_SUPPORTED_PAGES_SIZE 251 u_int8_t list[SVPD_SUPPORTED_PAGES_SIZE]; }; /* * This structure is more suited to target operation, because the * number of supported pages is left to the user to allocate. */ struct scsi_vpd_supported_pages { u_int8_t device; u_int8_t page_code; u_int8_t reserved; #define SVPD_SUPPORTED_PAGES 0x00 u_int8_t length; u_int8_t page_list[0]; }; struct scsi_vpd_unit_serial_number { u_int8_t device; u_int8_t page_code; #define SVPD_UNIT_SERIAL_NUMBER 0x80 u_int8_t reserved; u_int8_t length; /* serial number length */ #define SVPD_SERIAL_NUM_SIZE 251 u_int8_t serial_num[SVPD_SERIAL_NUM_SIZE]; }; struct scsi_vpd_device_id { u_int8_t device; u_int8_t page_code; #define SVPD_DEVICE_ID 0x83 #define SVPD_DEVICE_ID_MAX_SIZE 252 #define SVPD_DEVICE_ID_HDR_LEN \ __offsetof(struct scsi_vpd_device_id, desc_list) u_int8_t length[2]; u_int8_t desc_list[]; }; struct scsi_vpd_id_descriptor { u_int8_t proto_codeset; /* * See the SCSI_PROTO definitions above for the protocols. */ #define SVPD_ID_PROTO_SHIFT 4 #define SVPD_ID_CODESET_BINARY 0x01 #define SVPD_ID_CODESET_ASCII 0x02 #define SVPD_ID_CODESET_UTF8 0x03 #define SVPD_ID_CODESET_MASK 0x0f u_int8_t id_type; #define SVPD_ID_PIV 0x80 #define SVPD_ID_ASSOC_LUN 0x00 #define SVPD_ID_ASSOC_PORT 0x10 #define SVPD_ID_ASSOC_TARGET 0x20 #define SVPD_ID_ASSOC_MASK 0x30 #define SVPD_ID_TYPE_VENDOR 0x00 #define SVPD_ID_TYPE_T10 0x01 #define SVPD_ID_TYPE_EUI64 0x02 #define SVPD_ID_TYPE_NAA 0x03 #define SVPD_ID_TYPE_RELTARG 0x04 #define SVPD_ID_TYPE_TPORTGRP 0x05 #define SVPD_ID_TYPE_LUNGRP 0x06 #define SVPD_ID_TYPE_MD5_LUN_ID 0x07 #define SVPD_ID_TYPE_SCSI_NAME 0x08 #define SVPD_ID_TYPE_PROTO 0x09 #define SVPD_ID_TYPE_UUID 0x0a #define SVPD_ID_TYPE_MASK 0x0f u_int8_t reserved; u_int8_t length; #define SVPD_DEVICE_ID_DESC_HDR_LEN \ __offsetof(struct scsi_vpd_id_descriptor, identifier) u_int8_t identifier[]; }; struct scsi_vpd_id_t10 { u_int8_t vendor[8]; u_int8_t vendor_spec_id[0]; }; struct scsi_vpd_id_eui64 { u_int8_t ieee_company_id[3]; u_int8_t extension_id[5]; }; struct scsi_vpd_id_naa_basic { uint8_t naa; /* big endian, packed: uint8_t naa : 4; uint8_t naa_desig : 4; */ #define SVPD_ID_NAA_NAA_SHIFT 4 #define SVPD_ID_NAA_IEEE_EXT 0x02 #define SVPD_ID_NAA_LOCAL_REG 0x03 #define SVPD_ID_NAA_IEEE_REG 0x05 #define SVPD_ID_NAA_IEEE_REG_EXT 0x06 uint8_t naa_data[]; }; struct scsi_vpd_id_naa_ieee_extended_id { uint8_t naa; uint8_t vendor_specific_id_a; uint8_t ieee_company_id[3]; uint8_t vendor_specific_id_b[4]; }; struct scsi_vpd_id_naa_local_reg { uint8_t naa; uint8_t local_value[7]; }; struct scsi_vpd_id_naa_ieee_reg { uint8_t naa; uint8_t reg_value[7]; /* big endian, packed: uint8_t naa_basic : 4; uint8_t ieee_company_id_0 : 4; uint8_t ieee_company_id_1[2]; uint8_t ieee_company_id_2 : 4; uint8_t vendor_specific_id_0 : 4; uint8_t vendor_specific_id_1[4]; */ }; struct scsi_vpd_id_naa_ieee_reg_extended { uint8_t naa; uint8_t reg_value[15]; /* big endian, packed: uint8_t naa_basic : 4; uint8_t ieee_company_id_0 : 4; uint8_t ieee_company_id_1[2]; uint8_t ieee_company_id_2 : 4; uint8_t vendor_specific_id_0 : 4; uint8_t vendor_specific_id_1[4]; uint8_t vendor_specific_id_ext[8]; */ }; struct scsi_vpd_id_rel_trgt_port_id { uint8_t obsolete[2]; uint8_t rel_trgt_port_id[2]; }; struct scsi_vpd_id_trgt_port_grp_id { uint8_t reserved[2]; uint8_t trgt_port_grp[2]; }; struct scsi_vpd_id_lun_grp_id { uint8_t reserved[2]; uint8_t log_unit_grp[2]; }; struct scsi_vpd_id_md5_lun_id { uint8_t lun_id[16]; }; struct scsi_vpd_id_scsi_name { uint8_t name_string[256]; }; struct scsi_service_action_in { uint8_t opcode; uint8_t service_action; uint8_t action_dependent[13]; uint8_t control; }; struct scsi_vpd_extended_inquiry_data { uint8_t device; uint8_t page_code; #define SVPD_EXTENDED_INQUIRY_DATA 0x86 uint8_t page_length[2]; uint8_t flags1; /* These values are for direct access devices */ #define SVPD_EID_AM_MASK 0xC0 #define SVPD_EID_AM_DEFER 0x80 #define SVPD_EID_AM_IMMED 0x40 #define SVPD_EID_AM_UNDEFINED 0x00 #define SVPD_EID_AM_RESERVED 0xc0 #define SVPD_EID_SPT 0x38 #define SVPD_EID_SPT_1 0x00 #define SVPD_EID_SPT_12 0x08 #define SVPD_EID_SPT_2 0x10 #define SVPD_EID_SPT_13 0x18 #define SVPD_EID_SPT_3 0x20 #define SVPD_EID_SPT_23 0x28 #define SVPD_EID_SPT_123 0x38 /* These values are for sequential access devices */ #define SVPD_EID_SA_SPT_LBP 0x08 #define SVPD_EID_GRD_CHK 0x04 #define SVPD_EID_APP_CHK 0x02 #define SVPD_EID_REF_CHK 0x01 uint8_t flags2; #define SVPD_EID_UASK_SUP 0x20 #define SVPD_EID_GROUP_SUP 0x10 #define SVPD_EID_PRIOR_SUP 0x08 #define SVPD_EID_HEADSUP 0x04 #define SVPD_EID_ORDSUP 0x02 #define SVPD_EID_SIMPSUP 0x01 uint8_t flags3; #define SVPD_EID_WU_SUP 0x08 #define SVPD_EID_CRD_SUP 0x04 #define SVPD_EID_NV_SUP 0x02 #define SVPD_EID_V_SUP 0x01 uint8_t flags4; #define SVPD_EID_NO_PI_CHK 0x20 #define SVPD_EID_P_I_I_SUP 0x10 #define SVPD_EID_LUICLR 0x01 uint8_t flags5; #define SVPD_EID_LUCT_MASK 0xe0 #define SVPD_EID_LUCT_NOT_REP 0x00 #define SVPD_EID_LUCT_CONGL 0x20 #define SVPD_EID_LUCT_GROUP 0x40 #define SVPD_EID_R_SUP 0x10 #define SVPD_EID_RTD_SUP 0x08 #define SVPD_EID_HSSRELEF 0x02 #define SVPD_EID_CBCS 0x01 uint8_t flags6; #define SVPD_EID_MULTI_I_T_FW 0x0F #define SVPD_EID_MC_VENDOR_SPEC 0x00 #define SVPD_EID_MC_MODE_1 0x01 #define SVPD_EID_MC_MODE_2 0x02 #define SVPD_EID_MC_MODE_3 0x03 uint8_t est[2]; uint8_t flags7; #define SVPD_EID_POA_SUP 0x80 #define SVPD_EID_HRA_SUP 0x40 #define SVPD_EID_VSA_SUP 0x20 uint8_t max_sense_length; uint8_t bind_flags; #define SVPD_EID_IBS 0x80 #define SVPD_EID_IAS 0x40 #define SVPD_EID_SAC 0x04 #define SVPD_EID_NRD1 0x02 #define SVPD_EID_NRD0 0x01 uint8_t reserved2[49]; }; struct scsi_vpd_mode_page_policy_descr { uint8_t page_code; uint8_t subpage_code; uint8_t policy; #define SVPD_MPP_SHARED 0x00 #define SVPD_MPP_PORT 0x01 #define SVPD_MPP_I_T 0x03 #define SVPD_MPP_MLUS 0x80 uint8_t reserved; }; struct scsi_vpd_mode_page_policy { uint8_t device; uint8_t page_code; #define SVPD_MODE_PAGE_POLICY 0x87 uint8_t page_length[2]; struct scsi_vpd_mode_page_policy_descr descr[0]; }; struct scsi_diag_page { uint8_t page_code; uint8_t page_specific_flags; uint8_t length[2]; uint8_t params[0]; }; struct scsi_vpd_port_designation { uint8_t reserved[2]; uint8_t relative_port_id[2]; uint8_t reserved2[2]; uint8_t initiator_transportid_length[2]; uint8_t initiator_transportid[0]; }; struct scsi_vpd_port_designation_cont { uint8_t reserved[2]; uint8_t target_port_descriptors_length[2]; struct scsi_vpd_id_descriptor target_port_descriptors[0]; }; struct scsi_vpd_scsi_ports { u_int8_t device; u_int8_t page_code; #define SVPD_SCSI_PORTS 0x88 u_int8_t page_length[2]; struct scsi_vpd_port_designation design[]; }; /* * ATA Information VPD Page based on * T10/2126-D Revision 04 */ #define SVPD_ATA_INFORMATION 0x89 struct scsi_vpd_tpc_descriptor { uint8_t desc_type[2]; uint8_t desc_length[2]; uint8_t parameters[]; }; struct scsi_vpd_tpc_descriptor_bdrl { uint8_t desc_type[2]; #define SVPD_TPC_BDRL 0x0000 uint8_t desc_length[2]; uint8_t vendor_specific[6]; uint8_t maximum_ranges[2]; uint8_t maximum_inactivity_timeout[4]; uint8_t default_inactivity_timeout[4]; uint8_t maximum_token_transfer_size[8]; uint8_t optimal_transfer_count[8]; }; struct scsi_vpd_tpc_descriptor_sc_descr { uint8_t opcode; uint8_t sa_length; uint8_t supported_service_actions[0]; }; struct scsi_vpd_tpc_descriptor_sc { uint8_t desc_type[2]; #define SVPD_TPC_SC 0x0001 uint8_t desc_length[2]; uint8_t list_length; struct scsi_vpd_tpc_descriptor_sc_descr descr[]; }; struct scsi_vpd_tpc_descriptor_pd { uint8_t desc_type[2]; #define SVPD_TPC_PD 0x0004 uint8_t desc_length[2]; uint8_t reserved[4]; uint8_t maximum_cscd_descriptor_count[2]; uint8_t maximum_segment_descriptor_count[2]; uint8_t maximum_descriptor_list_length[4]; uint8_t maximum_inline_data_length[4]; uint8_t reserved2[12]; }; struct scsi_vpd_tpc_descriptor_sd { uint8_t desc_type[2]; #define SVPD_TPC_SD 0x0008 uint8_t desc_length[2]; uint8_t list_length; uint8_t supported_descriptor_codes[]; }; struct scsi_vpd_tpc_descriptor_sdid { uint8_t desc_type[2]; #define SVPD_TPC_SDID 0x000C uint8_t desc_length[2]; uint8_t list_length[2]; uint8_t supported_descriptor_ids[]; }; struct scsi_vpd_tpc_descriptor_rtf_block { uint8_t type_format; #define SVPD_TPC_RTF_BLOCK 0x00 uint8_t reserved; uint8_t desc_length[2]; uint8_t reserved2[2]; uint8_t optimal_length_granularity[2]; uint8_t maximum_bytes[8]; uint8_t optimal_bytes[8]; uint8_t optimal_bytes_to_token_per_segment[8]; uint8_t optimal_bytes_from_token_per_segment[8]; uint8_t reserved3[8]; }; struct scsi_vpd_tpc_descriptor_rtf { uint8_t desc_type[2]; #define SVPD_TPC_RTF 0x0106 uint8_t desc_length[2]; uint8_t remote_tokens; uint8_t reserved[11]; uint8_t minimum_token_lifetime[4]; uint8_t maximum_token_lifetime[4]; uint8_t maximum_token_inactivity_timeout[4]; uint8_t reserved2[18]; uint8_t type_specific_features_length[2]; uint8_t type_specific_features[0]; }; struct scsi_vpd_tpc_descriptor_srtd { uint8_t rod_type[4]; uint8_t flags; #define SVPD_TPC_SRTD_TOUT 0x01 #define SVPD_TPC_SRTD_TIN 0x02 #define SVPD_TPC_SRTD_ECPY 0x80 uint8_t reserved; uint8_t preference_indicator[2]; uint8_t reserved2[56]; }; struct scsi_vpd_tpc_descriptor_srt { uint8_t desc_type[2]; #define SVPD_TPC_SRT 0x0108 uint8_t desc_length[2]; uint8_t reserved[2]; uint8_t rod_type_descriptors_length[2]; uint8_t rod_type_descriptors[0]; }; struct scsi_vpd_tpc_descriptor_gco { uint8_t desc_type[2]; #define SVPD_TPC_GCO 0x8001 uint8_t desc_length[2]; uint8_t total_concurrent_copies[4]; uint8_t maximum_identified_concurrent_copies[4]; uint8_t maximum_segment_length[4]; uint8_t data_segment_granularity; uint8_t inline_data_granularity; uint8_t reserved[18]; }; struct scsi_vpd_tpc { uint8_t device; uint8_t page_code; #define SVPD_SCSI_TPC 0x8F uint8_t page_length[2]; struct scsi_vpd_tpc_descriptor descr[]; }; /* + * SCSI Feature Sets VPD Page + */ +struct scsi_vpd_sfs +{ + uint8_t device; + uint8_t page_code; +#define SVPD_SCSI_SFS 0x92 + uint8_t page_length[2]; + uint8_t reserved[4]; + uint8_t codes[]; +}; + +/* * Block Device Characteristics VPD Page based on * T10/1799-D Revision 31 */ struct scsi_vpd_block_characteristics { u_int8_t device; u_int8_t page_code; #define SVPD_BDC 0xB1 u_int8_t page_length[2]; u_int8_t medium_rotation_rate[2]; #define SVPD_BDC_RATE_NOT_REPORTED 0x00 #define SVPD_BDC_RATE_NON_ROTATING 0x01 u_int8_t reserved1; u_int8_t nominal_form_factor; #define SVPD_BDC_FORM_NOT_REPORTED 0x00 #define SVPD_BDC_FORM_5_25INCH 0x01 #define SVPD_BDC_FORM_3_5INCH 0x02 #define SVPD_BDC_FORM_2_5INCH 0x03 #define SVPD_BDC_FORM_1_5INCH 0x04 #define SVPD_BDC_FORM_LESSTHAN_1_5INCH 0x05 u_int8_t reserved2[56]; }; /* * Block Device Characteristics VPD Page */ struct scsi_vpd_block_device_characteristics { uint8_t device; uint8_t page_code; #define SVPD_BDC 0xB1 uint8_t page_length[2]; uint8_t medium_rotation_rate[2]; #define SVPD_NOT_REPORTED 0x0000 #define SVPD_NON_ROTATING 0x0001 uint8_t product_type; uint8_t wab_wac_ff; uint8_t flags; #define SVPD_VBULS 0x01 #define SVPD_FUAB 0x02 +#define SVPD_BOCS 0x04 +#define SVPD_RBWZ 0x08 #define SVPD_ZBC_NR 0x00 /* Not Reported */ #define SVPD_HAW_ZBC 0x10 /* Host Aware */ #define SVPD_DM_ZBC 0x20 /* Drive Managed */ #define SVPD_ZBC_MASK 0x30 /* Zoned mask */ - uint8_t reserved[55]; + uint8_t reserved[3]; + uint8_t depopulation_time[4]; + uint8_t reserved2[48]; }; #define SBDC_IS_PRESENT(bdc, length, field) \ ((length >= offsetof(struct scsi_vpd_block_device_characteristics, \ field) + sizeof(bdc->field)) ? 1 : 0) /* * Logical Block Provisioning VPD Page based on * T10/1799-D Revision 31 */ struct scsi_vpd_logical_block_prov { u_int8_t device; u_int8_t page_code; #define SVPD_LBP 0xB2 u_int8_t page_length[2]; #define SVPD_LBP_PL_BASIC 0x04 u_int8_t threshold_exponent; u_int8_t flags; #define SVPD_LBP_UNMAP 0x80 #define SVPD_LBP_WS16 0x40 #define SVPD_LBP_WS10 0x20 #define SVPD_LBP_RZ 0x04 #define SVPD_LBP_ANC_SUP 0x02 #define SVPD_LBP_DP 0x01 u_int8_t prov_type; #define SVPD_LBP_RESOURCE 0x01 #define SVPD_LBP_THIN 0x02 u_int8_t reserved; /* * Provisioning Group Descriptor can be here if SVPD_LBP_DP is set * Its size can be determined from page_length - 4 */ }; /* * Block Limits VDP Page based on SBC-4 Revision 2 */ struct scsi_vpd_block_limits { u_int8_t device; u_int8_t page_code; #define SVPD_BLOCK_LIMITS 0xB0 u_int8_t page_length[2]; #define SVPD_BL_PL_BASIC 0x10 #define SVPD_BL_PL_TP 0x3C u_int8_t reserved1; u_int8_t max_cmp_write_len; u_int8_t opt_txfer_len_grain[2]; u_int8_t max_txfer_len[4]; u_int8_t opt_txfer_len[4]; u_int8_t max_prefetch[4]; u_int8_t max_unmap_lba_cnt[4]; u_int8_t max_unmap_blk_cnt[4]; u_int8_t opt_unmap_grain[4]; u_int8_t unmap_grain_align[4]; u_int8_t max_write_same_length[8]; u_int8_t max_atomic_transfer_length[4]; u_int8_t atomic_alignment[4]; u_int8_t atomic_transfer_length_granularity[4]; u_int8_t max_atomic_transfer_length_with_atomic_boundary[4]; u_int8_t max_atomic_boundary_size[4]; }; /* * Zoned Block Device Characacteristics VPD page. * From ZBC-r04, dated August 12, 2015. */ struct scsi_vpd_zoned_bdc { uint8_t device; uint8_t page_code; #define SVPD_ZONED_BDC 0xB6 uint8_t page_length[2]; #define SVPD_ZBDC_PL 0x3C uint8_t flags; #define SVPD_ZBDC_URSWRZ 0x01 uint8_t reserved1[3]; uint8_t optimal_seq_zones[4]; #define SVPD_ZBDC_OPT_SEQ_NR 0xffffffff uint8_t optimal_nonseq_zones[4]; #define SVPD_ZBDC_OPT_NONSEQ_NR 0xffffffff uint8_t max_seq_req_zones[4]; #define SVPD_ZBDC_MAX_SEQ_UNLIMITED 0xffffffff uint8_t reserved2[44]; }; struct scsi_read_capacity { u_int8_t opcode; u_int8_t byte2; #define SRC_RELADR 0x01 u_int8_t addr[4]; u_int8_t unused[2]; u_int8_t pmi; #define SRC_PMI 0x01 u_int8_t control; }; struct scsi_read_capacity_16 { uint8_t opcode; #define SRC16_SERVICE_ACTION 0x10 uint8_t service_action; uint8_t addr[8]; uint8_t alloc_len[4]; #define SRC16_PMI 0x01 #define SRC16_RELADR 0x02 uint8_t reladr; uint8_t control; }; struct scsi_read_capacity_data { u_int8_t addr[4]; u_int8_t length[4]; }; struct scsi_read_capacity_data_long { uint8_t addr[8]; uint8_t length[4]; #define SRC16_PROT_EN 0x01 #define SRC16_P_TYPE 0x0e #define SRC16_P_TYPE_SHIFT 1 #define SRC16_PTYPE_1 0x00 #define SRC16_PTYPE_2 0x02 #define SRC16_PTYPE_3 0x04 uint8_t prot; #define SRC16_LBPPBE 0x0f #define SRC16_PI_EXPONENT 0xf0 #define SRC16_PI_EXPONENT_SHIFT 4 uint8_t prot_lbppbe; #define SRC16_LALBA 0x3f #define SRC16_LBPRZ 0x40 #define SRC16_LBPME 0x80 /* * Alternate versions of these macros that are intended for use on a 16-bit * version of the lalba_lbp field instead of the array of 2 8 bit numbers. */ #define SRC16_LALBA_A 0x3fff #define SRC16_LBPRZ_A 0x4000 #define SRC16_LBPME_A 0x8000 uint8_t lalba_lbp[2]; uint8_t reserved[16]; }; struct scsi_get_lba_status { uint8_t opcode; #define SGLS_SERVICE_ACTION 0x12 uint8_t service_action; uint8_t addr[8]; uint8_t alloc_len[4]; uint8_t reserved; uint8_t control; }; struct scsi_get_lba_status_data_descr { uint8_t addr[8]; uint8_t length[4]; uint8_t status; uint8_t reserved[3]; }; struct scsi_get_lba_status_data { uint8_t length[4]; uint8_t reserved[4]; struct scsi_get_lba_status_data_descr descr[]; }; struct scsi_report_luns { uint8_t opcode; uint8_t reserved1; #define RPL_REPORT_DEFAULT 0x00 #define RPL_REPORT_WELLKNOWN 0x01 #define RPL_REPORT_ALL 0x02 #define RPL_REPORT_ADMIN 0x10 #define RPL_REPORT_NONSUBSID 0x11 #define RPL_REPORT_CONGLOM 0x12 uint8_t select_report; uint8_t reserved2[3]; uint8_t length[4]; uint8_t reserved3; uint8_t control; }; struct scsi_report_luns_lundata { uint8_t lundata[8]; #define RPL_LUNDATA_PERIPH_BUS_MASK 0x3f #define RPL_LUNDATA_FLAT_LUN_MASK 0x3f #define RPL_LUNDATA_FLAT_LUN_BITS 0x06 #define RPL_LUNDATA_LUN_TARG_MASK 0x3f #define RPL_LUNDATA_LUN_BUS_MASK 0xe0 #define RPL_LUNDATA_LUN_LUN_MASK 0x1f #define RPL_LUNDATA_EXT_LEN_MASK 0x30 #define RPL_LUNDATA_EXT_EAM_MASK 0x0f #define RPL_LUNDATA_EXT_EAM_WK 0x01 #define RPL_LUNDATA_EXT_EAM_NOT_SPEC 0x0f #define RPL_LUNDATA_ATYP_MASK 0xc0 /* MBZ for type 0 lun */ #define RPL_LUNDATA_ATYP_PERIPH 0x00 #define RPL_LUNDATA_ATYP_FLAT 0x40 #define RPL_LUNDATA_ATYP_LUN 0x80 #define RPL_LUNDATA_ATYP_EXTLUN 0xc0 }; struct scsi_report_luns_data { u_int8_t length[4]; /* length of LUN inventory, in bytes */ u_int8_t reserved[4]; /* unused */ /* * LUN inventory- we only support the type zero form for now. */ struct scsi_report_luns_lundata luns[0]; }; struct scsi_target_group { uint8_t opcode; uint8_t service_action; #define STG_PDF_MASK 0xe0 #define STG_PDF_LENGTH 0x00 #define STG_PDF_EXTENDED 0x20 uint8_t reserved1[4]; uint8_t length[4]; uint8_t reserved2; uint8_t control; }; struct scsi_timestamp { uint8_t opcode; uint8_t service_action; uint8_t reserved1[4]; uint8_t length[4]; uint8_t reserved2; uint8_t control; }; struct scsi_set_timestamp_parameters { uint8_t reserved1[4]; uint8_t timestamp[6]; uint8_t reserved2[2]; }; struct scsi_report_timestamp_parameter_data { uint8_t length[2]; uint8_t reserved1[2]; uint8_t timestamp[6]; uint8_t reserved2[2]; }; struct scsi_target_port_descriptor { uint8_t reserved[2]; uint8_t relative_target_port_identifier[2]; uint8_t desc_list[]; }; struct scsi_target_port_group_descriptor { uint8_t pref_state; #define TPG_PRIMARY 0x80 #define TPG_ASYMMETRIC_ACCESS_STATE_MASK 0xf #define TPG_ASYMMETRIC_ACCESS_OPTIMIZED 0x0 #define TPG_ASYMMETRIC_ACCESS_NONOPTIMIZED 0x1 #define TPG_ASYMMETRIC_ACCESS_STANDBY 0x2 #define TPG_ASYMMETRIC_ACCESS_UNAVAILABLE 0x3 #define TPG_ASYMMETRIC_ACCESS_LBA_DEPENDENT 0x4 #define TPG_ASYMMETRIC_ACCESS_OFFLINE 0xE #define TPG_ASYMMETRIC_ACCESS_TRANSITIONING 0xF uint8_t support; #define TPG_AO_SUP 0x01 #define TPG_AN_SUP 0x02 #define TPG_S_SUP 0x04 #define TPG_U_SUP 0x08 #define TPG_LBD_SUP 0x10 #define TPG_O_SUP 0x40 #define TPG_T_SUP 0x80 uint8_t target_port_group[2]; uint8_t reserved; uint8_t status; #define TPG_UNAVLBL 0 #define TPG_SET_BY_STPG 0x01 #define TPG_IMPLICIT 0x02 uint8_t vendor_specific; uint8_t target_port_count; struct scsi_target_port_descriptor descriptors[]; }; struct scsi_target_group_data { uint8_t length[4]; /* length of returned data, in bytes */ struct scsi_target_port_group_descriptor groups[]; }; struct scsi_target_group_data_extended { uint8_t length[4]; /* length of returned data, in bytes */ uint8_t format_type; /* STG_PDF_LENGTH or STG_PDF_EXTENDED */ uint8_t implicit_transition_time; uint8_t reserved[2]; struct scsi_target_port_group_descriptor groups[]; }; struct scsi_security_protocol_in { uint8_t opcode; uint8_t security_protocol; #define SPI_PROT_INFORMATION 0x00 #define SPI_PROT_CBCS 0x07 #define SPI_PROT_TAPE_DATA_ENC 0x20 #define SPI_PROT_DATA_ENC_CONFIG 0x21 #define SPI_PROT_SA_CREATE_CAP 0x40 #define SPI_PROT_IKEV2_SCSI 0x41 #define SPI_PROT_JEDEC_UFS 0xEC #define SPI_PROT_SDCARD_TFSSS 0xED #define SPI_PROT_AUTH_HOST_TRANSIENT 0xEE #define SPI_PROT_ATA_DEVICE_PASSWORD 0xEF uint8_t security_protocol_specific[2]; uint8_t byte4; #define SPI_INC_512 0x80 uint8_t reserved1; uint8_t length[4]; uint8_t reserved2; uint8_t control; }; struct scsi_security_protocol_out { uint8_t opcode; uint8_t security_protocol; uint8_t security_protocol_specific[2]; uint8_t byte4; #define SPO_INC_512 0x80 uint8_t reserved1; uint8_t length[4]; uint8_t reserved2; uint8_t control; }; typedef enum { SSD_TYPE_NONE, SSD_TYPE_FIXED, SSD_TYPE_DESC } scsi_sense_data_type; typedef enum { SSD_ELEM_NONE, SSD_ELEM_SKIP, SSD_ELEM_DESC, SSD_ELEM_SKS, SSD_ELEM_COMMAND, SSD_ELEM_INFO, SSD_ELEM_FRU, SSD_ELEM_STREAM, SSD_ELEM_MAX } scsi_sense_elem_type; struct scsi_sense_data { uint8_t error_code; /* * SPC-4 says that the maximum length of sense data is 252 bytes. * So this structure is exactly 252 bytes log. */ #define SSD_FULL_SIZE 252 uint8_t sense_buf[SSD_FULL_SIZE - 1]; /* * XXX KDM is this still a reasonable minimum size? */ #define SSD_MIN_SIZE 18 /* * Maximum value for the extra_len field in the sense data. */ #define SSD_EXTRA_MAX 244 }; /* * Fixed format sense data. */ struct scsi_sense_data_fixed { u_int8_t error_code; #define SSD_ERRCODE 0x7F #define SSD_CURRENT_ERROR 0x70 #define SSD_DEFERRED_ERROR 0x71 #define SSD_ERRCODE_VALID 0x80 u_int8_t segment; u_int8_t flags; #define SSD_KEY 0x0F #define SSD_KEY_NO_SENSE 0x00 #define SSD_KEY_RECOVERED_ERROR 0x01 #define SSD_KEY_NOT_READY 0x02 #define SSD_KEY_MEDIUM_ERROR 0x03 #define SSD_KEY_HARDWARE_ERROR 0x04 #define SSD_KEY_ILLEGAL_REQUEST 0x05 #define SSD_KEY_UNIT_ATTENTION 0x06 #define SSD_KEY_DATA_PROTECT 0x07 #define SSD_KEY_BLANK_CHECK 0x08 #define SSD_KEY_Vendor_Specific 0x09 #define SSD_KEY_COPY_ABORTED 0x0a #define SSD_KEY_ABORTED_COMMAND 0x0b #define SSD_KEY_EQUAL 0x0c #define SSD_KEY_VOLUME_OVERFLOW 0x0d #define SSD_KEY_MISCOMPARE 0x0e #define SSD_KEY_COMPLETED 0x0f #define SSD_SDAT_OVFL 0x10 #define SSD_ILI 0x20 #define SSD_EOM 0x40 #define SSD_FILEMARK 0x80 u_int8_t info[4]; u_int8_t extra_len; u_int8_t cmd_spec_info[4]; u_int8_t add_sense_code; u_int8_t add_sense_code_qual; u_int8_t fru; u_int8_t sense_key_spec[3]; #define SSD_SCS_VALID 0x80 #define SSD_FIELDPTR_CMD 0x40 #define SSD_BITPTR_VALID 0x08 #define SSD_BITPTR_VALUE 0x07 u_int8_t extra_bytes[14]; #define SSD_FIXED_IS_PRESENT(sense, length, field) \ ((length >= (offsetof(struct scsi_sense_data_fixed, field) + \ sizeof(sense->field))) ? 1 :0) #define SSD_FIXED_IS_FILLED(sense, field) \ ((((offsetof(struct scsi_sense_data_fixed, field) + \ sizeof(sense->field)) - \ (offsetof(struct scsi_sense_data_fixed, extra_len) + \ sizeof(sense->extra_len))) <= sense->extra_len) ? 1 : 0) }; /* * Descriptor format sense data definitions. * Introduced in SPC-3. */ struct scsi_sense_data_desc { uint8_t error_code; #define SSD_DESC_CURRENT_ERROR 0x72 #define SSD_DESC_DEFERRED_ERROR 0x73 uint8_t sense_key; uint8_t add_sense_code; uint8_t add_sense_code_qual; uint8_t flags; #define SSDD_SDAT_OVFL 0x80 uint8_t reserved[2]; /* * Note that SPC-4, section 4.5.2.1 says that the extra_len field * must be less than or equal to 244. */ uint8_t extra_len; uint8_t sense_desc[0]; #define SSD_DESC_IS_PRESENT(sense, length, field) \ ((length >= (offsetof(struct scsi_sense_data_desc, field) + \ sizeof(sense->field))) ? 1 :0) }; struct scsi_sense_desc_header { uint8_t desc_type; uint8_t length; }; /* * The information provide in the Information descriptor is device type or * command specific information, and defined in a command standard. * * Note that any changes to the field names or positions in this structure, * even reserved fields, should be accompanied by an examination of the * code in ctl_set_sense() that uses them. * * Maximum descriptors allowed: 1 (as of SPC-4) */ struct scsi_sense_info { uint8_t desc_type; #define SSD_DESC_INFO 0x00 uint8_t length; uint8_t byte2; #define SSD_INFO_VALID 0x80 uint8_t reserved; uint8_t info[8]; }; /* * Command-specific information depends on the command for which the * reported condition occurred. * * Note that any changes to the field names or positions in this structure, * even reserved fields, should be accompanied by an examination of the * code in ctl_set_sense() that uses them. * * Maximum descriptors allowed: 1 (as of SPC-4) */ struct scsi_sense_command { uint8_t desc_type; #define SSD_DESC_COMMAND 0x01 uint8_t length; uint8_t reserved[2]; uint8_t command_info[8]; }; /* * Sense key specific descriptor. The sense key specific data format * depends on the sense key in question. * * Maximum descriptors allowed: 1 (as of SPC-4) */ struct scsi_sense_sks { uint8_t desc_type; #define SSD_DESC_SKS 0x02 uint8_t length; uint8_t reserved1[2]; uint8_t sense_key_spec[3]; #define SSD_SKS_VALID 0x80 uint8_t reserved2; }; /* * This is used for the Illegal Request sense key (0x05) only. */ struct scsi_sense_sks_field { uint8_t byte0; #define SSD_SKS_FIELD_VALID 0x80 #define SSD_SKS_FIELD_CMD 0x40 #define SSD_SKS_BPV 0x08 #define SSD_SKS_BIT_VALUE 0x07 uint8_t field[2]; }; /* * This is used for the Hardware Error (0x04), Medium Error (0x03) and * Recovered Error (0x01) sense keys. */ struct scsi_sense_sks_retry { uint8_t byte0; #define SSD_SKS_RETRY_VALID 0x80 uint8_t actual_retry_count[2]; }; /* * Used with the NO Sense (0x00) or Not Ready (0x02) sense keys. */ struct scsi_sense_sks_progress { uint8_t byte0; #define SSD_SKS_PROGRESS_VALID 0x80 uint8_t progress[2]; #define SSD_SKS_PROGRESS_DENOM 0x10000 }; /* * Used with the Copy Aborted (0x0a) sense key. */ struct scsi_sense_sks_segment { uint8_t byte0; #define SSD_SKS_SEGMENT_VALID 0x80 #define SSD_SKS_SEGMENT_SD 0x20 #define SSD_SKS_SEGMENT_BPV 0x08 #define SSD_SKS_SEGMENT_BITPTR 0x07 uint8_t field[2]; }; /* * Used with the Unit Attention (0x06) sense key. * * This is currently used to indicate that the unit attention condition * queue has overflowed (when the overflow bit is set). */ struct scsi_sense_sks_overflow { uint8_t byte0; #define SSD_SKS_OVERFLOW_VALID 0x80 #define SSD_SKS_OVERFLOW_SET 0x01 uint8_t reserved[2]; }; /* * This specifies which component is associated with the sense data. There * is no standard meaning for the fru value. * * Maximum descriptors allowed: 1 (as of SPC-4) */ struct scsi_sense_fru { uint8_t desc_type; #define SSD_DESC_FRU 0x03 uint8_t length; uint8_t reserved; uint8_t fru; }; /* * Used for Stream commands, defined in SSC-4. * * Maximum descriptors allowed: 1 (as of SPC-4) */ struct scsi_sense_stream { uint8_t desc_type; #define SSD_DESC_STREAM 0x04 uint8_t length; uint8_t reserved; uint8_t byte3; #define SSD_DESC_STREAM_FM 0x80 #define SSD_DESC_STREAM_EOM 0x40 #define SSD_DESC_STREAM_ILI 0x20 }; /* * Used for Block commands, defined in SBC-3. * * This is currently (as of SBC-3) only used for the Incorrect Length * Indication (ILI) bit, which says that the data length requested in the * READ LONG or WRITE LONG command did not match the length of the logical * block. * * Maximum descriptors allowed: 1 (as of SPC-4) */ struct scsi_sense_block { uint8_t desc_type; #define SSD_DESC_BLOCK 0x05 uint8_t length; uint8_t reserved; uint8_t byte3; #define SSD_DESC_BLOCK_ILI 0x20 }; /* * Used for Object-Based Storage Devices (OSD-3). * * Maximum descriptors allowed: 1 (as of SPC-4) */ struct scsi_sense_osd_objid { uint8_t desc_type; #define SSD_DESC_OSD_OBJID 0x06 uint8_t length; uint8_t reserved[6]; /* * XXX KDM provide the bit definitions here? There are a lot of * them, and we don't have an OSD driver yet. */ uint8_t not_init_cmds[4]; uint8_t completed_cmds[4]; uint8_t partition_id[8]; uint8_t object_id[8]; }; /* * Used for Object-Based Storage Devices (OSD-3). * * Maximum descriptors allowed: 1 (as of SPC-4) */ struct scsi_sense_osd_integrity { uint8_t desc_type; #define SSD_DESC_OSD_INTEGRITY 0x07 uint8_t length; uint8_t integ_check_val[32]; }; /* * Used for Object-Based Storage Devices (OSD-3). * * Maximum descriptors allowed: 1 (as of SPC-4) */ struct scsi_sense_osd_attr_id { uint8_t desc_type; #define SSD_DESC_OSD_ATTR_ID 0x08 uint8_t length; uint8_t reserved[2]; uint8_t attr_desc[0]; }; /* * ATA Return descriptor, used for the SCSI ATA PASS-THROUGH(12), (16) and * (32) commands. Described in SAT-4r05. */ struct scsi_sense_ata_ret_desc { uint8_t desc_type; #define SSD_DESC_ATA 0x09 uint8_t length; uint8_t flags; #define SSD_DESC_ATA_FLAG_EXTEND 0x01 uint8_t error; uint8_t count_15_8; uint8_t count_7_0; uint8_t lba_31_24; uint8_t lba_7_0; uint8_t lba_39_32; uint8_t lba_15_8; uint8_t lba_47_40; uint8_t lba_23_16; uint8_t device; uint8_t status; }; /* * Used with Sense keys No Sense (0x00) and Not Ready (0x02). * * Maximum descriptors allowed: 32 (as of SPC-4) */ struct scsi_sense_progress { uint8_t desc_type; #define SSD_DESC_PROGRESS 0x0a uint8_t length; uint8_t sense_key; uint8_t add_sense_code; uint8_t add_sense_code_qual; uint8_t reserved; uint8_t progress[2]; }; /* * This is typically forwarded as the result of an EXTENDED COPY command. * * Maximum descriptors allowed: 2 (as of SPC-4) */ struct scsi_sense_forwarded { uint8_t desc_type; #define SSD_DESC_FORWARDED 0x0c uint8_t length; uint8_t byte2; #define SSD_FORWARDED_FSDT 0x80 #define SSD_FORWARDED_SDS_MASK 0x0f #define SSD_FORWARDED_SDS_UNK 0x00 #define SSD_FORWARDED_SDS_EXSRC 0x01 #define SSD_FORWARDED_SDS_EXDST 0x02 uint8_t status; uint8_t sense_data[]; }; /* * Vendor-specific sense descriptor. The desc_type field will be in the * range between MIN and MAX inclusive. */ struct scsi_sense_vendor { uint8_t desc_type; #define SSD_DESC_VENDOR_MIN 0x80 #define SSD_DESC_VENDOR_MAX 0xff uint8_t length; uint8_t data[0]; }; struct scsi_mode_header_6 { u_int8_t data_length; /* Sense data length */ u_int8_t medium_type; u_int8_t dev_spec; u_int8_t blk_desc_len; }; struct scsi_mode_header_10 { u_int8_t data_length[2];/* Sense data length */ u_int8_t medium_type; u_int8_t dev_spec; u_int8_t unused[2]; u_int8_t blk_desc_len[2]; }; struct scsi_mode_page_header { u_int8_t page_code; #define SMPH_PS 0x80 #define SMPH_SPF 0x40 #define SMPH_PC_MASK 0x3f u_int8_t page_length; }; struct scsi_mode_page_header_sp { uint8_t page_code; uint8_t subpage; uint8_t page_length[2]; }; struct scsi_mode_blk_desc { u_int8_t density; u_int8_t nblocks[3]; u_int8_t reserved; u_int8_t blklen[3]; }; #define SCSI_DEFAULT_DENSITY 0x00 /* use 'default' density */ #define SCSI_SAME_DENSITY 0x7f /* use 'same' density- >= SCSI-2 only */ /* * Status Byte */ #define SCSI_STATUS_OK 0x00 #define SCSI_STATUS_CHECK_COND 0x02 #define SCSI_STATUS_COND_MET 0x04 #define SCSI_STATUS_BUSY 0x08 #define SCSI_STATUS_INTERMED 0x10 #define SCSI_STATUS_INTERMED_COND_MET 0x14 #define SCSI_STATUS_RESERV_CONFLICT 0x18 #define SCSI_STATUS_CMD_TERMINATED 0x22 /* Obsolete in SAM-2 */ #define SCSI_STATUS_QUEUE_FULL 0x28 #define SCSI_STATUS_ACA_ACTIVE 0x30 #define SCSI_STATUS_TASK_ABORTED 0x40 struct scsi_inquiry_pattern { u_int8_t type; u_int8_t media_type; #define SIP_MEDIA_REMOVABLE 0x01 #define SIP_MEDIA_FIXED 0x02 const char *vendor; const char *product; const char *revision; }; struct scsi_static_inquiry_pattern { u_int8_t type; u_int8_t media_type; char vendor[SID_VENDOR_SIZE+1]; char product[SID_PRODUCT_SIZE+1]; char revision[SID_REVISION_SIZE+1]; }; struct scsi_sense_quirk_entry { struct scsi_inquiry_pattern inq_pat; int num_sense_keys; int num_ascs; struct sense_key_table_entry *sense_key_info; struct asc_table_entry *asc_info; }; struct sense_key_table_entry { u_int8_t sense_key; u_int32_t action; const char *desc; }; struct asc_table_entry { u_int8_t asc; u_int8_t ascq; u_int32_t action; const char *desc; }; struct op_table_entry { u_int8_t opcode; u_int32_t opmask; const char *desc; }; struct scsi_op_quirk_entry { struct scsi_inquiry_pattern inq_pat; int num_ops; struct op_table_entry *op_table; }; typedef enum { SSS_FLAG_NONE = 0x00, SSS_FLAG_PRINT_COMMAND = 0x01 } scsi_sense_string_flags; struct scsi_nv { const char *name; uint64_t value; }; typedef enum { SCSI_NV_FOUND, SCSI_NV_AMBIGUOUS, SCSI_NV_NOT_FOUND } scsi_nv_status; typedef enum { SCSI_NV_FLAG_NONE = 0x00, SCSI_NV_FLAG_IG_CASE = 0x01 /* Case insensitive comparison */ } scsi_nv_flags; struct ccb_scsiio; struct cam_periph; union ccb; #ifndef _KERNEL struct cam_device; #endif extern const char *scsi_sense_key_text[]; __BEGIN_DECLS void scsi_sense_desc(int sense_key, int asc, int ascq, struct scsi_inquiry_data *inq_data, const char **sense_key_desc, const char **asc_desc); scsi_sense_action scsi_error_action(struct ccb_scsiio* csio, struct scsi_inquiry_data *inq_data, u_int32_t sense_flags); const char * scsi_status_string(struct ccb_scsiio *csio); void scsi_desc_iterate(struct scsi_sense_data_desc *sense, u_int sense_len, int (*iter_func)(struct scsi_sense_data_desc *sense, u_int, struct scsi_sense_desc_header *, void *), void *arg); uint8_t *scsi_find_desc(struct scsi_sense_data_desc *sense, u_int sense_len, uint8_t desc_type); void scsi_set_sense_data(struct scsi_sense_data *sense_data, scsi_sense_data_type sense_format, int current_error, int sense_key, int asc, int ascq, ...) ; void scsi_set_sense_data_len(struct scsi_sense_data *sense_data, u_int *sense_len, scsi_sense_data_type sense_format, int current_error, int sense_key, int asc, int ascq, ...) ; void scsi_set_sense_data_va(struct scsi_sense_data *sense_data, u_int *sense_len, scsi_sense_data_type sense_format, int current_error, int sense_key, int asc, int ascq, va_list ap); int scsi_get_sense_info(struct scsi_sense_data *sense_data, u_int sense_len, uint8_t info_type, uint64_t *info, int64_t *signed_info); int scsi_get_sks(struct scsi_sense_data *sense_data, u_int sense_len, uint8_t *sks); int scsi_get_block_info(struct scsi_sense_data *sense_data, u_int sense_len, struct scsi_inquiry_data *inq_data, uint8_t *block_bits); int scsi_get_stream_info(struct scsi_sense_data *sense_data, u_int sense_len, struct scsi_inquiry_data *inq_data, uint8_t *stream_bits); void scsi_info_sbuf(struct sbuf *sb, uint8_t *cdb, int cdb_len, struct scsi_inquiry_data *inq_data, uint64_t info); void scsi_command_sbuf(struct sbuf *sb, uint8_t *cdb, int cdb_len, struct scsi_inquiry_data *inq_data, uint64_t csi); void scsi_progress_sbuf(struct sbuf *sb, uint16_t progress); int scsi_sks_sbuf(struct sbuf *sb, int sense_key, uint8_t *sks); void scsi_fru_sbuf(struct sbuf *sb, uint64_t fru); void scsi_stream_sbuf(struct sbuf *sb, uint8_t stream_bits); void scsi_block_sbuf(struct sbuf *sb, uint8_t block_bits); void scsi_sense_info_sbuf(struct sbuf *sb, struct scsi_sense_data *sense, u_int sense_len, uint8_t *cdb, int cdb_len, struct scsi_inquiry_data *inq_data, struct scsi_sense_desc_header *header); void scsi_sense_command_sbuf(struct sbuf *sb, struct scsi_sense_data *sense, u_int sense_len, uint8_t *cdb, int cdb_len, struct scsi_inquiry_data *inq_data, struct scsi_sense_desc_header *header); void scsi_sense_sks_sbuf(struct sbuf *sb, struct scsi_sense_data *sense, u_int sense_len, uint8_t *cdb, int cdb_len, struct scsi_inquiry_data *inq_data, struct scsi_sense_desc_header *header); void scsi_sense_fru_sbuf(struct sbuf *sb, struct scsi_sense_data *sense, u_int sense_len, uint8_t *cdb, int cdb_len, struct scsi_inquiry_data *inq_data, struct scsi_sense_desc_header *header); void scsi_sense_stream_sbuf(struct sbuf *sb, struct scsi_sense_data *sense, u_int sense_len, uint8_t *cdb, int cdb_len, struct scsi_inquiry_data *inq_data, struct scsi_sense_desc_header *header); void scsi_sense_block_sbuf(struct sbuf *sb, struct scsi_sense_data *sense, u_int sense_len, uint8_t *cdb, int cdb_len, struct scsi_inquiry_data *inq_data, struct scsi_sense_desc_header *header); void scsi_sense_progress_sbuf(struct sbuf *sb, struct scsi_sense_data *sense, u_int sense_len, uint8_t *cdb, int cdb_len, struct scsi_inquiry_data *inq_data, struct scsi_sense_desc_header *header); void scsi_sense_ata_sbuf(struct sbuf *sb, struct scsi_sense_data *sense, u_int sense_len, uint8_t *cdb, int cdb_len, struct scsi_inquiry_data *inq_data, struct scsi_sense_desc_header *header); void scsi_sense_forwarded_sbuf(struct sbuf *sb, struct scsi_sense_data *sense, u_int sense_len, uint8_t *cdb, int cdb_len, struct scsi_inquiry_data *inq_data, struct scsi_sense_desc_header *header); void scsi_sense_generic_sbuf(struct sbuf *sb, struct scsi_sense_data *sense, u_int sense_len, uint8_t *cdb, int cdb_len, struct scsi_inquiry_data *inq_data, struct scsi_sense_desc_header *header); void scsi_sense_desc_sbuf(struct sbuf *sb, struct scsi_sense_data *sense, u_int sense_len, uint8_t *cdb, int cdb_len, struct scsi_inquiry_data *inq_data, struct scsi_sense_desc_header *header); scsi_sense_data_type scsi_sense_type(struct scsi_sense_data *sense_data); void scsi_sense_only_sbuf(struct scsi_sense_data *sense, u_int sense_len, struct sbuf *sb, char *path_str, struct scsi_inquiry_data *inq_data, uint8_t *cdb, int cdb_len); #ifdef _KERNEL int scsi_command_string(struct ccb_scsiio *csio, struct sbuf *sb); int scsi_sense_sbuf(struct ccb_scsiio *csio, struct sbuf *sb, scsi_sense_string_flags flags); char * scsi_sense_string(struct ccb_scsiio *csio, char *str, int str_len); void scsi_sense_print(struct ccb_scsiio *csio); int scsi_vpd_supported_page(struct cam_periph *periph, uint8_t page_id); #else /* _KERNEL */ int scsi_command_string(struct cam_device *device, struct ccb_scsiio *csio, struct sbuf *sb); int scsi_sense_sbuf(struct cam_device *device, struct ccb_scsiio *csio, struct sbuf *sb, scsi_sense_string_flags flags); char * scsi_sense_string(struct cam_device *device, struct ccb_scsiio *csio, char *str, int str_len); void scsi_sense_print(struct cam_device *device, struct ccb_scsiio *csio, FILE *ofile); #endif /* _KERNEL */ const char * scsi_op_desc(u_int16_t opcode, struct scsi_inquiry_data *inq_data); char * scsi_cdb_string(u_int8_t *cdb_ptr, char *cdb_string, size_t len); void scsi_cdb_sbuf(u_int8_t *cdb_ptr, struct sbuf *sb); void scsi_print_inquiry(struct scsi_inquiry_data *inq_data); void scsi_print_inquiry_sbuf(struct sbuf *sb, struct scsi_inquiry_data *inq_data); void scsi_print_inquiry_short(struct scsi_inquiry_data *inq_data); void scsi_print_inquiry_short_sbuf(struct sbuf *sb, struct scsi_inquiry_data *inq_data); u_int scsi_calc_syncsrate(u_int period_factor); u_int scsi_calc_syncparam(u_int period); typedef int (*scsi_devid_checkfn_t)(uint8_t *); int scsi_devid_is_naa_ieee_reg(uint8_t *bufp); int scsi_devid_is_sas_target(uint8_t *bufp); int scsi_devid_is_lun_eui64(uint8_t *bufp); int scsi_devid_is_lun_naa(uint8_t *bufp); int scsi_devid_is_lun_name(uint8_t *bufp); int scsi_devid_is_lun_t10(uint8_t *bufp); int scsi_devid_is_lun_md5(uint8_t *bufp); int scsi_devid_is_lun_uuid(uint8_t *bufp); int scsi_devid_is_port_naa(uint8_t *bufp); struct scsi_vpd_id_descriptor * scsi_get_devid(struct scsi_vpd_device_id *id, uint32_t len, scsi_devid_checkfn_t ck_fn); struct scsi_vpd_id_descriptor * scsi_get_devid_desc(struct scsi_vpd_id_descriptor *desc, uint32_t len, scsi_devid_checkfn_t ck_fn); int scsi_transportid_sbuf(struct sbuf *sb, struct scsi_transportid_header *hdr, uint32_t valid_len); const char * scsi_nv_to_str(struct scsi_nv *table, int num_table_entries, uint64_t value); scsi_nv_status scsi_get_nv(struct scsi_nv *table, int num_table_entries, char *name, int *table_entry, scsi_nv_flags flags); int scsi_parse_transportid_64bit(int proto_id, char *id_str, struct scsi_transportid_header **hdr, unsigned int *alloc_len, #ifdef _KERNEL struct malloc_type *type, int flags, #endif char *error_str, int error_str_len); int scsi_parse_transportid_spi(char *id_str, struct scsi_transportid_header **hdr, unsigned int *alloc_len, #ifdef _KERNEL struct malloc_type *type, int flags, #endif char *error_str, int error_str_len); int scsi_parse_transportid_rdma(char *id_str, struct scsi_transportid_header **hdr, unsigned int *alloc_len, #ifdef _KERNEL struct malloc_type *type, int flags, #endif char *error_str, int error_str_len); int scsi_parse_transportid_iscsi(char *id_str, struct scsi_transportid_header **hdr, unsigned int *alloc_len, #ifdef _KERNEL struct malloc_type *type, int flags, #endif char *error_str,int error_str_len); int scsi_parse_transportid_sop(char *id_str, struct scsi_transportid_header **hdr, unsigned int *alloc_len, #ifdef _KERNEL struct malloc_type *type, int flags, #endif char *error_str,int error_str_len); int scsi_parse_transportid(char *transportid_str, struct scsi_transportid_header **hdr, unsigned int *alloc_len, #ifdef _KERNEL struct malloc_type *type, int flags, #endif char *error_str, int error_str_len); int scsi_attrib_volcoh_sbuf(struct sbuf *sb, struct scsi_mam_attribute_header *hdr, uint32_t valid_len, uint32_t flags, uint32_t output_flags, char *error_str, int error_str_len); int scsi_attrib_vendser_sbuf(struct sbuf *sb, struct scsi_mam_attribute_header *hdr, uint32_t valid_len, uint32_t flags, uint32_t output_flags, char *error_str, int error_str_len); int scsi_attrib_hexdump_sbuf(struct sbuf *sb, struct scsi_mam_attribute_header *hdr, uint32_t valid_len, uint32_t flags, uint32_t output_flags, char *error_str, int error_str_len); int scsi_attrib_int_sbuf(struct sbuf *sb, struct scsi_mam_attribute_header *hdr, uint32_t valid_len, uint32_t flags, uint32_t output_flags, char *error_str, int error_str_len); int scsi_attrib_ascii_sbuf(struct sbuf *sb, struct scsi_mam_attribute_header *hdr, uint32_t valid_len, uint32_t flags, uint32_t output_flags, char *error_str, int error_str_len); int scsi_attrib_text_sbuf(struct sbuf *sb, struct scsi_mam_attribute_header *hdr, uint32_t valid_len, uint32_t flags, uint32_t output_flags, char *error_str, int error_str_len); struct scsi_attrib_table_entry *scsi_find_attrib_entry( struct scsi_attrib_table_entry *table, size_t num_table_entries, uint32_t id); struct scsi_attrib_table_entry *scsi_get_attrib_entry(uint32_t id); int scsi_attrib_value_sbuf(struct sbuf *sb, uint32_t valid_len, struct scsi_mam_attribute_header *hdr, uint32_t output_flags, char *error_str, size_t error_str_len); void scsi_attrib_prefix_sbuf(struct sbuf *sb, uint32_t output_flags, struct scsi_mam_attribute_header *hdr, uint32_t valid_len, const char *desc); int scsi_attrib_sbuf(struct sbuf *sb, struct scsi_mam_attribute_header *hdr, uint32_t valid_len, struct scsi_attrib_table_entry *user_table, size_t num_user_entries, int prefer_user_table, uint32_t output_flags, char *error_str, int error_str_len); void scsi_test_unit_ready(struct ccb_scsiio *csio, u_int32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb *), u_int8_t tag_action, u_int8_t sense_len, u_int32_t timeout); void scsi_request_sense(struct ccb_scsiio *csio, u_int32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb *), void *data_ptr, u_int8_t dxfer_len, u_int8_t tag_action, u_int8_t sense_len, u_int32_t timeout); void scsi_inquiry(struct ccb_scsiio *csio, u_int32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb *), u_int8_t tag_action, u_int8_t *inq_buf, u_int32_t inq_len, int evpd, u_int8_t page_code, u_int8_t sense_len, u_int32_t timeout); void scsi_mode_sense(struct ccb_scsiio *csio, u_int32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb *), uint8_t tag_action, int dbd, uint8_t pc, uint8_t page, uint8_t *param_buf, uint32_t param_len, uint8_t sense_len, uint32_t timeout); void scsi_mode_sense_len(struct ccb_scsiio *csio, u_int32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb *), uint8_t tag_action, int dbd, uint8_t pc, uint8_t page, uint8_t *param_buf, uint32_t param_len, int minimum_cmd_size, uint8_t sense_len, uint32_t timeout); void scsi_mode_sense_subpage(struct ccb_scsiio *csio, uint32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb *), uint8_t tag_action, int dbd, uint8_t pc, uint8_t page, uint8_t subpage, uint8_t *param_buf, uint32_t param_len, int minimum_cmd_size, uint8_t sense_len, uint32_t timeout); void scsi_mode_select(struct ccb_scsiio *csio, u_int32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb *), u_int8_t tag_action, int scsi_page_fmt, int save_pages, u_int8_t *param_buf, u_int32_t param_len, u_int8_t sense_len, u_int32_t timeout); void scsi_mode_select_len(struct ccb_scsiio *csio, u_int32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb *), u_int8_t tag_action, int scsi_page_fmt, int save_pages, u_int8_t *param_buf, u_int32_t param_len, int minimum_cmd_size, u_int8_t sense_len, u_int32_t timeout); void scsi_log_sense(struct ccb_scsiio *csio, u_int32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb *), u_int8_t tag_action, u_int8_t page_code, u_int8_t page, int save_pages, int ppc, u_int32_t paramptr, u_int8_t *param_buf, u_int32_t param_len, u_int8_t sense_len, u_int32_t timeout); void scsi_log_select(struct ccb_scsiio *csio, u_int32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb *), u_int8_t tag_action, u_int8_t page_code, int save_pages, int pc_reset, u_int8_t *param_buf, u_int32_t param_len, u_int8_t sense_len, u_int32_t timeout); void scsi_prevent(struct ccb_scsiio *csio, u_int32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb *), u_int8_t tag_action, u_int8_t action, u_int8_t sense_len, u_int32_t timeout); void scsi_read_capacity(struct ccb_scsiio *csio, u_int32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb *), u_int8_t tag_action, struct scsi_read_capacity_data *, u_int8_t sense_len, u_int32_t timeout); void scsi_read_capacity_16(struct ccb_scsiio *csio, uint32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb *), uint8_t tag_action, uint64_t lba, int reladr, int pmi, uint8_t *rcap_buf, int rcap_buf_len, uint8_t sense_len, uint32_t timeout); void scsi_report_luns(struct ccb_scsiio *csio, u_int32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb *), u_int8_t tag_action, u_int8_t select_report, struct scsi_report_luns_data *rpl_buf, u_int32_t alloc_len, u_int8_t sense_len, u_int32_t timeout); void scsi_report_target_group(struct ccb_scsiio *csio, u_int32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb *), u_int8_t tag_action, u_int8_t pdf, void *buf, u_int32_t alloc_len, u_int8_t sense_len, u_int32_t timeout); void scsi_report_timestamp(struct ccb_scsiio *csio, u_int32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb *), u_int8_t tag_action, u_int8_t pdf, void *buf, u_int32_t alloc_len, u_int8_t sense_len, u_int32_t timeout); void scsi_set_target_group(struct ccb_scsiio *csio, u_int32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb *), u_int8_t tag_action, void *buf, u_int32_t alloc_len, u_int8_t sense_len, u_int32_t timeout); void scsi_create_timestamp(uint8_t *timestamp_6b_buf, uint64_t timestamp); void scsi_set_timestamp(struct ccb_scsiio *csio, u_int32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb *), u_int8_t tag_action, void *buf, u_int32_t alloc_len, u_int8_t sense_len, u_int32_t timeout); void scsi_synchronize_cache(struct ccb_scsiio *csio, u_int32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb *), u_int8_t tag_action, u_int32_t begin_lba, u_int16_t lb_count, u_int8_t sense_len, u_int32_t timeout); void scsi_receive_diagnostic_results(struct ccb_scsiio *csio, u_int32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb*), uint8_t tag_action, int pcv, uint8_t page_code, uint8_t *data_ptr, uint16_t allocation_length, uint8_t sense_len, uint32_t timeout); void scsi_send_diagnostic(struct ccb_scsiio *csio, u_int32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb *), uint8_t tag_action, int unit_offline, int device_offline, int self_test, int page_format, int self_test_code, uint8_t *data_ptr, uint16_t param_list_length, uint8_t sense_len, uint32_t timeout); void scsi_read_buffer(struct ccb_scsiio *csio, u_int32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb*), uint8_t tag_action, int mode, uint8_t buffer_id, u_int32_t offset, uint8_t *data_ptr, uint32_t allocation_length, uint8_t sense_len, uint32_t timeout); void scsi_write_buffer(struct ccb_scsiio *csio, u_int32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb *), uint8_t tag_action, int mode, uint8_t buffer_id, u_int32_t offset, uint8_t *data_ptr, uint32_t param_list_length, uint8_t sense_len, uint32_t timeout); #define SCSI_RW_READ 0x0001 #define SCSI_RW_WRITE 0x0002 #define SCSI_RW_DIRMASK 0x0003 #define SCSI_RW_BIO 0x1000 void scsi_read_write(struct ccb_scsiio *csio, u_int32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb *), u_int8_t tag_action, int readop, u_int8_t byte2, int minimum_cmd_size, u_int64_t lba, u_int32_t block_count, u_int8_t *data_ptr, u_int32_t dxfer_len, u_int8_t sense_len, u_int32_t timeout); void scsi_write_same(struct ccb_scsiio *csio, u_int32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb *), u_int8_t tag_action, u_int8_t byte2, int minimum_cmd_size, u_int64_t lba, u_int32_t block_count, u_int8_t *data_ptr, u_int32_t dxfer_len, u_int8_t sense_len, u_int32_t timeout); void scsi_ata_identify(struct ccb_scsiio *csio, u_int32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb *), u_int8_t tag_action, u_int8_t *data_ptr, u_int16_t dxfer_len, u_int8_t sense_len, u_int32_t timeout); void scsi_ata_trim(struct ccb_scsiio *csio, u_int32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb *), u_int8_t tag_action, u_int16_t block_count, u_int8_t *data_ptr, u_int16_t dxfer_len, u_int8_t sense_len, u_int32_t timeout); int scsi_ata_read_log(struct ccb_scsiio *csio, uint32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb *), uint8_t tag_action, uint32_t log_address, uint32_t page_number, uint16_t block_count, uint8_t protocol, uint8_t *data_ptr, uint32_t dxfer_len, uint8_t sense_len, uint32_t timeout); int scsi_ata_setfeatures(struct ccb_scsiio *csio, uint32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb *), uint8_t tag_action, uint8_t feature, uint64_t lba, uint32_t count, uint8_t sense_len, uint32_t timeout); int scsi_ata_pass(struct ccb_scsiio *csio, uint32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb *), uint32_t flags, uint8_t tag_action, uint8_t protocol, uint8_t ata_flags, uint16_t features, uint16_t sector_count, uint64_t lba, uint8_t command, uint8_t device, uint8_t icc, uint32_t auxiliary, uint8_t control, u_int8_t *data_ptr, uint32_t dxfer_len, uint8_t *cdb_storage, size_t cdb_storage_len, int minimum_cmd_size, u_int8_t sense_len, u_int32_t timeout); void scsi_ata_pass_16(struct ccb_scsiio *csio, u_int32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb *), u_int32_t flags, u_int8_t tag_action, u_int8_t protocol, u_int8_t ata_flags, u_int16_t features, u_int16_t sector_count, uint64_t lba, u_int8_t command, u_int8_t control, u_int8_t *data_ptr, u_int16_t dxfer_len, u_int8_t sense_len, u_int32_t timeout); void scsi_unmap(struct ccb_scsiio *csio, u_int32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb *), u_int8_t tag_action, u_int8_t byte2, u_int8_t *data_ptr, u_int16_t dxfer_len, u_int8_t sense_len, u_int32_t timeout); void scsi_start_stop(struct ccb_scsiio *csio, u_int32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb *), u_int8_t tag_action, int start, int load_eject, int immediate, u_int8_t sense_len, u_int32_t timeout); void scsi_read_attribute(struct ccb_scsiio *csio, u_int32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb *), u_int8_t tag_action, u_int8_t service_action, uint32_t element, u_int8_t elem_type, int logical_volume, int partition, u_int32_t first_attribute, int cache, u_int8_t *data_ptr, u_int32_t length, int sense_len, u_int32_t timeout); void scsi_write_attribute(struct ccb_scsiio *csio, u_int32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb *), u_int8_t tag_action, uint32_t element, int logical_volume, int partition, int wtc, u_int8_t *data_ptr, u_int32_t length, int sense_len, u_int32_t timeout); void scsi_security_protocol_in(struct ccb_scsiio *csio, uint32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb *), uint8_t tag_action, uint32_t security_protocol, uint32_t security_protocol_specific, int byte4, uint8_t *data_ptr, uint32_t dxfer_len, int sense_len, int timeout); void scsi_security_protocol_out(struct ccb_scsiio *csio, uint32_t retries, void (*cbfcnp)(struct cam_periph *,union ccb *), uint8_t tag_action, uint32_t security_protocol, uint32_t security_protocol_specific, int byte4, uint8_t *data_ptr, uint32_t dxfer_len, int sense_len, int timeout); void scsi_persistent_reserve_in(struct ccb_scsiio *csio, uint32_t retries, void (*cbfcnp)(struct cam_periph *,union ccb *), uint8_t tag_action, int service_action, uint8_t *data_ptr, uint32_t dxfer_len, int sense_len, int timeout); void scsi_persistent_reserve_out(struct ccb_scsiio *csio, uint32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb *), uint8_t tag_action, int service_action, int scope, int res_type, uint8_t *data_ptr, uint32_t dxfer_len, int sense_len, int timeout); void scsi_report_supported_opcodes(struct ccb_scsiio *csio, uint32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb *), uint8_t tag_action, int options, int req_opcode, int req_service_action, uint8_t *data_ptr, uint32_t dxfer_len, int sense_len, int timeout); int scsi_inquiry_match(caddr_t inqbuffer, caddr_t table_entry); int scsi_static_inquiry_match(caddr_t inqbuffer, caddr_t table_entry); int scsi_devid_match(uint8_t *rhs, size_t rhs_len, uint8_t *lhs, size_t lhs_len); void scsi_extract_sense(struct scsi_sense_data *sense, int *error_code, int *sense_key, int *asc, int *ascq); int scsi_extract_sense_ccb(union ccb *ccb, int *error_code, int *sense_key, int *asc, int *ascq); void scsi_extract_sense_len(struct scsi_sense_data *sense, u_int sense_len, int *error_code, int *sense_key, int *asc, int *ascq, int show_errors); int scsi_get_sense_key(struct scsi_sense_data *sense, u_int sense_len, int show_errors); int scsi_get_asc(struct scsi_sense_data *sense, u_int sense_len, int show_errors); int scsi_get_ascq(struct scsi_sense_data *sense, u_int sense_len, int show_errors); static __inline void scsi_ulto2b(u_int32_t val, u_int8_t *bytes); static __inline void scsi_ulto3b(u_int32_t val, u_int8_t *bytes); static __inline void scsi_ulto4b(u_int32_t val, u_int8_t *bytes); static __inline void scsi_u64to8b(u_int64_t val, u_int8_t *bytes); static __inline uint32_t scsi_2btoul(const uint8_t *bytes); static __inline uint32_t scsi_3btoul(const uint8_t *bytes); static __inline int32_t scsi_3btol(const uint8_t *bytes); static __inline uint32_t scsi_4btoul(const uint8_t *bytes); static __inline uint64_t scsi_8btou64(const uint8_t *bytes); static __inline void *find_mode_page_6(struct scsi_mode_header_6 *mode_header); static __inline void *find_mode_page_10(struct scsi_mode_header_10 *mode_header); static __inline void scsi_ulto2b(u_int32_t val, u_int8_t *bytes) { bytes[0] = (val >> 8) & 0xff; bytes[1] = val & 0xff; } static __inline void scsi_ulto3b(u_int32_t val, u_int8_t *bytes) { bytes[0] = (val >> 16) & 0xff; bytes[1] = (val >> 8) & 0xff; bytes[2] = val & 0xff; } static __inline void scsi_ulto4b(u_int32_t val, u_int8_t *bytes) { bytes[0] = (val >> 24) & 0xff; bytes[1] = (val >> 16) & 0xff; bytes[2] = (val >> 8) & 0xff; bytes[3] = val & 0xff; } static __inline void scsi_u64to8b(u_int64_t val, u_int8_t *bytes) { bytes[0] = (val >> 56) & 0xff; bytes[1] = (val >> 48) & 0xff; bytes[2] = (val >> 40) & 0xff; bytes[3] = (val >> 32) & 0xff; bytes[4] = (val >> 24) & 0xff; bytes[5] = (val >> 16) & 0xff; bytes[6] = (val >> 8) & 0xff; bytes[7] = val & 0xff; } static __inline uint32_t scsi_2btoul(const uint8_t *bytes) { uint32_t rv; rv = (bytes[0] << 8) | bytes[1]; return (rv); } static __inline uint32_t scsi_3btoul(const uint8_t *bytes) { uint32_t rv; rv = (bytes[0] << 16) | (bytes[1] << 8) | bytes[2]; return (rv); } static __inline int32_t scsi_3btol(const uint8_t *bytes) { uint32_t rc = scsi_3btoul(bytes); if (rc & 0x00800000) rc |= 0xff000000; return (int32_t) rc; } static __inline uint32_t scsi_4btoul(const uint8_t *bytes) { uint32_t rv; rv = (bytes[0] << 24) | (bytes[1] << 16) | (bytes[2] << 8) | bytes[3]; return (rv); } static __inline uint64_t scsi_8btou64(const uint8_t *bytes) { uint64_t rv; rv = (((uint64_t)bytes[0]) << 56) | (((uint64_t)bytes[1]) << 48) | (((uint64_t)bytes[2]) << 40) | (((uint64_t)bytes[3]) << 32) | (((uint64_t)bytes[4]) << 24) | (((uint64_t)bytes[5]) << 16) | (((uint64_t)bytes[6]) << 8) | bytes[7]; return (rv); } /* * Given the pointer to a returned mode sense buffer, return a pointer to * the start of the first mode page. */ static __inline void * find_mode_page_6(struct scsi_mode_header_6 *mode_header) { void *page_start; page_start = (void *)((u_int8_t *)&mode_header[1] + mode_header->blk_desc_len); return(page_start); } static __inline void * find_mode_page_10(struct scsi_mode_header_10 *mode_header) { void *page_start; page_start = (void *)((u_int8_t *)&mode_header[1] + scsi_2btoul(mode_header->blk_desc_len)); return(page_start); } __END_DECLS #endif /*_SCSI_SCSI_ALL_H*/ Index: projects/nfsv42/sys/dev/nvme/nvme_ctrlr.c =================================================================== --- projects/nfsv42/sys/dev/nvme/nvme_ctrlr.c (revision 350367) +++ projects/nfsv42/sys/dev/nvme/nvme_ctrlr.c (revision 350368) @@ -1,1418 +1,1418 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (C) 2012-2016 Intel Corporation * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_cam.h" #include #include #include #include #include #include #include #include #include #include #include #include #include "nvme_private.h" #define B4_CHK_RDY_DELAY_MS 2300 /* work around controller bug */ static void nvme_ctrlr_construct_and_submit_aer(struct nvme_controller *ctrlr, struct nvme_async_event_request *aer); static void nvme_ctrlr_setup_interrupts(struct nvme_controller *ctrlr); static int nvme_ctrlr_allocate_bar(struct nvme_controller *ctrlr) { ctrlr->resource_id = PCIR_BAR(0); ctrlr->resource = bus_alloc_resource_any(ctrlr->dev, SYS_RES_MEMORY, &ctrlr->resource_id, RF_ACTIVE); if(ctrlr->resource == NULL) { nvme_printf(ctrlr, "unable to allocate pci resource\n"); return (ENOMEM); } ctrlr->bus_tag = rman_get_bustag(ctrlr->resource); ctrlr->bus_handle = rman_get_bushandle(ctrlr->resource); ctrlr->regs = (struct nvme_registers *)ctrlr->bus_handle; /* * The NVMe spec allows for the MSI-X table to be placed behind * BAR 4/5, separate from the control/doorbell registers. Always * try to map this bar, because it must be mapped prior to calling * pci_alloc_msix(). If the table isn't behind BAR 4/5, * bus_alloc_resource() will just return NULL which is OK. */ ctrlr->bar4_resource_id = PCIR_BAR(4); ctrlr->bar4_resource = bus_alloc_resource_any(ctrlr->dev, SYS_RES_MEMORY, &ctrlr->bar4_resource_id, RF_ACTIVE); return (0); } static int nvme_ctrlr_construct_admin_qpair(struct nvme_controller *ctrlr) { struct nvme_qpair *qpair; uint32_t num_entries; int error; qpair = &ctrlr->adminq; num_entries = NVME_ADMIN_ENTRIES; TUNABLE_INT_FETCH("hw.nvme.admin_entries", &num_entries); /* * If admin_entries was overridden to an invalid value, revert it * back to our default value. */ if (num_entries < NVME_MIN_ADMIN_ENTRIES || num_entries > NVME_MAX_ADMIN_ENTRIES) { nvme_printf(ctrlr, "invalid hw.nvme.admin_entries=%d " "specified\n", num_entries); num_entries = NVME_ADMIN_ENTRIES; } /* * The admin queue's max xfer size is treated differently than the * max I/O xfer size. 16KB is sufficient here - maybe even less? */ error = nvme_qpair_construct(qpair, 0, /* qpair ID */ 0, /* vector */ num_entries, NVME_ADMIN_TRACKERS, ctrlr); return (error); } static int nvme_ctrlr_construct_io_qpairs(struct nvme_controller *ctrlr) { struct nvme_qpair *qpair; uint32_t cap_lo; uint16_t mqes; int i, error, num_entries, num_trackers; num_entries = NVME_IO_ENTRIES; TUNABLE_INT_FETCH("hw.nvme.io_entries", &num_entries); /* * NVMe spec sets a hard limit of 64K max entries, but * devices may specify a smaller limit, so we need to check * the MQES field in the capabilities register. */ cap_lo = nvme_mmio_read_4(ctrlr, cap_lo); mqes = NVME_CAP_LO_MQES(cap_lo); num_entries = min(num_entries, mqes + 1); num_trackers = NVME_IO_TRACKERS; TUNABLE_INT_FETCH("hw.nvme.io_trackers", &num_trackers); num_trackers = max(num_trackers, NVME_MIN_IO_TRACKERS); num_trackers = min(num_trackers, NVME_MAX_IO_TRACKERS); /* * No need to have more trackers than entries in the submit queue. * Note also that for a queue size of N, we can only have (N-1) * commands outstanding, hence the "-1" here. */ num_trackers = min(num_trackers, (num_entries-1)); /* * Our best estimate for the maximum number of I/Os that we should * noramlly have in flight at one time. This should be viewed as a hint, * not a hard limit and will need to be revisitted when the upper layers * of the storage system grows multi-queue support. */ ctrlr->max_hw_pend_io = num_trackers * ctrlr->num_io_queues * 3 / 4; /* * This was calculated previously when setting up interrupts, but * a controller could theoretically support fewer I/O queues than * MSI-X vectors. So calculate again here just to be safe. */ ctrlr->num_cpus_per_ioq = howmany(mp_ncpus, ctrlr->num_io_queues); ctrlr->ioq = malloc(ctrlr->num_io_queues * sizeof(struct nvme_qpair), M_NVME, M_ZERO | M_WAITOK); for (i = 0; i < ctrlr->num_io_queues; i++) { qpair = &ctrlr->ioq[i]; /* * Admin queue has ID=0. IO queues start at ID=1 - * hence the 'i+1' here. * * For I/O queues, use the controller-wide max_xfer_size * calculated in nvme_attach(). */ error = nvme_qpair_construct(qpair, i+1, /* qpair ID */ ctrlr->msix_enabled ? i+1 : 0, /* vector */ num_entries, num_trackers, ctrlr); if (error) return (error); /* * Do not bother binding interrupts if we only have one I/O * interrupt thread for this controller. */ if (ctrlr->num_io_queues > 1) bus_bind_intr(ctrlr->dev, qpair->res, i * ctrlr->num_cpus_per_ioq); } return (0); } static void nvme_ctrlr_fail(struct nvme_controller *ctrlr) { int i; ctrlr->is_failed = TRUE; nvme_qpair_fail(&ctrlr->adminq); if (ctrlr->ioq != NULL) { for (i = 0; i < ctrlr->num_io_queues; i++) nvme_qpair_fail(&ctrlr->ioq[i]); } nvme_notify_fail_consumers(ctrlr); } void nvme_ctrlr_post_failed_request(struct nvme_controller *ctrlr, struct nvme_request *req) { mtx_lock(&ctrlr->lock); STAILQ_INSERT_TAIL(&ctrlr->fail_req, req, stailq); mtx_unlock(&ctrlr->lock); taskqueue_enqueue(ctrlr->taskqueue, &ctrlr->fail_req_task); } static void nvme_ctrlr_fail_req_task(void *arg, int pending) { struct nvme_controller *ctrlr = arg; struct nvme_request *req; mtx_lock(&ctrlr->lock); while ((req = STAILQ_FIRST(&ctrlr->fail_req)) != NULL) { STAILQ_REMOVE_HEAD(&ctrlr->fail_req, stailq); mtx_unlock(&ctrlr->lock); nvme_qpair_manual_complete_request(req->qpair, req, NVME_SCT_GENERIC, NVME_SC_ABORTED_BY_REQUEST); mtx_lock(&ctrlr->lock); } mtx_unlock(&ctrlr->lock); } static int nvme_ctrlr_wait_for_ready(struct nvme_controller *ctrlr, int desired_val) { int ms_waited; uint32_t csts; csts = nvme_mmio_read_4(ctrlr, csts); ms_waited = 0; while (((csts >> NVME_CSTS_REG_RDY_SHIFT) & NVME_CSTS_REG_RDY_MASK) != desired_val) { if (ms_waited++ > ctrlr->ready_timeout_in_ms) { nvme_printf(ctrlr, "controller ready did not become %d " "within %d ms\n", desired_val, ctrlr->ready_timeout_in_ms); return (ENXIO); } DELAY(1000); csts = nvme_mmio_read_4(ctrlr, csts); } return (0); } static int nvme_ctrlr_disable(struct nvme_controller *ctrlr) { uint32_t cc; uint32_t csts; uint8_t en, rdy; int err; cc = nvme_mmio_read_4(ctrlr, cc); csts = nvme_mmio_read_4(ctrlr, csts); en = (cc >> NVME_CC_REG_EN_SHIFT) & NVME_CC_REG_EN_MASK; rdy = (csts >> NVME_CSTS_REG_RDY_SHIFT) & NVME_CSTS_REG_RDY_MASK; /* * Per 3.1.5 in NVME 1.3 spec, transitioning CC.EN from 0 to 1 * when CSTS.RDY is 1 or transitioning CC.EN from 1 to 0 when * CSTS.RDY is 0 "has undefined results" So make sure that CSTS.RDY * isn't the desired value. Short circuit if we're already disabled. */ if (en == 1) { if (rdy == 0) { /* EN == 1, wait for RDY == 1 or fail */ err = nvme_ctrlr_wait_for_ready(ctrlr, 1); if (err != 0) return (err); } } else { /* EN == 0 already wait for RDY == 0 */ if (rdy == 0) return (0); else return (nvme_ctrlr_wait_for_ready(ctrlr, 0)); } cc &= ~NVME_CC_REG_EN_MASK; nvme_mmio_write_4(ctrlr, cc, cc); /* * Some drives have issues with accessing the mmio after we * disable, so delay for a bit after we write the bit to * cope with these issues. */ if (ctrlr->quirks & QUIRK_DELAY_B4_CHK_RDY) pause("nvmeR", B4_CHK_RDY_DELAY_MS * hz / 1000); return (nvme_ctrlr_wait_for_ready(ctrlr, 0)); } static int nvme_ctrlr_enable(struct nvme_controller *ctrlr) { uint32_t cc; uint32_t csts; uint32_t aqa; uint32_t qsize; uint8_t en, rdy; int err; cc = nvme_mmio_read_4(ctrlr, cc); csts = nvme_mmio_read_4(ctrlr, csts); en = (cc >> NVME_CC_REG_EN_SHIFT) & NVME_CC_REG_EN_MASK; rdy = (csts >> NVME_CSTS_REG_RDY_SHIFT) & NVME_CSTS_REG_RDY_MASK; /* * See note in nvme_ctrlr_disable. Short circuit if we're already enabled. */ if (en == 1) { if (rdy == 1) return (0); else return (nvme_ctrlr_wait_for_ready(ctrlr, 1)); } else { /* EN == 0 already wait for RDY == 0 or fail */ err = nvme_ctrlr_wait_for_ready(ctrlr, 0); if (err != 0) return (err); } nvme_mmio_write_8(ctrlr, asq, ctrlr->adminq.cmd_bus_addr); DELAY(5000); nvme_mmio_write_8(ctrlr, acq, ctrlr->adminq.cpl_bus_addr); DELAY(5000); /* acqs and asqs are 0-based. */ qsize = ctrlr->adminq.num_entries - 1; aqa = 0; aqa = (qsize & NVME_AQA_REG_ACQS_MASK) << NVME_AQA_REG_ACQS_SHIFT; aqa |= (qsize & NVME_AQA_REG_ASQS_MASK) << NVME_AQA_REG_ASQS_SHIFT; nvme_mmio_write_4(ctrlr, aqa, aqa); DELAY(5000); /* Initialization values for CC */ cc = 0; cc |= 1 << NVME_CC_REG_EN_SHIFT; cc |= 0 << NVME_CC_REG_CSS_SHIFT; cc |= 0 << NVME_CC_REG_AMS_SHIFT; cc |= 0 << NVME_CC_REG_SHN_SHIFT; cc |= 6 << NVME_CC_REG_IOSQES_SHIFT; /* SQ entry size == 64 == 2^6 */ cc |= 4 << NVME_CC_REG_IOCQES_SHIFT; /* CQ entry size == 16 == 2^4 */ /* This evaluates to 0, which is according to spec. */ cc |= (PAGE_SIZE >> 13) << NVME_CC_REG_MPS_SHIFT; nvme_mmio_write_4(ctrlr, cc, cc); return (nvme_ctrlr_wait_for_ready(ctrlr, 1)); } int nvme_ctrlr_hw_reset(struct nvme_controller *ctrlr) { int i, err; nvme_admin_qpair_disable(&ctrlr->adminq); /* * I/O queues are not allocated before the initial HW * reset, so do not try to disable them. Use is_initialized * to determine if this is the initial HW reset. */ if (ctrlr->is_initialized) { for (i = 0; i < ctrlr->num_io_queues; i++) nvme_io_qpair_disable(&ctrlr->ioq[i]); } DELAY(100*1000); err = nvme_ctrlr_disable(ctrlr); if (err != 0) return err; return (nvme_ctrlr_enable(ctrlr)); } void nvme_ctrlr_reset(struct nvme_controller *ctrlr) { int cmpset; cmpset = atomic_cmpset_32(&ctrlr->is_resetting, 0, 1); if (cmpset == 0 || ctrlr->is_failed) /* * Controller is already resetting or has failed. Return * immediately since there is no need to kick off another * reset in these cases. */ return; taskqueue_enqueue(ctrlr->taskqueue, &ctrlr->reset_task); } static int nvme_ctrlr_identify(struct nvme_controller *ctrlr) { struct nvme_completion_poll_status status; status.done = 0; nvme_ctrlr_cmd_identify_controller(ctrlr, &ctrlr->cdata, nvme_completion_poll_cb, &status); while (!atomic_load_acq_int(&status.done)) pause("nvme", 1); if (nvme_completion_is_error(&status.cpl)) { nvme_printf(ctrlr, "nvme_identify_controller failed!\n"); return (ENXIO); } /* Convert data to host endian */ nvme_controller_data_swapbytes(&ctrlr->cdata); /* * Use MDTS to ensure our default max_xfer_size doesn't exceed what the * controller supports. */ if (ctrlr->cdata.mdts > 0) ctrlr->max_xfer_size = min(ctrlr->max_xfer_size, ctrlr->min_page_size * (1 << (ctrlr->cdata.mdts))); return (0); } static int nvme_ctrlr_set_num_qpairs(struct nvme_controller *ctrlr) { struct nvme_completion_poll_status status; int cq_allocated, sq_allocated; status.done = 0; nvme_ctrlr_cmd_set_num_queues(ctrlr, ctrlr->num_io_queues, nvme_completion_poll_cb, &status); while (!atomic_load_acq_int(&status.done)) pause("nvme", 1); if (nvme_completion_is_error(&status.cpl)) { nvme_printf(ctrlr, "nvme_ctrlr_set_num_qpairs failed!\n"); return (ENXIO); } /* * Data in cdw0 is 0-based. * Lower 16-bits indicate number of submission queues allocated. * Upper 16-bits indicate number of completion queues allocated. */ sq_allocated = (status.cpl.cdw0 & 0xFFFF) + 1; cq_allocated = (status.cpl.cdw0 >> 16) + 1; /* * Controller may allocate more queues than we requested, * so use the minimum of the number requested and what was * actually allocated. */ ctrlr->num_io_queues = min(ctrlr->num_io_queues, sq_allocated); ctrlr->num_io_queues = min(ctrlr->num_io_queues, cq_allocated); return (0); } static int nvme_ctrlr_create_qpairs(struct nvme_controller *ctrlr) { struct nvme_completion_poll_status status; struct nvme_qpair *qpair; int i; for (i = 0; i < ctrlr->num_io_queues; i++) { qpair = &ctrlr->ioq[i]; status.done = 0; nvme_ctrlr_cmd_create_io_cq(ctrlr, qpair, qpair->vector, nvme_completion_poll_cb, &status); while (!atomic_load_acq_int(&status.done)) pause("nvme", 1); if (nvme_completion_is_error(&status.cpl)) { nvme_printf(ctrlr, "nvme_create_io_cq failed!\n"); return (ENXIO); } status.done = 0; nvme_ctrlr_cmd_create_io_sq(qpair->ctrlr, qpair, nvme_completion_poll_cb, &status); while (!atomic_load_acq_int(&status.done)) pause("nvme", 1); if (nvme_completion_is_error(&status.cpl)) { nvme_printf(ctrlr, "nvme_create_io_sq failed!\n"); return (ENXIO); } } return (0); } static int nvme_ctrlr_destroy_qpairs(struct nvme_controller *ctrlr) { struct nvme_completion_poll_status status; struct nvme_qpair *qpair; for (int i = 0; i < ctrlr->num_io_queues; i++) { qpair = &ctrlr->ioq[i]; status.done = 0; nvme_ctrlr_cmd_delete_io_sq(ctrlr, qpair, nvme_completion_poll_cb, &status); while (!atomic_load_acq_int(&status.done)) pause("nvme", 1); if (nvme_completion_is_error(&status.cpl)) { nvme_printf(ctrlr, "nvme_destroy_io_sq failed!\n"); return (ENXIO); } status.done = 0; nvme_ctrlr_cmd_delete_io_cq(ctrlr, qpair, nvme_completion_poll_cb, &status); while (!atomic_load_acq_int(&status.done)) pause("nvme", 1); if (nvme_completion_is_error(&status.cpl)) { nvme_printf(ctrlr, "nvme_destroy_io_cq failed!\n"); return (ENXIO); } } return (0); } static int nvme_ctrlr_construct_namespaces(struct nvme_controller *ctrlr) { struct nvme_namespace *ns; uint32_t i; for (i = 0; i < min(ctrlr->cdata.nn, NVME_MAX_NAMESPACES); i++) { ns = &ctrlr->ns[i]; nvme_ns_construct(ns, i+1, ctrlr); } return (0); } static boolean_t is_log_page_id_valid(uint8_t page_id) { switch (page_id) { case NVME_LOG_ERROR: case NVME_LOG_HEALTH_INFORMATION: case NVME_LOG_FIRMWARE_SLOT: case NVME_LOG_CHANGED_NAMESPACE: return (TRUE); } return (FALSE); } static uint32_t nvme_ctrlr_get_log_page_size(struct nvme_controller *ctrlr, uint8_t page_id) { uint32_t log_page_size; switch (page_id) { case NVME_LOG_ERROR: log_page_size = min( sizeof(struct nvme_error_information_entry) * (ctrlr->cdata.elpe + 1), NVME_MAX_AER_LOG_SIZE); break; case NVME_LOG_HEALTH_INFORMATION: log_page_size = sizeof(struct nvme_health_information_page); break; case NVME_LOG_FIRMWARE_SLOT: log_page_size = sizeof(struct nvme_firmware_page); break; case NVME_LOG_CHANGED_NAMESPACE: log_page_size = sizeof(struct nvme_ns_list); break; default: log_page_size = 0; break; } return (log_page_size); } static void nvme_ctrlr_log_critical_warnings(struct nvme_controller *ctrlr, uint8_t state) { if (state & NVME_CRIT_WARN_ST_AVAILABLE_SPARE) nvme_printf(ctrlr, "available spare space below threshold\n"); if (state & NVME_CRIT_WARN_ST_TEMPERATURE) nvme_printf(ctrlr, "temperature above threshold\n"); if (state & NVME_CRIT_WARN_ST_DEVICE_RELIABILITY) nvme_printf(ctrlr, "device reliability degraded\n"); if (state & NVME_CRIT_WARN_ST_READ_ONLY) nvme_printf(ctrlr, "media placed in read only mode\n"); if (state & NVME_CRIT_WARN_ST_VOLATILE_MEMORY_BACKUP) nvme_printf(ctrlr, "volatile memory backup device failed\n"); if (state & NVME_CRIT_WARN_ST_RESERVED_MASK) nvme_printf(ctrlr, "unknown critical warning(s): state = 0x%02x\n", state); } static void nvme_ctrlr_async_event_log_page_cb(void *arg, const struct nvme_completion *cpl) { struct nvme_async_event_request *aer = arg; struct nvme_health_information_page *health_info; struct nvme_ns_list *nsl; struct nvme_error_information_entry *err; int i; /* * If the log page fetch for some reason completed with an error, * don't pass log page data to the consumers. In practice, this case * should never happen. */ if (nvme_completion_is_error(cpl)) nvme_notify_async_consumers(aer->ctrlr, &aer->cpl, aer->log_page_id, NULL, 0); else { /* Convert data to host endian */ switch (aer->log_page_id) { case NVME_LOG_ERROR: err = (struct nvme_error_information_entry *)aer->log_page_buffer; for (i = 0; i < (aer->ctrlr->cdata.elpe + 1); i++) nvme_error_information_entry_swapbytes(err++); break; case NVME_LOG_HEALTH_INFORMATION: nvme_health_information_page_swapbytes( (struct nvme_health_information_page *)aer->log_page_buffer); break; case NVME_LOG_FIRMWARE_SLOT: nvme_firmware_page_swapbytes( (struct nvme_firmware_page *)aer->log_page_buffer); break; case NVME_LOG_CHANGED_NAMESPACE: nvme_ns_list_swapbytes( (struct nvme_ns_list *)aer->log_page_buffer); break; case INTEL_LOG_TEMP_STATS: intel_log_temp_stats_swapbytes( (struct intel_log_temp_stats *)aer->log_page_buffer); break; default: break; } if (aer->log_page_id == NVME_LOG_HEALTH_INFORMATION) { health_info = (struct nvme_health_information_page *) aer->log_page_buffer; nvme_ctrlr_log_critical_warnings(aer->ctrlr, health_info->critical_warning); /* * Critical warnings reported through the * SMART/health log page are persistent, so * clear the associated bits in the async event * config so that we do not receive repeated * notifications for the same event. */ aer->ctrlr->async_event_config &= ~health_info->critical_warning; nvme_ctrlr_cmd_set_async_event_config(aer->ctrlr, aer->ctrlr->async_event_config, NULL, NULL); } else if (aer->log_page_id == NVME_LOG_CHANGED_NAMESPACE && !nvme_use_nvd) { nsl = (struct nvme_ns_list *)aer->log_page_buffer; for (i = 0; i < nitems(nsl->ns) && nsl->ns[i] != 0; i++) { if (nsl->ns[i] > NVME_MAX_NAMESPACES) break; nvme_notify_ns(aer->ctrlr, nsl->ns[i]); } } /* * Pass the cpl data from the original async event completion, * not the log page fetch. */ nvme_notify_async_consumers(aer->ctrlr, &aer->cpl, aer->log_page_id, aer->log_page_buffer, aer->log_page_size); } /* * Repost another asynchronous event request to replace the one * that just completed. */ nvme_ctrlr_construct_and_submit_aer(aer->ctrlr, aer); } static void nvme_ctrlr_async_event_cb(void *arg, const struct nvme_completion *cpl) { struct nvme_async_event_request *aer = arg; if (nvme_completion_is_error(cpl)) { /* * Do not retry failed async event requests. This avoids * infinite loops where a new async event request is submitted * to replace the one just failed, only to fail again and * perpetuate the loop. */ return; } /* Associated log page is in bits 23:16 of completion entry dw0. */ aer->log_page_id = (cpl->cdw0 & 0xFF0000) >> 16; nvme_printf(aer->ctrlr, "async event occurred (type 0x%x, info 0x%02x," " page 0x%02x)\n", (cpl->cdw0 & 0x03), (cpl->cdw0 & 0xFF00) >> 8, aer->log_page_id); if (is_log_page_id_valid(aer->log_page_id)) { aer->log_page_size = nvme_ctrlr_get_log_page_size(aer->ctrlr, aer->log_page_id); memcpy(&aer->cpl, cpl, sizeof(*cpl)); nvme_ctrlr_cmd_get_log_page(aer->ctrlr, aer->log_page_id, NVME_GLOBAL_NAMESPACE_TAG, aer->log_page_buffer, aer->log_page_size, nvme_ctrlr_async_event_log_page_cb, aer); /* Wait to notify consumers until after log page is fetched. */ } else { nvme_notify_async_consumers(aer->ctrlr, cpl, aer->log_page_id, NULL, 0); /* * Repost another asynchronous event request to replace the one * that just completed. */ nvme_ctrlr_construct_and_submit_aer(aer->ctrlr, aer); } } static void nvme_ctrlr_construct_and_submit_aer(struct nvme_controller *ctrlr, struct nvme_async_event_request *aer) { struct nvme_request *req; aer->ctrlr = ctrlr; req = nvme_allocate_request_null(nvme_ctrlr_async_event_cb, aer); aer->req = req; /* * Disable timeout here, since asynchronous event requests should by * nature never be timed out. */ req->timeout = FALSE; req->cmd.opc = NVME_OPC_ASYNC_EVENT_REQUEST; nvme_ctrlr_submit_admin_request(ctrlr, req); } static void nvme_ctrlr_configure_aer(struct nvme_controller *ctrlr) { struct nvme_completion_poll_status status; struct nvme_async_event_request *aer; uint32_t i; ctrlr->async_event_config = NVME_CRIT_WARN_ST_AVAILABLE_SPARE | NVME_CRIT_WARN_ST_DEVICE_RELIABILITY | NVME_CRIT_WARN_ST_READ_ONLY | NVME_CRIT_WARN_ST_VOLATILE_MEMORY_BACKUP; if (ctrlr->cdata.ver >= NVME_REV(1, 2)) ctrlr->async_event_config |= 0x300; status.done = 0; nvme_ctrlr_cmd_get_feature(ctrlr, NVME_FEAT_TEMPERATURE_THRESHOLD, 0, NULL, 0, nvme_completion_poll_cb, &status); while (!atomic_load_acq_int(&status.done)) pause("nvme", 1); if (nvme_completion_is_error(&status.cpl) || (status.cpl.cdw0 & 0xFFFF) == 0xFFFF || (status.cpl.cdw0 & 0xFFFF) == 0x0000) { nvme_printf(ctrlr, "temperature threshold not supported\n"); } else ctrlr->async_event_config |= NVME_CRIT_WARN_ST_TEMPERATURE; nvme_ctrlr_cmd_set_async_event_config(ctrlr, ctrlr->async_event_config, NULL, NULL); /* aerl is a zero-based value, so we need to add 1 here. */ ctrlr->num_aers = min(NVME_MAX_ASYNC_EVENTS, (ctrlr->cdata.aerl+1)); for (i = 0; i < ctrlr->num_aers; i++) { aer = &ctrlr->aer[i]; nvme_ctrlr_construct_and_submit_aer(ctrlr, aer); } } static void nvme_ctrlr_configure_int_coalescing(struct nvme_controller *ctrlr) { ctrlr->int_coal_time = 0; TUNABLE_INT_FETCH("hw.nvme.int_coal_time", &ctrlr->int_coal_time); ctrlr->int_coal_threshold = 0; TUNABLE_INT_FETCH("hw.nvme.int_coal_threshold", &ctrlr->int_coal_threshold); nvme_ctrlr_cmd_set_interrupt_coalescing(ctrlr, ctrlr->int_coal_time, ctrlr->int_coal_threshold, NULL, NULL); } static void nvme_ctrlr_start(void *ctrlr_arg) { struct nvme_controller *ctrlr = ctrlr_arg; uint32_t old_num_io_queues; int i; /* * Only reset adminq here when we are restarting the * controller after a reset. During initialization, * we have already submitted admin commands to get * the number of I/O queues supported, so cannot reset * the adminq again here. */ if (ctrlr->is_resetting) { nvme_qpair_reset(&ctrlr->adminq); } for (i = 0; i < ctrlr->num_io_queues; i++) nvme_qpair_reset(&ctrlr->ioq[i]); nvme_admin_qpair_enable(&ctrlr->adminq); if (nvme_ctrlr_identify(ctrlr) != 0) { nvme_ctrlr_fail(ctrlr); return; } /* * The number of qpairs are determined during controller initialization, * including using NVMe SET_FEATURES/NUMBER_OF_QUEUES to determine the * HW limit. We call SET_FEATURES again here so that it gets called * after any reset for controllers that depend on the driver to * explicit specify how many queues it will use. This value should * never change between resets, so panic if somehow that does happen. */ if (ctrlr->is_resetting) { old_num_io_queues = ctrlr->num_io_queues; if (nvme_ctrlr_set_num_qpairs(ctrlr) != 0) { nvme_ctrlr_fail(ctrlr); return; } if (old_num_io_queues != ctrlr->num_io_queues) { panic("num_io_queues changed from %u to %u", old_num_io_queues, ctrlr->num_io_queues); } } if (nvme_ctrlr_create_qpairs(ctrlr) != 0) { nvme_ctrlr_fail(ctrlr); return; } if (nvme_ctrlr_construct_namespaces(ctrlr) != 0) { nvme_ctrlr_fail(ctrlr); return; } nvme_ctrlr_configure_aer(ctrlr); nvme_ctrlr_configure_int_coalescing(ctrlr); for (i = 0; i < ctrlr->num_io_queues; i++) nvme_io_qpair_enable(&ctrlr->ioq[i]); } void nvme_ctrlr_start_config_hook(void *arg) { struct nvme_controller *ctrlr = arg; nvme_qpair_reset(&ctrlr->adminq); nvme_admin_qpair_enable(&ctrlr->adminq); if (nvme_ctrlr_set_num_qpairs(ctrlr) == 0 && nvme_ctrlr_construct_io_qpairs(ctrlr) == 0) nvme_ctrlr_start(ctrlr); else nvme_ctrlr_fail(ctrlr); nvme_sysctl_initialize_ctrlr(ctrlr); config_intrhook_disestablish(&ctrlr->config_hook); ctrlr->is_initialized = 1; nvme_notify_new_controller(ctrlr); } static void nvme_ctrlr_reset_task(void *arg, int pending) { struct nvme_controller *ctrlr = arg; int status; nvme_printf(ctrlr, "resetting controller\n"); status = nvme_ctrlr_hw_reset(ctrlr); /* * Use pause instead of DELAY, so that we yield to any nvme interrupt * handlers on this CPU that were blocked on a qpair lock. We want * all nvme interrupts completed before proceeding with restarting the * controller. * * XXX - any way to guarantee the interrupt handlers have quiesced? */ pause("nvmereset", hz / 10); if (status == 0) nvme_ctrlr_start(ctrlr); else nvme_ctrlr_fail(ctrlr); atomic_cmpset_32(&ctrlr->is_resetting, 1, 0); } /* * Poll all the queues enabled on the device for completion. */ void nvme_ctrlr_poll(struct nvme_controller *ctrlr) { int i; nvme_qpair_process_completions(&ctrlr->adminq); for (i = 0; i < ctrlr->num_io_queues; i++) if (ctrlr->ioq && ctrlr->ioq[i].cpl) nvme_qpair_process_completions(&ctrlr->ioq[i]); } /* * Poll the single-vector intertrupt case: num_io_queues will be 1 and * there's only a single vector. While we're polling, we mask further * interrupts in the controller. */ void nvme_ctrlr_intx_handler(void *arg) { struct nvme_controller *ctrlr = arg; nvme_mmio_write_4(ctrlr, intms, 1); nvme_ctrlr_poll(ctrlr); nvme_mmio_write_4(ctrlr, intmc, 1); } static int nvme_ctrlr_configure_intx(struct nvme_controller *ctrlr) { ctrlr->msix_enabled = 0; ctrlr->num_io_queues = 1; ctrlr->num_cpus_per_ioq = mp_ncpus; ctrlr->rid = 0; ctrlr->res = bus_alloc_resource_any(ctrlr->dev, SYS_RES_IRQ, &ctrlr->rid, RF_SHAREABLE | RF_ACTIVE); if (ctrlr->res == NULL) { nvme_printf(ctrlr, "unable to allocate shared IRQ\n"); return (ENOMEM); } bus_setup_intr(ctrlr->dev, ctrlr->res, INTR_TYPE_MISC | INTR_MPSAFE, NULL, nvme_ctrlr_intx_handler, ctrlr, &ctrlr->tag); if (ctrlr->tag == NULL) { nvme_printf(ctrlr, "unable to setup intx handler\n"); return (ENOMEM); } return (0); } static void nvme_pt_done(void *arg, const struct nvme_completion *cpl) { struct nvme_pt_command *pt = arg; struct mtx *mtx = pt->driver_lock; uint16_t status; bzero(&pt->cpl, sizeof(pt->cpl)); pt->cpl.cdw0 = cpl->cdw0; status = cpl->status; status &= ~NVME_STATUS_P_MASK; pt->cpl.status = status; mtx_lock(mtx); pt->driver_lock = NULL; wakeup(pt); mtx_unlock(mtx); } int nvme_ctrlr_passthrough_cmd(struct nvme_controller *ctrlr, struct nvme_pt_command *pt, uint32_t nsid, int is_user_buffer, int is_admin_cmd) { struct nvme_request *req; struct mtx *mtx; struct buf *buf = NULL; int ret = 0; vm_offset_t addr, end; if (pt->len > 0) { /* * vmapbuf calls vm_fault_quick_hold_pages which only maps full * pages. Ensure this request has fewer than MAXPHYS bytes when * extended to full pages. */ addr = (vm_offset_t)pt->buf; end = round_page(addr + pt->len); addr = trunc_page(addr); if (end - addr > MAXPHYS) return EIO; if (pt->len > ctrlr->max_xfer_size) { nvme_printf(ctrlr, "pt->len (%d) " "exceeds max_xfer_size (%d)\n", pt->len, ctrlr->max_xfer_size); return EIO; } if (is_user_buffer) { /* * Ensure the user buffer is wired for the duration of * this passthrough command. */ PHOLD(curproc); buf = uma_zalloc(pbuf_zone, M_WAITOK); buf->b_data = pt->buf; buf->b_bufsize = pt->len; buf->b_iocmd = pt->is_read ? BIO_READ : BIO_WRITE; if (vmapbuf(buf, 1) < 0) { ret = EFAULT; goto err; } req = nvme_allocate_request_vaddr(buf->b_data, pt->len, nvme_pt_done, pt); } else req = nvme_allocate_request_vaddr(pt->buf, pt->len, nvme_pt_done, pt); } else req = nvme_allocate_request_null(nvme_pt_done, pt); /* Assume userspace already converted to little-endian */ req->cmd.opc = pt->cmd.opc; req->cmd.fuse = pt->cmd.fuse; req->cmd.rsvd2 = pt->cmd.rsvd2; req->cmd.rsvd3 = pt->cmd.rsvd3; req->cmd.cdw10 = pt->cmd.cdw10; req->cmd.cdw11 = pt->cmd.cdw11; req->cmd.cdw12 = pt->cmd.cdw12; req->cmd.cdw13 = pt->cmd.cdw13; req->cmd.cdw14 = pt->cmd.cdw14; req->cmd.cdw15 = pt->cmd.cdw15; req->cmd.nsid = htole32(nsid); mtx = mtx_pool_find(mtxpool_sleep, pt); pt->driver_lock = mtx; if (is_admin_cmd) nvme_ctrlr_submit_admin_request(ctrlr, req); else nvme_ctrlr_submit_io_request(ctrlr, req); mtx_lock(mtx); while (pt->driver_lock != NULL) mtx_sleep(pt, mtx, PRIBIO, "nvme_pt", 0); mtx_unlock(mtx); err: if (buf != NULL) { uma_zfree(pbuf_zone, buf); PRELE(curproc); } return (ret); } static int nvme_ctrlr_ioctl(struct cdev *cdev, u_long cmd, caddr_t arg, int flag, struct thread *td) { struct nvme_controller *ctrlr; struct nvme_pt_command *pt; ctrlr = cdev->si_drv1; switch (cmd) { case NVME_RESET_CONTROLLER: nvme_ctrlr_reset(ctrlr); break; case NVME_PASSTHROUGH_CMD: pt = (struct nvme_pt_command *)arg; return (nvme_ctrlr_passthrough_cmd(ctrlr, pt, le32toh(pt->cmd.nsid), 1 /* is_user_buffer */, 1 /* is_admin_cmd */)); default: return (ENOTTY); } return (0); } static struct cdevsw nvme_ctrlr_cdevsw = { .d_version = D_VERSION, .d_flags = 0, .d_ioctl = nvme_ctrlr_ioctl }; static void nvme_ctrlr_setup_interrupts(struct nvme_controller *ctrlr) { device_t dev; int per_cpu_io_queues; int min_cpus_per_ioq; int num_vectors_requested, num_vectors_allocated; int num_vectors_available; dev = ctrlr->dev; min_cpus_per_ioq = 1; TUNABLE_INT_FETCH("hw.nvme.min_cpus_per_ioq", &min_cpus_per_ioq); if (min_cpus_per_ioq < 1) { min_cpus_per_ioq = 1; } else if (min_cpus_per_ioq > mp_ncpus) { min_cpus_per_ioq = mp_ncpus; } per_cpu_io_queues = 1; TUNABLE_INT_FETCH("hw.nvme.per_cpu_io_queues", &per_cpu_io_queues); if (per_cpu_io_queues == 0) { min_cpus_per_ioq = mp_ncpus; } ctrlr->force_intx = 0; TUNABLE_INT_FETCH("hw.nvme.force_intx", &ctrlr->force_intx); /* * FreeBSD currently cannot allocate more than about 190 vectors at * boot, meaning that systems with high core count and many devices * requesting per-CPU interrupt vectors will not get their full * allotment. So first, try to allocate as many as we may need to * understand what is available, then immediately release them. * Then figure out how many of those we will actually use, based on * assigning an equal number of cores to each I/O queue. */ /* One vector for per core I/O queue, plus one vector for admin queue. */ num_vectors_available = min(pci_msix_count(dev), mp_ncpus + 1); if (pci_alloc_msix(dev, &num_vectors_available) != 0) { num_vectors_available = 0; } pci_release_msi(dev); if (ctrlr->force_intx || num_vectors_available < 2) { nvme_ctrlr_configure_intx(ctrlr); return; } /* * Do not use all vectors for I/O queues - one must be saved for the * admin queue. */ ctrlr->num_cpus_per_ioq = max(min_cpus_per_ioq, howmany(mp_ncpus, num_vectors_available - 1)); ctrlr->num_io_queues = howmany(mp_ncpus, ctrlr->num_cpus_per_ioq); num_vectors_requested = ctrlr->num_io_queues + 1; num_vectors_allocated = num_vectors_requested; /* * Now just allocate the number of vectors we need. This should * succeed, since we previously called pci_alloc_msix() * successfully returning at least this many vectors, but just to * be safe, if something goes wrong just revert to INTx. */ if (pci_alloc_msix(dev, &num_vectors_allocated) != 0) { nvme_ctrlr_configure_intx(ctrlr); return; } if (num_vectors_allocated < num_vectors_requested) { pci_release_msi(dev); nvme_ctrlr_configure_intx(ctrlr); return; } ctrlr->msix_enabled = 1; } int nvme_ctrlr_construct(struct nvme_controller *ctrlr, device_t dev) { struct make_dev_args md_args; uint32_t cap_lo; uint32_t cap_hi; - uint8_t to; + uint32_t to; uint8_t dstrd; uint8_t mpsmin; int status, timeout_period; ctrlr->dev = dev; mtx_init(&ctrlr->lock, "nvme ctrlr lock", NULL, MTX_DEF); status = nvme_ctrlr_allocate_bar(ctrlr); if (status != 0) return (status); /* * Software emulators may set the doorbell stride to something * other than zero, but this driver is not set up to handle that. */ cap_hi = nvme_mmio_read_4(ctrlr, cap_hi); dstrd = NVME_CAP_HI_DSTRD(cap_hi); if (dstrd != 0) return (ENXIO); mpsmin = NVME_CAP_HI_MPSMIN(cap_hi); ctrlr->min_page_size = 1 << (12 + mpsmin); /* Get ready timeout value from controller, in units of 500ms. */ cap_lo = nvme_mmio_read_4(ctrlr, cap_lo); to = NVME_CAP_LO_TO(cap_lo) + 1; ctrlr->ready_timeout_in_ms = to * 500; timeout_period = NVME_DEFAULT_TIMEOUT_PERIOD; TUNABLE_INT_FETCH("hw.nvme.timeout_period", &timeout_period); timeout_period = min(timeout_period, NVME_MAX_TIMEOUT_PERIOD); timeout_period = max(timeout_period, NVME_MIN_TIMEOUT_PERIOD); ctrlr->timeout_period = timeout_period; nvme_retry_count = NVME_DEFAULT_RETRY_COUNT; TUNABLE_INT_FETCH("hw.nvme.retry_count", &nvme_retry_count); ctrlr->enable_aborts = 0; TUNABLE_INT_FETCH("hw.nvme.enable_aborts", &ctrlr->enable_aborts); nvme_ctrlr_setup_interrupts(ctrlr); ctrlr->max_xfer_size = NVME_MAX_XFER_SIZE; if (nvme_ctrlr_construct_admin_qpair(ctrlr) != 0) return (ENXIO); ctrlr->taskqueue = taskqueue_create("nvme_taskq", M_WAITOK, taskqueue_thread_enqueue, &ctrlr->taskqueue); taskqueue_start_threads(&ctrlr->taskqueue, 1, PI_DISK, "nvme taskq"); ctrlr->is_resetting = 0; ctrlr->is_initialized = 0; ctrlr->notification_sent = 0; TASK_INIT(&ctrlr->reset_task, 0, nvme_ctrlr_reset_task, ctrlr); TASK_INIT(&ctrlr->fail_req_task, 0, nvme_ctrlr_fail_req_task, ctrlr); STAILQ_INIT(&ctrlr->fail_req); ctrlr->is_failed = FALSE; make_dev_args_init(&md_args); md_args.mda_devsw = &nvme_ctrlr_cdevsw; md_args.mda_uid = UID_ROOT; md_args.mda_gid = GID_WHEEL; md_args.mda_mode = 0600; md_args.mda_unit = device_get_unit(dev); md_args.mda_si_drv1 = (void *)ctrlr; status = make_dev_s(&md_args, &ctrlr->cdev, "nvme%d", device_get_unit(dev)); if (status != 0) return (ENXIO); return (0); } void nvme_ctrlr_destruct(struct nvme_controller *ctrlr, device_t dev) { int i; if (ctrlr->resource == NULL) goto nores; nvme_notify_fail_consumers(ctrlr); for (i = 0; i < NVME_MAX_NAMESPACES; i++) nvme_ns_destruct(&ctrlr->ns[i]); if (ctrlr->cdev) destroy_dev(ctrlr->cdev); nvme_ctrlr_destroy_qpairs(ctrlr); for (i = 0; i < ctrlr->num_io_queues; i++) { nvme_io_qpair_destroy(&ctrlr->ioq[i]); } free(ctrlr->ioq, M_NVME); nvme_admin_qpair_destroy(&ctrlr->adminq); /* * Notify the controller of a shutdown, even though this is due to * a driver unload, not a system shutdown (this path is not invoked * during shutdown). This ensures the controller receives a * shutdown notification in case the system is shutdown before * reloading the driver. */ nvme_ctrlr_shutdown(ctrlr); nvme_ctrlr_disable(ctrlr); if (ctrlr->taskqueue) taskqueue_free(ctrlr->taskqueue); if (ctrlr->tag) bus_teardown_intr(ctrlr->dev, ctrlr->res, ctrlr->tag); if (ctrlr->res) bus_release_resource(ctrlr->dev, SYS_RES_IRQ, rman_get_rid(ctrlr->res), ctrlr->res); if (ctrlr->msix_enabled) pci_release_msi(dev); if (ctrlr->bar4_resource != NULL) { bus_release_resource(dev, SYS_RES_MEMORY, ctrlr->bar4_resource_id, ctrlr->bar4_resource); } bus_release_resource(dev, SYS_RES_MEMORY, ctrlr->resource_id, ctrlr->resource); nores: mtx_destroy(&ctrlr->lock); } void nvme_ctrlr_shutdown(struct nvme_controller *ctrlr) { uint32_t cc; uint32_t csts; int ticks = 0; cc = nvme_mmio_read_4(ctrlr, cc); cc &= ~(NVME_CC_REG_SHN_MASK << NVME_CC_REG_SHN_SHIFT); cc |= NVME_SHN_NORMAL << NVME_CC_REG_SHN_SHIFT; nvme_mmio_write_4(ctrlr, cc, cc); csts = nvme_mmio_read_4(ctrlr, csts); while ((NVME_CSTS_GET_SHST(csts) != NVME_SHST_COMPLETE) && (ticks++ < 5*hz)) { pause("nvme shn", 1); csts = nvme_mmio_read_4(ctrlr, csts); } if (NVME_CSTS_GET_SHST(csts) != NVME_SHST_COMPLETE) nvme_printf(ctrlr, "did not complete shutdown within 5 seconds " "of notification\n"); } void nvme_ctrlr_submit_admin_request(struct nvme_controller *ctrlr, struct nvme_request *req) { nvme_qpair_submit_request(&ctrlr->adminq, req); } void nvme_ctrlr_submit_io_request(struct nvme_controller *ctrlr, struct nvme_request *req) { struct nvme_qpair *qpair; qpair = &ctrlr->ioq[curcpu / ctrlr->num_cpus_per_ioq]; nvme_qpair_submit_request(qpair, req); } device_t nvme_ctrlr_get_device(struct nvme_controller *ctrlr) { return (ctrlr->dev); } const struct nvme_controller_data * nvme_ctrlr_get_data(struct nvme_controller *ctrlr) { return (&ctrlr->cdata); } Index: projects/nfsv42/sys/dev/virtio/mmio/virtio_mmio.c =================================================================== --- projects/nfsv42/sys/dev/virtio/mmio/virtio_mmio.c (revision 350367) +++ projects/nfsv42/sys/dev/virtio/mmio/virtio_mmio.c (revision 350368) @@ -1,788 +1,788 @@ /*- * Copyright (c) 2014 Ruslan Bukin * Copyright (c) 2014 The FreeBSD Foundation * All rights reserved. * * This software was developed by SRI International and the University of * Cambridge Computer Laboratory under DARPA/AFRL contract (FA8750-10-C-0237) * ("CTSRD"), as part of the DARPA CRASH research programme. * * Portions of this software were developed by Andrew Turner * under sponsorship from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * VirtIO MMIO interface. * This driver is heavily based on VirtIO PCI interface driver. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include "virtio_mmio_if.h" #include "virtio_bus_if.h" #include "virtio_if.h" #define PAGE_SHIFT 12 struct vtmmio_virtqueue { struct virtqueue *vtv_vq; int vtv_no_intr; }; static int vtmmio_detach(device_t); static int vtmmio_suspend(device_t); static int vtmmio_resume(device_t); static int vtmmio_shutdown(device_t); static void vtmmio_driver_added(device_t, driver_t *); static void vtmmio_child_detached(device_t, device_t); static int vtmmio_read_ivar(device_t, device_t, int, uintptr_t *); static int vtmmio_write_ivar(device_t, device_t, int, uintptr_t); static uint64_t vtmmio_negotiate_features(device_t, uint64_t); static int vtmmio_with_feature(device_t, uint64_t); static int vtmmio_alloc_virtqueues(device_t, int, int, struct vq_alloc_info *); static int vtmmio_setup_intr(device_t, enum intr_type); static void vtmmio_stop(device_t); static void vtmmio_poll(device_t); static int vtmmio_reinit(device_t, uint64_t); static void vtmmio_reinit_complete(device_t); static void vtmmio_notify_virtqueue(device_t, uint16_t); static uint8_t vtmmio_get_status(device_t); static void vtmmio_set_status(device_t, uint8_t); static void vtmmio_read_dev_config(device_t, bus_size_t, void *, int); static void vtmmio_write_dev_config(device_t, bus_size_t, void *, int); static void vtmmio_describe_features(struct vtmmio_softc *, const char *, uint64_t); static void vtmmio_probe_and_attach_child(struct vtmmio_softc *); static int vtmmio_reinit_virtqueue(struct vtmmio_softc *, int); static void vtmmio_free_interrupts(struct vtmmio_softc *); static void vtmmio_free_virtqueues(struct vtmmio_softc *); static void vtmmio_release_child_resources(struct vtmmio_softc *); static void vtmmio_reset(struct vtmmio_softc *); static void vtmmio_select_virtqueue(struct vtmmio_softc *, int); static void vtmmio_vq_intr(void *); /* * I/O port read/write wrappers. */ #define vtmmio_write_config_1(sc, o, v) \ do { \ if (sc->platform != NULL) \ VIRTIO_MMIO_PREWRITE(sc->platform, (o), (v)); \ bus_write_1((sc)->res[0], (o), (v)); \ if (sc->platform != NULL) \ VIRTIO_MMIO_NOTE(sc->platform, (o), (v)); \ } while (0) #define vtmmio_write_config_2(sc, o, v) \ do { \ if (sc->platform != NULL) \ VIRTIO_MMIO_PREWRITE(sc->platform, (o), (v)); \ bus_write_2((sc)->res[0], (o), (v)); \ if (sc->platform != NULL) \ VIRTIO_MMIO_NOTE(sc->platform, (o), (v)); \ } while (0) #define vtmmio_write_config_4(sc, o, v) \ do { \ if (sc->platform != NULL) \ VIRTIO_MMIO_PREWRITE(sc->platform, (o), (v)); \ bus_write_4((sc)->res[0], (o), (v)); \ if (sc->platform != NULL) \ VIRTIO_MMIO_NOTE(sc->platform, (o), (v)); \ } while (0) #define vtmmio_read_config_1(sc, o) \ bus_read_1((sc)->res[0], (o)) #define vtmmio_read_config_2(sc, o) \ bus_read_2((sc)->res[0], (o)) #define vtmmio_read_config_4(sc, o) \ bus_read_4((sc)->res[0], (o)) static device_method_t vtmmio_methods[] = { /* Device interface. */ DEVMETHOD(device_attach, vtmmio_attach), DEVMETHOD(device_detach, vtmmio_detach), DEVMETHOD(device_suspend, vtmmio_suspend), DEVMETHOD(device_resume, vtmmio_resume), DEVMETHOD(device_shutdown, vtmmio_shutdown), /* Bus interface. */ DEVMETHOD(bus_driver_added, vtmmio_driver_added), DEVMETHOD(bus_child_detached, vtmmio_child_detached), DEVMETHOD(bus_child_pnpinfo_str, virtio_child_pnpinfo_str), DEVMETHOD(bus_read_ivar, vtmmio_read_ivar), DEVMETHOD(bus_write_ivar, vtmmio_write_ivar), /* VirtIO bus interface. */ DEVMETHOD(virtio_bus_negotiate_features, vtmmio_negotiate_features), DEVMETHOD(virtio_bus_with_feature, vtmmio_with_feature), DEVMETHOD(virtio_bus_alloc_virtqueues, vtmmio_alloc_virtqueues), DEVMETHOD(virtio_bus_setup_intr, vtmmio_setup_intr), DEVMETHOD(virtio_bus_stop, vtmmio_stop), DEVMETHOD(virtio_bus_poll, vtmmio_poll), DEVMETHOD(virtio_bus_reinit, vtmmio_reinit), DEVMETHOD(virtio_bus_reinit_complete, vtmmio_reinit_complete), DEVMETHOD(virtio_bus_notify_vq, vtmmio_notify_virtqueue), DEVMETHOD(virtio_bus_read_device_config, vtmmio_read_dev_config), DEVMETHOD(virtio_bus_write_device_config, vtmmio_write_dev_config), DEVMETHOD_END }; DEFINE_CLASS_0(virtio_mmio, vtmmio_driver, vtmmio_methods, sizeof(struct vtmmio_softc)); MODULE_VERSION(virtio_mmio, 1); static int vtmmio_setup_intr(device_t dev, enum intr_type type) { struct vtmmio_softc *sc; int rid; int err; sc = device_get_softc(dev); if (sc->platform != NULL) { err = VIRTIO_MMIO_SETUP_INTR(sc->platform, sc->dev, vtmmio_vq_intr, sc); if (err == 0) { /* Okay we have backend-specific interrupts */ return (0); } } rid = 0; sc->res[1] = bus_alloc_resource_any(dev, SYS_RES_IRQ, &rid, RF_ACTIVE); if (!sc->res[1]) { device_printf(dev, "Can't allocate interrupt\n"); return (ENXIO); } if (bus_setup_intr(dev, sc->res[1], INTR_TYPE_MISC | INTR_MPSAFE, NULL, vtmmio_vq_intr, sc, &sc->ih)) { device_printf(dev, "Can't setup the interrupt\n"); return (ENXIO); } return (0); } int vtmmio_attach(device_t dev) { struct vtmmio_softc *sc; device_t child; int rid; sc = device_get_softc(dev); sc->dev = dev; rid = 0; sc->res[0] = bus_alloc_resource_any(dev, SYS_RES_MEMORY, &rid, RF_ACTIVE); if (!sc->res[0]) { device_printf(dev, "Cannot allocate memory window.\n"); return (ENXIO); } vtmmio_reset(sc); /* Tell the host we've noticed this device. */ vtmmio_set_status(dev, VIRTIO_CONFIG_STATUS_ACK); if ((child = device_add_child(dev, NULL, -1)) == NULL) { device_printf(dev, "Cannot create child device.\n"); vtmmio_set_status(dev, VIRTIO_CONFIG_STATUS_FAILED); vtmmio_detach(dev); return (ENOMEM); } sc->vtmmio_child_dev = child; vtmmio_probe_and_attach_child(sc); return (0); } static int vtmmio_detach(device_t dev) { struct vtmmio_softc *sc; device_t child; int error; sc = device_get_softc(dev); if ((child = sc->vtmmio_child_dev) != NULL) { error = device_delete_child(dev, child); if (error) return (error); sc->vtmmio_child_dev = NULL; } vtmmio_reset(sc); if (sc->res[0] != NULL) { bus_release_resource(dev, SYS_RES_MEMORY, 0, sc->res[0]); sc->res[0] = NULL; } return (0); } static int vtmmio_suspend(device_t dev) { return (bus_generic_suspend(dev)); } static int vtmmio_resume(device_t dev) { return (bus_generic_resume(dev)); } static int vtmmio_shutdown(device_t dev) { (void) bus_generic_shutdown(dev); /* Forcibly stop the host device. */ vtmmio_stop(dev); return (0); } static void vtmmio_driver_added(device_t dev, driver_t *driver) { struct vtmmio_softc *sc; sc = device_get_softc(dev); vtmmio_probe_and_attach_child(sc); } static void vtmmio_child_detached(device_t dev, device_t child) { struct vtmmio_softc *sc; sc = device_get_softc(dev); vtmmio_reset(sc); vtmmio_release_child_resources(sc); } static int vtmmio_read_ivar(device_t dev, device_t child, int index, uintptr_t *result) { struct vtmmio_softc *sc; sc = device_get_softc(dev); if (sc->vtmmio_child_dev != child) return (ENOENT); switch (index) { case VIRTIO_IVAR_DEVTYPE: case VIRTIO_IVAR_SUBDEVICE: *result = vtmmio_read_config_4(sc, VIRTIO_MMIO_DEVICE_ID); break; case VIRTIO_IVAR_VENDOR: *result = vtmmio_read_config_4(sc, VIRTIO_MMIO_VENDOR_ID); break; case VIRTIO_IVAR_SUBVENDOR: case VIRTIO_IVAR_DEVICE: /* * Dummy value for fields not present in this bus. Used by * bus-agnostic virtio_child_pnpinfo_str. */ *result = 0; break; default: return (ENOENT); } return (0); } static int vtmmio_write_ivar(device_t dev, device_t child, int index, uintptr_t value) { struct vtmmio_softc *sc; sc = device_get_softc(dev); if (sc->vtmmio_child_dev != child) return (ENOENT); switch (index) { case VIRTIO_IVAR_FEATURE_DESC: sc->vtmmio_child_feat_desc = (void *) value; break; default: return (ENOENT); } return (0); } static uint64_t vtmmio_negotiate_features(device_t dev, uint64_t child_features) { struct vtmmio_softc *sc; uint64_t host_features, features; sc = device_get_softc(dev); host_features = vtmmio_read_config_4(sc, VIRTIO_MMIO_HOST_FEATURES); vtmmio_describe_features(sc, "host", host_features); /* * Limit negotiated features to what the driver, virtqueue, and * host all support. */ features = host_features & child_features; features = virtqueue_filter_features(features); sc->vtmmio_features = features; vtmmio_describe_features(sc, "negotiated", features); vtmmio_write_config_4(sc, VIRTIO_MMIO_GUEST_FEATURES, features); return (features); } static int vtmmio_with_feature(device_t dev, uint64_t feature) { struct vtmmio_softc *sc; sc = device_get_softc(dev); return ((sc->vtmmio_features & feature) != 0); } static int vtmmio_alloc_virtqueues(device_t dev, int flags, int nvqs, struct vq_alloc_info *vq_info) { struct vtmmio_virtqueue *vqx; struct vq_alloc_info *info; struct vtmmio_softc *sc; struct virtqueue *vq; uint32_t size; int idx, error; sc = device_get_softc(dev); if (sc->vtmmio_nvqs != 0) return (EALREADY); if (nvqs <= 0) return (EINVAL); sc->vtmmio_vqs = malloc(nvqs * sizeof(struct vtmmio_virtqueue), M_DEVBUF, M_NOWAIT | M_ZERO); if (sc->vtmmio_vqs == NULL) return (ENOMEM); vtmmio_write_config_4(sc, VIRTIO_MMIO_GUEST_PAGE_SIZE, (1 << PAGE_SHIFT)); for (idx = 0; idx < nvqs; idx++) { vqx = &sc->vtmmio_vqs[idx]; info = &vq_info[idx]; vtmmio_write_config_4(sc, VIRTIO_MMIO_QUEUE_SEL, idx); vtmmio_select_virtqueue(sc, idx); size = vtmmio_read_config_4(sc, VIRTIO_MMIO_QUEUE_NUM_MAX); error = virtqueue_alloc(dev, idx, size, - VIRTIO_MMIO_VRING_ALIGN, 0xFFFFFFFFUL, info, &vq); + VIRTIO_MMIO_VRING_ALIGN, ~(vm_paddr_t)0, info, &vq); if (error) { device_printf(dev, "cannot allocate virtqueue %d: %d\n", idx, error); break; } vtmmio_write_config_4(sc, VIRTIO_MMIO_QUEUE_NUM, size); vtmmio_write_config_4(sc, VIRTIO_MMIO_QUEUE_ALIGN, VIRTIO_MMIO_VRING_ALIGN); #if 0 device_printf(dev, "virtqueue paddr 0x%08lx\n", (uint64_t)virtqueue_paddr(vq)); #endif vtmmio_write_config_4(sc, VIRTIO_MMIO_QUEUE_PFN, virtqueue_paddr(vq) >> PAGE_SHIFT); vqx->vtv_vq = *info->vqai_vq = vq; vqx->vtv_no_intr = info->vqai_intr == NULL; sc->vtmmio_nvqs++; } if (error) vtmmio_free_virtqueues(sc); return (error); } static void vtmmio_stop(device_t dev) { vtmmio_reset(device_get_softc(dev)); } static void vtmmio_poll(device_t dev) { struct vtmmio_softc *sc; sc = device_get_softc(dev); if (sc->platform != NULL) VIRTIO_MMIO_POLL(sc->platform); } static int vtmmio_reinit(device_t dev, uint64_t features) { struct vtmmio_softc *sc; int idx, error; sc = device_get_softc(dev); if (vtmmio_get_status(dev) != VIRTIO_CONFIG_STATUS_RESET) vtmmio_stop(dev); /* * Quickly drive the status through ACK and DRIVER. The device * does not become usable again until vtmmio_reinit_complete(). */ vtmmio_set_status(dev, VIRTIO_CONFIG_STATUS_ACK); vtmmio_set_status(dev, VIRTIO_CONFIG_STATUS_DRIVER); vtmmio_negotiate_features(dev, features); vtmmio_write_config_4(sc, VIRTIO_MMIO_GUEST_PAGE_SIZE, (1 << PAGE_SHIFT)); for (idx = 0; idx < sc->vtmmio_nvqs; idx++) { error = vtmmio_reinit_virtqueue(sc, idx); if (error) return (error); } return (0); } static void vtmmio_reinit_complete(device_t dev) { vtmmio_set_status(dev, VIRTIO_CONFIG_STATUS_DRIVER_OK); } static void vtmmio_notify_virtqueue(device_t dev, uint16_t queue) { struct vtmmio_softc *sc; sc = device_get_softc(dev); vtmmio_write_config_4(sc, VIRTIO_MMIO_QUEUE_NOTIFY, queue); } static uint8_t vtmmio_get_status(device_t dev) { struct vtmmio_softc *sc; sc = device_get_softc(dev); return (vtmmio_read_config_4(sc, VIRTIO_MMIO_STATUS)); } static void vtmmio_set_status(device_t dev, uint8_t status) { struct vtmmio_softc *sc; sc = device_get_softc(dev); if (status != VIRTIO_CONFIG_STATUS_RESET) status |= vtmmio_get_status(dev); vtmmio_write_config_4(sc, VIRTIO_MMIO_STATUS, status); } static void vtmmio_read_dev_config(device_t dev, bus_size_t offset, void *dst, int length) { struct vtmmio_softc *sc; bus_size_t off; uint8_t *d; int size; sc = device_get_softc(dev); off = VIRTIO_MMIO_CONFIG + offset; for (d = dst; length > 0; d += size, off += size, length -= size) { #ifdef ALLOW_WORD_ALIGNED_ACCESS if (length >= 4) { size = 4; *(uint32_t *)d = vtmmio_read_config_4(sc, off); } else if (length >= 2) { size = 2; *(uint16_t *)d = vtmmio_read_config_2(sc, off); } else #endif { size = 1; *d = vtmmio_read_config_1(sc, off); } } } static void vtmmio_write_dev_config(device_t dev, bus_size_t offset, void *src, int length) { struct vtmmio_softc *sc; bus_size_t off; uint8_t *s; int size; sc = device_get_softc(dev); off = VIRTIO_MMIO_CONFIG + offset; for (s = src; length > 0; s += size, off += size, length -= size) { #ifdef ALLOW_WORD_ALIGNED_ACCESS if (length >= 4) { size = 4; vtmmio_write_config_4(sc, off, *(uint32_t *)s); } else if (length >= 2) { size = 2; vtmmio_write_config_2(sc, off, *(uint16_t *)s); } else #endif { size = 1; vtmmio_write_config_1(sc, off, *s); } } } static void vtmmio_describe_features(struct vtmmio_softc *sc, const char *msg, uint64_t features) { device_t dev, child; dev = sc->dev; child = sc->vtmmio_child_dev; if (device_is_attached(child) || bootverbose == 0) return; virtio_describe(dev, msg, features, sc->vtmmio_child_feat_desc); } static void vtmmio_probe_and_attach_child(struct vtmmio_softc *sc) { device_t dev, child; dev = sc->dev; child = sc->vtmmio_child_dev; if (child == NULL) return; if (device_get_state(child) != DS_NOTPRESENT) { return; } if (device_probe(child) != 0) { return; } vtmmio_set_status(dev, VIRTIO_CONFIG_STATUS_DRIVER); if (device_attach(child) != 0) { vtmmio_set_status(dev, VIRTIO_CONFIG_STATUS_FAILED); vtmmio_reset(sc); vtmmio_release_child_resources(sc); /* Reset status for future attempt. */ vtmmio_set_status(dev, VIRTIO_CONFIG_STATUS_ACK); } else { vtmmio_set_status(dev, VIRTIO_CONFIG_STATUS_DRIVER_OK); VIRTIO_ATTACH_COMPLETED(child); } } static int vtmmio_reinit_virtqueue(struct vtmmio_softc *sc, int idx) { struct vtmmio_virtqueue *vqx; struct virtqueue *vq; int error; uint16_t size; vqx = &sc->vtmmio_vqs[idx]; vq = vqx->vtv_vq; KASSERT(vq != NULL, ("%s: vq %d not allocated", __func__, idx)); vtmmio_select_virtqueue(sc, idx); size = vtmmio_read_config_4(sc, VIRTIO_MMIO_QUEUE_NUM_MAX); error = virtqueue_reinit(vq, size); if (error) return (error); vtmmio_write_config_4(sc, VIRTIO_MMIO_QUEUE_NUM, size); vtmmio_write_config_4(sc, VIRTIO_MMIO_QUEUE_ALIGN, VIRTIO_MMIO_VRING_ALIGN); #if 0 device_printf(sc->dev, "virtqueue paddr 0x%08lx\n", (uint64_t)virtqueue_paddr(vq)); #endif vtmmio_write_config_4(sc, VIRTIO_MMIO_QUEUE_PFN, virtqueue_paddr(vq) >> PAGE_SHIFT); return (0); } static void vtmmio_free_interrupts(struct vtmmio_softc *sc) { if (sc->ih != NULL) bus_teardown_intr(sc->dev, sc->res[1], sc->ih); if (sc->res[1] != NULL) bus_release_resource(sc->dev, SYS_RES_IRQ, 0, sc->res[1]); } static void vtmmio_free_virtqueues(struct vtmmio_softc *sc) { struct vtmmio_virtqueue *vqx; int idx; for (idx = 0; idx < sc->vtmmio_nvqs; idx++) { vqx = &sc->vtmmio_vqs[idx]; vtmmio_select_virtqueue(sc, idx); vtmmio_write_config_4(sc, VIRTIO_MMIO_QUEUE_PFN, 0); virtqueue_free(vqx->vtv_vq); vqx->vtv_vq = NULL; } free(sc->vtmmio_vqs, M_DEVBUF); sc->vtmmio_vqs = NULL; sc->vtmmio_nvqs = 0; } static void vtmmio_release_child_resources(struct vtmmio_softc *sc) { vtmmio_free_interrupts(sc); vtmmio_free_virtqueues(sc); } static void vtmmio_reset(struct vtmmio_softc *sc) { /* * Setting the status to RESET sets the host device to * the original, uninitialized state. */ vtmmio_set_status(sc->dev, VIRTIO_CONFIG_STATUS_RESET); } static void vtmmio_select_virtqueue(struct vtmmio_softc *sc, int idx) { vtmmio_write_config_4(sc, VIRTIO_MMIO_QUEUE_SEL, idx); } static void vtmmio_vq_intr(void *arg) { struct vtmmio_virtqueue *vqx; struct vtmmio_softc *sc; struct virtqueue *vq; uint32_t status; int idx; sc = arg; status = vtmmio_read_config_4(sc, VIRTIO_MMIO_INTERRUPT_STATUS); vtmmio_write_config_4(sc, VIRTIO_MMIO_INTERRUPT_ACK, status); /* The config changed */ if (status & VIRTIO_MMIO_INT_CONFIG) if (sc->vtmmio_child_dev != NULL) VIRTIO_CONFIG_CHANGE(sc->vtmmio_child_dev); /* Notify all virtqueues. */ if (status & VIRTIO_MMIO_INT_VRING) { for (idx = 0; idx < sc->vtmmio_nvqs; idx++) { vqx = &sc->vtmmio_vqs[idx]; if (vqx->vtv_no_intr == 0) { vq = vqx->vtv_vq; virtqueue_intr(vq); } } } } Index: projects/nfsv42/sys/dev/virtio/pci/virtio_pci.c =================================================================== --- projects/nfsv42/sys/dev/virtio/pci/virtio_pci.c (revision 350367) +++ projects/nfsv42/sys/dev/virtio/pci/virtio_pci.c (revision 350368) @@ -1,1333 +1,1333 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2011, Bryan Venteicher * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* Driver for the VirtIO PCI interface. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "virtio_bus_if.h" #include "virtio_if.h" struct vtpci_interrupt { struct resource *vti_irq; int vti_rid; void *vti_handler; }; struct vtpci_virtqueue { struct virtqueue *vtv_vq; int vtv_no_intr; }; struct vtpci_softc { device_t vtpci_dev; struct resource *vtpci_res; struct resource *vtpci_msix_res; uint64_t vtpci_features; uint32_t vtpci_flags; #define VTPCI_FLAG_NO_MSI 0x0001 #define VTPCI_FLAG_NO_MSIX 0x0002 #define VTPCI_FLAG_LEGACY 0x1000 #define VTPCI_FLAG_MSI 0x2000 #define VTPCI_FLAG_MSIX 0x4000 #define VTPCI_FLAG_SHARED_MSIX 0x8000 #define VTPCI_FLAG_ITYPE_MASK 0xF000 /* This "bus" will only ever have one child. */ device_t vtpci_child_dev; struct virtio_feature_desc *vtpci_child_feat_desc; int vtpci_nvqs; struct vtpci_virtqueue *vtpci_vqs; /* * Ideally, each virtqueue that the driver provides a callback for will * receive its own MSIX vector. If there are not sufficient vectors * available, then attempt to have all the VQs share one vector. For * MSIX, the configuration changed notifications must be on their own * vector. * * If MSIX is not available, we will attempt to have the whole device * share one MSI vector, and then, finally, one legacy interrupt. */ struct vtpci_interrupt vtpci_device_interrupt; struct vtpci_interrupt *vtpci_msix_vq_interrupts; int vtpci_nmsix_resources; }; static int vtpci_probe(device_t); static int vtpci_attach(device_t); static int vtpci_detach(device_t); static int vtpci_suspend(device_t); static int vtpci_resume(device_t); static int vtpci_shutdown(device_t); static void vtpci_driver_added(device_t, driver_t *); static void vtpci_child_detached(device_t, device_t); static int vtpci_read_ivar(device_t, device_t, int, uintptr_t *); static int vtpci_write_ivar(device_t, device_t, int, uintptr_t); static uint64_t vtpci_negotiate_features(device_t, uint64_t); static int vtpci_with_feature(device_t, uint64_t); static int vtpci_alloc_virtqueues(device_t, int, int, struct vq_alloc_info *); static int vtpci_setup_intr(device_t, enum intr_type); static void vtpci_stop(device_t); static int vtpci_reinit(device_t, uint64_t); static void vtpci_reinit_complete(device_t); static void vtpci_notify_virtqueue(device_t, uint16_t); static uint8_t vtpci_get_status(device_t); static void vtpci_set_status(device_t, uint8_t); static void vtpci_read_dev_config(device_t, bus_size_t, void *, int); static void vtpci_write_dev_config(device_t, bus_size_t, void *, int); static void vtpci_describe_features(struct vtpci_softc *, const char *, uint64_t); static void vtpci_probe_and_attach_child(struct vtpci_softc *); static int vtpci_alloc_msix(struct vtpci_softc *, int); static int vtpci_alloc_msi(struct vtpci_softc *); static int vtpci_alloc_intr_msix_pervq(struct vtpci_softc *); static int vtpci_alloc_intr_msix_shared(struct vtpci_softc *); static int vtpci_alloc_intr_msi(struct vtpci_softc *); static int vtpci_alloc_intr_legacy(struct vtpci_softc *); static int vtpci_alloc_interrupt(struct vtpci_softc *, int, int, struct vtpci_interrupt *); static int vtpci_alloc_intr_resources(struct vtpci_softc *); static int vtpci_setup_legacy_interrupt(struct vtpci_softc *, enum intr_type); static int vtpci_setup_pervq_msix_interrupts(struct vtpci_softc *, enum intr_type); static int vtpci_setup_msix_interrupts(struct vtpci_softc *, enum intr_type); static int vtpci_setup_interrupts(struct vtpci_softc *, enum intr_type); static int vtpci_register_msix_vector(struct vtpci_softc *, int, struct vtpci_interrupt *); static int vtpci_set_host_msix_vectors(struct vtpci_softc *); static int vtpci_reinit_virtqueue(struct vtpci_softc *, int); static void vtpci_free_interrupt(struct vtpci_softc *, struct vtpci_interrupt *); static void vtpci_free_interrupts(struct vtpci_softc *); static void vtpci_free_virtqueues(struct vtpci_softc *); static void vtpci_release_child_resources(struct vtpci_softc *); static void vtpci_cleanup_setup_intr_attempt(struct vtpci_softc *); static void vtpci_reset(struct vtpci_softc *); static void vtpci_select_virtqueue(struct vtpci_softc *, int); static void vtpci_legacy_intr(void *); static int vtpci_vq_shared_intr_filter(void *); static void vtpci_vq_shared_intr(void *); static int vtpci_vq_intr_filter(void *); static void vtpci_vq_intr(void *); static void vtpci_config_intr(void *); #define vtpci_setup_msi_interrupt vtpci_setup_legacy_interrupt #define VIRTIO_PCI_CONFIG(_sc) \ VIRTIO_PCI_CONFIG_OFF((((_sc)->vtpci_flags & VTPCI_FLAG_MSIX)) != 0) /* * I/O port read/write wrappers. */ #define vtpci_read_config_1(sc, o) bus_read_1((sc)->vtpci_res, (o)) #define vtpci_read_config_2(sc, o) bus_read_2((sc)->vtpci_res, (o)) #define vtpci_read_config_4(sc, o) bus_read_4((sc)->vtpci_res, (o)) #define vtpci_write_config_1(sc, o, v) bus_write_1((sc)->vtpci_res, (o), (v)) #define vtpci_write_config_2(sc, o, v) bus_write_2((sc)->vtpci_res, (o), (v)) #define vtpci_write_config_4(sc, o, v) bus_write_4((sc)->vtpci_res, (o), (v)) /* Tunables. */ static int vtpci_disable_msix = 0; TUNABLE_INT("hw.virtio.pci.disable_msix", &vtpci_disable_msix); static device_method_t vtpci_methods[] = { /* Device interface. */ DEVMETHOD(device_probe, vtpci_probe), DEVMETHOD(device_attach, vtpci_attach), DEVMETHOD(device_detach, vtpci_detach), DEVMETHOD(device_suspend, vtpci_suspend), DEVMETHOD(device_resume, vtpci_resume), DEVMETHOD(device_shutdown, vtpci_shutdown), /* Bus interface. */ DEVMETHOD(bus_driver_added, vtpci_driver_added), DEVMETHOD(bus_child_detached, vtpci_child_detached), DEVMETHOD(bus_child_pnpinfo_str, virtio_child_pnpinfo_str), DEVMETHOD(bus_read_ivar, vtpci_read_ivar), DEVMETHOD(bus_write_ivar, vtpci_write_ivar), /* VirtIO bus interface. */ DEVMETHOD(virtio_bus_negotiate_features, vtpci_negotiate_features), DEVMETHOD(virtio_bus_with_feature, vtpci_with_feature), DEVMETHOD(virtio_bus_alloc_virtqueues, vtpci_alloc_virtqueues), DEVMETHOD(virtio_bus_setup_intr, vtpci_setup_intr), DEVMETHOD(virtio_bus_stop, vtpci_stop), DEVMETHOD(virtio_bus_reinit, vtpci_reinit), DEVMETHOD(virtio_bus_reinit_complete, vtpci_reinit_complete), DEVMETHOD(virtio_bus_notify_vq, vtpci_notify_virtqueue), DEVMETHOD(virtio_bus_read_device_config, vtpci_read_dev_config), DEVMETHOD(virtio_bus_write_device_config, vtpci_write_dev_config), DEVMETHOD_END }; static driver_t vtpci_driver = { "virtio_pci", vtpci_methods, sizeof(struct vtpci_softc) }; devclass_t vtpci_devclass; DRIVER_MODULE(virtio_pci, pci, vtpci_driver, vtpci_devclass, 0, 0); MODULE_VERSION(virtio_pci, 1); MODULE_DEPEND(virtio_pci, pci, 1, 1, 1); MODULE_DEPEND(virtio_pci, virtio, 1, 1, 1); static int vtpci_probe(device_t dev) { char desc[36]; const char *name; if (pci_get_vendor(dev) != VIRTIO_PCI_VENDORID) return (ENXIO); if (pci_get_device(dev) < VIRTIO_PCI_DEVICEID_MIN || pci_get_device(dev) > VIRTIO_PCI_DEVICEID_MAX) return (ENXIO); if (pci_get_revid(dev) != VIRTIO_PCI_ABI_VERSION) return (ENXIO); name = virtio_device_name(pci_get_subdevice(dev)); if (name == NULL) name = "Unknown"; snprintf(desc, sizeof(desc), "VirtIO PCI %s adapter", name); device_set_desc_copy(dev, desc); return (BUS_PROBE_DEFAULT); } static int vtpci_attach(device_t dev) { struct vtpci_softc *sc; device_t child; int rid; sc = device_get_softc(dev); sc->vtpci_dev = dev; pci_enable_busmaster(dev); rid = PCIR_BAR(0); sc->vtpci_res = bus_alloc_resource_any(dev, SYS_RES_IOPORT, &rid, RF_ACTIVE); if (sc->vtpci_res == NULL) { device_printf(dev, "cannot map I/O space\n"); return (ENXIO); } if (pci_find_cap(dev, PCIY_MSI, NULL) != 0) sc->vtpci_flags |= VTPCI_FLAG_NO_MSI; if (pci_find_cap(dev, PCIY_MSIX, NULL) == 0) { rid = PCIR_BAR(1); sc->vtpci_msix_res = bus_alloc_resource_any(dev, SYS_RES_MEMORY, &rid, RF_ACTIVE); } if (sc->vtpci_msix_res == NULL) sc->vtpci_flags |= VTPCI_FLAG_NO_MSIX; vtpci_reset(sc); /* Tell the host we've noticed this device. */ vtpci_set_status(dev, VIRTIO_CONFIG_STATUS_ACK); if ((child = device_add_child(dev, NULL, -1)) == NULL) { device_printf(dev, "cannot create child device\n"); vtpci_set_status(dev, VIRTIO_CONFIG_STATUS_FAILED); vtpci_detach(dev); return (ENOMEM); } sc->vtpci_child_dev = child; vtpci_probe_and_attach_child(sc); return (0); } static int vtpci_detach(device_t dev) { struct vtpci_softc *sc; device_t child; int error; sc = device_get_softc(dev); if ((child = sc->vtpci_child_dev) != NULL) { error = device_delete_child(dev, child); if (error) return (error); sc->vtpci_child_dev = NULL; } vtpci_reset(sc); if (sc->vtpci_msix_res != NULL) { bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BAR(1), sc->vtpci_msix_res); sc->vtpci_msix_res = NULL; } if (sc->vtpci_res != NULL) { bus_release_resource(dev, SYS_RES_IOPORT, PCIR_BAR(0), sc->vtpci_res); sc->vtpci_res = NULL; } return (0); } static int vtpci_suspend(device_t dev) { return (bus_generic_suspend(dev)); } static int vtpci_resume(device_t dev) { return (bus_generic_resume(dev)); } static int vtpci_shutdown(device_t dev) { (void) bus_generic_shutdown(dev); /* Forcibly stop the host device. */ vtpci_stop(dev); return (0); } static void vtpci_driver_added(device_t dev, driver_t *driver) { struct vtpci_softc *sc; sc = device_get_softc(dev); vtpci_probe_and_attach_child(sc); } static void vtpci_child_detached(device_t dev, device_t child) { struct vtpci_softc *sc; sc = device_get_softc(dev); vtpci_reset(sc); vtpci_release_child_resources(sc); } static int vtpci_read_ivar(device_t dev, device_t child, int index, uintptr_t *result) { struct vtpci_softc *sc; sc = device_get_softc(dev); if (sc->vtpci_child_dev != child) return (ENOENT); switch (index) { case VIRTIO_IVAR_DEVTYPE: case VIRTIO_IVAR_SUBDEVICE: *result = pci_get_subdevice(dev); break; case VIRTIO_IVAR_VENDOR: *result = pci_get_vendor(dev); break; case VIRTIO_IVAR_DEVICE: *result = pci_get_device(dev); break; case VIRTIO_IVAR_SUBVENDOR: *result = pci_get_subvendor(dev); break; default: return (ENOENT); } return (0); } static int vtpci_write_ivar(device_t dev, device_t child, int index, uintptr_t value) { struct vtpci_softc *sc; sc = device_get_softc(dev); if (sc->vtpci_child_dev != child) return (ENOENT); switch (index) { case VIRTIO_IVAR_FEATURE_DESC: sc->vtpci_child_feat_desc = (void *) value; break; default: return (ENOENT); } return (0); } static uint64_t vtpci_negotiate_features(device_t dev, uint64_t child_features) { struct vtpci_softc *sc; uint64_t host_features, features; sc = device_get_softc(dev); host_features = vtpci_read_config_4(sc, VIRTIO_PCI_HOST_FEATURES); vtpci_describe_features(sc, "host", host_features); /* * Limit negotiated features to what the driver, virtqueue, and * host all support. */ features = host_features & child_features; features = virtqueue_filter_features(features); sc->vtpci_features = features; vtpci_describe_features(sc, "negotiated", features); vtpci_write_config_4(sc, VIRTIO_PCI_GUEST_FEATURES, features); return (features); } static int vtpci_with_feature(device_t dev, uint64_t feature) { struct vtpci_softc *sc; sc = device_get_softc(dev); return ((sc->vtpci_features & feature) != 0); } static int vtpci_alloc_virtqueues(device_t dev, int flags, int nvqs, struct vq_alloc_info *vq_info) { struct vtpci_softc *sc; struct virtqueue *vq; struct vtpci_virtqueue *vqx; struct vq_alloc_info *info; int idx, error; uint16_t size; sc = device_get_softc(dev); if (sc->vtpci_nvqs != 0) return (EALREADY); if (nvqs <= 0) return (EINVAL); sc->vtpci_vqs = malloc(nvqs * sizeof(struct vtpci_virtqueue), M_DEVBUF, M_NOWAIT | M_ZERO); if (sc->vtpci_vqs == NULL) return (ENOMEM); for (idx = 0; idx < nvqs; idx++) { vqx = &sc->vtpci_vqs[idx]; info = &vq_info[idx]; vtpci_select_virtqueue(sc, idx); size = vtpci_read_config_2(sc, VIRTIO_PCI_QUEUE_NUM); error = virtqueue_alloc(dev, idx, size, VIRTIO_PCI_VRING_ALIGN, - 0xFFFFFFFFUL, info, &vq); + ~(vm_paddr_t)0, info, &vq); if (error) { device_printf(dev, "cannot allocate virtqueue %d: %d\n", idx, error); break; } vtpci_write_config_4(sc, VIRTIO_PCI_QUEUE_PFN, virtqueue_paddr(vq) >> VIRTIO_PCI_QUEUE_ADDR_SHIFT); vqx->vtv_vq = *info->vqai_vq = vq; vqx->vtv_no_intr = info->vqai_intr == NULL; sc->vtpci_nvqs++; } if (error) vtpci_free_virtqueues(sc); return (error); } static int vtpci_setup_intr(device_t dev, enum intr_type type) { struct vtpci_softc *sc; int attempt, error; sc = device_get_softc(dev); for (attempt = 0; attempt < 5; attempt++) { /* * Start with the most desirable interrupt configuration and * fallback towards less desirable ones. */ switch (attempt) { case 0: error = vtpci_alloc_intr_msix_pervq(sc); break; case 1: error = vtpci_alloc_intr_msix_shared(sc); break; case 2: error = vtpci_alloc_intr_msi(sc); break; case 3: error = vtpci_alloc_intr_legacy(sc); break; default: device_printf(dev, "exhausted all interrupt allocation attempts\n"); return (ENXIO); } if (error == 0 && vtpci_setup_interrupts(sc, type) == 0) break; vtpci_cleanup_setup_intr_attempt(sc); } if (bootverbose) { if (sc->vtpci_flags & VTPCI_FLAG_LEGACY) device_printf(dev, "using legacy interrupt\n"); else if (sc->vtpci_flags & VTPCI_FLAG_MSI) device_printf(dev, "using MSI interrupt\n"); else if (sc->vtpci_flags & VTPCI_FLAG_SHARED_MSIX) device_printf(dev, "using shared MSIX interrupts\n"); else device_printf(dev, "using per VQ MSIX interrupts\n"); } return (0); } static void vtpci_stop(device_t dev) { vtpci_reset(device_get_softc(dev)); } static int vtpci_reinit(device_t dev, uint64_t features) { struct vtpci_softc *sc; int idx, error; sc = device_get_softc(dev); /* * Redrive the device initialization. This is a bit of an abuse of * the specification, but VirtualBox, QEMU/KVM, and BHyVe seem to * play nice. * * We do not allow the host device to change from what was originally * negotiated beyond what the guest driver changed. MSIX state should * not change, number of virtqueues and their size remain the same, etc. * This will need to be rethought when we want to support migration. */ if (vtpci_get_status(dev) != VIRTIO_CONFIG_STATUS_RESET) vtpci_stop(dev); /* * Quickly drive the status through ACK and DRIVER. The device * does not become usable again until vtpci_reinit_complete(). */ vtpci_set_status(dev, VIRTIO_CONFIG_STATUS_ACK); vtpci_set_status(dev, VIRTIO_CONFIG_STATUS_DRIVER); vtpci_negotiate_features(dev, features); for (idx = 0; idx < sc->vtpci_nvqs; idx++) { error = vtpci_reinit_virtqueue(sc, idx); if (error) return (error); } if (sc->vtpci_flags & VTPCI_FLAG_MSIX) { error = vtpci_set_host_msix_vectors(sc); if (error) return (error); } return (0); } static void vtpci_reinit_complete(device_t dev) { vtpci_set_status(dev, VIRTIO_CONFIG_STATUS_DRIVER_OK); } static void vtpci_notify_virtqueue(device_t dev, uint16_t queue) { struct vtpci_softc *sc; sc = device_get_softc(dev); vtpci_write_config_2(sc, VIRTIO_PCI_QUEUE_NOTIFY, queue); } static uint8_t vtpci_get_status(device_t dev) { struct vtpci_softc *sc; sc = device_get_softc(dev); return (vtpci_read_config_1(sc, VIRTIO_PCI_STATUS)); } static void vtpci_set_status(device_t dev, uint8_t status) { struct vtpci_softc *sc; sc = device_get_softc(dev); if (status != VIRTIO_CONFIG_STATUS_RESET) status |= vtpci_get_status(dev); vtpci_write_config_1(sc, VIRTIO_PCI_STATUS, status); } static void vtpci_read_dev_config(device_t dev, bus_size_t offset, void *dst, int length) { struct vtpci_softc *sc; bus_size_t off; uint8_t *d; int size; sc = device_get_softc(dev); off = VIRTIO_PCI_CONFIG(sc) + offset; for (d = dst; length > 0; d += size, off += size, length -= size) { if (length >= 4) { size = 4; *(uint32_t *)d = vtpci_read_config_4(sc, off); } else if (length >= 2) { size = 2; *(uint16_t *)d = vtpci_read_config_2(sc, off); } else { size = 1; *d = vtpci_read_config_1(sc, off); } } } static void vtpci_write_dev_config(device_t dev, bus_size_t offset, void *src, int length) { struct vtpci_softc *sc; bus_size_t off; uint8_t *s; int size; sc = device_get_softc(dev); off = VIRTIO_PCI_CONFIG(sc) + offset; for (s = src; length > 0; s += size, off += size, length -= size) { if (length >= 4) { size = 4; vtpci_write_config_4(sc, off, *(uint32_t *)s); } else if (length >= 2) { size = 2; vtpci_write_config_2(sc, off, *(uint16_t *)s); } else { size = 1; vtpci_write_config_1(sc, off, *s); } } } static void vtpci_describe_features(struct vtpci_softc *sc, const char *msg, uint64_t features) { device_t dev, child; dev = sc->vtpci_dev; child = sc->vtpci_child_dev; if (device_is_attached(child) || bootverbose == 0) return; virtio_describe(dev, msg, features, sc->vtpci_child_feat_desc); } static void vtpci_probe_and_attach_child(struct vtpci_softc *sc) { device_t dev, child; dev = sc->vtpci_dev; child = sc->vtpci_child_dev; if (child == NULL) return; if (device_get_state(child) != DS_NOTPRESENT) return; if (device_probe(child) != 0) return; vtpci_set_status(dev, VIRTIO_CONFIG_STATUS_DRIVER); if (device_attach(child) != 0) { vtpci_set_status(dev, VIRTIO_CONFIG_STATUS_FAILED); vtpci_reset(sc); vtpci_release_child_resources(sc); /* Reset status for future attempt. */ vtpci_set_status(dev, VIRTIO_CONFIG_STATUS_ACK); } else { vtpci_set_status(dev, VIRTIO_CONFIG_STATUS_DRIVER_OK); VIRTIO_ATTACH_COMPLETED(child); } } static int vtpci_alloc_msix(struct vtpci_softc *sc, int nvectors) { device_t dev; int nmsix, cnt, required; dev = sc->vtpci_dev; /* Allocate an additional vector for the config changes. */ required = nvectors + 1; nmsix = pci_msix_count(dev); if (nmsix < required) return (1); cnt = required; if (pci_alloc_msix(dev, &cnt) == 0 && cnt >= required) { sc->vtpci_nmsix_resources = required; return (0); } pci_release_msi(dev); return (1); } static int vtpci_alloc_msi(struct vtpci_softc *sc) { device_t dev; int nmsi, cnt, required; dev = sc->vtpci_dev; required = 1; nmsi = pci_msi_count(dev); if (nmsi < required) return (1); cnt = required; if (pci_alloc_msi(dev, &cnt) == 0 && cnt >= required) return (0); pci_release_msi(dev); return (1); } static int vtpci_alloc_intr_msix_pervq(struct vtpci_softc *sc) { int i, nvectors, error; if (vtpci_disable_msix != 0 || sc->vtpci_flags & VTPCI_FLAG_NO_MSIX) return (ENOTSUP); for (nvectors = 0, i = 0; i < sc->vtpci_nvqs; i++) { if (sc->vtpci_vqs[i].vtv_no_intr == 0) nvectors++; } error = vtpci_alloc_msix(sc, nvectors); if (error) return (error); sc->vtpci_flags |= VTPCI_FLAG_MSIX; return (0); } static int vtpci_alloc_intr_msix_shared(struct vtpci_softc *sc) { int error; if (vtpci_disable_msix != 0 || sc->vtpci_flags & VTPCI_FLAG_NO_MSIX) return (ENOTSUP); error = vtpci_alloc_msix(sc, 1); if (error) return (error); sc->vtpci_flags |= VTPCI_FLAG_MSIX | VTPCI_FLAG_SHARED_MSIX; return (0); } static int vtpci_alloc_intr_msi(struct vtpci_softc *sc) { int error; /* Only BHyVe supports MSI. */ if (sc->vtpci_flags & VTPCI_FLAG_NO_MSI) return (ENOTSUP); error = vtpci_alloc_msi(sc); if (error) return (error); sc->vtpci_flags |= VTPCI_FLAG_MSI; return (0); } static int vtpci_alloc_intr_legacy(struct vtpci_softc *sc) { sc->vtpci_flags |= VTPCI_FLAG_LEGACY; return (0); } static int vtpci_alloc_interrupt(struct vtpci_softc *sc, int rid, int flags, struct vtpci_interrupt *intr) { struct resource *irq; irq = bus_alloc_resource_any(sc->vtpci_dev, SYS_RES_IRQ, &rid, flags); if (irq == NULL) return (ENXIO); intr->vti_irq = irq; intr->vti_rid = rid; return (0); } static int vtpci_alloc_intr_resources(struct vtpci_softc *sc) { struct vtpci_interrupt *intr; int i, rid, flags, nvq_intrs, error; rid = 0; flags = RF_ACTIVE; if (sc->vtpci_flags & VTPCI_FLAG_LEGACY) flags |= RF_SHAREABLE; else rid = 1; /* * For legacy and MSI interrupts, this single resource handles all * interrupts. For MSIX, this resource is used for the configuration * changed interrupt. */ intr = &sc->vtpci_device_interrupt; error = vtpci_alloc_interrupt(sc, rid, flags, intr); if (error || sc->vtpci_flags & (VTPCI_FLAG_LEGACY | VTPCI_FLAG_MSI)) return (error); /* Subtract one for the configuration changed interrupt. */ nvq_intrs = sc->vtpci_nmsix_resources - 1; intr = sc->vtpci_msix_vq_interrupts = malloc(nvq_intrs * sizeof(struct vtpci_interrupt), M_DEVBUF, M_NOWAIT | M_ZERO); if (sc->vtpci_msix_vq_interrupts == NULL) return (ENOMEM); for (i = 0, rid++; i < nvq_intrs; i++, rid++, intr++) { error = vtpci_alloc_interrupt(sc, rid, flags, intr); if (error) return (error); } return (0); } static int vtpci_setup_legacy_interrupt(struct vtpci_softc *sc, enum intr_type type) { struct vtpci_interrupt *intr; int error; intr = &sc->vtpci_device_interrupt; error = bus_setup_intr(sc->vtpci_dev, intr->vti_irq, type, NULL, vtpci_legacy_intr, sc, &intr->vti_handler); return (error); } static int vtpci_setup_pervq_msix_interrupts(struct vtpci_softc *sc, enum intr_type type) { struct vtpci_virtqueue *vqx; struct vtpci_interrupt *intr; int i, error; intr = sc->vtpci_msix_vq_interrupts; for (i = 0; i < sc->vtpci_nvqs; i++) { vqx = &sc->vtpci_vqs[i]; if (vqx->vtv_no_intr) continue; error = bus_setup_intr(sc->vtpci_dev, intr->vti_irq, type, vtpci_vq_intr_filter, vtpci_vq_intr, vqx->vtv_vq, &intr->vti_handler); if (error) return (error); intr++; } return (0); } static int vtpci_setup_msix_interrupts(struct vtpci_softc *sc, enum intr_type type) { device_t dev; struct vtpci_interrupt *intr; int error; dev = sc->vtpci_dev; intr = &sc->vtpci_device_interrupt; error = bus_setup_intr(dev, intr->vti_irq, type, NULL, vtpci_config_intr, sc, &intr->vti_handler); if (error) return (error); if (sc->vtpci_flags & VTPCI_FLAG_SHARED_MSIX) { intr = sc->vtpci_msix_vq_interrupts; error = bus_setup_intr(dev, intr->vti_irq, type, vtpci_vq_shared_intr_filter, vtpci_vq_shared_intr, sc, &intr->vti_handler); } else error = vtpci_setup_pervq_msix_interrupts(sc, type); return (error ? error : vtpci_set_host_msix_vectors(sc)); } static int vtpci_setup_interrupts(struct vtpci_softc *sc, enum intr_type type) { int error; type |= INTR_MPSAFE; KASSERT(sc->vtpci_flags & VTPCI_FLAG_ITYPE_MASK, ("%s: no interrupt type selected %#x", __func__, sc->vtpci_flags)); error = vtpci_alloc_intr_resources(sc); if (error) return (error); if (sc->vtpci_flags & VTPCI_FLAG_LEGACY) error = vtpci_setup_legacy_interrupt(sc, type); else if (sc->vtpci_flags & VTPCI_FLAG_MSI) error = vtpci_setup_msi_interrupt(sc, type); else error = vtpci_setup_msix_interrupts(sc, type); return (error); } static int vtpci_register_msix_vector(struct vtpci_softc *sc, int offset, struct vtpci_interrupt *intr) { device_t dev; uint16_t vector; dev = sc->vtpci_dev; if (intr != NULL) { /* Map from guest rid to host vector. */ vector = intr->vti_rid - 1; } else vector = VIRTIO_MSI_NO_VECTOR; vtpci_write_config_2(sc, offset, vector); /* Read vector to determine if the host had sufficient resources. */ if (vtpci_read_config_2(sc, offset) != vector) { device_printf(dev, "insufficient host resources for MSIX interrupts\n"); return (ENODEV); } return (0); } static int vtpci_set_host_msix_vectors(struct vtpci_softc *sc) { struct vtpci_interrupt *intr, *tintr; int idx, offset, error; intr = &sc->vtpci_device_interrupt; offset = VIRTIO_MSI_CONFIG_VECTOR; error = vtpci_register_msix_vector(sc, offset, intr); if (error) return (error); intr = sc->vtpci_msix_vq_interrupts; offset = VIRTIO_MSI_QUEUE_VECTOR; for (idx = 0; idx < sc->vtpci_nvqs; idx++) { vtpci_select_virtqueue(sc, idx); if (sc->vtpci_vqs[idx].vtv_no_intr) tintr = NULL; else tintr = intr; error = vtpci_register_msix_vector(sc, offset, tintr); if (error) break; /* * For shared MSIX, all the virtqueues share the first * interrupt. */ if (!sc->vtpci_vqs[idx].vtv_no_intr && (sc->vtpci_flags & VTPCI_FLAG_SHARED_MSIX) == 0) intr++; } return (error); } static int vtpci_reinit_virtqueue(struct vtpci_softc *sc, int idx) { struct vtpci_virtqueue *vqx; struct virtqueue *vq; int error; uint16_t size; vqx = &sc->vtpci_vqs[idx]; vq = vqx->vtv_vq; KASSERT(vq != NULL, ("%s: vq %d not allocated", __func__, idx)); vtpci_select_virtqueue(sc, idx); size = vtpci_read_config_2(sc, VIRTIO_PCI_QUEUE_NUM); error = virtqueue_reinit(vq, size); if (error) return (error); vtpci_write_config_4(sc, VIRTIO_PCI_QUEUE_PFN, virtqueue_paddr(vq) >> VIRTIO_PCI_QUEUE_ADDR_SHIFT); return (0); } static void vtpci_free_interrupt(struct vtpci_softc *sc, struct vtpci_interrupt *intr) { device_t dev; dev = sc->vtpci_dev; if (intr->vti_handler != NULL) { bus_teardown_intr(dev, intr->vti_irq, intr->vti_handler); intr->vti_handler = NULL; } if (intr->vti_irq != NULL) { bus_release_resource(dev, SYS_RES_IRQ, intr->vti_rid, intr->vti_irq); intr->vti_irq = NULL; intr->vti_rid = -1; } } static void vtpci_free_interrupts(struct vtpci_softc *sc) { struct vtpci_interrupt *intr; int i, nvq_intrs; vtpci_free_interrupt(sc, &sc->vtpci_device_interrupt); if (sc->vtpci_nmsix_resources != 0) { nvq_intrs = sc->vtpci_nmsix_resources - 1; sc->vtpci_nmsix_resources = 0; intr = sc->vtpci_msix_vq_interrupts; if (intr != NULL) { for (i = 0; i < nvq_intrs; i++, intr++) vtpci_free_interrupt(sc, intr); free(sc->vtpci_msix_vq_interrupts, M_DEVBUF); sc->vtpci_msix_vq_interrupts = NULL; } } if (sc->vtpci_flags & (VTPCI_FLAG_MSI | VTPCI_FLAG_MSIX)) pci_release_msi(sc->vtpci_dev); sc->vtpci_flags &= ~VTPCI_FLAG_ITYPE_MASK; } static void vtpci_free_virtqueues(struct vtpci_softc *sc) { struct vtpci_virtqueue *vqx; int idx; for (idx = 0; idx < sc->vtpci_nvqs; idx++) { vqx = &sc->vtpci_vqs[idx]; vtpci_select_virtqueue(sc, idx); vtpci_write_config_4(sc, VIRTIO_PCI_QUEUE_PFN, 0); virtqueue_free(vqx->vtv_vq); vqx->vtv_vq = NULL; } free(sc->vtpci_vqs, M_DEVBUF); sc->vtpci_vqs = NULL; sc->vtpci_nvqs = 0; } static void vtpci_release_child_resources(struct vtpci_softc *sc) { vtpci_free_interrupts(sc); vtpci_free_virtqueues(sc); } static void vtpci_cleanup_setup_intr_attempt(struct vtpci_softc *sc) { int idx; if (sc->vtpci_flags & VTPCI_FLAG_MSIX) { vtpci_write_config_2(sc, VIRTIO_MSI_CONFIG_VECTOR, VIRTIO_MSI_NO_VECTOR); for (idx = 0; idx < sc->vtpci_nvqs; idx++) { vtpci_select_virtqueue(sc, idx); vtpci_write_config_2(sc, VIRTIO_MSI_QUEUE_VECTOR, VIRTIO_MSI_NO_VECTOR); } } vtpci_free_interrupts(sc); } static void vtpci_reset(struct vtpci_softc *sc) { /* * Setting the status to RESET sets the host device to * the original, uninitialized state. */ vtpci_set_status(sc->vtpci_dev, VIRTIO_CONFIG_STATUS_RESET); } static void vtpci_select_virtqueue(struct vtpci_softc *sc, int idx) { vtpci_write_config_2(sc, VIRTIO_PCI_QUEUE_SEL, idx); } static void vtpci_legacy_intr(void *xsc) { struct vtpci_softc *sc; struct vtpci_virtqueue *vqx; int i; uint8_t isr; sc = xsc; vqx = &sc->vtpci_vqs[0]; /* Reading the ISR also clears it. */ isr = vtpci_read_config_1(sc, VIRTIO_PCI_ISR); if (isr & VIRTIO_PCI_ISR_CONFIG) vtpci_config_intr(sc); if (isr & VIRTIO_PCI_ISR_INTR) { for (i = 0; i < sc->vtpci_nvqs; i++, vqx++) { if (vqx->vtv_no_intr == 0) virtqueue_intr(vqx->vtv_vq); } } } static int vtpci_vq_shared_intr_filter(void *xsc) { struct vtpci_softc *sc; struct vtpci_virtqueue *vqx; int i, rc; rc = 0; sc = xsc; vqx = &sc->vtpci_vqs[0]; for (i = 0; i < sc->vtpci_nvqs; i++, vqx++) { if (vqx->vtv_no_intr == 0) rc |= virtqueue_intr_filter(vqx->vtv_vq); } return (rc ? FILTER_SCHEDULE_THREAD : FILTER_STRAY); } static void vtpci_vq_shared_intr(void *xsc) { struct vtpci_softc *sc; struct vtpci_virtqueue *vqx; int i; sc = xsc; vqx = &sc->vtpci_vqs[0]; for (i = 0; i < sc->vtpci_nvqs; i++, vqx++) { if (vqx->vtv_no_intr == 0) virtqueue_intr(vqx->vtv_vq); } } static int vtpci_vq_intr_filter(void *xvq) { struct virtqueue *vq; int rc; vq = xvq; rc = virtqueue_intr_filter(vq); return (rc ? FILTER_SCHEDULE_THREAD : FILTER_STRAY); } static void vtpci_vq_intr(void *xvq) { struct virtqueue *vq; vq = xvq; virtqueue_intr(vq); } static void vtpci_config_intr(void *xsc) { struct vtpci_softc *sc; device_t child; sc = xsc; child = sc->vtpci_child_dev; if (child != NULL) VIRTIO_CONFIG_CHANGE(child); } Index: projects/nfsv42/sys/i386/i386/pmap.c =================================================================== --- projects/nfsv42/sys/i386/i386/pmap.c (revision 350367) +++ projects/nfsv42/sys/i386/i386/pmap.c (revision 350368) @@ -1,6164 +1,6154 @@ /*- * SPDX-License-Identifier: BSD-4-Clause * * Copyright (c) 1991 Regents of the University of California. * All rights reserved. * Copyright (c) 1994 John S. Dyson * All rights reserved. * Copyright (c) 1994 David Greenman * All rights reserved. * Copyright (c) 2005-2010 Alan L. Cox * All rights reserved. * * This code is derived from software contributed to Berkeley by * the Systems Programming Group of the University of Utah Computer * Science Department and William Jolitz of UUNET Technologies Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)pmap.c 7.7 (Berkeley) 5/12/91 */ /*- * Copyright (c) 2003 Networks Associates Technology, Inc. * All rights reserved. * Copyright (c) 2018 The FreeBSD Foundation * All rights reserved. * * This software was developed for the FreeBSD Project by Jake Burkholder, * Safeport Network Services, and Network Associates Laboratories, the * Security Research Division of Network Associates, Inc. under * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA * CHATS research program. * * Portions of this software were developed by * Konstantin Belousov under sponsorship from * the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); /* * Manages physical address maps. * * Since the information managed by this module is * also stored by the logical address mapping module, * this module may throw away valid virtual-to-physical * mappings at almost any time. However, invalidations * of virtual-to-physical mappings must be done as * requested. * * In order to cope with hardware architectures which * make virtual-to-physical map invalidates expensive, * this module may delay invalidate or reduced protection * operations until such time as they are actually * necessary. This module is given full information as * to which processors are currently using which maps, * and to when physical maps must be made correct. */ #include "opt_apic.h" #include "opt_cpu.h" #include "opt_pmap.h" #include "opt_smp.h" #include "opt_vm.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DEV_APIC #include #include #include #endif #include #include #include #include #include #include #include #ifdef SMP #include #endif #include #if !defined(DIAGNOSTIC) #ifdef __GNUC_GNU_INLINE__ #define PMAP_INLINE __attribute__((__gnu_inline__)) inline #else #define PMAP_INLINE extern inline #endif #else #define PMAP_INLINE #endif #ifdef PV_STATS #define PV_STAT(x) do { x ; } while (0) #else #define PV_STAT(x) do { } while (0) #endif #define pa_index(pa) ((pa) >> PDRSHIFT) #define pa_to_pvh(pa) (&pv_table[pa_index(pa)]) /* * PTmap is recursive pagemap at top of virtual address space. * Within PTmap, the page directory can be found (third indirection). */ #define PTmap ((pt_entry_t *)(PTDPTDI << PDRSHIFT)) #define PTD ((pd_entry_t *)((PTDPTDI << PDRSHIFT) + (PTDPTDI * PAGE_SIZE))) #define PTDpde ((pd_entry_t *)((PTDPTDI << PDRSHIFT) + (PTDPTDI * PAGE_SIZE) + \ (PTDPTDI * PDESIZE))) /* * Translate a virtual address to the kernel virtual address of its page table * entry (PTE). This can be used recursively. If the address of a PTE as * previously returned by this macro is itself given as the argument, then the * address of the page directory entry (PDE) that maps the PTE will be * returned. * * This macro may be used before pmap_bootstrap() is called. */ #define vtopte(va) (PTmap + i386_btop(va)) /* * Get PDEs and PTEs for user/kernel address space */ #define pmap_pde(m, v) (&((m)->pm_pdir[(vm_offset_t)(v) >> PDRSHIFT])) #define pdir_pde(m, v) (m[(vm_offset_t)(v) >> PDRSHIFT]) #define pmap_pde_v(pte) ((*(int *)pte & PG_V) != 0) #define pmap_pte_w(pte) ((*(int *)pte & PG_W) != 0) #define pmap_pte_m(pte) ((*(int *)pte & PG_M) != 0) #define pmap_pte_u(pte) ((*(int *)pte & PG_A) != 0) #define pmap_pte_v(pte) ((*(int *)pte & PG_V) != 0) #define pmap_pte_set_w(pte, v) ((v) ? atomic_set_int((u_int *)(pte), PG_W) : \ atomic_clear_int((u_int *)(pte), PG_W)) #define pmap_pte_set_prot(pte, v) ((*(int *)pte &= ~PG_PROT), (*(int *)pte |= (v))) _Static_assert(sizeof(struct pmap) <= sizeof(struct pmap_KBI), "pmap_KBI"); static int pgeflag = 0; /* PG_G or-in */ static int pseflag = 0; /* PG_PS or-in */ static int nkpt = NKPT; #ifdef PMAP_PAE_COMP pt_entry_t pg_nx; static uma_zone_t pdptzone; #endif _Static_assert(VM_MAXUSER_ADDRESS == VADDR(TRPTDI, 0), "VM_MAXUSER_ADDRESS"); _Static_assert(VM_MAX_KERNEL_ADDRESS <= VADDR(PTDPTDI, 0), "VM_MAX_KERNEL_ADDRESS"); _Static_assert(PMAP_MAP_LOW == VADDR(LOWPTDI, 0), "PMAP_MAP_LOW"); _Static_assert(KERNLOAD == (KERNPTDI << PDRSHIFT), "KERNLOAD"); extern int pat_works; extern int pg_ps_enabled; extern int elf32_nxstack; #define PAT_INDEX_SIZE 8 static int pat_index[PAT_INDEX_SIZE]; /* cache mode to PAT index conversion */ /* * pmap_mapdev support pre initialization (i.e. console) */ #define PMAP_PREINIT_MAPPING_COUNT 8 static struct pmap_preinit_mapping { vm_paddr_t pa; vm_offset_t va; vm_size_t sz; int mode; } pmap_preinit_mapping[PMAP_PREINIT_MAPPING_COUNT]; static int pmap_initialized; static struct rwlock_padalign pvh_global_lock; /* * Data for the pv entry allocation mechanism */ static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks); extern int pv_entry_max, pv_entry_count; static int pv_entry_high_water = 0; static struct md_page *pv_table; extern int shpgperproc; static struct pv_chunk *pv_chunkbase; /* KVA block for pv_chunks */ static int pv_maxchunks; /* How many chunks we have KVA for */ static vm_offset_t pv_vafree; /* freelist stored in the PTE */ /* * All those kernel PT submaps that BSD is so fond of */ static pt_entry_t *CMAP3; static pd_entry_t *KPTD; static caddr_t CADDR3; /* * Crashdump maps. */ static caddr_t crashdumpmap; static pt_entry_t *PMAP1 = NULL, *PMAP2, *PMAP3; static pt_entry_t *PADDR1 = NULL, *PADDR2, *PADDR3; #ifdef SMP static int PMAP1cpu, PMAP3cpu; extern int PMAP1changedcpu; #endif extern int PMAP1changed; extern int PMAP1unchanged; static struct mtx PMAP2mutex; /* * Internal flags for pmap_enter()'s helper functions. */ #define PMAP_ENTER_NORECLAIM 0x1000000 /* Don't reclaim PV entries. */ #define PMAP_ENTER_NOREPLACE 0x2000000 /* Don't replace mappings. */ static void free_pv_chunk(struct pv_chunk *pc); static void free_pv_entry(pmap_t pmap, pv_entry_t pv); static pv_entry_t get_pv_entry(pmap_t pmap, boolean_t try); static void pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa); static bool pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, pd_entry_t pde, u_int flags); #if VM_NRESERVLEVEL > 0 static void pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa); #endif static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va); static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va); static int pmap_pvh_wired_mappings(struct md_page *pvh, int count); static boolean_t pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va); static bool pmap_enter_4mpage(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot); static int pmap_enter_pde(pmap_t pmap, vm_offset_t va, pd_entry_t newpde, u_int flags, vm_page_t m); static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, vm_page_t mpte); static int pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte, bool promoted); static void pmap_invalidate_pde_page(pmap_t pmap, vm_offset_t va, pd_entry_t pde); static void pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte); static boolean_t pmap_is_modified_pvh(struct md_page *pvh); static boolean_t pmap_is_referenced_pvh(struct md_page *pvh); static void pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode); static void pmap_kenter_pde(vm_offset_t va, pd_entry_t newpde); static void pmap_pde_attr(pd_entry_t *pde, int cache_bits); #if VM_NRESERVLEVEL > 0 static void pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va); #endif static boolean_t pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva, vm_prot_t prot); static void pmap_pte_attr(pt_entry_t *pte, int cache_bits); static void pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva, struct spglist *free); static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t sva, struct spglist *free); static vm_page_t pmap_remove_pt_page(pmap_t pmap, vm_offset_t va); static void pmap_remove_page(struct pmap *pmap, vm_offset_t va, struct spglist *free); static bool pmap_remove_ptes(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, struct spglist *free); static void pmap_remove_entry(struct pmap *pmap, vm_page_t m, vm_offset_t va); static void pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m); static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m); static void pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde); static void pmap_update_pde_invalidate(vm_offset_t va, pd_entry_t newpde); static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va, u_int flags); static vm_page_t _pmap_allocpte(pmap_t pmap, u_int ptepindex, u_int flags); static void _pmap_unwire_ptp(pmap_t pmap, vm_page_t m, struct spglist *free); static pt_entry_t *pmap_pte_quick(pmap_t pmap, vm_offset_t va); static void pmap_pte_release(pt_entry_t *pte); static int pmap_unuse_pt(pmap_t, vm_offset_t, struct spglist *); #ifdef PMAP_PAE_COMP static void *pmap_pdpt_allocf(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *flags, int wait); #endif static void pmap_init_trm(void); static void pmap_invalidate_all_int(pmap_t pmap); static __inline void pagezero(void *page); CTASSERT(1 << PDESHIFT == sizeof(pd_entry_t)); CTASSERT(1 << PTESHIFT == sizeof(pt_entry_t)); extern char _end[]; extern u_long physfree; /* phys addr of next free page */ extern u_long vm86phystk;/* PA of vm86/bios stack */ extern u_long vm86paddr;/* address of vm86 region */ extern int vm86pa; /* phys addr of vm86 region */ extern u_long KERNend; /* phys addr end of kernel (just after bss) */ #ifdef PMAP_PAE_COMP pd_entry_t *IdlePTD_pae; /* phys addr of kernel PTD */ pdpt_entry_t *IdlePDPT; /* phys addr of kernel PDPT */ pt_entry_t *KPTmap_pae; /* address of kernel page tables */ #define IdlePTD IdlePTD_pae #define KPTmap KPTmap_pae #else pd_entry_t *IdlePTD_nopae; pt_entry_t *KPTmap_nopae; #define IdlePTD IdlePTD_nopae #define KPTmap KPTmap_nopae #endif extern u_long KPTphys; /* phys addr of kernel page tables */ extern u_long tramp_idleptd; static u_long allocpages(u_int cnt, u_long *physfree) { u_long res; res = *physfree; *physfree += PAGE_SIZE * cnt; bzero((void *)res, PAGE_SIZE * cnt); return (res); } static void pmap_cold_map(u_long pa, u_long va, u_long cnt) { pt_entry_t *pt; for (pt = (pt_entry_t *)KPTphys + atop(va); cnt > 0; cnt--, pt++, va += PAGE_SIZE, pa += PAGE_SIZE) *pt = pa | PG_V | PG_RW | PG_A | PG_M; } static void pmap_cold_mapident(u_long pa, u_long cnt) { pmap_cold_map(pa, pa, cnt); } _Static_assert(LOWPTDI * 2 * NBPDR == KERNBASE, "Broken double-map of zero PTD"); static void __CONCAT(PMTYPE, remap_lower)(bool enable) { int i; for (i = 0; i < LOWPTDI; i++) IdlePTD[i] = enable ? IdlePTD[LOWPTDI + i] : 0; load_cr3(rcr3()); /* invalidate TLB */ } /* * Called from locore.s before paging is enabled. Sets up the first * kernel page table. Since kernel is mapped with PA == VA, this code * does not require relocations. */ void __CONCAT(PMTYPE, cold)(void) { pt_entry_t *pt; u_long a; u_int cr3, ncr4; physfree = (u_long)&_end; if (bootinfo.bi_esymtab != 0) physfree = bootinfo.bi_esymtab; if (bootinfo.bi_kernend != 0) physfree = bootinfo.bi_kernend; physfree = roundup2(physfree, NBPDR); KERNend = physfree; /* Allocate Kernel Page Tables */ KPTphys = allocpages(NKPT, &physfree); KPTmap = (pt_entry_t *)KPTphys; /* Allocate Page Table Directory */ #ifdef PMAP_PAE_COMP /* XXX only need 32 bytes (easier for now) */ IdlePDPT = (pdpt_entry_t *)allocpages(1, &physfree); #endif IdlePTD = (pd_entry_t *)allocpages(NPGPTD, &physfree); /* * Allocate KSTACK. Leave a guard page between IdlePTD and * proc0kstack, to control stack overflow for thread0 and * prevent corruption of the page table. We leak the guard * physical memory due to 1:1 mappings. */ allocpages(1, &physfree); proc0kstack = allocpages(TD0_KSTACK_PAGES, &physfree); /* vm86/bios stack */ vm86phystk = allocpages(1, &physfree); /* pgtable + ext + IOPAGES */ vm86paddr = vm86pa = allocpages(3, &physfree); /* Install page tables into PTD. Page table page 1 is wasted. */ for (a = 0; a < NKPT; a++) IdlePTD[a] = (KPTphys + ptoa(a)) | PG_V | PG_RW | PG_A | PG_M; #ifdef PMAP_PAE_COMP /* PAE install PTD pointers into PDPT */ for (a = 0; a < NPGPTD; a++) IdlePDPT[a] = ((u_int)IdlePTD + ptoa(a)) | PG_V; #endif /* * Install recursive mapping for kernel page tables into * itself. */ for (a = 0; a < NPGPTD; a++) IdlePTD[PTDPTDI + a] = ((u_int)IdlePTD + ptoa(a)) | PG_V | PG_RW; /* * Initialize page table pages mapping physical address zero * through the (physical) end of the kernel. Many of these * pages must be reserved, and we reserve them all and map * them linearly for convenience. We do this even if we've * enabled PSE above; we'll just switch the corresponding * kernel PDEs before we turn on paging. * * This and all other page table entries allow read and write * access for various reasons. Kernel mappings never have any * access restrictions. */ pmap_cold_mapident(0, atop(NBPDR) * LOWPTDI); pmap_cold_map(0, NBPDR * LOWPTDI, atop(NBPDR) * LOWPTDI); pmap_cold_mapident(KERNBASE, atop(KERNend - KERNBASE)); /* Map page table directory */ #ifdef PMAP_PAE_COMP pmap_cold_mapident((u_long)IdlePDPT, 1); #endif pmap_cold_mapident((u_long)IdlePTD, NPGPTD); /* Map early KPTmap. It is really pmap_cold_mapident. */ pmap_cold_map(KPTphys, (u_long)KPTmap, NKPT); /* Map proc0kstack */ pmap_cold_mapident(proc0kstack, TD0_KSTACK_PAGES); /* ISA hole already mapped */ pmap_cold_mapident(vm86phystk, 1); pmap_cold_mapident(vm86pa, 3); /* Map page 0 into the vm86 page table */ *(pt_entry_t *)vm86pa = 0 | PG_RW | PG_U | PG_A | PG_M | PG_V; /* ...likewise for the ISA hole for vm86 */ for (pt = (pt_entry_t *)vm86pa + atop(ISA_HOLE_START), a = 0; a < atop(ISA_HOLE_LENGTH); a++, pt++) *pt = (ISA_HOLE_START + ptoa(a)) | PG_RW | PG_U | PG_A | PG_M | PG_V; /* Enable PSE, PGE, VME, and PAE if configured. */ ncr4 = 0; if ((cpu_feature & CPUID_PSE) != 0) { ncr4 |= CR4_PSE; pseflag = PG_PS; /* * Superpage mapping of the kernel text. Existing 4k * page table pages are wasted. */ for (a = KERNBASE; a < KERNend; a += NBPDR) IdlePTD[a >> PDRSHIFT] = a | PG_PS | PG_A | PG_M | PG_RW | PG_V; } if ((cpu_feature & CPUID_PGE) != 0) { ncr4 |= CR4_PGE; pgeflag = PG_G; } ncr4 |= (cpu_feature & CPUID_VME) != 0 ? CR4_VME : 0; #ifdef PMAP_PAE_COMP ncr4 |= CR4_PAE; #endif if (ncr4 != 0) load_cr4(rcr4() | ncr4); /* Now enable paging */ #ifdef PMAP_PAE_COMP cr3 = (u_int)IdlePDPT; if ((cpu_feature & CPUID_PAT) == 0) wbinvd(); #else cr3 = (u_int)IdlePTD; #endif tramp_idleptd = cr3; load_cr3(cr3); load_cr0(rcr0() | CR0_PG); /* * Now running relocated at KERNBASE where the system is * linked to run. */ /* * Remove the lowest part of the double mapping of low memory * to get some null pointer checks. */ __CONCAT(PMTYPE, remap_lower)(false); kernel_vm_end = /* 0 + */ NKPT * NBPDR; #ifdef PMAP_PAE_COMP i386_pmap_VM_NFREEORDER = VM_NFREEORDER_PAE; i386_pmap_VM_LEVEL_0_ORDER = VM_LEVEL_0_ORDER_PAE; i386_pmap_PDRSHIFT = PDRSHIFT_PAE; #else i386_pmap_VM_NFREEORDER = VM_NFREEORDER_NOPAE; i386_pmap_VM_LEVEL_0_ORDER = VM_LEVEL_0_ORDER_NOPAE; i386_pmap_PDRSHIFT = PDRSHIFT_NOPAE; #endif } static void __CONCAT(PMTYPE, set_nx)(void) { #ifdef PMAP_PAE_COMP if ((amd_feature & AMDID_NX) == 0) return; pg_nx = PG_NX; elf32_nxstack = 1; /* EFER.EFER_NXE is set in initializecpu(). */ #endif } /* * Bootstrap the system enough to run with virtual memory. * * On the i386 this is called after pmap_cold() created initial * kernel page table and enabled paging, and just syncs the pmap * module with what has already been done. */ static void __CONCAT(PMTYPE, bootstrap)(vm_paddr_t firstaddr) { vm_offset_t va; pt_entry_t *pte, *unused; struct pcpu *pc; u_long res; int i; res = atop(firstaddr - (vm_paddr_t)KERNLOAD); /* * Add a physical memory segment (vm_phys_seg) corresponding to the * preallocated kernel page table pages so that vm_page structures * representing these pages will be created. The vm_page structures * are required for promotion of the corresponding kernel virtual * addresses to superpage mappings. */ vm_phys_add_seg(KPTphys, KPTphys + ptoa(nkpt)); /* * Initialize the first available kernel virtual address. * However, using "firstaddr" may waste a few pages of the * kernel virtual address space, because pmap_cold() may not * have mapped every physical page that it allocated. * Preferably, pmap_cold() would provide a first unused * virtual address in addition to "firstaddr". */ virtual_avail = (vm_offset_t)firstaddr; virtual_end = VM_MAX_KERNEL_ADDRESS; /* * Initialize the kernel pmap (which is statically allocated). * Count bootstrap data as being resident in case any of this data is * later unmapped (using pmap_remove()) and freed. */ PMAP_LOCK_INIT(kernel_pmap); kernel_pmap->pm_pdir = IdlePTD; #ifdef PMAP_PAE_COMP kernel_pmap->pm_pdpt = IdlePDPT; #endif CPU_FILL(&kernel_pmap->pm_active); /* don't allow deactivation */ kernel_pmap->pm_stats.resident_count = res; TAILQ_INIT(&kernel_pmap->pm_pvchunk); /* * Initialize the global pv list lock. */ rw_init(&pvh_global_lock, "pmap pv global"); /* * Reserve some special page table entries/VA space for temporary * mapping of pages. */ #define SYSMAP(c, p, v, n) \ v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n); va = virtual_avail; pte = vtopte(va); /* * Initialize temporary map objects on the current CPU for use * during early boot. * CMAP1/CMAP2 are used for zeroing and copying pages. * CMAP3 is used for the boot-time memory test. */ pc = get_pcpu(); mtx_init(&pc->pc_cmap_lock, "SYSMAPS", NULL, MTX_DEF); SYSMAP(caddr_t, pc->pc_cmap_pte1, pc->pc_cmap_addr1, 1) SYSMAP(caddr_t, pc->pc_cmap_pte2, pc->pc_cmap_addr2, 1) SYSMAP(vm_offset_t, pte, pc->pc_qmap_addr, 1) SYSMAP(caddr_t, CMAP3, CADDR3, 1); /* * Crashdump maps. */ SYSMAP(caddr_t, unused, crashdumpmap, MAXDUMPPGS) /* * ptvmmap is used for reading arbitrary physical pages via /dev/mem. */ SYSMAP(caddr_t, unused, ptvmmap, 1) /* * msgbufp is used to map the system message buffer. */ SYSMAP(struct msgbuf *, unused, msgbufp, atop(round_page(msgbufsize))) /* * KPTmap is used by pmap_kextract(). * * KPTmap is first initialized by pmap_cold(). However, that initial * KPTmap can only support NKPT page table pages. Here, a larger * KPTmap is created that can support KVA_PAGES page table pages. */ SYSMAP(pt_entry_t *, KPTD, KPTmap, KVA_PAGES) for (i = 0; i < NKPT; i++) KPTD[i] = (KPTphys + ptoa(i)) | PG_RW | PG_V; /* * PADDR1 and PADDR2 are used by pmap_pte_quick() and pmap_pte(), * respectively. */ SYSMAP(pt_entry_t *, PMAP1, PADDR1, 1) SYSMAP(pt_entry_t *, PMAP2, PADDR2, 1) SYSMAP(pt_entry_t *, PMAP3, PADDR3, 1) mtx_init(&PMAP2mutex, "PMAP2", NULL, MTX_DEF); virtual_avail = va; /* * Initialize the PAT MSR if present. * pmap_init_pat() clears and sets CR4_PGE, which, as a * side-effect, invalidates stale PG_G TLB entries that might * have been created in our pre-boot environment. We assume * that PAT support implies PGE and in reverse, PGE presence * comes with PAT. Both features were added for Pentium Pro. */ pmap_init_pat(); } static void pmap_init_reserved_pages(void) { struct pcpu *pc; vm_offset_t pages; int i; #ifdef PMAP_PAE_COMP if (!pae_mode) return; #else if (pae_mode) return; #endif CPU_FOREACH(i) { pc = pcpu_find(i); mtx_init(&pc->pc_copyout_mlock, "cpmlk", NULL, MTX_DEF | MTX_NEW); pc->pc_copyout_maddr = kva_alloc(ptoa(2)); if (pc->pc_copyout_maddr == 0) panic("unable to allocate non-sleepable copyout KVA"); sx_init(&pc->pc_copyout_slock, "cpslk"); pc->pc_copyout_saddr = kva_alloc(ptoa(2)); if (pc->pc_copyout_saddr == 0) panic("unable to allocate sleepable copyout KVA"); pc->pc_pmap_eh_va = kva_alloc(ptoa(1)); if (pc->pc_pmap_eh_va == 0) panic("unable to allocate pmap_extract_and_hold KVA"); pc->pc_pmap_eh_ptep = (char *)vtopte(pc->pc_pmap_eh_va); /* * Skip if the mappings have already been initialized, * i.e. this is the BSP. */ if (pc->pc_cmap_addr1 != 0) continue; mtx_init(&pc->pc_cmap_lock, "SYSMAPS", NULL, MTX_DEF); pages = kva_alloc(PAGE_SIZE * 3); if (pages == 0) panic("unable to allocate CMAP KVA"); pc->pc_cmap_pte1 = vtopte(pages); pc->pc_cmap_pte2 = vtopte(pages + PAGE_SIZE); pc->pc_cmap_addr1 = (caddr_t)pages; pc->pc_cmap_addr2 = (caddr_t)(pages + PAGE_SIZE); pc->pc_qmap_addr = pages + ptoa(2); } } SYSINIT(rpages_init, SI_SUB_CPU, SI_ORDER_ANY, pmap_init_reserved_pages, NULL); /* * Setup the PAT MSR. */ static void __CONCAT(PMTYPE, init_pat)(void) { int pat_table[PAT_INDEX_SIZE]; uint64_t pat_msr; u_long cr0, cr4; int i; /* Set default PAT index table. */ for (i = 0; i < PAT_INDEX_SIZE; i++) pat_table[i] = -1; pat_table[PAT_WRITE_BACK] = 0; pat_table[PAT_WRITE_THROUGH] = 1; pat_table[PAT_UNCACHEABLE] = 3; pat_table[PAT_WRITE_COMBINING] = 3; pat_table[PAT_WRITE_PROTECTED] = 3; pat_table[PAT_UNCACHED] = 3; /* * Bail if this CPU doesn't implement PAT. * We assume that PAT support implies PGE. */ if ((cpu_feature & CPUID_PAT) == 0) { for (i = 0; i < PAT_INDEX_SIZE; i++) pat_index[i] = pat_table[i]; pat_works = 0; return; } /* * Due to some Intel errata, we can only safely use the lower 4 * PAT entries. * * Intel Pentium III Processor Specification Update * Errata E.27 (Upper Four PAT Entries Not Usable With Mode B * or Mode C Paging) * * Intel Pentium IV Processor Specification Update * Errata N46 (PAT Index MSB May Be Calculated Incorrectly) */ if (cpu_vendor_id == CPU_VENDOR_INTEL && !(CPUID_TO_FAMILY(cpu_id) == 6 && CPUID_TO_MODEL(cpu_id) >= 0xe)) pat_works = 0; /* Initialize default PAT entries. */ pat_msr = PAT_VALUE(0, PAT_WRITE_BACK) | PAT_VALUE(1, PAT_WRITE_THROUGH) | PAT_VALUE(2, PAT_UNCACHED) | PAT_VALUE(3, PAT_UNCACHEABLE) | PAT_VALUE(4, PAT_WRITE_BACK) | PAT_VALUE(5, PAT_WRITE_THROUGH) | PAT_VALUE(6, PAT_UNCACHED) | PAT_VALUE(7, PAT_UNCACHEABLE); if (pat_works) { /* * Leave the indices 0-3 at the default of WB, WT, UC-, and UC. * Program 5 and 6 as WP and WC. * Leave 4 and 7 as WB and UC. */ pat_msr &= ~(PAT_MASK(5) | PAT_MASK(6)); pat_msr |= PAT_VALUE(5, PAT_WRITE_PROTECTED) | PAT_VALUE(6, PAT_WRITE_COMBINING); pat_table[PAT_UNCACHED] = 2; pat_table[PAT_WRITE_PROTECTED] = 5; pat_table[PAT_WRITE_COMBINING] = 6; } else { /* * Just replace PAT Index 2 with WC instead of UC-. */ pat_msr &= ~PAT_MASK(2); pat_msr |= PAT_VALUE(2, PAT_WRITE_COMBINING); pat_table[PAT_WRITE_COMBINING] = 2; } /* Disable PGE. */ cr4 = rcr4(); load_cr4(cr4 & ~CR4_PGE); /* Disable caches (CD = 1, NW = 0). */ cr0 = rcr0(); load_cr0((cr0 & ~CR0_NW) | CR0_CD); /* Flushes caches and TLBs. */ wbinvd(); invltlb(); /* Update PAT and index table. */ wrmsr(MSR_PAT, pat_msr); for (i = 0; i < PAT_INDEX_SIZE; i++) pat_index[i] = pat_table[i]; /* Flush caches and TLBs again. */ wbinvd(); invltlb(); /* Restore caches and PGE. */ load_cr0(cr0); load_cr4(cr4); } #ifdef PMAP_PAE_COMP static void * pmap_pdpt_allocf(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *flags, int wait) { /* Inform UMA that this allocator uses kernel_map/object. */ *flags = UMA_SLAB_KERNEL; return ((void *)kmem_alloc_contig_domainset(DOMAINSET_FIXED(domain), bytes, wait, 0x0ULL, 0xffffffffULL, 1, 0, VM_MEMATTR_DEFAULT)); } #endif /* * Abuse the pte nodes for unmapped kva to thread a kva freelist through. * Requirements: * - Must deal with pages in order to ensure that none of the PG_* bits * are ever set, PG_V in particular. * - Assumes we can write to ptes without pte_store() atomic ops, even * on PAE systems. This should be ok. * - Assumes nothing will ever test these addresses for 0 to indicate * no mapping instead of correctly checking PG_V. * - Assumes a vm_offset_t will fit in a pte (true for i386). * Because PG_V is never set, there can be no mappings to invalidate. */ static vm_offset_t pmap_ptelist_alloc(vm_offset_t *head) { pt_entry_t *pte; vm_offset_t va; va = *head; if (va == 0) panic("pmap_ptelist_alloc: exhausted ptelist KVA"); pte = vtopte(va); *head = *pte; if (*head & PG_V) panic("pmap_ptelist_alloc: va with PG_V set!"); *pte = 0; return (va); } static void pmap_ptelist_free(vm_offset_t *head, vm_offset_t va) { pt_entry_t *pte; if (va & PG_V) panic("pmap_ptelist_free: freeing va with PG_V set!"); pte = vtopte(va); *pte = *head; /* virtual! PG_V is 0 though */ *head = va; } static void pmap_ptelist_init(vm_offset_t *head, void *base, int npages) { int i; vm_offset_t va; *head = 0; for (i = npages - 1; i >= 0; i--) { va = (vm_offset_t)base + i * PAGE_SIZE; pmap_ptelist_free(head, va); } } /* * Initialize the pmap module. * Called by vm_init, to initialize any structures that the pmap * system needs to map virtual memory. */ static void __CONCAT(PMTYPE, init)(void) { struct pmap_preinit_mapping *ppim; vm_page_t mpte; vm_size_t s; int i, pv_npg; /* * Initialize the vm page array entries for the kernel pmap's * page table pages. */ PMAP_LOCK(kernel_pmap); for (i = 0; i < NKPT; i++) { mpte = PHYS_TO_VM_PAGE(KPTphys + ptoa(i)); KASSERT(mpte >= vm_page_array && mpte < &vm_page_array[vm_page_array_size], ("pmap_init: page table page is out of range")); mpte->pindex = i + KPTDI; mpte->phys_addr = KPTphys + ptoa(i); mpte->wire_count = 1; /* * Collect the page table pages that were replaced by a 2/4MB * page. They are filled with equivalent 4KB page mappings. */ if (pseflag != 0 && KERNBASE <= i << PDRSHIFT && i << PDRSHIFT < KERNend && pmap_insert_pt_page(kernel_pmap, mpte, true)) panic("pmap_init: pmap_insert_pt_page failed"); } PMAP_UNLOCK(kernel_pmap); vm_wire_add(NKPT); /* * Initialize the address space (zone) for the pv entries. Set a * high water mark so that the system can recover from excessive * numbers of pv entries. */ TUNABLE_INT_FETCH("vm.pmap.shpgperproc", &shpgperproc); pv_entry_max = shpgperproc * maxproc + vm_cnt.v_page_count; TUNABLE_INT_FETCH("vm.pmap.pv_entries", &pv_entry_max); pv_entry_max = roundup(pv_entry_max, _NPCPV); pv_entry_high_water = 9 * (pv_entry_max / 10); /* * If the kernel is running on a virtual machine, then it must assume * that MCA is enabled by the hypervisor. Moreover, the kernel must * be prepared for the hypervisor changing the vendor and family that * are reported by CPUID. Consequently, the workaround for AMD Family * 10h Erratum 383 is enabled if the processor's feature set does not * include at least one feature that is only supported by older Intel * or newer AMD processors. */ if (vm_guest != VM_GUEST_NO && (cpu_feature & CPUID_SS) == 0 && (cpu_feature2 & (CPUID2_SSSE3 | CPUID2_SSE41 | CPUID2_AESNI | CPUID2_AVX | CPUID2_XSAVE)) == 0 && (amd_feature2 & (AMDID2_XOP | AMDID2_FMA4)) == 0) workaround_erratum383 = 1; /* * Are large page mappings supported and enabled? */ TUNABLE_INT_FETCH("vm.pmap.pg_ps_enabled", &pg_ps_enabled); if (pseflag == 0) pg_ps_enabled = 0; else if (pg_ps_enabled) { KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0, ("pmap_init: can't assign to pagesizes[1]")); pagesizes[1] = NBPDR; } /* * Calculate the size of the pv head table for superpages. * Handle the possibility that "vm_phys_segs[...].end" is zero. */ pv_npg = trunc_4mpage(vm_phys_segs[vm_phys_nsegs - 1].end - PAGE_SIZE) / NBPDR + 1; /* * Allocate memory for the pv head table for superpages. */ s = (vm_size_t)(pv_npg * sizeof(struct md_page)); s = round_page(s); pv_table = (struct md_page *)kmem_malloc(s, M_WAITOK | M_ZERO); for (i = 0; i < pv_npg; i++) TAILQ_INIT(&pv_table[i].pv_list); pv_maxchunks = MAX(pv_entry_max / _NPCPV, maxproc); pv_chunkbase = (struct pv_chunk *)kva_alloc(PAGE_SIZE * pv_maxchunks); if (pv_chunkbase == NULL) panic("pmap_init: not enough kvm for pv chunks"); pmap_ptelist_init(&pv_vafree, pv_chunkbase, pv_maxchunks); #ifdef PMAP_PAE_COMP pdptzone = uma_zcreate("PDPT", NPGPTD * sizeof(pdpt_entry_t), NULL, NULL, NULL, NULL, (NPGPTD * sizeof(pdpt_entry_t)) - 1, UMA_ZONE_VM | UMA_ZONE_NOFREE); uma_zone_set_allocf(pdptzone, pmap_pdpt_allocf); #endif pmap_initialized = 1; pmap_init_trm(); if (!bootverbose) return; for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { ppim = pmap_preinit_mapping + i; if (ppim->va == 0) continue; printf("PPIM %u: PA=%#jx, VA=%#x, size=%#x, mode=%#x\n", i, (uintmax_t)ppim->pa, ppim->va, ppim->sz, ppim->mode); } } extern u_long pmap_pde_demotions; extern u_long pmap_pde_mappings; extern u_long pmap_pde_p_failures; extern u_long pmap_pde_promotions; /*************************************************** * Low level helper routines..... ***************************************************/ static boolean_t __CONCAT(PMTYPE, is_valid_memattr)(pmap_t pmap __unused, vm_memattr_t mode) { return (mode >= 0 && mode < PAT_INDEX_SIZE && pat_index[(int)mode] >= 0); } /* * Determine the appropriate bits to set in a PTE or PDE for a specified * caching mode. */ static int __CONCAT(PMTYPE, cache_bits)(pmap_t pmap, int mode, boolean_t is_pde) { int cache_bits, pat_flag, pat_idx; if (!pmap_is_valid_memattr(pmap, mode)) panic("Unknown caching mode %d\n", mode); /* The PAT bit is different for PTE's and PDE's. */ pat_flag = is_pde ? PG_PDE_PAT : PG_PTE_PAT; /* Map the caching mode to a PAT index. */ pat_idx = pat_index[mode]; /* Map the 3-bit index value into the PAT, PCD, and PWT bits. */ cache_bits = 0; if (pat_idx & 0x4) cache_bits |= pat_flag; if (pat_idx & 0x2) cache_bits |= PG_NC_PCD; if (pat_idx & 0x1) cache_bits |= PG_NC_PWT; return (cache_bits); } static bool __CONCAT(PMTYPE, ps_enabled)(pmap_t pmap __unused) { return (pg_ps_enabled); } /* * The caller is responsible for maintaining TLB consistency. */ static void pmap_kenter_pde(vm_offset_t va, pd_entry_t newpde) { pd_entry_t *pde; pde = pmap_pde(kernel_pmap, va); pde_store(pde, newpde); } /* * After changing the page size for the specified virtual address in the page * table, flush the corresponding entries from the processor's TLB. Only the * calling processor's TLB is affected. * * The calling thread must be pinned to a processor. */ static void pmap_update_pde_invalidate(vm_offset_t va, pd_entry_t newpde) { if ((newpde & PG_PS) == 0) /* Demotion: flush a specific 2MB page mapping. */ invlpg(va); else /* if ((newpde & PG_G) == 0) */ /* * Promotion: flush every 4KB page mapping from the TLB * because there are too many to flush individually. */ invltlb(); } #ifdef SMP /* * For SMP, these functions have to use the IPI mechanism for coherence. * * N.B.: Before calling any of the following TLB invalidation functions, * the calling processor must ensure that all stores updating a non- * kernel page table are globally performed. Otherwise, another * processor could cache an old, pre-update entry without being * invalidated. This can happen one of two ways: (1) The pmap becomes * active on another processor after its pm_active field is checked by * one of the following functions but before a store updating the page * table is globally performed. (2) The pmap becomes active on another * processor before its pm_active field is checked but due to * speculative loads one of the following functions stills reads the * pmap as inactive on the other processor. * * The kernel page table is exempt because its pm_active field is * immutable. The kernel page table is always active on every * processor. */ static void pmap_invalidate_page_int(pmap_t pmap, vm_offset_t va) { cpuset_t *mask, other_cpus; u_int cpuid; sched_pin(); if (pmap == kernel_pmap) { invlpg(va); mask = &all_cpus; } else if (!CPU_CMP(&pmap->pm_active, &all_cpus)) { mask = &all_cpus; } else { cpuid = PCPU_GET(cpuid); other_cpus = all_cpus; CPU_CLR(cpuid, &other_cpus); CPU_AND(&other_cpus, &pmap->pm_active); mask = &other_cpus; } smp_masked_invlpg(*mask, va, pmap); sched_unpin(); } /* 4k PTEs -- Chosen to exceed the total size of Broadwell L2 TLB */ #define PMAP_INVLPG_THRESHOLD (4 * 1024 * PAGE_SIZE) static void pmap_invalidate_range_int(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { cpuset_t *mask, other_cpus; vm_offset_t addr; u_int cpuid; if (eva - sva >= PMAP_INVLPG_THRESHOLD) { pmap_invalidate_all_int(pmap); return; } sched_pin(); if (pmap == kernel_pmap) { for (addr = sva; addr < eva; addr += PAGE_SIZE) invlpg(addr); mask = &all_cpus; } else if (!CPU_CMP(&pmap->pm_active, &all_cpus)) { mask = &all_cpus; } else { cpuid = PCPU_GET(cpuid); other_cpus = all_cpus; CPU_CLR(cpuid, &other_cpus); CPU_AND(&other_cpus, &pmap->pm_active); mask = &other_cpus; } smp_masked_invlpg_range(*mask, sva, eva, pmap); sched_unpin(); } static void pmap_invalidate_all_int(pmap_t pmap) { cpuset_t *mask, other_cpus; u_int cpuid; sched_pin(); if (pmap == kernel_pmap) { invltlb(); mask = &all_cpus; } else if (!CPU_CMP(&pmap->pm_active, &all_cpus)) { mask = &all_cpus; } else { cpuid = PCPU_GET(cpuid); other_cpus = all_cpus; CPU_CLR(cpuid, &other_cpus); CPU_AND(&other_cpus, &pmap->pm_active); mask = &other_cpus; } smp_masked_invltlb(*mask, pmap); sched_unpin(); } static void __CONCAT(PMTYPE, invalidate_cache)(void) { sched_pin(); wbinvd(); smp_cache_flush(); sched_unpin(); } struct pde_action { cpuset_t invalidate; /* processors that invalidate their TLB */ vm_offset_t va; pd_entry_t *pde; pd_entry_t newpde; u_int store; /* processor that updates the PDE */ }; static void pmap_update_pde_kernel(void *arg) { struct pde_action *act = arg; pd_entry_t *pde; if (act->store == PCPU_GET(cpuid)) { pde = pmap_pde(kernel_pmap, act->va); pde_store(pde, act->newpde); } } static void pmap_update_pde_user(void *arg) { struct pde_action *act = arg; if (act->store == PCPU_GET(cpuid)) pde_store(act->pde, act->newpde); } static void pmap_update_pde_teardown(void *arg) { struct pde_action *act = arg; if (CPU_ISSET(PCPU_GET(cpuid), &act->invalidate)) pmap_update_pde_invalidate(act->va, act->newpde); } /* * Change the page size for the specified virtual address in a way that * prevents any possibility of the TLB ever having two entries that map the * same virtual address using different page sizes. This is the recommended * workaround for Erratum 383 on AMD Family 10h processors. It prevents a * machine check exception for a TLB state that is improperly diagnosed as a * hardware error. */ static void pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde) { struct pde_action act; cpuset_t active, other_cpus; u_int cpuid; sched_pin(); cpuid = PCPU_GET(cpuid); other_cpus = all_cpus; CPU_CLR(cpuid, &other_cpus); if (pmap == kernel_pmap) active = all_cpus; else active = pmap->pm_active; if (CPU_OVERLAP(&active, &other_cpus)) { act.store = cpuid; act.invalidate = active; act.va = va; act.pde = pde; act.newpde = newpde; CPU_SET(cpuid, &active); smp_rendezvous_cpus(active, smp_no_rendezvous_barrier, pmap == kernel_pmap ? pmap_update_pde_kernel : pmap_update_pde_user, pmap_update_pde_teardown, &act); } else { if (pmap == kernel_pmap) pmap_kenter_pde(va, newpde); else pde_store(pde, newpde); if (CPU_ISSET(cpuid, &active)) pmap_update_pde_invalidate(va, newpde); } sched_unpin(); } #else /* !SMP */ /* * Normal, non-SMP, 486+ invalidation functions. * We inline these within pmap.c for speed. */ static void pmap_invalidate_page_int(pmap_t pmap, vm_offset_t va) { if (pmap == kernel_pmap) invlpg(va); } static void pmap_invalidate_range_int(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { vm_offset_t addr; if (pmap == kernel_pmap) for (addr = sva; addr < eva; addr += PAGE_SIZE) invlpg(addr); } static void pmap_invalidate_all_int(pmap_t pmap) { if (pmap == kernel_pmap) invltlb(); } static void __CONCAT(PMTYPE, invalidate_cache)(void) { wbinvd(); } static void pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde) { if (pmap == kernel_pmap) pmap_kenter_pde(va, newpde); else pde_store(pde, newpde); if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active)) pmap_update_pde_invalidate(va, newpde); } #endif /* !SMP */ static void __CONCAT(PMTYPE, invalidate_page)(pmap_t pmap, vm_offset_t va) { pmap_invalidate_page_int(pmap, va); } static void __CONCAT(PMTYPE, invalidate_range)(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { pmap_invalidate_range_int(pmap, sva, eva); } static void __CONCAT(PMTYPE, invalidate_all)(pmap_t pmap) { pmap_invalidate_all_int(pmap); } static void pmap_invalidate_pde_page(pmap_t pmap, vm_offset_t va, pd_entry_t pde) { /* * When the PDE has PG_PROMOTED set, the 2- or 4MB page mapping was * created by a promotion that did not invalidate the 512 or 1024 4KB * page mappings that might exist in the TLB. Consequently, at this * point, the TLB may hold both 4KB and 2- or 4MB page mappings for * the address range [va, va + NBPDR). Therefore, the entire range * must be invalidated here. In contrast, when PG_PROMOTED is clear, * the TLB will not hold any 4KB page mappings for the address range * [va, va + NBPDR), and so a single INVLPG suffices to invalidate the * 2- or 4MB page mapping from the TLB. */ if ((pde & PG_PROMOTED) != 0) pmap_invalidate_range_int(pmap, va, va + NBPDR - 1); else pmap_invalidate_page_int(pmap, va); } /* * Are we current address space or kernel? */ static __inline int pmap_is_current(pmap_t pmap) { return (pmap == kernel_pmap); } /* * If the given pmap is not the current or kernel pmap, the returned pte must * be released by passing it to pmap_pte_release(). */ static pt_entry_t * __CONCAT(PMTYPE, pte)(pmap_t pmap, vm_offset_t va) { pd_entry_t newpf; pd_entry_t *pde; pde = pmap_pde(pmap, va); if (*pde & PG_PS) return (pde); if (*pde != 0) { /* are we current address space or kernel? */ if (pmap_is_current(pmap)) return (vtopte(va)); mtx_lock(&PMAP2mutex); newpf = *pde & PG_FRAME; if ((*PMAP2 & PG_FRAME) != newpf) { *PMAP2 = newpf | PG_RW | PG_V | PG_A | PG_M; pmap_invalidate_page_int(kernel_pmap, (vm_offset_t)PADDR2); } return (PADDR2 + (i386_btop(va) & (NPTEPG - 1))); } return (NULL); } /* * Releases a pte that was obtained from pmap_pte(). Be prepared for the pte * being NULL. */ static __inline void pmap_pte_release(pt_entry_t *pte) { if ((pt_entry_t *)((vm_offset_t)pte & ~PAGE_MASK) == PADDR2) mtx_unlock(&PMAP2mutex); } /* * NB: The sequence of updating a page table followed by accesses to the * corresponding pages is subject to the situation described in the "AMD64 * Architecture Programmer's Manual Volume 2: System Programming" rev. 3.23, * "7.3.1 Special Coherency Considerations". Therefore, issuing the INVLPG * right after modifying the PTE bits is crucial. */ static __inline void invlcaddr(void *caddr) { invlpg((u_int)caddr); } /* * Super fast pmap_pte routine best used when scanning * the pv lists. This eliminates many coarse-grained * invltlb calls. Note that many of the pv list * scans are across different pmaps. It is very wasteful * to do an entire invltlb for checking a single mapping. * * If the given pmap is not the current pmap, pvh_global_lock * must be held and curthread pinned to a CPU. */ static pt_entry_t * pmap_pte_quick(pmap_t pmap, vm_offset_t va) { pd_entry_t newpf; pd_entry_t *pde; pde = pmap_pde(pmap, va); if (*pde & PG_PS) return (pde); if (*pde != 0) { /* are we current address space or kernel? */ if (pmap_is_current(pmap)) return (vtopte(va)); rw_assert(&pvh_global_lock, RA_WLOCKED); KASSERT(curthread->td_pinned > 0, ("curthread not pinned")); newpf = *pde & PG_FRAME; if ((*PMAP1 & PG_FRAME) != newpf) { *PMAP1 = newpf | PG_RW | PG_V | PG_A | PG_M; #ifdef SMP PMAP1cpu = PCPU_GET(cpuid); #endif invlcaddr(PADDR1); PMAP1changed++; } else #ifdef SMP if (PMAP1cpu != PCPU_GET(cpuid)) { PMAP1cpu = PCPU_GET(cpuid); invlcaddr(PADDR1); PMAP1changedcpu++; } else #endif PMAP1unchanged++; return (PADDR1 + (i386_btop(va) & (NPTEPG - 1))); } return (0); } static pt_entry_t * pmap_pte_quick3(pmap_t pmap, vm_offset_t va) { pd_entry_t newpf; pd_entry_t *pde; pde = pmap_pde(pmap, va); if (*pde & PG_PS) return (pde); if (*pde != 0) { rw_assert(&pvh_global_lock, RA_WLOCKED); KASSERT(curthread->td_pinned > 0, ("curthread not pinned")); newpf = *pde & PG_FRAME; if ((*PMAP3 & PG_FRAME) != newpf) { *PMAP3 = newpf | PG_RW | PG_V | PG_A | PG_M; #ifdef SMP PMAP3cpu = PCPU_GET(cpuid); #endif invlcaddr(PADDR3); PMAP1changed++; } else #ifdef SMP if (PMAP3cpu != PCPU_GET(cpuid)) { PMAP3cpu = PCPU_GET(cpuid); invlcaddr(PADDR3); PMAP1changedcpu++; } else #endif PMAP1unchanged++; return (PADDR3 + (i386_btop(va) & (NPTEPG - 1))); } return (0); } static pt_entry_t pmap_pte_ufast(pmap_t pmap, vm_offset_t va, pd_entry_t pde) { pt_entry_t *eh_ptep, pte, *ptep; PMAP_LOCK_ASSERT(pmap, MA_OWNED); pde &= PG_FRAME; critical_enter(); eh_ptep = (pt_entry_t *)PCPU_GET(pmap_eh_ptep); if ((*eh_ptep & PG_FRAME) != pde) { *eh_ptep = pde | PG_RW | PG_V | PG_A | PG_M; invlcaddr((void *)PCPU_GET(pmap_eh_va)); } ptep = (pt_entry_t *)PCPU_GET(pmap_eh_va) + (i386_btop(va) & (NPTEPG - 1)); pte = *ptep; critical_exit(); return (pte); } /* * Extract from the kernel page table the physical address that is mapped by * the given virtual address "va". * * This function may be used before pmap_bootstrap() is called. */ static vm_paddr_t __CONCAT(PMTYPE, kextract)(vm_offset_t va) { vm_paddr_t pa; if ((pa = pte_load(&PTD[va >> PDRSHIFT])) & PG_PS) { pa = (pa & PG_PS_FRAME) | (va & PDRMASK); } else { /* * Beware of a concurrent promotion that changes the PDE at * this point! For example, vtopte() must not be used to * access the PTE because it would use the new PDE. It is, * however, safe to use the old PDE because the page table * page is preserved by the promotion. */ pa = KPTmap[i386_btop(va)]; pa = (pa & PG_FRAME) | (va & PAGE_MASK); } return (pa); } /* * Routine: pmap_extract * Function: * Extract the physical page address associated * with the given map/virtual_address pair. */ static vm_paddr_t __CONCAT(PMTYPE, extract)(pmap_t pmap, vm_offset_t va) { vm_paddr_t rtval; pt_entry_t pte; pd_entry_t pde; rtval = 0; PMAP_LOCK(pmap); pde = pmap->pm_pdir[va >> PDRSHIFT]; if (pde != 0) { if ((pde & PG_PS) != 0) rtval = (pde & PG_PS_FRAME) | (va & PDRMASK); else { pte = pmap_pte_ufast(pmap, va, pde); rtval = (pte & PG_FRAME) | (va & PAGE_MASK); } } PMAP_UNLOCK(pmap); return (rtval); } /* * Routine: pmap_extract_and_hold * Function: * Atomically extract and hold the physical page * with the given pmap and virtual address pair * if that mapping permits the given protection. */ static vm_page_t __CONCAT(PMTYPE, extract_and_hold)(pmap_t pmap, vm_offset_t va, vm_prot_t prot) { pd_entry_t pde; pt_entry_t pte; vm_page_t m; vm_paddr_t pa; pa = 0; m = NULL; PMAP_LOCK(pmap); retry: pde = *pmap_pde(pmap, va); if (pde != 0) { if (pde & PG_PS) { if ((pde & PG_RW) || (prot & VM_PROT_WRITE) == 0) { if (vm_page_pa_tryrelock(pmap, (pde & PG_PS_FRAME) | (va & PDRMASK), &pa)) goto retry; m = PHYS_TO_VM_PAGE(pa); } } else { pte = pmap_pte_ufast(pmap, va, pde); if (pte != 0 && ((pte & PG_RW) || (prot & VM_PROT_WRITE) == 0)) { if (vm_page_pa_tryrelock(pmap, pte & PG_FRAME, &pa)) goto retry; m = PHYS_TO_VM_PAGE(pa); } } if (m != NULL) vm_page_wire(m); } PA_UNLOCK_COND(pa); PMAP_UNLOCK(pmap); return (m); } /*************************************************** * Low level mapping routines..... ***************************************************/ /* * Add a wired page to the kva. * Note: not SMP coherent. * * This function may be used before pmap_bootstrap() is called. */ static void __CONCAT(PMTYPE, kenter)(vm_offset_t va, vm_paddr_t pa) { pt_entry_t *pte; pte = vtopte(va); pte_store(pte, pa | PG_RW | PG_V); } static __inline void pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode) { pt_entry_t *pte; pte = vtopte(va); pte_store(pte, pa | PG_RW | PG_V | pmap_cache_bits(kernel_pmap, mode, 0)); } /* * Remove a page from the kernel pagetables. * Note: not SMP coherent. * * This function may be used before pmap_bootstrap() is called. */ static void __CONCAT(PMTYPE, kremove)(vm_offset_t va) { pt_entry_t *pte; pte = vtopte(va); pte_clear(pte); } /* * Used to map a range of physical addresses into kernel * virtual address space. * * The value passed in '*virt' is a suggested virtual address for * the mapping. Architectures which can support a direct-mapped * physical to virtual region can return the appropriate address * within that region, leaving '*virt' unchanged. Other * architectures should map the pages starting at '*virt' and * update '*virt' with the first usable address after the mapped * region. */ static vm_offset_t __CONCAT(PMTYPE, map)(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot) { vm_offset_t va, sva; vm_paddr_t superpage_offset; pd_entry_t newpde; va = *virt; /* * Does the physical address range's size and alignment permit at * least one superpage mapping to be created? */ superpage_offset = start & PDRMASK; if ((end - start) - ((NBPDR - superpage_offset) & PDRMASK) >= NBPDR) { /* * Increase the starting virtual address so that its alignment * does not preclude the use of superpage mappings. */ if ((va & PDRMASK) < superpage_offset) va = (va & ~PDRMASK) + superpage_offset; else if ((va & PDRMASK) > superpage_offset) va = ((va + PDRMASK) & ~PDRMASK) + superpage_offset; } sva = va; while (start < end) { if ((start & PDRMASK) == 0 && end - start >= NBPDR && pseflag != 0) { KASSERT((va & PDRMASK) == 0, ("pmap_map: misaligned va %#x", va)); newpde = start | PG_PS | PG_RW | PG_V; pmap_kenter_pde(va, newpde); va += NBPDR; start += NBPDR; } else { pmap_kenter(va, start); va += PAGE_SIZE; start += PAGE_SIZE; } } pmap_invalidate_range_int(kernel_pmap, sva, va); *virt = va; return (sva); } /* * Add a list of wired pages to the kva * this routine is only used for temporary * kernel mappings that do not need to have * page modification or references recorded. * Note that old mappings are simply written * over. The page *must* be wired. * Note: SMP coherent. Uses a ranged shootdown IPI. */ static void __CONCAT(PMTYPE, qenter)(vm_offset_t sva, vm_page_t *ma, int count) { pt_entry_t *endpte, oldpte, pa, *pte; vm_page_t m; oldpte = 0; pte = vtopte(sva); endpte = pte + count; while (pte < endpte) { m = *ma++; pa = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(kernel_pmap, m->md.pat_mode, 0); if ((*pte & (PG_FRAME | PG_PTE_CACHE)) != pa) { oldpte |= *pte; #ifdef PMAP_PAE_COMP pte_store(pte, pa | pg_nx | PG_RW | PG_V); #else pte_store(pte, pa | PG_RW | PG_V); #endif } pte++; } if (__predict_false((oldpte & PG_V) != 0)) pmap_invalidate_range_int(kernel_pmap, sva, sva + count * PAGE_SIZE); } /* * This routine tears out page mappings from the * kernel -- it is meant only for temporary mappings. * Note: SMP coherent. Uses a ranged shootdown IPI. */ static void __CONCAT(PMTYPE, qremove)(vm_offset_t sva, int count) { vm_offset_t va; va = sva; while (count-- > 0) { pmap_kremove(va); va += PAGE_SIZE; } pmap_invalidate_range_int(kernel_pmap, sva, va); } /*************************************************** * Page table page management routines..... ***************************************************/ /* * Schedule the specified unused page table page to be freed. Specifically, * add the page to the specified list of pages that will be released to the * physical memory manager after the TLB has been updated. */ static __inline void pmap_add_delayed_free_list(vm_page_t m, struct spglist *free, boolean_t set_PG_ZERO) { if (set_PG_ZERO) m->flags |= PG_ZERO; else m->flags &= ~PG_ZERO; SLIST_INSERT_HEAD(free, m, plinks.s.ss); } /* * Inserts the specified page table page into the specified pmap's collection * of idle page table pages. Each of a pmap's page table pages is responsible * for mapping a distinct range of virtual addresses. The pmap's collection is * ordered by this virtual address range. * * If "promoted" is false, then the page table page "mpte" must be zero filled. */ static __inline int pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte, bool promoted) { PMAP_LOCK_ASSERT(pmap, MA_OWNED); mpte->valid = promoted ? VM_PAGE_BITS_ALL : 0; return (vm_radix_insert(&pmap->pm_root, mpte)); } /* * Removes the page table page mapping the specified virtual address from the * specified pmap's collection of idle page table pages, and returns it. * Otherwise, returns NULL if there is no page table page corresponding to the * specified virtual address. */ static __inline vm_page_t pmap_remove_pt_page(pmap_t pmap, vm_offset_t va) { PMAP_LOCK_ASSERT(pmap, MA_OWNED); return (vm_radix_remove(&pmap->pm_root, va >> PDRSHIFT)); } /* * Decrements a page table page's wire count, which is used to record the * number of valid page table entries within the page. If the wire count * drops to zero, then the page table page is unmapped. Returns TRUE if the * page table page was unmapped and FALSE otherwise. */ static inline boolean_t pmap_unwire_ptp(pmap_t pmap, vm_page_t m, struct spglist *free) { --m->wire_count; if (m->wire_count == 0) { _pmap_unwire_ptp(pmap, m, free); return (TRUE); } else return (FALSE); } static void _pmap_unwire_ptp(pmap_t pmap, vm_page_t m, struct spglist *free) { /* * unmap the page table page */ pmap->pm_pdir[m->pindex] = 0; --pmap->pm_stats.resident_count; /* * There is not need to invalidate the recursive mapping since * we never instantiate such mapping for the usermode pmaps, * and never remove page table pages from the kernel pmap. * Put page on a list so that it is released since all TLB * shootdown is done. */ MPASS(pmap != kernel_pmap); pmap_add_delayed_free_list(m, free, TRUE); } /* * After removing a page table entry, this routine is used to * conditionally free the page, and manage the hold/wire counts. */ static int pmap_unuse_pt(pmap_t pmap, vm_offset_t va, struct spglist *free) { pd_entry_t ptepde; vm_page_t mpte; if (pmap == kernel_pmap) return (0); ptepde = *pmap_pde(pmap, va); mpte = PHYS_TO_VM_PAGE(ptepde & PG_FRAME); return (pmap_unwire_ptp(pmap, mpte, free)); } /* * Initialize the pmap for the swapper process. */ static void __CONCAT(PMTYPE, pinit0)(pmap_t pmap) { PMAP_LOCK_INIT(pmap); pmap->pm_pdir = IdlePTD; #ifdef PMAP_PAE_COMP pmap->pm_pdpt = IdlePDPT; #endif pmap->pm_root.rt_root = 0; CPU_ZERO(&pmap->pm_active); TAILQ_INIT(&pmap->pm_pvchunk); bzero(&pmap->pm_stats, sizeof pmap->pm_stats); pmap_activate_boot(pmap); } /* * Initialize a preallocated and zeroed pmap structure, * such as one in a vmspace structure. */ static int __CONCAT(PMTYPE, pinit)(pmap_t pmap) { vm_page_t m; int i; /* * No need to allocate page table space yet but we do need a valid * page directory table. */ if (pmap->pm_pdir == NULL) { pmap->pm_pdir = (pd_entry_t *)kva_alloc(NBPTD); if (pmap->pm_pdir == NULL) return (0); #ifdef PMAP_PAE_COMP pmap->pm_pdpt = uma_zalloc(pdptzone, M_WAITOK | M_ZERO); KASSERT(((vm_offset_t)pmap->pm_pdpt & ((NPGPTD * sizeof(pdpt_entry_t)) - 1)) == 0, ("pmap_pinit: pdpt misaligned")); KASSERT(pmap_kextract((vm_offset_t)pmap->pm_pdpt) < (4ULL<<30), ("pmap_pinit: pdpt above 4g")); #endif pmap->pm_root.rt_root = 0; } KASSERT(vm_radix_is_empty(&pmap->pm_root), ("pmap_pinit: pmap has reserved page table page(s)")); /* * allocate the page directory page(s) */ for (i = 0; i < NPGPTD; i++) { m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO | VM_ALLOC_WAITOK); pmap->pm_ptdpg[i] = m; #ifdef PMAP_PAE_COMP pmap->pm_pdpt[i] = VM_PAGE_TO_PHYS(m) | PG_V; #endif } pmap_qenter((vm_offset_t)pmap->pm_pdir, pmap->pm_ptdpg, NPGPTD); #ifdef PMAP_PAE_COMP if ((cpu_feature & CPUID_PAT) == 0) { pmap_invalidate_cache_range( trunc_page((vm_offset_t)pmap->pm_pdpt), round_page((vm_offset_t)pmap->pm_pdpt + NPGPTD * sizeof(pdpt_entry_t))); } #endif for (i = 0; i < NPGPTD; i++) if ((pmap->pm_ptdpg[i]->flags & PG_ZERO) == 0) pagezero(pmap->pm_pdir + (i * NPDEPG)); /* Install the trampoline mapping. */ pmap->pm_pdir[TRPTDI] = PTD[TRPTDI]; CPU_ZERO(&pmap->pm_active); TAILQ_INIT(&pmap->pm_pvchunk); bzero(&pmap->pm_stats, sizeof pmap->pm_stats); return (1); } /* * this routine is called if the page table page is not * mapped correctly. */ static vm_page_t _pmap_allocpte(pmap_t pmap, u_int ptepindex, u_int flags) { vm_paddr_t ptepa; vm_page_t m; /* * Allocate a page table page. */ if ((m = vm_page_alloc(NULL, ptepindex, VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) { if ((flags & PMAP_ENTER_NOSLEEP) == 0) { PMAP_UNLOCK(pmap); rw_wunlock(&pvh_global_lock); vm_wait(NULL); rw_wlock(&pvh_global_lock); PMAP_LOCK(pmap); } /* * Indicate the need to retry. While waiting, the page table * page may have been allocated. */ return (NULL); } if ((m->flags & PG_ZERO) == 0) pmap_zero_page(m); /* * Map the pagetable page into the process address space, if * it isn't already there. */ pmap->pm_stats.resident_count++; ptepa = VM_PAGE_TO_PHYS(m); pmap->pm_pdir[ptepindex] = (pd_entry_t) (ptepa | PG_U | PG_RW | PG_V | PG_A | PG_M); return (m); } static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va, u_int flags) { u_int ptepindex; pd_entry_t ptepa; vm_page_t m; /* * Calculate pagetable page index */ ptepindex = va >> PDRSHIFT; retry: /* * Get the page directory entry */ ptepa = pmap->pm_pdir[ptepindex]; /* * This supports switching from a 4MB page to a * normal 4K page. */ if (ptepa & PG_PS) { (void)pmap_demote_pde(pmap, &pmap->pm_pdir[ptepindex], va); ptepa = pmap->pm_pdir[ptepindex]; } /* * If the page table page is mapped, we just increment the * hold count, and activate it. */ if (ptepa) { m = PHYS_TO_VM_PAGE(ptepa & PG_FRAME); m->wire_count++; } else { /* * Here if the pte page isn't mapped, or if it has * been deallocated. */ m = _pmap_allocpte(pmap, ptepindex, flags); if (m == NULL && (flags & PMAP_ENTER_NOSLEEP) == 0) goto retry; } return (m); } /*************************************************** * Pmap allocation/deallocation routines. ***************************************************/ /* * Release any resources held by the given physical map. * Called when a pmap initialized by pmap_pinit is being released. * Should only be called if the map contains no valid mappings. */ static void __CONCAT(PMTYPE, release)(pmap_t pmap) { vm_page_t m; int i; KASSERT(pmap->pm_stats.resident_count == 0, ("pmap_release: pmap resident count %ld != 0", pmap->pm_stats.resident_count)); KASSERT(vm_radix_is_empty(&pmap->pm_root), ("pmap_release: pmap has reserved page table page(s)")); KASSERT(CPU_EMPTY(&pmap->pm_active), ("releasing active pmap %p", pmap)); pmap_qremove((vm_offset_t)pmap->pm_pdir, NPGPTD); for (i = 0; i < NPGPTD; i++) { m = pmap->pm_ptdpg[i]; #ifdef PMAP_PAE_COMP KASSERT(VM_PAGE_TO_PHYS(m) == (pmap->pm_pdpt[i] & PG_FRAME), ("pmap_release: got wrong ptd page")); #endif vm_page_unwire_noq(m); vm_page_free(m); } } /* * grow the number of kernel page table entries, if needed */ static void __CONCAT(PMTYPE, growkernel)(vm_offset_t addr) { vm_paddr_t ptppaddr; vm_page_t nkpg; pd_entry_t newpdir; mtx_assert(&kernel_map->system_mtx, MA_OWNED); addr = roundup2(addr, NBPDR); if (addr - 1 >= vm_map_max(kernel_map)) addr = vm_map_max(kernel_map); while (kernel_vm_end < addr) { if (pdir_pde(PTD, kernel_vm_end)) { kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK; if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) { kernel_vm_end = vm_map_max(kernel_map); break; } continue; } nkpg = vm_page_alloc(NULL, kernel_vm_end >> PDRSHIFT, VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO); if (nkpg == NULL) panic("pmap_growkernel: no memory to grow kernel"); nkpt++; if ((nkpg->flags & PG_ZERO) == 0) pmap_zero_page(nkpg); ptppaddr = VM_PAGE_TO_PHYS(nkpg); newpdir = (pd_entry_t) (ptppaddr | PG_V | PG_RW | PG_A | PG_M); pdir_pde(KPTD, kernel_vm_end) = newpdir; pmap_kenter_pde(kernel_vm_end, newpdir); kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK; if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) { kernel_vm_end = vm_map_max(kernel_map); break; } } } /*************************************************** * page management routines. ***************************************************/ CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE); CTASSERT(_NPCM == 11); CTASSERT(_NPCPV == 336); static __inline struct pv_chunk * pv_to_chunk(pv_entry_t pv) { return ((struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK)); } #define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap) #define PC_FREE0_9 0xfffffffful /* Free values for index 0 through 9 */ #define PC_FREE10 0x0000fffful /* Free values for index 10 */ static const uint32_t pc_freemask[_NPCM] = { PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, PC_FREE10 }; #ifdef PV_STATS extern int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail; extern long pv_entry_frees, pv_entry_allocs; extern int pv_entry_spare; #endif /* * We are in a serious low memory condition. Resort to * drastic measures to free some pages so we can allocate * another pv entry chunk. */ static vm_page_t pmap_pv_reclaim(pmap_t locked_pmap) { struct pch newtail; struct pv_chunk *pc; struct md_page *pvh; pd_entry_t *pde; pmap_t pmap; pt_entry_t *pte, tpte; pv_entry_t pv; vm_offset_t va; vm_page_t m, m_pc; struct spglist free; uint32_t inuse; int bit, field, freed; PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED); pmap = NULL; m_pc = NULL; SLIST_INIT(&free); TAILQ_INIT(&newtail); while ((pc = TAILQ_FIRST(&pv_chunks)) != NULL && (pv_vafree == 0 || SLIST_EMPTY(&free))) { TAILQ_REMOVE(&pv_chunks, pc, pc_lru); if (pmap != pc->pc_pmap) { if (pmap != NULL) { pmap_invalidate_all_int(pmap); if (pmap != locked_pmap) PMAP_UNLOCK(pmap); } pmap = pc->pc_pmap; /* Avoid deadlock and lock recursion. */ if (pmap > locked_pmap) PMAP_LOCK(pmap); else if (pmap != locked_pmap && !PMAP_TRYLOCK(pmap)) { pmap = NULL; TAILQ_INSERT_TAIL(&newtail, pc, pc_lru); continue; } } /* * Destroy every non-wired, 4 KB page mapping in the chunk. */ freed = 0; for (field = 0; field < _NPCM; field++) { for (inuse = ~pc->pc_map[field] & pc_freemask[field]; inuse != 0; inuse &= ~(1UL << bit)) { bit = bsfl(inuse); pv = &pc->pc_pventry[field * 32 + bit]; va = pv->pv_va; pde = pmap_pde(pmap, va); if ((*pde & PG_PS) != 0) continue; pte = __CONCAT(PMTYPE, pte)(pmap, va); tpte = *pte; if ((tpte & PG_W) == 0) tpte = pte_load_clear(pte); pmap_pte_release(pte); if ((tpte & PG_W) != 0) continue; KASSERT(tpte != 0, ("pmap_pv_reclaim: pmap %p va %x zero pte", pmap, va)); if ((tpte & PG_G) != 0) pmap_invalidate_page_int(pmap, va); m = PHYS_TO_VM_PAGE(tpte & PG_FRAME); if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) vm_page_dirty(m); if ((tpte & PG_A) != 0) vm_page_aflag_set(m, PGA_REFERENCED); TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); if (TAILQ_EMPTY(&m->md.pv_list) && (m->flags & PG_FICTITIOUS) == 0) { pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); if (TAILQ_EMPTY(&pvh->pv_list)) { vm_page_aflag_clear(m, PGA_WRITEABLE); } } pc->pc_map[field] |= 1UL << bit; pmap_unuse_pt(pmap, va, &free); freed++; } } if (freed == 0) { TAILQ_INSERT_TAIL(&newtail, pc, pc_lru); continue; } /* Every freed mapping is for a 4 KB page. */ pmap->pm_stats.resident_count -= freed; PV_STAT(pv_entry_frees += freed); PV_STAT(pv_entry_spare += freed); pv_entry_count -= freed; TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); for (field = 0; field < _NPCM; field++) if (pc->pc_map[field] != pc_freemask[field]) { TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); TAILQ_INSERT_TAIL(&newtail, pc, pc_lru); /* * One freed pv entry in locked_pmap is * sufficient. */ if (pmap == locked_pmap) goto out; break; } if (field == _NPCM) { PV_STAT(pv_entry_spare -= _NPCPV); PV_STAT(pc_chunk_count--); PV_STAT(pc_chunk_frees++); /* Entire chunk is free; return it. */ m_pc = PHYS_TO_VM_PAGE(pmap_kextract((vm_offset_t)pc)); pmap_qremove((vm_offset_t)pc, 1); pmap_ptelist_free(&pv_vafree, (vm_offset_t)pc); break; } } out: TAILQ_CONCAT(&pv_chunks, &newtail, pc_lru); if (pmap != NULL) { pmap_invalidate_all_int(pmap); if (pmap != locked_pmap) PMAP_UNLOCK(pmap); } if (m_pc == NULL && pv_vafree != 0 && SLIST_EMPTY(&free)) { m_pc = SLIST_FIRST(&free); SLIST_REMOVE_HEAD(&free, plinks.s.ss); /* Recycle a freed page table page. */ m_pc->wire_count = 1; } vm_page_free_pages_toq(&free, true); return (m_pc); } /* * free the pv_entry back to the free list */ static void free_pv_entry(pmap_t pmap, pv_entry_t pv) { struct pv_chunk *pc; int idx, field, bit; rw_assert(&pvh_global_lock, RA_WLOCKED); PMAP_LOCK_ASSERT(pmap, MA_OWNED); PV_STAT(pv_entry_frees++); PV_STAT(pv_entry_spare++); pv_entry_count--; pc = pv_to_chunk(pv); idx = pv - &pc->pc_pventry[0]; field = idx / 32; bit = idx % 32; pc->pc_map[field] |= 1ul << bit; for (idx = 0; idx < _NPCM; idx++) if (pc->pc_map[idx] != pc_freemask[idx]) { /* * 98% of the time, pc is already at the head of the * list. If it isn't already, move it to the head. */ if (__predict_false(TAILQ_FIRST(&pmap->pm_pvchunk) != pc)) { TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); } return; } TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); free_pv_chunk(pc); } static void free_pv_chunk(struct pv_chunk *pc) { vm_page_t m; TAILQ_REMOVE(&pv_chunks, pc, pc_lru); PV_STAT(pv_entry_spare -= _NPCPV); PV_STAT(pc_chunk_count--); PV_STAT(pc_chunk_frees++); /* entire chunk is free, return it */ m = PHYS_TO_VM_PAGE(pmap_kextract((vm_offset_t)pc)); pmap_qremove((vm_offset_t)pc, 1); vm_page_unwire_noq(m); vm_page_free(m); pmap_ptelist_free(&pv_vafree, (vm_offset_t)pc); } /* * get a new pv_entry, allocating a block from the system * when needed. */ static pv_entry_t get_pv_entry(pmap_t pmap, boolean_t try) { static const struct timeval printinterval = { 60, 0 }; static struct timeval lastprint; int bit, field; pv_entry_t pv; struct pv_chunk *pc; vm_page_t m; rw_assert(&pvh_global_lock, RA_WLOCKED); PMAP_LOCK_ASSERT(pmap, MA_OWNED); PV_STAT(pv_entry_allocs++); pv_entry_count++; if (pv_entry_count > pv_entry_high_water) if (ratecheck(&lastprint, &printinterval)) printf("Approaching the limit on PV entries, consider " "increasing either the vm.pmap.shpgperproc or the " "vm.pmap.pv_entries tunable.\n"); retry: pc = TAILQ_FIRST(&pmap->pm_pvchunk); if (pc != NULL) { for (field = 0; field < _NPCM; field++) { if (pc->pc_map[field]) { bit = bsfl(pc->pc_map[field]); break; } } if (field < _NPCM) { pv = &pc->pc_pventry[field * 32 + bit]; pc->pc_map[field] &= ~(1ul << bit); /* If this was the last item, move it to tail */ for (field = 0; field < _NPCM; field++) if (pc->pc_map[field] != 0) { PV_STAT(pv_entry_spare--); return (pv); /* not full, return */ } TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); PV_STAT(pv_entry_spare--); return (pv); } } /* * Access to the ptelist "pv_vafree" is synchronized by the pvh * global lock. If "pv_vafree" is currently non-empty, it will * remain non-empty until pmap_ptelist_alloc() completes. */ if (pv_vafree == 0 || (m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) { if (try) { pv_entry_count--; PV_STAT(pc_chunk_tryfail++); return (NULL); } m = pmap_pv_reclaim(pmap); if (m == NULL) goto retry; } PV_STAT(pc_chunk_count++); PV_STAT(pc_chunk_allocs++); pc = (struct pv_chunk *)pmap_ptelist_alloc(&pv_vafree); pmap_qenter((vm_offset_t)pc, &m, 1); pc->pc_pmap = pmap; pc->pc_map[0] = pc_freemask[0] & ~1ul; /* preallocated bit 0 */ for (field = 1; field < _NPCM; field++) pc->pc_map[field] = pc_freemask[field]; TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru); pv = &pc->pc_pventry[0]; TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); PV_STAT(pv_entry_spare += _NPCPV - 1); return (pv); } static __inline pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va) { pv_entry_t pv; rw_assert(&pvh_global_lock, RA_WLOCKED); TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { if (pmap == PV_PMAP(pv) && va == pv->pv_va) { TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); break; } } return (pv); } static void pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa) { struct md_page *pvh; pv_entry_t pv; vm_offset_t va_last; vm_page_t m; rw_assert(&pvh_global_lock, RA_WLOCKED); KASSERT((pa & PDRMASK) == 0, ("pmap_pv_demote_pde: pa is not 4mpage aligned")); /* * Transfer the 4mpage's pv entry for this mapping to the first * page's pv list. */ pvh = pa_to_pvh(pa); va = trunc_4mpage(va); pv = pmap_pvh_remove(pvh, pmap, va); KASSERT(pv != NULL, ("pmap_pv_demote_pde: pv not found")); m = PHYS_TO_VM_PAGE(pa); TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); /* Instantiate the remaining NPTEPG - 1 pv entries. */ va_last = va + NBPDR - PAGE_SIZE; do { m++; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_pv_demote_pde: page %p is not managed", m)); va += PAGE_SIZE; pmap_insert_entry(pmap, va, m); } while (va < va_last); } #if VM_NRESERVLEVEL > 0 static void pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa) { struct md_page *pvh; pv_entry_t pv; vm_offset_t va_last; vm_page_t m; rw_assert(&pvh_global_lock, RA_WLOCKED); KASSERT((pa & PDRMASK) == 0, ("pmap_pv_promote_pde: pa is not 4mpage aligned")); /* * Transfer the first page's pv entry for this mapping to the * 4mpage's pv list. Aside from avoiding the cost of a call * to get_pv_entry(), a transfer avoids the possibility that * get_pv_entry() calls pmap_collect() and that pmap_collect() * removes one of the mappings that is being promoted. */ m = PHYS_TO_VM_PAGE(pa); va = trunc_4mpage(va); pv = pmap_pvh_remove(&m->md, pmap, va); KASSERT(pv != NULL, ("pmap_pv_promote_pde: pv not found")); pvh = pa_to_pvh(pa); TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); /* Free the remaining NPTEPG - 1 pv entries. */ va_last = va + NBPDR - PAGE_SIZE; do { m++; va += PAGE_SIZE; pmap_pvh_free(&m->md, pmap, va); } while (va < va_last); } #endif /* VM_NRESERVLEVEL > 0 */ static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va) { pv_entry_t pv; pv = pmap_pvh_remove(pvh, pmap, va); KASSERT(pv != NULL, ("pmap_pvh_free: pv not found")); free_pv_entry(pmap, pv); } static void pmap_remove_entry(pmap_t pmap, vm_page_t m, vm_offset_t va) { struct md_page *pvh; rw_assert(&pvh_global_lock, RA_WLOCKED); pmap_pvh_free(&m->md, pmap, va); if (TAILQ_EMPTY(&m->md.pv_list) && (m->flags & PG_FICTITIOUS) == 0) { pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); if (TAILQ_EMPTY(&pvh->pv_list)) vm_page_aflag_clear(m, PGA_WRITEABLE); } } /* * Create a pv entry for page at pa for * (pmap, va). */ static void pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m) { pv_entry_t pv; rw_assert(&pvh_global_lock, RA_WLOCKED); PMAP_LOCK_ASSERT(pmap, MA_OWNED); pv = get_pv_entry(pmap, FALSE); pv->pv_va = va; TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); } /* * Conditionally create a pv entry. */ static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m) { pv_entry_t pv; rw_assert(&pvh_global_lock, RA_WLOCKED); PMAP_LOCK_ASSERT(pmap, MA_OWNED); if (pv_entry_count < pv_entry_high_water && (pv = get_pv_entry(pmap, TRUE)) != NULL) { pv->pv_va = va; TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); return (TRUE); } else return (FALSE); } /* * Create the pv entries for each of the pages within a superpage. */ static bool pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, pd_entry_t pde, u_int flags) { struct md_page *pvh; pv_entry_t pv; bool noreclaim; rw_assert(&pvh_global_lock, RA_WLOCKED); noreclaim = (flags & PMAP_ENTER_NORECLAIM) != 0; if ((noreclaim && pv_entry_count >= pv_entry_high_water) || (pv = get_pv_entry(pmap, noreclaim)) == NULL) return (false); pv->pv_va = va; pvh = pa_to_pvh(pde & PG_PS_FRAME); TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); return (true); } /* * Fills a page table page with mappings to consecutive physical pages. */ static void pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte) { pt_entry_t *pte; for (pte = firstpte; pte < firstpte + NPTEPG; pte++) { *pte = newpte; newpte += PAGE_SIZE; } } /* * Tries to demote a 2- or 4MB page mapping. If demotion fails, the * 2- or 4MB page mapping is invalidated. */ static boolean_t pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va) { pd_entry_t newpde, oldpde; pt_entry_t *firstpte, newpte; vm_paddr_t mptepa; vm_page_t mpte; struct spglist free; vm_offset_t sva; PMAP_LOCK_ASSERT(pmap, MA_OWNED); oldpde = *pde; KASSERT((oldpde & (PG_PS | PG_V)) == (PG_PS | PG_V), ("pmap_demote_pde: oldpde is missing PG_PS and/or PG_V")); if ((oldpde & PG_A) == 0 || (mpte = pmap_remove_pt_page(pmap, va)) == NULL) { KASSERT((oldpde & PG_W) == 0, ("pmap_demote_pde: page table page for a wired mapping" " is missing")); /* * Invalidate the 2- or 4MB page mapping and return * "failure" if the mapping was never accessed or the * allocation of the new page table page fails. */ if ((oldpde & PG_A) == 0 || (mpte = vm_page_alloc(NULL, va >> PDRSHIFT, VM_ALLOC_NOOBJ | VM_ALLOC_NORMAL | VM_ALLOC_WIRED)) == NULL) { SLIST_INIT(&free); sva = trunc_4mpage(va); pmap_remove_pde(pmap, pde, sva, &free); if ((oldpde & PG_G) == 0) pmap_invalidate_pde_page(pmap, sva, oldpde); vm_page_free_pages_toq(&free, true); CTR2(KTR_PMAP, "pmap_demote_pde: failure for va %#x" " in pmap %p", va, pmap); return (FALSE); } if (pmap != kernel_pmap) { mpte->wire_count = NPTEPG; pmap->pm_stats.resident_count++; } } mptepa = VM_PAGE_TO_PHYS(mpte); /* * If the page mapping is in the kernel's address space, then the * KPTmap can provide access to the page table page. Otherwise, * temporarily map the page table page (mpte) into the kernel's * address space at either PADDR1 or PADDR2. */ if (pmap == kernel_pmap) firstpte = &KPTmap[i386_btop(trunc_4mpage(va))]; else if (curthread->td_pinned > 0 && rw_wowned(&pvh_global_lock)) { if ((*PMAP1 & PG_FRAME) != mptepa) { *PMAP1 = mptepa | PG_RW | PG_V | PG_A | PG_M; #ifdef SMP PMAP1cpu = PCPU_GET(cpuid); #endif invlcaddr(PADDR1); PMAP1changed++; } else #ifdef SMP if (PMAP1cpu != PCPU_GET(cpuid)) { PMAP1cpu = PCPU_GET(cpuid); invlcaddr(PADDR1); PMAP1changedcpu++; } else #endif PMAP1unchanged++; firstpte = PADDR1; } else { mtx_lock(&PMAP2mutex); if ((*PMAP2 & PG_FRAME) != mptepa) { *PMAP2 = mptepa | PG_RW | PG_V | PG_A | PG_M; pmap_invalidate_page_int(kernel_pmap, (vm_offset_t)PADDR2); } firstpte = PADDR2; } newpde = mptepa | PG_M | PG_A | (oldpde & PG_U) | PG_RW | PG_V; KASSERT((oldpde & PG_A) != 0, ("pmap_demote_pde: oldpde is missing PG_A")); KASSERT((oldpde & (PG_M | PG_RW)) != PG_RW, ("pmap_demote_pde: oldpde is missing PG_M")); newpte = oldpde & ~PG_PS; if ((newpte & PG_PDE_PAT) != 0) newpte ^= PG_PDE_PAT | PG_PTE_PAT; /* * If the page table page is not leftover from an earlier promotion, * initialize it. */ if (mpte->valid == 0) pmap_fill_ptp(firstpte, newpte); KASSERT((*firstpte & PG_FRAME) == (newpte & PG_FRAME), ("pmap_demote_pde: firstpte and newpte map different physical" " addresses")); /* * If the mapping has changed attributes, update the page table * entries. */ if ((*firstpte & PG_PTE_PROMOTE) != (newpte & PG_PTE_PROMOTE)) pmap_fill_ptp(firstpte, newpte); /* * Demote the mapping. This pmap is locked. The old PDE has * PG_A set. If the old PDE has PG_RW set, it also has PG_M * set. Thus, there is no danger of a race with another * processor changing the setting of PG_A and/or PG_M between * the read above and the store below. */ if (workaround_erratum383) pmap_update_pde(pmap, va, pde, newpde); else if (pmap == kernel_pmap) pmap_kenter_pde(va, newpde); else pde_store(pde, newpde); if (firstpte == PADDR2) mtx_unlock(&PMAP2mutex); /* * Invalidate the recursive mapping of the page table page. */ pmap_invalidate_page_int(pmap, (vm_offset_t)vtopte(va)); /* * Demote the pv entry. This depends on the earlier demotion * of the mapping. Specifically, the (re)creation of a per- * page pv entry might trigger the execution of pmap_collect(), * which might reclaim a newly (re)created per-page pv entry * and destroy the associated mapping. In order to destroy * the mapping, the PDE must have already changed from mapping * the 2mpage to referencing the page table page. */ if ((oldpde & PG_MANAGED) != 0) pmap_pv_demote_pde(pmap, va, oldpde & PG_PS_FRAME); pmap_pde_demotions++; CTR2(KTR_PMAP, "pmap_demote_pde: success for va %#x" " in pmap %p", va, pmap); return (TRUE); } /* * Removes a 2- or 4MB page mapping from the kernel pmap. */ static void pmap_remove_kernel_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va) { pd_entry_t newpde; vm_paddr_t mptepa; vm_page_t mpte; PMAP_LOCK_ASSERT(pmap, MA_OWNED); mpte = pmap_remove_pt_page(pmap, va); if (mpte == NULL) panic("pmap_remove_kernel_pde: Missing pt page."); mptepa = VM_PAGE_TO_PHYS(mpte); newpde = mptepa | PG_M | PG_A | PG_RW | PG_V; /* * If this page table page was unmapped by a promotion, then it * contains valid mappings. Zero it to invalidate those mappings. */ if (mpte->valid != 0) pagezero((void *)&KPTmap[i386_btop(trunc_4mpage(va))]); /* * Remove the mapping. */ if (workaround_erratum383) pmap_update_pde(pmap, va, pde, newpde); else pmap_kenter_pde(va, newpde); /* * Invalidate the recursive mapping of the page table page. */ pmap_invalidate_page_int(pmap, (vm_offset_t)vtopte(va)); } /* * pmap_remove_pde: do the things to unmap a superpage in a process */ static void pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva, struct spglist *free) { struct md_page *pvh; pd_entry_t oldpde; vm_offset_t eva, va; vm_page_t m, mpte; PMAP_LOCK_ASSERT(pmap, MA_OWNED); KASSERT((sva & PDRMASK) == 0, ("pmap_remove_pde: sva is not 4mpage aligned")); oldpde = pte_load_clear(pdq); if (oldpde & PG_W) pmap->pm_stats.wired_count -= NBPDR / PAGE_SIZE; /* * Machines that don't support invlpg, also don't support * PG_G. */ if ((oldpde & PG_G) != 0) pmap_invalidate_pde_page(kernel_pmap, sva, oldpde); pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE; if (oldpde & PG_MANAGED) { pvh = pa_to_pvh(oldpde & PG_PS_FRAME); pmap_pvh_free(pvh, pmap, sva); eva = sva + NBPDR; for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME); va < eva; va += PAGE_SIZE, m++) { if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW)) vm_page_dirty(m); if (oldpde & PG_A) vm_page_aflag_set(m, PGA_REFERENCED); if (TAILQ_EMPTY(&m->md.pv_list) && TAILQ_EMPTY(&pvh->pv_list)) vm_page_aflag_clear(m, PGA_WRITEABLE); } } if (pmap == kernel_pmap) { pmap_remove_kernel_pde(pmap, pdq, sva); } else { mpte = pmap_remove_pt_page(pmap, sva); if (mpte != NULL) { KASSERT(mpte->valid == VM_PAGE_BITS_ALL, ("pmap_remove_pde: pte page not promoted")); pmap->pm_stats.resident_count--; KASSERT(mpte->wire_count == NPTEPG, ("pmap_remove_pde: pte page wire count error")); mpte->wire_count = 0; pmap_add_delayed_free_list(mpte, free, FALSE); } } } /* * pmap_remove_pte: do the things to unmap a page in a process */ static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va, struct spglist *free) { pt_entry_t oldpte; vm_page_t m; rw_assert(&pvh_global_lock, RA_WLOCKED); PMAP_LOCK_ASSERT(pmap, MA_OWNED); oldpte = pte_load_clear(ptq); KASSERT(oldpte != 0, ("pmap_remove_pte: pmap %p va %x zero pte", pmap, va)); if (oldpte & PG_W) pmap->pm_stats.wired_count -= 1; /* * Machines that don't support invlpg, also don't support * PG_G. */ if (oldpte & PG_G) pmap_invalidate_page_int(kernel_pmap, va); pmap->pm_stats.resident_count -= 1; if (oldpte & PG_MANAGED) { m = PHYS_TO_VM_PAGE(oldpte & PG_FRAME); if ((oldpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) vm_page_dirty(m); if (oldpte & PG_A) vm_page_aflag_set(m, PGA_REFERENCED); pmap_remove_entry(pmap, m, va); } return (pmap_unuse_pt(pmap, va, free)); } /* * Remove a single page from a process address space */ static void pmap_remove_page(pmap_t pmap, vm_offset_t va, struct spglist *free) { pt_entry_t *pte; rw_assert(&pvh_global_lock, RA_WLOCKED); KASSERT(curthread->td_pinned > 0, ("curthread not pinned")); PMAP_LOCK_ASSERT(pmap, MA_OWNED); if ((pte = pmap_pte_quick(pmap, va)) == NULL || *pte == 0) return; pmap_remove_pte(pmap, pte, va, free); pmap_invalidate_page_int(pmap, va); } /* * Removes the specified range of addresses from the page table page. */ static bool pmap_remove_ptes(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, struct spglist *free) { pt_entry_t *pte; bool anyvalid; rw_assert(&pvh_global_lock, RA_WLOCKED); KASSERT(curthread->td_pinned > 0, ("curthread not pinned")); PMAP_LOCK_ASSERT(pmap, MA_OWNED); anyvalid = false; for (pte = pmap_pte_quick(pmap, sva); sva != eva; pte++, sva += PAGE_SIZE) { if (*pte == 0) continue; /* * The TLB entry for a PG_G mapping is invalidated by * pmap_remove_pte(). */ if ((*pte & PG_G) == 0) anyvalid = true; if (pmap_remove_pte(pmap, pte, sva, free)) break; } return (anyvalid); } /* * Remove the given range of addresses from the specified map. * * It is assumed that the start and end are properly * rounded to the page size. */ static void __CONCAT(PMTYPE, remove)(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { vm_offset_t pdnxt; pd_entry_t ptpaddr; struct spglist free; int anyvalid; /* * Perform an unsynchronized read. This is, however, safe. */ if (pmap->pm_stats.resident_count == 0) return; anyvalid = 0; SLIST_INIT(&free); rw_wlock(&pvh_global_lock); sched_pin(); PMAP_LOCK(pmap); /* * special handling of removing one page. a very * common operation and easy to short circuit some * code. */ if ((sva + PAGE_SIZE == eva) && ((pmap->pm_pdir[(sva >> PDRSHIFT)] & PG_PS) == 0)) { pmap_remove_page(pmap, sva, &free); goto out; } for (; sva < eva; sva = pdnxt) { u_int pdirindex; /* * Calculate index for next page table. */ pdnxt = (sva + NBPDR) & ~PDRMASK; if (pdnxt < sva) pdnxt = eva; if (pmap->pm_stats.resident_count == 0) break; pdirindex = sva >> PDRSHIFT; ptpaddr = pmap->pm_pdir[pdirindex]; /* * Weed out invalid mappings. Note: we assume that the page * directory table is always allocated, and in kernel virtual. */ if (ptpaddr == 0) continue; /* * Check for large page. */ if ((ptpaddr & PG_PS) != 0) { /* * Are we removing the entire large page? If not, * demote the mapping and fall through. */ if (sva + NBPDR == pdnxt && eva >= pdnxt) { /* * The TLB entry for a PG_G mapping is * invalidated by pmap_remove_pde(). */ if ((ptpaddr & PG_G) == 0) anyvalid = 1; pmap_remove_pde(pmap, &pmap->pm_pdir[pdirindex], sva, &free); continue; } else if (!pmap_demote_pde(pmap, &pmap->pm_pdir[pdirindex], sva)) { /* The large page mapping was destroyed. */ continue; } } /* * Limit our scan to either the end of the va represented * by the current page table page, or to the end of the * range being removed. */ if (pdnxt > eva) pdnxt = eva; if (pmap_remove_ptes(pmap, sva, pdnxt, &free)) anyvalid = 1; } out: sched_unpin(); if (anyvalid) pmap_invalidate_all_int(pmap); rw_wunlock(&pvh_global_lock); PMAP_UNLOCK(pmap); vm_page_free_pages_toq(&free, true); } /* * Routine: pmap_remove_all * Function: * Removes this physical page from * all physical maps in which it resides. * Reflects back modify bits to the pager. * * Notes: * Original versions of this routine were very * inefficient because they iteratively called * pmap_remove (slow...) */ static void __CONCAT(PMTYPE, remove_all)(vm_page_t m) { struct md_page *pvh; pv_entry_t pv; pmap_t pmap; pt_entry_t *pte, tpte; pd_entry_t *pde; vm_offset_t va; struct spglist free; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_remove_all: page %p is not managed", m)); SLIST_INIT(&free); rw_wlock(&pvh_global_lock); sched_pin(); if ((m->flags & PG_FICTITIOUS) != 0) goto small_mappings; pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) { va = pv->pv_va; pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pde = pmap_pde(pmap, va); (void)pmap_demote_pde(pmap, pde, va); PMAP_UNLOCK(pmap); } small_mappings: while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pmap->pm_stats.resident_count--; pde = pmap_pde(pmap, pv->pv_va); KASSERT((*pde & PG_PS) == 0, ("pmap_remove_all: found" " a 4mpage in page %p's pv list", m)); pte = pmap_pte_quick(pmap, pv->pv_va); tpte = pte_load_clear(pte); KASSERT(tpte != 0, ("pmap_remove_all: pmap %p va %x zero pte", pmap, pv->pv_va)); if (tpte & PG_W) pmap->pm_stats.wired_count--; if (tpte & PG_A) vm_page_aflag_set(m, PGA_REFERENCED); /* * Update the vm_page_t clean and reference bits. */ if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) vm_page_dirty(m); pmap_unuse_pt(pmap, pv->pv_va, &free); pmap_invalidate_page_int(pmap, pv->pv_va); TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); free_pv_entry(pmap, pv); PMAP_UNLOCK(pmap); } vm_page_aflag_clear(m, PGA_WRITEABLE); sched_unpin(); rw_wunlock(&pvh_global_lock); vm_page_free_pages_toq(&free, true); } /* * pmap_protect_pde: do the things to protect a 4mpage in a process */ static boolean_t pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva, vm_prot_t prot) { pd_entry_t newpde, oldpde; vm_page_t m, mt; boolean_t anychanged; PMAP_LOCK_ASSERT(pmap, MA_OWNED); KASSERT((sva & PDRMASK) == 0, ("pmap_protect_pde: sva is not 4mpage aligned")); anychanged = FALSE; retry: oldpde = newpde = *pde; if ((prot & VM_PROT_WRITE) == 0) { if ((oldpde & (PG_MANAGED | PG_M | PG_RW)) == (PG_MANAGED | PG_M | PG_RW)) { m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME); for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++) vm_page_dirty(mt); } newpde &= ~(PG_RW | PG_M); } #ifdef PMAP_PAE_COMP if ((prot & VM_PROT_EXECUTE) == 0 && !i386_read_exec) newpde |= pg_nx; #endif if (newpde != oldpde) { /* * As an optimization to future operations on this PDE, clear * PG_PROMOTED. The impending invalidation will remove any * lingering 4KB page mappings from the TLB. */ if (!pde_cmpset(pde, oldpde, newpde & ~PG_PROMOTED)) goto retry; if ((oldpde & PG_G) != 0) pmap_invalidate_pde_page(kernel_pmap, sva, oldpde); else anychanged = TRUE; } return (anychanged); } /* * Set the physical protection on the * specified range of this map as requested. */ static void __CONCAT(PMTYPE, protect)(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) { vm_offset_t pdnxt; pd_entry_t ptpaddr; pt_entry_t *pte; boolean_t anychanged, pv_lists_locked; KASSERT((prot & ~VM_PROT_ALL) == 0, ("invalid prot %x", prot)); if (prot == VM_PROT_NONE) { pmap_remove(pmap, sva, eva); return; } #ifdef PMAP_PAE_COMP if ((prot & (VM_PROT_WRITE | VM_PROT_EXECUTE)) == (VM_PROT_WRITE | VM_PROT_EXECUTE)) return; #else if (prot & VM_PROT_WRITE) return; #endif if (pmap_is_current(pmap)) pv_lists_locked = FALSE; else { pv_lists_locked = TRUE; resume: rw_wlock(&pvh_global_lock); sched_pin(); } anychanged = FALSE; PMAP_LOCK(pmap); for (; sva < eva; sva = pdnxt) { pt_entry_t obits, pbits; u_int pdirindex; pdnxt = (sva + NBPDR) & ~PDRMASK; if (pdnxt < sva) pdnxt = eva; pdirindex = sva >> PDRSHIFT; ptpaddr = pmap->pm_pdir[pdirindex]; /* * Weed out invalid mappings. Note: we assume that the page * directory table is always allocated, and in kernel virtual. */ if (ptpaddr == 0) continue; /* * Check for large page. */ if ((ptpaddr & PG_PS) != 0) { /* * Are we protecting the entire large page? If not, * demote the mapping and fall through. */ if (sva + NBPDR == pdnxt && eva >= pdnxt) { /* * The TLB entry for a PG_G mapping is * invalidated by pmap_protect_pde(). */ if (pmap_protect_pde(pmap, &pmap->pm_pdir[pdirindex], sva, prot)) anychanged = TRUE; continue; } else { if (!pv_lists_locked) { pv_lists_locked = TRUE; if (!rw_try_wlock(&pvh_global_lock)) { if (anychanged) pmap_invalidate_all_int( pmap); PMAP_UNLOCK(pmap); goto resume; } sched_pin(); } if (!pmap_demote_pde(pmap, &pmap->pm_pdir[pdirindex], sva)) { /* * The large page mapping was * destroyed. */ continue; } } } if (pdnxt > eva) pdnxt = eva; for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++, sva += PAGE_SIZE) { vm_page_t m; retry: /* * Regardless of whether a pte is 32 or 64 bits in * size, PG_RW, PG_A, and PG_M are among the least * significant 32 bits. */ obits = pbits = *pte; if ((pbits & PG_V) == 0) continue; if ((prot & VM_PROT_WRITE) == 0) { if ((pbits & (PG_MANAGED | PG_M | PG_RW)) == (PG_MANAGED | PG_M | PG_RW)) { m = PHYS_TO_VM_PAGE(pbits & PG_FRAME); vm_page_dirty(m); } pbits &= ~(PG_RW | PG_M); } #ifdef PMAP_PAE_COMP if ((prot & VM_PROT_EXECUTE) == 0 && !i386_read_exec) pbits |= pg_nx; #endif if (pbits != obits) { #ifdef PMAP_PAE_COMP if (!atomic_cmpset_64(pte, obits, pbits)) goto retry; #else if (!atomic_cmpset_int((u_int *)pte, obits, pbits)) goto retry; #endif if (obits & PG_G) pmap_invalidate_page_int(pmap, sva); else anychanged = TRUE; } } } if (anychanged) pmap_invalidate_all_int(pmap); if (pv_lists_locked) { sched_unpin(); rw_wunlock(&pvh_global_lock); } PMAP_UNLOCK(pmap); } #if VM_NRESERVLEVEL > 0 /* * Tries to promote the 512 or 1024, contiguous 4KB page mappings that are * within a single page table page (PTP) to a single 2- or 4MB page mapping. * For promotion to occur, two conditions must be met: (1) the 4KB page * mappings must map aligned, contiguous physical memory and (2) the 4KB page * mappings must have identical characteristics. * * Managed (PG_MANAGED) mappings within the kernel address space are not * promoted. The reason is that kernel PDEs are replicated in each pmap but * pmap_clear_ptes() and pmap_ts_referenced() only read the PDE from the kernel * pmap. */ static void pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va) { pd_entry_t newpde; pt_entry_t *firstpte, oldpte, pa, *pte; vm_offset_t oldpteva; vm_page_t mpte; PMAP_LOCK_ASSERT(pmap, MA_OWNED); /* * Examine the first PTE in the specified PTP. Abort if this PTE is * either invalid, unused, or does not map the first 4KB physical page * within a 2- or 4MB page. */ firstpte = pmap_pte_quick(pmap, trunc_4mpage(va)); setpde: newpde = *firstpte; if ((newpde & ((PG_FRAME & PDRMASK) | PG_A | PG_V)) != (PG_A | PG_V)) { pmap_pde_p_failures++; CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x" " in pmap %p", va, pmap); return; } if ((*firstpte & PG_MANAGED) != 0 && pmap == kernel_pmap) { pmap_pde_p_failures++; CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x" " in pmap %p", va, pmap); return; } if ((newpde & (PG_M | PG_RW)) == PG_RW) { /* * When PG_M is already clear, PG_RW can be cleared without * a TLB invalidation. */ if (!atomic_cmpset_int((u_int *)firstpte, newpde, newpde & ~PG_RW)) goto setpde; newpde &= ~PG_RW; } /* * Examine each of the other PTEs in the specified PTP. Abort if this * PTE maps an unexpected 4KB physical page or does not have identical * characteristics to the first PTE. */ pa = (newpde & (PG_PS_FRAME | PG_A | PG_V)) + NBPDR - PAGE_SIZE; for (pte = firstpte + NPTEPG - 1; pte > firstpte; pte--) { setpte: oldpte = *pte; if ((oldpte & (PG_FRAME | PG_A | PG_V)) != pa) { pmap_pde_p_failures++; CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x" " in pmap %p", va, pmap); return; } if ((oldpte & (PG_M | PG_RW)) == PG_RW) { /* * When PG_M is already clear, PG_RW can be cleared * without a TLB invalidation. */ if (!atomic_cmpset_int((u_int *)pte, oldpte, oldpte & ~PG_RW)) goto setpte; oldpte &= ~PG_RW; oldpteva = (oldpte & PG_FRAME & PDRMASK) | (va & ~PDRMASK); CTR2(KTR_PMAP, "pmap_promote_pde: protect for va %#x" " in pmap %p", oldpteva, pmap); } if ((oldpte & PG_PTE_PROMOTE) != (newpde & PG_PTE_PROMOTE)) { pmap_pde_p_failures++; CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x" " in pmap %p", va, pmap); return; } pa -= PAGE_SIZE; } /* * Save the page table page in its current state until the PDE * mapping the superpage is demoted by pmap_demote_pde() or * destroyed by pmap_remove_pde(). */ mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME); KASSERT(mpte >= vm_page_array && mpte < &vm_page_array[vm_page_array_size], ("pmap_promote_pde: page table page is out of range")); KASSERT(mpte->pindex == va >> PDRSHIFT, ("pmap_promote_pde: page table page's pindex is wrong")); if (pmap_insert_pt_page(pmap, mpte, true)) { pmap_pde_p_failures++; CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x in pmap %p", va, pmap); return; } /* * Promote the pv entries. */ if ((newpde & PG_MANAGED) != 0) pmap_pv_promote_pde(pmap, va, newpde & PG_PS_FRAME); /* * Propagate the PAT index to its proper position. */ if ((newpde & PG_PTE_PAT) != 0) newpde ^= PG_PDE_PAT | PG_PTE_PAT; /* * Map the superpage. */ if (workaround_erratum383) pmap_update_pde(pmap, va, pde, PG_PS | newpde); else if (pmap == kernel_pmap) pmap_kenter_pde(va, PG_PROMOTED | PG_PS | newpde); else pde_store(pde, PG_PROMOTED | PG_PS | newpde); pmap_pde_promotions++; CTR2(KTR_PMAP, "pmap_promote_pde: success for va %#x" " in pmap %p", va, pmap); } #endif /* VM_NRESERVLEVEL > 0 */ /* * Insert the given physical page (p) at * the specified virtual address (v) in the * target physical map with the protection requested. * * If specified, the page will be wired down, meaning * that the related pte can not be reclaimed. * * NB: This is the only routine which MAY NOT lazy-evaluate * or lose information. That is, this routine must actually * insert this page into the given map NOW. */ static int __CONCAT(PMTYPE, enter)(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, u_int flags, int8_t psind) { pd_entry_t *pde; pt_entry_t *pte; pt_entry_t newpte, origpte; pv_entry_t pv; vm_paddr_t opa, pa; vm_page_t mpte, om; int rv; va = trunc_page(va); KASSERT((pmap == kernel_pmap && va < VM_MAX_KERNEL_ADDRESS) || (pmap != kernel_pmap && va < VM_MAXUSER_ADDRESS), ("pmap_enter: toobig k%d %#x", pmap == kernel_pmap, va)); KASSERT(va < PMAP_TRM_MIN_ADDRESS, ("pmap_enter: invalid to pmap_enter into trampoline (va: 0x%x)", va)); KASSERT(pmap != kernel_pmap || (m->oflags & VPO_UNMANAGED) != 0 || va < kmi.clean_sva || va >= kmi.clean_eva, ("pmap_enter: managed mapping within the clean submap")); if ((m->oflags & VPO_UNMANAGED) == 0 && !vm_page_xbusied(m)) VM_OBJECT_ASSERT_LOCKED(m->object); KASSERT((flags & PMAP_ENTER_RESERVED) == 0, ("pmap_enter: flags %u has reserved bits set", flags)); pa = VM_PAGE_TO_PHYS(m); newpte = (pt_entry_t)(pa | PG_A | PG_V); if ((flags & VM_PROT_WRITE) != 0) newpte |= PG_M; if ((prot & VM_PROT_WRITE) != 0) newpte |= PG_RW; KASSERT((newpte & (PG_M | PG_RW)) != PG_M, ("pmap_enter: flags includes VM_PROT_WRITE but prot doesn't")); #ifdef PMAP_PAE_COMP if ((prot & VM_PROT_EXECUTE) == 0 && !i386_read_exec) newpte |= pg_nx; #endif if ((flags & PMAP_ENTER_WIRED) != 0) newpte |= PG_W; if (pmap != kernel_pmap) newpte |= PG_U; newpte |= pmap_cache_bits(pmap, m->md.pat_mode, psind > 0); if ((m->oflags & VPO_UNMANAGED) == 0) newpte |= PG_MANAGED; rw_wlock(&pvh_global_lock); PMAP_LOCK(pmap); sched_pin(); if (psind == 1) { /* Assert the required virtual and physical alignment. */ KASSERT((va & PDRMASK) == 0, ("pmap_enter: va unaligned")); KASSERT(m->psind > 0, ("pmap_enter: m->psind < psind")); rv = pmap_enter_pde(pmap, va, newpte | PG_PS, flags, m); goto out; } pde = pmap_pde(pmap, va); if (pmap != kernel_pmap) { /* * va is for UVA. * In the case that a page table page is not resident, * we are creating it here. pmap_allocpte() handles * demotion. */ mpte = pmap_allocpte(pmap, va, flags); if (mpte == NULL) { KASSERT((flags & PMAP_ENTER_NOSLEEP) != 0, ("pmap_allocpte failed with sleep allowed")); rv = KERN_RESOURCE_SHORTAGE; goto out; } } else { /* * va is for KVA, so pmap_demote_pde() will never fail * to install a page table page. PG_V is also * asserted by pmap_demote_pde(). */ mpte = NULL; KASSERT(pde != NULL && (*pde & PG_V) != 0, ("KVA %#x invalid pde pdir %#jx", va, (uintmax_t)pmap->pm_pdir[PTDPTDI])); if ((*pde & PG_PS) != 0) pmap_demote_pde(pmap, pde, va); } pte = pmap_pte_quick(pmap, va); /* * Page Directory table entry is not valid, which should not * happen. We should have either allocated the page table * page or demoted the existing mapping above. */ if (pte == NULL) { panic("pmap_enter: invalid page directory pdir=%#jx, va=%#x", (uintmax_t)pmap->pm_pdir[PTDPTDI], va); } origpte = *pte; pv = NULL; /* * Is the specified virtual address already mapped? */ if ((origpte & PG_V) != 0) { /* * Wiring change, just update stats. We don't worry about * wiring PT pages as they remain resident as long as there * are valid mappings in them. Hence, if a user page is wired, * the PT page will be also. */ if ((newpte & PG_W) != 0 && (origpte & PG_W) == 0) pmap->pm_stats.wired_count++; else if ((newpte & PG_W) == 0 && (origpte & PG_W) != 0) pmap->pm_stats.wired_count--; /* * Remove the extra PT page reference. */ if (mpte != NULL) { mpte->wire_count--; KASSERT(mpte->wire_count > 0, ("pmap_enter: missing reference to page table page," " va: 0x%x", va)); } /* * Has the physical page changed? */ opa = origpte & PG_FRAME; if (opa == pa) { /* * No, might be a protection or wiring change. */ if ((origpte & PG_MANAGED) != 0 && (newpte & PG_RW) != 0) vm_page_aflag_set(m, PGA_WRITEABLE); if (((origpte ^ newpte) & ~(PG_M | PG_A)) == 0) goto unchanged; goto validate; } /* * The physical page has changed. Temporarily invalidate * the mapping. This ensures that all threads sharing the * pmap keep a consistent view of the mapping, which is * necessary for the correct handling of COW faults. It * also permits reuse of the old mapping's PV entry, * avoiding an allocation. * * For consistency, handle unmanaged mappings the same way. */ origpte = pte_load_clear(pte); KASSERT((origpte & PG_FRAME) == opa, ("pmap_enter: unexpected pa update for %#x", va)); if ((origpte & PG_MANAGED) != 0) { om = PHYS_TO_VM_PAGE(opa); /* * The pmap lock is sufficient to synchronize with * concurrent calls to pmap_page_test_mappings() and * pmap_ts_referenced(). */ if ((origpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) vm_page_dirty(om); if ((origpte & PG_A) != 0) vm_page_aflag_set(om, PGA_REFERENCED); pv = pmap_pvh_remove(&om->md, pmap, va); KASSERT(pv != NULL, ("pmap_enter: no PV entry for %#x", va)); if ((newpte & PG_MANAGED) == 0) free_pv_entry(pmap, pv); if ((om->aflags & PGA_WRITEABLE) != 0 && TAILQ_EMPTY(&om->md.pv_list) && ((om->flags & PG_FICTITIOUS) != 0 || TAILQ_EMPTY(&pa_to_pvh(opa)->pv_list))) vm_page_aflag_clear(om, PGA_WRITEABLE); } if ((origpte & PG_A) != 0) pmap_invalidate_page_int(pmap, va); origpte = 0; } else { /* * Increment the counters. */ if ((newpte & PG_W) != 0) pmap->pm_stats.wired_count++; pmap->pm_stats.resident_count++; } /* * Enter on the PV list if part of our managed memory. */ if ((newpte & PG_MANAGED) != 0) { if (pv == NULL) { pv = get_pv_entry(pmap, FALSE); pv->pv_va = va; } TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); if ((newpte & PG_RW) != 0) vm_page_aflag_set(m, PGA_WRITEABLE); } /* * Update the PTE. */ if ((origpte & PG_V) != 0) { validate: origpte = pte_load_store(pte, newpte); KASSERT((origpte & PG_FRAME) == pa, ("pmap_enter: unexpected pa update for %#x", va)); if ((newpte & PG_M) == 0 && (origpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { if ((origpte & PG_MANAGED) != 0) vm_page_dirty(m); /* * Although the PTE may still have PG_RW set, TLB * invalidation may nonetheless be required because * the PTE no longer has PG_M set. */ } #ifdef PMAP_PAE_COMP else if ((origpte & PG_NX) != 0 || (newpte & PG_NX) == 0) { /* * This PTE change does not require TLB invalidation. */ goto unchanged; } #endif if ((origpte & PG_A) != 0) pmap_invalidate_page_int(pmap, va); } else pte_store_zero(pte, newpte); unchanged: #if VM_NRESERVLEVEL > 0 /* * If both the page table page and the reservation are fully * populated, then attempt promotion. */ if ((mpte == NULL || mpte->wire_count == NPTEPG) && pg_ps_enabled && (m->flags & PG_FICTITIOUS) == 0 && vm_reserv_level_iffullpop(m) == 0) pmap_promote_pde(pmap, pde, va); #endif rv = KERN_SUCCESS; out: sched_unpin(); rw_wunlock(&pvh_global_lock); PMAP_UNLOCK(pmap); return (rv); } /* * Tries to create a read- and/or execute-only 2 or 4 MB page mapping. Returns * true if successful. Returns false if (1) a mapping already exists at the * specified virtual address or (2) a PV entry cannot be allocated without * reclaiming another PV entry. */ static bool pmap_enter_4mpage(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot) { pd_entry_t newpde; PMAP_LOCK_ASSERT(pmap, MA_OWNED); newpde = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(pmap, m->md.pat_mode, 1) | PG_PS | PG_V; if ((m->oflags & VPO_UNMANAGED) == 0) newpde |= PG_MANAGED; #ifdef PMAP_PAE_COMP if ((prot & VM_PROT_EXECUTE) == 0 && !i386_read_exec) newpde |= pg_nx; #endif if (pmap != kernel_pmap) newpde |= PG_U; return (pmap_enter_pde(pmap, va, newpde, PMAP_ENTER_NOSLEEP | PMAP_ENTER_NOREPLACE | PMAP_ENTER_NORECLAIM, NULL) == KERN_SUCCESS); } /* * Tries to create the specified 2 or 4 MB page mapping. Returns KERN_SUCCESS * if the mapping was created, and either KERN_FAILURE or * KERN_RESOURCE_SHORTAGE otherwise. Returns KERN_FAILURE if * PMAP_ENTER_NOREPLACE was specified and a mapping already exists at the * specified virtual address. Returns KERN_RESOURCE_SHORTAGE if * PMAP_ENTER_NORECLAIM was specified and a PV entry allocation failed. * * The parameter "m" is only used when creating a managed, writeable mapping. */ static int pmap_enter_pde(pmap_t pmap, vm_offset_t va, pd_entry_t newpde, u_int flags, vm_page_t m) { struct spglist free; pd_entry_t oldpde, *pde; vm_page_t mt; rw_assert(&pvh_global_lock, RA_WLOCKED); KASSERT((newpde & (PG_M | PG_RW)) != PG_RW, ("pmap_enter_pde: newpde is missing PG_M")); KASSERT(pmap == kernel_pmap || (newpde & PG_W) == 0, ("pmap_enter_pde: cannot create wired user mapping")); PMAP_LOCK_ASSERT(pmap, MA_OWNED); pde = pmap_pde(pmap, va); oldpde = *pde; if ((oldpde & PG_V) != 0) { if ((flags & PMAP_ENTER_NOREPLACE) != 0) { CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx" " in pmap %p", va, pmap); return (KERN_FAILURE); } /* Break the existing mapping(s). */ SLIST_INIT(&free); if ((oldpde & PG_PS) != 0) { /* * If the PDE resulted from a promotion, then a * reserved PT page could be freed. */ (void)pmap_remove_pde(pmap, pde, va, &free); if ((oldpde & PG_G) == 0) pmap_invalidate_pde_page(pmap, va, oldpde); } else { if (pmap_remove_ptes(pmap, va, va + NBPDR, &free)) pmap_invalidate_all_int(pmap); } vm_page_free_pages_toq(&free, true); if (pmap == kernel_pmap) { /* * Both pmap_remove_pde() and pmap_remove_ptes() will * leave the kernel page table page zero filled. */ mt = PHYS_TO_VM_PAGE(*pde & PG_FRAME); if (pmap_insert_pt_page(pmap, mt, false)) panic("pmap_enter_pde: trie insert failed"); } else KASSERT(*pde == 0, ("pmap_enter_pde: non-zero pde %p", pde)); } if ((newpde & PG_MANAGED) != 0) { /* * Abort this mapping if its PV entry could not be created. */ if (!pmap_pv_insert_pde(pmap, va, newpde, flags)) { CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx" " in pmap %p", va, pmap); return (KERN_RESOURCE_SHORTAGE); } if ((newpde & PG_RW) != 0) { for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++) vm_page_aflag_set(mt, PGA_WRITEABLE); } } /* * Increment counters. */ if ((newpde & PG_W) != 0) pmap->pm_stats.wired_count += NBPDR / PAGE_SIZE; pmap->pm_stats.resident_count += NBPDR / PAGE_SIZE; /* * Map the superpage. (This is not a promoted mapping; there will not * be any lingering 4KB page mappings in the TLB.) */ pde_store(pde, newpde); pmap_pde_mappings++; CTR2(KTR_PMAP, "pmap_enter_pde: success for va %#lx" " in pmap %p", va, pmap); return (KERN_SUCCESS); } /* * Maps a sequence of resident pages belonging to the same object. * The sequence begins with the given page m_start. This page is * mapped at the given virtual address start. Each subsequent page is * mapped at a virtual address that is offset from start by the same * amount as the page is offset from m_start within the object. The * last page in the sequence is the page with the largest offset from * m_start that can be mapped at a virtual address less than the given * virtual address end. Not every virtual page between start and end * is mapped; only those for which a resident page exists with the * corresponding offset from m_start are mapped. */ static void __CONCAT(PMTYPE, enter_object)(pmap_t pmap, vm_offset_t start, vm_offset_t end, vm_page_t m_start, vm_prot_t prot) { vm_offset_t va; vm_page_t m, mpte; vm_pindex_t diff, psize; VM_OBJECT_ASSERT_LOCKED(m_start->object); psize = atop(end - start); mpte = NULL; m = m_start; rw_wlock(&pvh_global_lock); PMAP_LOCK(pmap); while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) { va = start + ptoa(diff); if ((va & PDRMASK) == 0 && va + NBPDR <= end && m->psind == 1 && pg_ps_enabled && pmap_enter_4mpage(pmap, va, m, prot)) m = &m[NBPDR / PAGE_SIZE - 1]; else mpte = pmap_enter_quick_locked(pmap, va, m, prot, mpte); m = TAILQ_NEXT(m, listq); } rw_wunlock(&pvh_global_lock); PMAP_UNLOCK(pmap); } /* * this code makes some *MAJOR* assumptions: * 1. Current pmap & pmap exists. * 2. Not wired. * 3. Read access. * 4. No page table pages. * but is *MUCH* faster than pmap_enter... */ static void __CONCAT(PMTYPE, enter_quick)(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot) { rw_wlock(&pvh_global_lock); PMAP_LOCK(pmap); (void)pmap_enter_quick_locked(pmap, va, m, prot, NULL); rw_wunlock(&pvh_global_lock); PMAP_UNLOCK(pmap); } static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, vm_page_t mpte) { pt_entry_t newpte, *pte; struct spglist free; KASSERT(pmap != kernel_pmap || va < kmi.clean_sva || va >= kmi.clean_eva || (m->oflags & VPO_UNMANAGED) != 0, ("pmap_enter_quick_locked: managed mapping within the clean submap")); rw_assert(&pvh_global_lock, RA_WLOCKED); PMAP_LOCK_ASSERT(pmap, MA_OWNED); /* * In the case that a page table page is not * resident, we are creating it here. */ if (pmap != kernel_pmap) { u_int ptepindex; pd_entry_t ptepa; /* * Calculate pagetable page index */ ptepindex = va >> PDRSHIFT; if (mpte && (mpte->pindex == ptepindex)) { mpte->wire_count++; } else { /* * Get the page directory entry */ ptepa = pmap->pm_pdir[ptepindex]; /* * If the page table page is mapped, we just increment * the hold count, and activate it. */ if (ptepa) { if (ptepa & PG_PS) return (NULL); mpte = PHYS_TO_VM_PAGE(ptepa & PG_FRAME); mpte->wire_count++; } else { mpte = _pmap_allocpte(pmap, ptepindex, PMAP_ENTER_NOSLEEP); if (mpte == NULL) return (mpte); } } } else { mpte = NULL; } sched_pin(); pte = pmap_pte_quick(pmap, va); if (*pte) { if (mpte != NULL) { mpte->wire_count--; mpte = NULL; } sched_unpin(); return (mpte); } /* * Enter on the PV list if part of our managed memory. */ if ((m->oflags & VPO_UNMANAGED) == 0 && !pmap_try_insert_pv_entry(pmap, va, m)) { if (mpte != NULL) { SLIST_INIT(&free); if (pmap_unwire_ptp(pmap, mpte, &free)) { pmap_invalidate_page_int(pmap, va); vm_page_free_pages_toq(&free, true); } mpte = NULL; } sched_unpin(); return (mpte); } /* * Increment counters */ pmap->pm_stats.resident_count++; newpte = VM_PAGE_TO_PHYS(m) | PG_V | pmap_cache_bits(pmap, m->md.pat_mode, 0); if ((m->oflags & VPO_UNMANAGED) == 0) newpte |= PG_MANAGED; #ifdef PMAP_PAE_COMP if ((prot & VM_PROT_EXECUTE) == 0 && !i386_read_exec) newpte |= pg_nx; #endif if (pmap != kernel_pmap) newpte |= PG_U; pte_store_zero(pte, newpte); sched_unpin(); return (mpte); } /* * Make a temporary mapping for a physical address. This is only intended * to be used for panic dumps. */ static void * __CONCAT(PMTYPE, kenter_temporary)(vm_paddr_t pa, int i) { vm_offset_t va; va = (vm_offset_t)crashdumpmap + (i * PAGE_SIZE); pmap_kenter(va, pa); invlpg(va); return ((void *)crashdumpmap); } /* * This code maps large physical mmap regions into the * processor address space. Note that some shortcuts * are taken, but the code works. */ static void __CONCAT(PMTYPE, object_init_pt)(pmap_t pmap, vm_offset_t addr, vm_object_t object, vm_pindex_t pindex, vm_size_t size) { pd_entry_t *pde; vm_paddr_t pa, ptepa; vm_page_t p; int pat_mode; VM_OBJECT_ASSERT_WLOCKED(object); KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG, ("pmap_object_init_pt: non-device object")); if (pg_ps_enabled && (addr & (NBPDR - 1)) == 0 && (size & (NBPDR - 1)) == 0) { if (!vm_object_populate(object, pindex, pindex + atop(size))) return; p = vm_page_lookup(object, pindex); KASSERT(p->valid == VM_PAGE_BITS_ALL, ("pmap_object_init_pt: invalid page %p", p)); pat_mode = p->md.pat_mode; /* * Abort the mapping if the first page is not physically * aligned to a 2/4MB page boundary. */ ptepa = VM_PAGE_TO_PHYS(p); if (ptepa & (NBPDR - 1)) return; /* * Skip the first page. Abort the mapping if the rest of * the pages are not physically contiguous or have differing * memory attributes. */ p = TAILQ_NEXT(p, listq); for (pa = ptepa + PAGE_SIZE; pa < ptepa + size; pa += PAGE_SIZE) { KASSERT(p->valid == VM_PAGE_BITS_ALL, ("pmap_object_init_pt: invalid page %p", p)); if (pa != VM_PAGE_TO_PHYS(p) || pat_mode != p->md.pat_mode) return; p = TAILQ_NEXT(p, listq); } /* * Map using 2/4MB pages. Since "ptepa" is 2/4M aligned and * "size" is a multiple of 2/4M, adding the PAT setting to * "pa" will not affect the termination of this loop. */ PMAP_LOCK(pmap); for (pa = ptepa | pmap_cache_bits(pmap, pat_mode, 1); pa < ptepa + size; pa += NBPDR) { pde = pmap_pde(pmap, addr); if (*pde == 0) { pde_store(pde, pa | PG_PS | PG_M | PG_A | PG_U | PG_RW | PG_V); pmap->pm_stats.resident_count += NBPDR / PAGE_SIZE; pmap_pde_mappings++; } /* Else continue on if the PDE is already valid. */ addr += NBPDR; } PMAP_UNLOCK(pmap); } } /* * Clear the wired attribute from the mappings for the specified range of * addresses in the given pmap. Every valid mapping within that range * must have the wired attribute set. In contrast, invalid mappings * cannot have the wired attribute set, so they are ignored. * * The wired attribute of the page table entry is not a hardware feature, * so there is no need to invalidate any TLB entries. */ static void __CONCAT(PMTYPE, unwire)(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { vm_offset_t pdnxt; pd_entry_t *pde; pt_entry_t *pte; boolean_t pv_lists_locked; if (pmap_is_current(pmap)) pv_lists_locked = FALSE; else { pv_lists_locked = TRUE; resume: rw_wlock(&pvh_global_lock); sched_pin(); } PMAP_LOCK(pmap); for (; sva < eva; sva = pdnxt) { pdnxt = (sva + NBPDR) & ~PDRMASK; if (pdnxt < sva) pdnxt = eva; pde = pmap_pde(pmap, sva); if ((*pde & PG_V) == 0) continue; if ((*pde & PG_PS) != 0) { if ((*pde & PG_W) == 0) panic("pmap_unwire: pde %#jx is missing PG_W", (uintmax_t)*pde); /* * Are we unwiring the entire large page? If not, * demote the mapping and fall through. */ if (sva + NBPDR == pdnxt && eva >= pdnxt) { /* * Regardless of whether a pde (or pte) is 32 * or 64 bits in size, PG_W is among the least * significant 32 bits. */ atomic_clear_int((u_int *)pde, PG_W); pmap->pm_stats.wired_count -= NBPDR / PAGE_SIZE; continue; } else { if (!pv_lists_locked) { pv_lists_locked = TRUE; if (!rw_try_wlock(&pvh_global_lock)) { PMAP_UNLOCK(pmap); /* Repeat sva. */ goto resume; } sched_pin(); } if (!pmap_demote_pde(pmap, pde, sva)) panic("pmap_unwire: demotion failed"); } } if (pdnxt > eva) pdnxt = eva; for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++, sva += PAGE_SIZE) { if ((*pte & PG_V) == 0) continue; if ((*pte & PG_W) == 0) panic("pmap_unwire: pte %#jx is missing PG_W", (uintmax_t)*pte); /* * PG_W must be cleared atomically. Although the pmap * lock synchronizes access to PG_W, another processor * could be setting PG_M and/or PG_A concurrently. * * PG_W is among the least significant 32 bits. */ atomic_clear_int((u_int *)pte, PG_W); pmap->pm_stats.wired_count--; } } if (pv_lists_locked) { sched_unpin(); rw_wunlock(&pvh_global_lock); } PMAP_UNLOCK(pmap); } /* * Copy the range specified by src_addr/len * from the source map to the range dst_addr/len * in the destination map. * * This routine is only advisory and need not do anything. Since * current pmap is always the kernel pmap when executing in * kernel, and we do not copy from the kernel pmap to a user * pmap, this optimization is not usable in 4/4G full split i386 * world. */ static void __CONCAT(PMTYPE, copy)(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len, vm_offset_t src_addr) { struct spglist free; pt_entry_t *src_pte, *dst_pte, ptetemp; pd_entry_t srcptepaddr; vm_page_t dstmpte, srcmpte; vm_offset_t addr, end_addr, pdnxt; u_int ptepindex; if (dst_addr != src_addr) return; end_addr = src_addr + len; rw_wlock(&pvh_global_lock); if (dst_pmap < src_pmap) { PMAP_LOCK(dst_pmap); PMAP_LOCK(src_pmap); } else { PMAP_LOCK(src_pmap); PMAP_LOCK(dst_pmap); } sched_pin(); for (addr = src_addr; addr < end_addr; addr = pdnxt) { KASSERT(addr < PMAP_TRM_MIN_ADDRESS, ("pmap_copy: invalid to pmap_copy the trampoline")); pdnxt = (addr + NBPDR) & ~PDRMASK; if (pdnxt < addr) pdnxt = end_addr; ptepindex = addr >> PDRSHIFT; srcptepaddr = src_pmap->pm_pdir[ptepindex]; if (srcptepaddr == 0) continue; if (srcptepaddr & PG_PS) { if ((addr & PDRMASK) != 0 || addr + NBPDR > end_addr) continue; if (dst_pmap->pm_pdir[ptepindex] == 0 && ((srcptepaddr & PG_MANAGED) == 0 || pmap_pv_insert_pde(dst_pmap, addr, srcptepaddr, PMAP_ENTER_NORECLAIM))) { dst_pmap->pm_pdir[ptepindex] = srcptepaddr & ~PG_W; dst_pmap->pm_stats.resident_count += NBPDR / PAGE_SIZE; pmap_pde_mappings++; } continue; } srcmpte = PHYS_TO_VM_PAGE(srcptepaddr & PG_FRAME); KASSERT(srcmpte->wire_count > 0, ("pmap_copy: source page table page is unused")); if (pdnxt > end_addr) pdnxt = end_addr; src_pte = pmap_pte_quick3(src_pmap, addr); while (addr < pdnxt) { ptetemp = *src_pte; /* * we only virtual copy managed pages */ if ((ptetemp & PG_MANAGED) != 0) { dstmpte = pmap_allocpte(dst_pmap, addr, PMAP_ENTER_NOSLEEP); if (dstmpte == NULL) goto out; dst_pte = pmap_pte_quick(dst_pmap, addr); if (*dst_pte == 0 && pmap_try_insert_pv_entry(dst_pmap, addr, PHYS_TO_VM_PAGE(ptetemp & PG_FRAME))) { /* * Clear the wired, modified, and * accessed (referenced) bits * during the copy. */ *dst_pte = ptetemp & ~(PG_W | PG_M | PG_A); dst_pmap->pm_stats.resident_count++; } else { SLIST_INIT(&free); if (pmap_unwire_ptp(dst_pmap, dstmpte, &free)) { pmap_invalidate_page_int( dst_pmap, addr); vm_page_free_pages_toq(&free, true); } goto out; } if (dstmpte->wire_count >= srcmpte->wire_count) break; } addr += PAGE_SIZE; src_pte++; } } out: sched_unpin(); rw_wunlock(&pvh_global_lock); PMAP_UNLOCK(src_pmap); PMAP_UNLOCK(dst_pmap); } /* * Zero 1 page of virtual memory mapped from a hardware page by the caller. */ static __inline void pagezero(void *page) { #if defined(I686_CPU) if (cpu_class == CPUCLASS_686) { if (cpu_feature & CPUID_SSE2) sse2_pagezero(page); else i686_pagezero(page); } else #endif bzero(page, PAGE_SIZE); } /* * Zero the specified hardware page. */ static void __CONCAT(PMTYPE, zero_page)(vm_page_t m) { pt_entry_t *cmap_pte2; struct pcpu *pc; sched_pin(); pc = get_pcpu(); cmap_pte2 = pc->pc_cmap_pte2; mtx_lock(&pc->pc_cmap_lock); if (*cmap_pte2) panic("pmap_zero_page: CMAP2 busy"); *cmap_pte2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M | pmap_cache_bits(kernel_pmap, m->md.pat_mode, 0); invlcaddr(pc->pc_cmap_addr2); pagezero(pc->pc_cmap_addr2); *cmap_pte2 = 0; /* * Unpin the thread before releasing the lock. Otherwise the thread * could be rescheduled while still bound to the current CPU, only * to unpin itself immediately upon resuming execution. */ sched_unpin(); mtx_unlock(&pc->pc_cmap_lock); } /* * Zero an an area within a single hardware page. off and size must not * cover an area beyond a single hardware page. */ static void __CONCAT(PMTYPE, zero_page_area)(vm_page_t m, int off, int size) { pt_entry_t *cmap_pte2; struct pcpu *pc; sched_pin(); pc = get_pcpu(); cmap_pte2 = pc->pc_cmap_pte2; mtx_lock(&pc->pc_cmap_lock); if (*cmap_pte2) panic("pmap_zero_page_area: CMAP2 busy"); *cmap_pte2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M | pmap_cache_bits(kernel_pmap, m->md.pat_mode, 0); invlcaddr(pc->pc_cmap_addr2); if (off == 0 && size == PAGE_SIZE) pagezero(pc->pc_cmap_addr2); else bzero(pc->pc_cmap_addr2 + off, size); *cmap_pte2 = 0; sched_unpin(); mtx_unlock(&pc->pc_cmap_lock); } /* * Copy 1 specified hardware page to another. */ static void __CONCAT(PMTYPE, copy_page)(vm_page_t src, vm_page_t dst) { pt_entry_t *cmap_pte1, *cmap_pte2; struct pcpu *pc; sched_pin(); pc = get_pcpu(); cmap_pte1 = pc->pc_cmap_pte1; cmap_pte2 = pc->pc_cmap_pte2; mtx_lock(&pc->pc_cmap_lock); if (*cmap_pte1) panic("pmap_copy_page: CMAP1 busy"); if (*cmap_pte2) panic("pmap_copy_page: CMAP2 busy"); *cmap_pte1 = PG_V | VM_PAGE_TO_PHYS(src) | PG_A | pmap_cache_bits(kernel_pmap, src->md.pat_mode, 0); invlcaddr(pc->pc_cmap_addr1); *cmap_pte2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(dst) | PG_A | PG_M | pmap_cache_bits(kernel_pmap, dst->md.pat_mode, 0); invlcaddr(pc->pc_cmap_addr2); bcopy(pc->pc_cmap_addr1, pc->pc_cmap_addr2, PAGE_SIZE); *cmap_pte1 = 0; *cmap_pte2 = 0; sched_unpin(); mtx_unlock(&pc->pc_cmap_lock); } static void __CONCAT(PMTYPE, copy_pages)(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[], vm_offset_t b_offset, int xfersize) { vm_page_t a_pg, b_pg; char *a_cp, *b_cp; vm_offset_t a_pg_offset, b_pg_offset; pt_entry_t *cmap_pte1, *cmap_pte2; struct pcpu *pc; int cnt; sched_pin(); pc = get_pcpu(); cmap_pte1 = pc->pc_cmap_pte1; cmap_pte2 = pc->pc_cmap_pte2; mtx_lock(&pc->pc_cmap_lock); if (*cmap_pte1 != 0) panic("pmap_copy_pages: CMAP1 busy"); if (*cmap_pte2 != 0) panic("pmap_copy_pages: CMAP2 busy"); while (xfersize > 0) { a_pg = ma[a_offset >> PAGE_SHIFT]; a_pg_offset = a_offset & PAGE_MASK; cnt = min(xfersize, PAGE_SIZE - a_pg_offset); b_pg = mb[b_offset >> PAGE_SHIFT]; b_pg_offset = b_offset & PAGE_MASK; cnt = min(cnt, PAGE_SIZE - b_pg_offset); *cmap_pte1 = PG_V | VM_PAGE_TO_PHYS(a_pg) | PG_A | pmap_cache_bits(kernel_pmap, a_pg->md.pat_mode, 0); invlcaddr(pc->pc_cmap_addr1); *cmap_pte2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(b_pg) | PG_A | PG_M | pmap_cache_bits(kernel_pmap, b_pg->md.pat_mode, 0); invlcaddr(pc->pc_cmap_addr2); a_cp = pc->pc_cmap_addr1 + a_pg_offset; b_cp = pc->pc_cmap_addr2 + b_pg_offset; bcopy(a_cp, b_cp, cnt); a_offset += cnt; b_offset += cnt; xfersize -= cnt; } *cmap_pte1 = 0; *cmap_pte2 = 0; sched_unpin(); mtx_unlock(&pc->pc_cmap_lock); } /* * Returns true if the pmap's pv is one of the first * 16 pvs linked to from this page. This count may * be changed upwards or downwards in the future; it * is only necessary that true be returned for a small * subset of pmaps for proper page aging. */ static boolean_t __CONCAT(PMTYPE, page_exists_quick)(pmap_t pmap, vm_page_t m) { struct md_page *pvh; pv_entry_t pv; int loops = 0; boolean_t rv; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_page_exists_quick: page %p is not managed", m)); rv = FALSE; rw_wlock(&pvh_global_lock); TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { if (PV_PMAP(pv) == pmap) { rv = TRUE; break; } loops++; if (loops >= 16) break; } if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) { pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { if (PV_PMAP(pv) == pmap) { rv = TRUE; break; } loops++; if (loops >= 16) break; } } rw_wunlock(&pvh_global_lock); return (rv); } /* * pmap_page_wired_mappings: * * Return the number of managed mappings to the given physical page * that are wired. */ static int __CONCAT(PMTYPE, page_wired_mappings)(vm_page_t m) { int count; count = 0; if ((m->oflags & VPO_UNMANAGED) != 0) return (count); rw_wlock(&pvh_global_lock); count = pmap_pvh_wired_mappings(&m->md, count); if ((m->flags & PG_FICTITIOUS) == 0) { count = pmap_pvh_wired_mappings(pa_to_pvh(VM_PAGE_TO_PHYS(m)), count); } rw_wunlock(&pvh_global_lock); return (count); } /* * pmap_pvh_wired_mappings: * * Return the updated number "count" of managed mappings that are wired. */ static int pmap_pvh_wired_mappings(struct md_page *pvh, int count) { pmap_t pmap; pt_entry_t *pte; pv_entry_t pv; rw_assert(&pvh_global_lock, RA_WLOCKED); sched_pin(); TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pte = pmap_pte_quick(pmap, pv->pv_va); if ((*pte & PG_W) != 0) count++; PMAP_UNLOCK(pmap); } sched_unpin(); return (count); } /* * Returns TRUE if the given page is mapped individually or as part of * a 4mpage. Otherwise, returns FALSE. */ static boolean_t __CONCAT(PMTYPE, page_is_mapped)(vm_page_t m) { boolean_t rv; if ((m->oflags & VPO_UNMANAGED) != 0) return (FALSE); rw_wlock(&pvh_global_lock); rv = !TAILQ_EMPTY(&m->md.pv_list) || ((m->flags & PG_FICTITIOUS) == 0 && !TAILQ_EMPTY(&pa_to_pvh(VM_PAGE_TO_PHYS(m))->pv_list)); rw_wunlock(&pvh_global_lock); return (rv); } /* * Remove all pages from specified address space * this aids process exit speeds. Also, this code * is special cased for current process only, but * can have the more generic (and slightly slower) * mode enabled. This is much faster than pmap_remove * in the case of running down an entire address space. */ static void __CONCAT(PMTYPE, remove_pages)(pmap_t pmap) { pt_entry_t *pte, tpte; vm_page_t m, mpte, mt; pv_entry_t pv; struct md_page *pvh; struct pv_chunk *pc, *npc; struct spglist free; int field, idx; int32_t bit; uint32_t inuse, bitmask; int allfree; if (pmap != PCPU_GET(curpmap)) { printf("warning: pmap_remove_pages called with non-current pmap\n"); return; } SLIST_INIT(&free); rw_wlock(&pvh_global_lock); PMAP_LOCK(pmap); sched_pin(); TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) { KASSERT(pc->pc_pmap == pmap, ("Wrong pmap %p %p", pmap, pc->pc_pmap)); allfree = 1; for (field = 0; field < _NPCM; field++) { inuse = ~pc->pc_map[field] & pc_freemask[field]; while (inuse != 0) { bit = bsfl(inuse); bitmask = 1UL << bit; idx = field * 32 + bit; pv = &pc->pc_pventry[idx]; inuse &= ~bitmask; pte = pmap_pde(pmap, pv->pv_va); tpte = *pte; if ((tpte & PG_PS) == 0) { pte = pmap_pte_quick(pmap, pv->pv_va); tpte = *pte & ~PG_PTE_PAT; } if (tpte == 0) { printf( "TPTE at %p IS ZERO @ VA %08x\n", pte, pv->pv_va); panic("bad pte"); } /* * We cannot remove wired pages from a process' mapping at this time */ if (tpte & PG_W) { allfree = 0; continue; } m = PHYS_TO_VM_PAGE(tpte & PG_FRAME); KASSERT(m->phys_addr == (tpte & PG_FRAME), ("vm_page_t %p phys_addr mismatch %016jx %016jx", m, (uintmax_t)m->phys_addr, (uintmax_t)tpte)); KASSERT((m->flags & PG_FICTITIOUS) != 0 || m < &vm_page_array[vm_page_array_size], ("pmap_remove_pages: bad tpte %#jx", (uintmax_t)tpte)); pte_clear(pte); /* * Update the vm_page_t clean/reference bits. */ if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { if ((tpte & PG_PS) != 0) { for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++) vm_page_dirty(mt); } else vm_page_dirty(m); } /* Mark free */ PV_STAT(pv_entry_frees++); PV_STAT(pv_entry_spare++); pv_entry_count--; pc->pc_map[field] |= bitmask; if ((tpte & PG_PS) != 0) { pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE; pvh = pa_to_pvh(tpte & PG_PS_FRAME); TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); if (TAILQ_EMPTY(&pvh->pv_list)) { for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++) if (TAILQ_EMPTY(&mt->md.pv_list)) vm_page_aflag_clear(mt, PGA_WRITEABLE); } mpte = pmap_remove_pt_page(pmap, pv->pv_va); if (mpte != NULL) { KASSERT(mpte->valid == VM_PAGE_BITS_ALL, ("pmap_remove_pages: pte page not promoted")); pmap->pm_stats.resident_count--; KASSERT(mpte->wire_count == NPTEPG, ("pmap_remove_pages: pte page wire count error")); mpte->wire_count = 0; pmap_add_delayed_free_list(mpte, &free, FALSE); } } else { pmap->pm_stats.resident_count--; TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); if (TAILQ_EMPTY(&m->md.pv_list) && (m->flags & PG_FICTITIOUS) == 0) { pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); if (TAILQ_EMPTY(&pvh->pv_list)) vm_page_aflag_clear(m, PGA_WRITEABLE); } pmap_unuse_pt(pmap, pv->pv_va, &free); } } } if (allfree) { TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); free_pv_chunk(pc); } } sched_unpin(); pmap_invalidate_all_int(pmap); rw_wunlock(&pvh_global_lock); PMAP_UNLOCK(pmap); vm_page_free_pages_toq(&free, true); } /* * pmap_is_modified: * * Return whether or not the specified physical page was modified * in any physical maps. */ static boolean_t __CONCAT(PMTYPE, is_modified)(vm_page_t m) { boolean_t rv; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_is_modified: page %p is not managed", m)); /* * If the page is not exclusive busied, then PGA_WRITEABLE cannot be * concurrently set while the object is locked. Thus, if PGA_WRITEABLE * is clear, no PTEs can have PG_M set. */ VM_OBJECT_ASSERT_WLOCKED(m->object); if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0) return (FALSE); rw_wlock(&pvh_global_lock); rv = pmap_is_modified_pvh(&m->md) || ((m->flags & PG_FICTITIOUS) == 0 && pmap_is_modified_pvh(pa_to_pvh(VM_PAGE_TO_PHYS(m)))); rw_wunlock(&pvh_global_lock); return (rv); } /* * Returns TRUE if any of the given mappings were used to modify * physical memory. Otherwise, returns FALSE. Both page and 2mpage * mappings are supported. */ static boolean_t pmap_is_modified_pvh(struct md_page *pvh) { pv_entry_t pv; pt_entry_t *pte; pmap_t pmap; boolean_t rv; rw_assert(&pvh_global_lock, RA_WLOCKED); rv = FALSE; sched_pin(); TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pte = pmap_pte_quick(pmap, pv->pv_va); rv = (*pte & (PG_M | PG_RW)) == (PG_M | PG_RW); PMAP_UNLOCK(pmap); if (rv) break; } sched_unpin(); return (rv); } /* * pmap_is_prefaultable: * * Return whether or not the specified virtual address is elgible * for prefault. */ static boolean_t __CONCAT(PMTYPE, is_prefaultable)(pmap_t pmap, vm_offset_t addr) { pd_entry_t pde; boolean_t rv; rv = FALSE; PMAP_LOCK(pmap); pde = *pmap_pde(pmap, addr); if (pde != 0 && (pde & PG_PS) == 0) rv = pmap_pte_ufast(pmap, addr, pde) == 0; PMAP_UNLOCK(pmap); return (rv); } /* * pmap_is_referenced: * * Return whether or not the specified physical page was referenced * in any physical maps. */ static boolean_t __CONCAT(PMTYPE, is_referenced)(vm_page_t m) { boolean_t rv; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_is_referenced: page %p is not managed", m)); rw_wlock(&pvh_global_lock); rv = pmap_is_referenced_pvh(&m->md) || ((m->flags & PG_FICTITIOUS) == 0 && pmap_is_referenced_pvh(pa_to_pvh(VM_PAGE_TO_PHYS(m)))); rw_wunlock(&pvh_global_lock); return (rv); } /* * Returns TRUE if any of the given mappings were referenced and FALSE * otherwise. Both page and 4mpage mappings are supported. */ static boolean_t pmap_is_referenced_pvh(struct md_page *pvh) { pv_entry_t pv; pt_entry_t *pte; pmap_t pmap; boolean_t rv; rw_assert(&pvh_global_lock, RA_WLOCKED); rv = FALSE; sched_pin(); TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pte = pmap_pte_quick(pmap, pv->pv_va); rv = (*pte & (PG_A | PG_V)) == (PG_A | PG_V); PMAP_UNLOCK(pmap); if (rv) break; } sched_unpin(); return (rv); } /* * Clear the write and modified bits in each of the given page's mappings. */ static void __CONCAT(PMTYPE, remove_write)(vm_page_t m) { struct md_page *pvh; pv_entry_t next_pv, pv; pmap_t pmap; pd_entry_t *pde; pt_entry_t oldpte, *pte; vm_offset_t va; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_remove_write: page %p is not managed", m)); /* * If the page is not exclusive busied, then PGA_WRITEABLE cannot be * set by another thread while the object is locked. Thus, * if PGA_WRITEABLE is clear, no page table entries need updating. */ VM_OBJECT_ASSERT_WLOCKED(m->object); if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0) return; rw_wlock(&pvh_global_lock); sched_pin(); if ((m->flags & PG_FICTITIOUS) != 0) goto small_mappings; pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { va = pv->pv_va; pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pde = pmap_pde(pmap, va); if ((*pde & PG_RW) != 0) (void)pmap_demote_pde(pmap, pde, va); PMAP_UNLOCK(pmap); } small_mappings: TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pde = pmap_pde(pmap, pv->pv_va); KASSERT((*pde & PG_PS) == 0, ("pmap_clear_write: found" " a 4mpage in page %p's pv list", m)); pte = pmap_pte_quick(pmap, pv->pv_va); retry: oldpte = *pte; if ((oldpte & PG_RW) != 0) { /* * Regardless of whether a pte is 32 or 64 bits * in size, PG_RW and PG_M are among the least * significant 32 bits. */ if (!atomic_cmpset_int((u_int *)pte, oldpte, oldpte & ~(PG_RW | PG_M))) goto retry; if ((oldpte & PG_M) != 0) vm_page_dirty(m); pmap_invalidate_page_int(pmap, pv->pv_va); } PMAP_UNLOCK(pmap); } vm_page_aflag_clear(m, PGA_WRITEABLE); sched_unpin(); rw_wunlock(&pvh_global_lock); } /* * pmap_ts_referenced: * * Return a count of reference bits for a page, clearing those bits. * It is not necessary for every reference bit to be cleared, but it * is necessary that 0 only be returned when there are truly no * reference bits set. * * As an optimization, update the page's dirty field if a modified bit is * found while counting reference bits. This opportunistic update can be * performed at low cost and can eliminate the need for some future calls * to pmap_is_modified(). However, since this function stops after * finding PMAP_TS_REFERENCED_MAX reference bits, it may not detect some * dirty pages. Those dirty pages will only be detected by a future call * to pmap_is_modified(). */ static int __CONCAT(PMTYPE, ts_referenced)(vm_page_t m) { struct md_page *pvh; pv_entry_t pv, pvf; pmap_t pmap; pd_entry_t *pde; pt_entry_t *pte; vm_paddr_t pa; int rtval = 0; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_ts_referenced: page %p is not managed", m)); pa = VM_PAGE_TO_PHYS(m); pvh = pa_to_pvh(pa); rw_wlock(&pvh_global_lock); sched_pin(); if ((m->flags & PG_FICTITIOUS) != 0 || (pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL) goto small_mappings; pv = pvf; do { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pde = pmap_pde(pmap, pv->pv_va); if ((*pde & (PG_M | PG_RW)) == (PG_M | PG_RW)) { /* * Although "*pde" is mapping a 2/4MB page, because * this function is called at a 4KB page granularity, * we only update the 4KB page under test. */ vm_page_dirty(m); } if ((*pde & PG_A) != 0) { /* * Since this reference bit is shared by either 1024 * or 512 4KB pages, it should not be cleared every * time it is tested. Apply a simple "hash" function * on the physical page number, the virtual superpage * number, and the pmap address to select one 4KB page * out of the 1024 or 512 on which testing the * reference bit will result in clearing that bit. * This function is designed to avoid the selection of * the same 4KB page for every 2- or 4MB page mapping. * * On demotion, a mapping that hasn't been referenced * is simply destroyed. To avoid the possibility of a * subsequent page fault on a demoted wired mapping, * always leave its reference bit set. Moreover, * since the superpage is wired, the current state of * its reference bit won't affect page replacement. */ if ((((pa >> PAGE_SHIFT) ^ (pv->pv_va >> PDRSHIFT) ^ (uintptr_t)pmap) & (NPTEPG - 1)) == 0 && (*pde & PG_W) == 0) { atomic_clear_int((u_int *)pde, PG_A); pmap_invalidate_page_int(pmap, pv->pv_va); } rtval++; } PMAP_UNLOCK(pmap); /* Rotate the PV list if it has more than one entry. */ if (TAILQ_NEXT(pv, pv_next) != NULL) { TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); } if (rtval >= PMAP_TS_REFERENCED_MAX) goto out; } while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf); small_mappings: if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL) goto out; pv = pvf; do { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pde = pmap_pde(pmap, pv->pv_va); KASSERT((*pde & PG_PS) == 0, ("pmap_ts_referenced: found a 4mpage in page %p's pv list", m)); pte = pmap_pte_quick(pmap, pv->pv_va); if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) vm_page_dirty(m); if ((*pte & PG_A) != 0) { atomic_clear_int((u_int *)pte, PG_A); pmap_invalidate_page_int(pmap, pv->pv_va); rtval++; } PMAP_UNLOCK(pmap); /* Rotate the PV list if it has more than one entry. */ if (TAILQ_NEXT(pv, pv_next) != NULL) { TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); } } while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && rtval < PMAP_TS_REFERENCED_MAX); out: sched_unpin(); rw_wunlock(&pvh_global_lock); return (rtval); } /* * Apply the given advice to the specified range of addresses within the * given pmap. Depending on the advice, clear the referenced and/or * modified flags in each mapping and set the mapped page's dirty field. */ static void __CONCAT(PMTYPE, advise)(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice) { pd_entry_t oldpde, *pde; pt_entry_t *pte; vm_offset_t va, pdnxt; vm_page_t m; boolean_t anychanged, pv_lists_locked; if (advice != MADV_DONTNEED && advice != MADV_FREE) return; if (pmap_is_current(pmap)) pv_lists_locked = FALSE; else { pv_lists_locked = TRUE; resume: rw_wlock(&pvh_global_lock); sched_pin(); } anychanged = FALSE; PMAP_LOCK(pmap); for (; sva < eva; sva = pdnxt) { pdnxt = (sva + NBPDR) & ~PDRMASK; if (pdnxt < sva) pdnxt = eva; pde = pmap_pde(pmap, sva); oldpde = *pde; if ((oldpde & PG_V) == 0) continue; else if ((oldpde & PG_PS) != 0) { if ((oldpde & PG_MANAGED) == 0) continue; if (!pv_lists_locked) { pv_lists_locked = TRUE; if (!rw_try_wlock(&pvh_global_lock)) { if (anychanged) pmap_invalidate_all_int(pmap); PMAP_UNLOCK(pmap); goto resume; } sched_pin(); } if (!pmap_demote_pde(pmap, pde, sva)) { /* * The large page mapping was destroyed. */ continue; } /* * Unless the page mappings are wired, remove the * mapping to a single page so that a subsequent * access may repromote. Since the underlying page * table page is fully populated, this removal never * frees a page table page. */ if ((oldpde & PG_W) == 0) { pte = pmap_pte_quick(pmap, sva); KASSERT((*pte & PG_V) != 0, ("pmap_advise: invalid PTE")); pmap_remove_pte(pmap, pte, sva, NULL); anychanged = TRUE; } } if (pdnxt > eva) pdnxt = eva; va = pdnxt; for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++, sva += PAGE_SIZE) { if ((*pte & (PG_MANAGED | PG_V)) != (PG_MANAGED | PG_V)) goto maybe_invlrng; else if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { if (advice == MADV_DONTNEED) { /* * Future calls to pmap_is_modified() * can be avoided by making the page * dirty now. */ m = PHYS_TO_VM_PAGE(*pte & PG_FRAME); vm_page_dirty(m); } atomic_clear_int((u_int *)pte, PG_M | PG_A); } else if ((*pte & PG_A) != 0) atomic_clear_int((u_int *)pte, PG_A); else goto maybe_invlrng; if ((*pte & PG_G) != 0) { if (va == pdnxt) va = sva; } else anychanged = TRUE; continue; maybe_invlrng: if (va != pdnxt) { pmap_invalidate_range_int(pmap, va, sva); va = pdnxt; } } if (va != pdnxt) pmap_invalidate_range_int(pmap, va, sva); } if (anychanged) pmap_invalidate_all_int(pmap); if (pv_lists_locked) { sched_unpin(); rw_wunlock(&pvh_global_lock); } PMAP_UNLOCK(pmap); } /* * Clear the modify bits on the specified physical page. */ static void __CONCAT(PMTYPE, clear_modify)(vm_page_t m) { struct md_page *pvh; pv_entry_t next_pv, pv; pmap_t pmap; pd_entry_t oldpde, *pde; - pt_entry_t oldpte, *pte; + pt_entry_t *pte; vm_offset_t va; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_clear_modify: page %p is not managed", m)); VM_OBJECT_ASSERT_WLOCKED(m->object); KASSERT(!vm_page_xbusied(m), ("pmap_clear_modify: page %p is exclusive busied", m)); /* * If the page is not PGA_WRITEABLE, then no PTEs can have PG_M set. * If the object containing the page is locked and the page is not * exclusive busied, then PGA_WRITEABLE cannot be concurrently set. */ if ((m->aflags & PGA_WRITEABLE) == 0) return; rw_wlock(&pvh_global_lock); sched_pin(); if ((m->flags & PG_FICTITIOUS) != 0) goto small_mappings; pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { va = pv->pv_va; pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pde = pmap_pde(pmap, va); oldpde = *pde; - if ((oldpde & PG_RW) != 0) { - if (pmap_demote_pde(pmap, pde, va)) { - if ((oldpde & PG_W) == 0) { - /* - * Write protect the mapping to a - * single page so that a subsequent - * write access may repromote. - */ - va += VM_PAGE_TO_PHYS(m) - (oldpde & - PG_PS_FRAME); - pte = pmap_pte_quick(pmap, va); - oldpte = *pte; - if ((oldpte & PG_V) != 0) { - /* - * Regardless of whether a pte is 32 or 64 bits - * in size, PG_RW and PG_M are among the least - * significant 32 bits. - */ - while (!atomic_cmpset_int((u_int *)pte, - oldpte, - oldpte & ~(PG_M | PG_RW))) - oldpte = *pte; - vm_page_dirty(m); - pmap_invalidate_page_int(pmap, - va); - } - } - } + /* If oldpde has PG_RW set, then it also has PG_M set. */ + if ((oldpde & PG_RW) != 0 && + pmap_demote_pde(pmap, pde, va) && + (oldpde & PG_W) == 0) { + /* + * Write protect the mapping to a single page so that + * a subsequent write access may repromote. + */ + va += VM_PAGE_TO_PHYS(m) - (oldpde & PG_PS_FRAME); + pte = pmap_pte_quick(pmap, va); + /* + * Regardless of whether a pte is 32 or 64 bits + * in size, PG_RW and PG_M are among the least + * significant 32 bits. + */ + atomic_clear_int((u_int *)pte, PG_M | PG_RW); + vm_page_dirty(m); + pmap_invalidate_page_int(pmap, va); } PMAP_UNLOCK(pmap); } small_mappings: TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pde = pmap_pde(pmap, pv->pv_va); KASSERT((*pde & PG_PS) == 0, ("pmap_clear_modify: found" " a 4mpage in page %p's pv list", m)); pte = pmap_pte_quick(pmap, pv->pv_va); if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { /* * Regardless of whether a pte is 32 or 64 bits * in size, PG_M is among the least significant * 32 bits. */ atomic_clear_int((u_int *)pte, PG_M); pmap_invalidate_page_int(pmap, pv->pv_va); } PMAP_UNLOCK(pmap); } sched_unpin(); rw_wunlock(&pvh_global_lock); } /* * Miscellaneous support routines follow */ /* Adjust the cache mode for a 4KB page mapped via a PTE. */ static __inline void pmap_pte_attr(pt_entry_t *pte, int cache_bits) { u_int opte, npte; /* * The cache mode bits are all in the low 32-bits of the * PTE, so we can just spin on updating the low 32-bits. */ do { opte = *(u_int *)pte; npte = opte & ~PG_PTE_CACHE; npte |= cache_bits; } while (npte != opte && !atomic_cmpset_int((u_int *)pte, opte, npte)); } /* Adjust the cache mode for a 2/4MB page mapped via a PDE. */ static __inline void pmap_pde_attr(pd_entry_t *pde, int cache_bits) { u_int opde, npde; /* * The cache mode bits are all in the low 32-bits of the * PDE, so we can just spin on updating the low 32-bits. */ do { opde = *(u_int *)pde; npde = opde & ~PG_PDE_CACHE; npde |= cache_bits; } while (npde != opde && !atomic_cmpset_int((u_int *)pde, opde, npde)); } /* * Map a set of physical memory pages into the kernel virtual * address space. Return a pointer to where it is mapped. This * routine is intended to be used for mapping device memory, * NOT real memory. */ static void * __CONCAT(PMTYPE, mapdev_attr)(vm_paddr_t pa, vm_size_t size, int mode) { struct pmap_preinit_mapping *ppim; vm_offset_t va, offset; vm_size_t tmpsize; int i; offset = pa & PAGE_MASK; size = round_page(offset + size); pa = pa & PG_FRAME; if (pa < PMAP_MAP_LOW && pa + size <= PMAP_MAP_LOW) va = pa + PMAP_MAP_LOW; else if (!pmap_initialized) { va = 0; for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { ppim = pmap_preinit_mapping + i; if (ppim->va == 0) { ppim->pa = pa; ppim->sz = size; ppim->mode = mode; ppim->va = virtual_avail; virtual_avail += size; va = ppim->va; break; } } if (va == 0) panic("%s: too many preinit mappings", __func__); } else { /* * If we have a preinit mapping, re-use it. */ for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { ppim = pmap_preinit_mapping + i; if (ppim->pa == pa && ppim->sz == size && ppim->mode == mode) return ((void *)(ppim->va + offset)); } va = kva_alloc(size); if (va == 0) panic("%s: Couldn't allocate KVA", __func__); } for (tmpsize = 0; tmpsize < size; tmpsize += PAGE_SIZE) pmap_kenter_attr(va + tmpsize, pa + tmpsize, mode); pmap_invalidate_range_int(kernel_pmap, va, va + tmpsize); pmap_invalidate_cache_range(va, va + size); return ((void *)(va + offset)); } static void __CONCAT(PMTYPE, unmapdev)(vm_offset_t va, vm_size_t size) { struct pmap_preinit_mapping *ppim; vm_offset_t offset; int i; if (va >= PMAP_MAP_LOW && va <= KERNBASE && va + size <= KERNBASE) return; offset = va & PAGE_MASK; size = round_page(offset + size); va = trunc_page(va); for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { ppim = pmap_preinit_mapping + i; if (ppim->va == va && ppim->sz == size) { if (pmap_initialized) return; ppim->pa = 0; ppim->va = 0; ppim->sz = 0; ppim->mode = 0; if (va + size == virtual_avail) virtual_avail = va; return; } } if (pmap_initialized) kva_free(va, size); } /* * Sets the memory attribute for the specified page. */ static void __CONCAT(PMTYPE, page_set_memattr)(vm_page_t m, vm_memattr_t ma) { m->md.pat_mode = ma; if ((m->flags & PG_FICTITIOUS) != 0) return; /* * If "m" is a normal page, flush it from the cache. * See pmap_invalidate_cache_range(). * * First, try to find an existing mapping of the page by sf * buffer. sf_buf_invalidate_cache() modifies mapping and * flushes the cache. */ if (sf_buf_invalidate_cache(m)) return; /* * If page is not mapped by sf buffer, but CPU does not * support self snoop, map the page transient and do * invalidation. In the worst case, whole cache is flushed by * pmap_invalidate_cache_range(). */ if ((cpu_feature & CPUID_SS) == 0) pmap_flush_page(m); } static void __CONCAT(PMTYPE, flush_page)(vm_page_t m) { pt_entry_t *cmap_pte2; struct pcpu *pc; vm_offset_t sva, eva; bool useclflushopt; useclflushopt = (cpu_stdext_feature & CPUID_STDEXT_CLFLUSHOPT) != 0; if (useclflushopt || (cpu_feature & CPUID_CLFSH) != 0) { sched_pin(); pc = get_pcpu(); cmap_pte2 = pc->pc_cmap_pte2; mtx_lock(&pc->pc_cmap_lock); if (*cmap_pte2) panic("pmap_flush_page: CMAP2 busy"); *cmap_pte2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M | pmap_cache_bits(kernel_pmap, m->md.pat_mode, 0); invlcaddr(pc->pc_cmap_addr2); sva = (vm_offset_t)pc->pc_cmap_addr2; eva = sva + PAGE_SIZE; /* * Use mfence or sfence despite the ordering implied by * mtx_{un,}lock() because clflush on non-Intel CPUs * and clflushopt are not guaranteed to be ordered by * any other instruction. */ if (useclflushopt) sfence(); else if (cpu_vendor_id != CPU_VENDOR_INTEL) mfence(); for (; sva < eva; sva += cpu_clflush_line_size) { if (useclflushopt) clflushopt(sva); else clflush(sva); } if (useclflushopt) sfence(); else if (cpu_vendor_id != CPU_VENDOR_INTEL) mfence(); *cmap_pte2 = 0; sched_unpin(); mtx_unlock(&pc->pc_cmap_lock); } else pmap_invalidate_cache(); } /* * Changes the specified virtual address range's memory type to that given by * the parameter "mode". The specified virtual address range must be * completely contained within either the kernel map. * * Returns zero if the change completed successfully, and either EINVAL or * ENOMEM if the change failed. Specifically, EINVAL is returned if some part * of the virtual address range was not mapped, and ENOMEM is returned if * there was insufficient memory available to complete the change. */ static int __CONCAT(PMTYPE, change_attr)(vm_offset_t va, vm_size_t size, int mode) { vm_offset_t base, offset, tmpva; pd_entry_t *pde; pt_entry_t *pte; int cache_bits_pte, cache_bits_pde; boolean_t changed; base = trunc_page(va); offset = va & PAGE_MASK; size = round_page(offset + size); /* * Only supported on kernel virtual addresses above the recursive map. */ if (base < VM_MIN_KERNEL_ADDRESS) return (EINVAL); cache_bits_pde = pmap_cache_bits(kernel_pmap, mode, 1); cache_bits_pte = pmap_cache_bits(kernel_pmap, mode, 0); changed = FALSE; /* * Pages that aren't mapped aren't supported. Also break down * 2/4MB pages into 4KB pages if required. */ PMAP_LOCK(kernel_pmap); for (tmpva = base; tmpva < base + size; ) { pde = pmap_pde(kernel_pmap, tmpva); if (*pde == 0) { PMAP_UNLOCK(kernel_pmap); return (EINVAL); } if (*pde & PG_PS) { /* * If the current 2/4MB page already has * the required memory type, then we need not * demote this page. Just increment tmpva to * the next 2/4MB page frame. */ if ((*pde & PG_PDE_CACHE) == cache_bits_pde) { tmpva = trunc_4mpage(tmpva) + NBPDR; continue; } /* * If the current offset aligns with a 2/4MB * page frame and there is at least 2/4MB left * within the range, then we need not break * down this page into 4KB pages. */ if ((tmpva & PDRMASK) == 0 && tmpva + PDRMASK < base + size) { tmpva += NBPDR; continue; } if (!pmap_demote_pde(kernel_pmap, pde, tmpva)) { PMAP_UNLOCK(kernel_pmap); return (ENOMEM); } } pte = vtopte(tmpva); if (*pte == 0) { PMAP_UNLOCK(kernel_pmap); return (EINVAL); } tmpva += PAGE_SIZE; } PMAP_UNLOCK(kernel_pmap); /* * Ok, all the pages exist, so run through them updating their * cache mode if required. */ for (tmpva = base; tmpva < base + size; ) { pde = pmap_pde(kernel_pmap, tmpva); if (*pde & PG_PS) { if ((*pde & PG_PDE_CACHE) != cache_bits_pde) { pmap_pde_attr(pde, cache_bits_pde); changed = TRUE; } tmpva = trunc_4mpage(tmpva) + NBPDR; } else { pte = vtopte(tmpva); if ((*pte & PG_PTE_CACHE) != cache_bits_pte) { pmap_pte_attr(pte, cache_bits_pte); changed = TRUE; } tmpva += PAGE_SIZE; } } /* * Flush CPU caches to make sure any data isn't cached that * shouldn't be, etc. */ if (changed) { pmap_invalidate_range_int(kernel_pmap, base, tmpva); pmap_invalidate_cache_range(base, tmpva); } return (0); } /* * perform the pmap work for mincore */ static int __CONCAT(PMTYPE, mincore)(pmap_t pmap, vm_offset_t addr, vm_paddr_t *locked_pa) { pd_entry_t pde; pt_entry_t pte; vm_paddr_t pa; int val; PMAP_LOCK(pmap); retry: pde = *pmap_pde(pmap, addr); if (pde != 0) { if ((pde & PG_PS) != 0) { pte = pde; /* Compute the physical address of the 4KB page. */ pa = ((pde & PG_PS_FRAME) | (addr & PDRMASK)) & PG_FRAME; val = MINCORE_SUPER; } else { pte = pmap_pte_ufast(pmap, addr, pde); pa = pte & PG_FRAME; val = 0; } } else { pte = 0; pa = 0; val = 0; } if ((pte & PG_V) != 0) { val |= MINCORE_INCORE; if ((pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER; if ((pte & PG_A) != 0) val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER; } if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) != (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) && (pte & (PG_MANAGED | PG_V)) == (PG_MANAGED | PG_V)) { /* Ensure that "PHYS_TO_VM_PAGE(pa)->object" doesn't change. */ if (vm_page_pa_tryrelock(pmap, pa, locked_pa)) goto retry; } else PA_UNLOCK_COND(*locked_pa); PMAP_UNLOCK(pmap); return (val); } static void __CONCAT(PMTYPE, activate)(struct thread *td) { pmap_t pmap, oldpmap; u_int cpuid; u_int32_t cr3; critical_enter(); pmap = vmspace_pmap(td->td_proc->p_vmspace); oldpmap = PCPU_GET(curpmap); cpuid = PCPU_GET(cpuid); #if defined(SMP) CPU_CLR_ATOMIC(cpuid, &oldpmap->pm_active); CPU_SET_ATOMIC(cpuid, &pmap->pm_active); #else CPU_CLR(cpuid, &oldpmap->pm_active); CPU_SET(cpuid, &pmap->pm_active); #endif #ifdef PMAP_PAE_COMP cr3 = vtophys(pmap->pm_pdpt); #else cr3 = vtophys(pmap->pm_pdir); #endif /* * pmap_activate is for the current thread on the current cpu */ td->td_pcb->pcb_cr3 = cr3; PCPU_SET(curpmap, pmap); critical_exit(); } static void __CONCAT(PMTYPE, activate_boot)(pmap_t pmap) { u_int cpuid; cpuid = PCPU_GET(cpuid); #if defined(SMP) CPU_SET_ATOMIC(cpuid, &pmap->pm_active); #else CPU_SET(cpuid, &pmap->pm_active); #endif PCPU_SET(curpmap, pmap); } /* * Increase the starting virtual address of the given mapping if a * different alignment might result in more superpage mappings. */ static void __CONCAT(PMTYPE, align_superpage)(vm_object_t object, vm_ooffset_t offset, vm_offset_t *addr, vm_size_t size) { vm_offset_t superpage_offset; if (size < NBPDR) return; if (object != NULL && (object->flags & OBJ_COLORED) != 0) offset += ptoa(object->pg_color); superpage_offset = offset & PDRMASK; if (size - ((NBPDR - superpage_offset) & PDRMASK) < NBPDR || (*addr & PDRMASK) == superpage_offset) return; if ((*addr & PDRMASK) < superpage_offset) *addr = (*addr & ~PDRMASK) + superpage_offset; else *addr = ((*addr + PDRMASK) & ~PDRMASK) + superpage_offset; } static vm_offset_t __CONCAT(PMTYPE, quick_enter_page)(vm_page_t m) { vm_offset_t qaddr; pt_entry_t *pte; critical_enter(); qaddr = PCPU_GET(qmap_addr); pte = vtopte(qaddr); KASSERT(*pte == 0, ("pmap_quick_enter_page: PTE busy %#jx", (uintmax_t)*pte)); *pte = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M | pmap_cache_bits(kernel_pmap, pmap_page_get_memattr(m), 0); invlpg(qaddr); return (qaddr); } static void __CONCAT(PMTYPE, quick_remove_page)(vm_offset_t addr) { vm_offset_t qaddr; pt_entry_t *pte; qaddr = PCPU_GET(qmap_addr); pte = vtopte(qaddr); KASSERT(*pte != 0, ("pmap_quick_remove_page: PTE not in use")); KASSERT(addr == qaddr, ("pmap_quick_remove_page: invalid address")); *pte = 0; critical_exit(); } static vmem_t *pmap_trm_arena; static vmem_addr_t pmap_trm_arena_last = PMAP_TRM_MIN_ADDRESS; static int trm_guard = PAGE_SIZE; static int pmap_trm_import(void *unused __unused, vmem_size_t size, int flags, vmem_addr_t *addrp) { vm_page_t m; vmem_addr_t af, addr, prev_addr; pt_entry_t *trm_pte; prev_addr = atomic_load_long(&pmap_trm_arena_last); size = round_page(size) + trm_guard; for (;;) { if (prev_addr + size < prev_addr || prev_addr + size < size || prev_addr + size > PMAP_TRM_MAX_ADDRESS) return (ENOMEM); addr = prev_addr + size; if (atomic_fcmpset_int(&pmap_trm_arena_last, &prev_addr, addr)) break; } prev_addr += trm_guard; trm_pte = PTmap + atop(prev_addr); for (af = prev_addr; af < addr; af += PAGE_SIZE) { m = vm_page_alloc(NULL, 0, VM_ALLOC_NOOBJ | VM_ALLOC_NOBUSY | VM_ALLOC_NORMAL | VM_ALLOC_WIRED | VM_ALLOC_WAITOK); pte_store(&trm_pte[atop(af - prev_addr)], VM_PAGE_TO_PHYS(m) | PG_M | PG_A | PG_RW | PG_V | pgeflag | pmap_cache_bits(kernel_pmap, VM_MEMATTR_DEFAULT, FALSE)); } *addrp = prev_addr; return (0); } void pmap_init_trm(void) { vm_page_t pd_m; TUNABLE_INT_FETCH("machdep.trm_guard", &trm_guard); if ((trm_guard & PAGE_MASK) != 0) trm_guard = 0; pmap_trm_arena = vmem_create("i386trampoline", 0, 0, 1, 0, M_WAITOK); vmem_set_import(pmap_trm_arena, pmap_trm_import, NULL, NULL, PAGE_SIZE); pd_m = vm_page_alloc(NULL, 0, VM_ALLOC_NOOBJ | VM_ALLOC_NOBUSY | VM_ALLOC_NORMAL | VM_ALLOC_WIRED | VM_ALLOC_WAITOK | VM_ALLOC_ZERO); if ((pd_m->flags & PG_ZERO) == 0) pmap_zero_page(pd_m); PTD[TRPTDI] = VM_PAGE_TO_PHYS(pd_m) | PG_M | PG_A | PG_RW | PG_V | pmap_cache_bits(kernel_pmap, VM_MEMATTR_DEFAULT, TRUE); } static void * __CONCAT(PMTYPE, trm_alloc)(size_t size, int flags) { vmem_addr_t res; int error; MPASS((flags & ~(M_WAITOK | M_NOWAIT | M_ZERO)) == 0); error = vmem_xalloc(pmap_trm_arena, roundup2(size, 4), sizeof(int), 0, 0, VMEM_ADDR_MIN, VMEM_ADDR_MAX, flags | M_FIRSTFIT, &res); if (error != 0) return (NULL); if ((flags & M_ZERO) != 0) bzero((void *)res, size); return ((void *)res); } static void __CONCAT(PMTYPE, trm_free)(void *addr, size_t size) { vmem_free(pmap_trm_arena, (uintptr_t)addr, roundup2(size, 4)); } static void __CONCAT(PMTYPE, ksetrw)(vm_offset_t va) { *vtopte(va) |= PG_RW; } static void __CONCAT(PMTYPE, remap_lowptdi)(bool enable) { PTD[KPTDI] = enable ? PTD[LOWPTDI] : 0; invltlb_glob(); } static vm_offset_t __CONCAT(PMTYPE, get_map_low)(void) { return (PMAP_MAP_LOW); } static vm_offset_t __CONCAT(PMTYPE, get_vm_maxuser_address)(void) { return (VM_MAXUSER_ADDRESS); } static vm_paddr_t __CONCAT(PMTYPE, pg_frame)(vm_paddr_t pa) { return (pa & PG_FRAME); } static void __CONCAT(PMTYPE, sf_buf_map)(struct sf_buf *sf) { pt_entry_t opte, *ptep; /* * Update the sf_buf's virtual-to-physical mapping, flushing the * virtual address from the TLB. Since the reference count for * the sf_buf's old mapping was zero, that mapping is not * currently in use. Consequently, there is no need to exchange * the old and new PTEs atomically, even under PAE. */ ptep = vtopte(sf->kva); opte = *ptep; *ptep = VM_PAGE_TO_PHYS(sf->m) | PG_RW | PG_V | pmap_cache_bits(kernel_pmap, sf->m->md.pat_mode, 0); /* * Avoid unnecessary TLB invalidations: If the sf_buf's old * virtual-to-physical mapping was not used, then any processor * that has invalidated the sf_buf's virtual address from its TLB * since the last used mapping need not invalidate again. */ #ifdef SMP if ((opte & (PG_V | PG_A)) == (PG_V | PG_A)) CPU_ZERO(&sf->cpumask); #else if ((opte & (PG_V | PG_A)) == (PG_V | PG_A)) pmap_invalidate_page_int(kernel_pmap, sf->kva); #endif } static void __CONCAT(PMTYPE, cp_slow0_map)(vm_offset_t kaddr, int plen, vm_page_t *ma) { pt_entry_t *pte; int i; for (i = 0, pte = vtopte(kaddr); i < plen; i++, pte++) { *pte = PG_V | PG_RW | PG_A | PG_M | VM_PAGE_TO_PHYS(ma[i]) | pmap_cache_bits(kernel_pmap, pmap_page_get_memattr(ma[i]), FALSE); invlpg(kaddr + ptoa(i)); } } static u_int __CONCAT(PMTYPE, get_kcr3)(void) { #ifdef PMAP_PAE_COMP return ((u_int)IdlePDPT); #else return ((u_int)IdlePTD); #endif } static u_int __CONCAT(PMTYPE, get_cr3)(pmap_t pmap) { #ifdef PMAP_PAE_COMP return ((u_int)vtophys(pmap->pm_pdpt)); #else return ((u_int)vtophys(pmap->pm_pdir)); #endif } static caddr_t __CONCAT(PMTYPE, cmap3)(vm_paddr_t pa, u_int pte_bits) { pt_entry_t *pte; pte = CMAP3; *pte = pa | pte_bits; invltlb(); return (CADDR3); } static void __CONCAT(PMTYPE, basemem_setup)(u_int basemem) { pt_entry_t *pte; int i; /* * Map pages between basemem and ISA_HOLE_START, if any, r/w into * the vm86 page table so that vm86 can scribble on them using * the vm86 map too. XXX: why 2 ways for this and only 1 way for * page 0, at least as initialized here? */ pte = (pt_entry_t *)vm86paddr; for (i = basemem / 4; i < 160; i++) pte[i] = (i << PAGE_SHIFT) | PG_V | PG_RW | PG_U; } struct bios16_pmap_handle { pt_entry_t *pte; pd_entry_t *ptd; pt_entry_t orig_ptd; }; static void * __CONCAT(PMTYPE, bios16_enter)(void) { struct bios16_pmap_handle *h; /* * no page table, so create one and install it. */ h = malloc(sizeof(struct bios16_pmap_handle), M_TEMP, M_WAITOK); h->pte = (pt_entry_t *)malloc(PAGE_SIZE, M_TEMP, M_WAITOK); h->ptd = IdlePTD; *h->pte = vm86phystk | PG_RW | PG_V; h->orig_ptd = *h->ptd; *h->ptd = vtophys(h->pte) | PG_RW | PG_V; pmap_invalidate_all_int(kernel_pmap); /* XXX insurance for now */ return (h); } static void __CONCAT(PMTYPE, bios16_leave)(void *arg) { struct bios16_pmap_handle *h; h = arg; *h->ptd = h->orig_ptd; /* remove page table */ /* * XXX only needs to be invlpg(0) but that doesn't work on the 386 */ pmap_invalidate_all_int(kernel_pmap); free(h->pte, M_TEMP); /* ... and free it */ } #define PMM(a) \ .pm_##a = __CONCAT(PMTYPE, a), struct pmap_methods __CONCAT(PMTYPE, methods) = { PMM(ksetrw) PMM(remap_lower) PMM(remap_lowptdi) PMM(align_superpage) PMM(quick_enter_page) PMM(quick_remove_page) PMM(trm_alloc) PMM(trm_free) PMM(get_map_low) PMM(get_vm_maxuser_address) PMM(kextract) PMM(pg_frame) PMM(sf_buf_map) PMM(cp_slow0_map) PMM(get_kcr3) PMM(get_cr3) PMM(cmap3) PMM(basemem_setup) PMM(set_nx) PMM(bios16_enter) PMM(bios16_leave) PMM(bootstrap) PMM(is_valid_memattr) PMM(cache_bits) PMM(ps_enabled) PMM(pinit0) PMM(pinit) PMM(activate) PMM(activate_boot) PMM(advise) PMM(clear_modify) PMM(change_attr) PMM(mincore) PMM(copy) PMM(copy_page) PMM(copy_pages) PMM(zero_page) PMM(zero_page_area) PMM(enter) PMM(enter_object) PMM(enter_quick) PMM(kenter_temporary) PMM(object_init_pt) PMM(unwire) PMM(page_exists_quick) PMM(page_wired_mappings) PMM(page_is_mapped) PMM(remove_pages) PMM(is_modified) PMM(is_prefaultable) PMM(is_referenced) PMM(remove_write) PMM(ts_referenced) PMM(mapdev_attr) PMM(unmapdev) PMM(page_set_memattr) PMM(extract) PMM(extract_and_hold) PMM(map) PMM(qenter) PMM(qremove) PMM(release) PMM(remove) PMM(protect) PMM(remove_all) PMM(init) PMM(init_pat) PMM(growkernel) PMM(invalidate_page) PMM(invalidate_range) PMM(invalidate_all) PMM(invalidate_cache) PMM(flush_page) PMM(kenter) PMM(kremove) }; Index: projects/nfsv42/sys/net/if_tap.h =================================================================== --- projects/nfsv42/sys/net/if_tap.h (revision 350367) +++ projects/nfsv42/sys/net/if_tap.h (revision 350368) @@ -1,74 +1,74 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (C) 1999-2000 by Maksim Yevmenkin * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * BASED ON: * ------------------------------------------------------------------------- * * Copyright (c) 1988, Julian Onions * Nottingham University 1987. */ /* * $FreeBSD$ * $Id: if_tap.h,v 0.7 2000/07/12 04:12:51 max Exp $ */ #ifndef _NET_IF_TAP_H_ #define _NET_IF_TAP_H_ #include /* maximum receive packet size (hard limit) */ #define TAPMRU 16384 #define tapinfo tuninfo /* * ioctl's for get/set debug; these are aliases of TUN* ioctls, see net/if_tun.h * for details. */ #define TAPSDEBUG TUNSDEBUG #define TAPGDEBUG TUNGDEBUG #define TAPSIFINFO TUNSIFINFO #define TAPGIFINFO TUNGIFINFO -#define TAPGIFNAME _IOR('t', 93, struct ifreq) +#define TAPGIFNAME TUNGIFNAME /* VMware ioctl's */ #define VMIO_SIOCSIFFLAGS _IOWINT('V', 0) #define VMIO_SIOCSKEEP _IO('V', 1) #define VMIO_SIOCSIFBR _IO('V', 2) #define VMIO_SIOCSLADRF _IO('V', 3) /* XXX -- unimplemented */ #define VMIO_SIOCSETMACADDR _IO('V', 4) /* XXX -- not used? */ #define VMIO_SIOCPORT _IO('V', 5) #define VMIO_SIOCBRIDGE _IO('V', 6) #define VMIO_SIOCNETIF _IO('V', 7) #endif /* !_NET_IF_TAP_H_ */ Index: projects/nfsv42/sys/net/if_tun.h =================================================================== --- projects/nfsv42/sys/net/if_tun.h (revision 350367) +++ projects/nfsv42/sys/net/if_tun.h (revision 350368) @@ -1,48 +1,49 @@ /* $NetBSD: if_tun.h,v 1.5 1994/06/29 06:36:27 cgd Exp $ */ /*- * Copyright (c) 1988, Julian Onions * Nottingham University 1987. * * This source may be freely distributed, however I would be interested * in any changes that are made. * * This driver takes packets off the IP i/f and hands them up to a * user process to have its wicked way with. This driver has it's * roots in a similar driver written by Phil Cockcroft (formerly) at * UCL. This driver is based much more on read/write/select mode of * operation though. * * $FreeBSD$ */ #ifndef _NET_IF_TUN_H_ #define _NET_IF_TUN_H_ /* Refer to if_tunvar.h for the softc stuff */ /* Maximum transmit packet size (default) */ #define TUNMTU 1500 /* Maximum receive packet size (hard limit) */ #define TUNMRU 65535 struct tuninfo { int baudrate; /* linespeed */ unsigned short mtu; /* maximum transmission unit */ u_char type; /* ethernet, tokenring, etc. */ u_char dummy; /* place holder */ }; /* ioctl's for get/set debug */ #define TUNSDEBUG _IOW('t', 90, int) #define TUNGDEBUG _IOR('t', 89, int) #define TUNSIFINFO _IOW('t', 91, struct tuninfo) #define TUNGIFINFO _IOR('t', 92, struct tuninfo) #define TUNSLMODE _IOW('t', 93, int) +#define TUNGIFNAME _IOR('t', 93, struct ifreq) #define TUNSIFMODE _IOW('t', 94, int) #define TUNSIFPID _IO('t', 95) #define TUNSIFHEAD _IOW('t', 96, int) #define TUNGIFHEAD _IOR('t', 97, int) #endif /* !_NET_IF_TUN_H_ */ Index: projects/nfsv42/sys/net/if_tuntap.c =================================================================== --- projects/nfsv42/sys/net/if_tuntap.c (revision 350367) +++ projects/nfsv42/sys/net/if_tuntap.c (revision 350368) @@ -1,1719 +1,1718 @@ /* $NetBSD: if_tun.c,v 1.14 1994/06/29 06:36:25 cgd Exp $ */ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (C) 1999-2000 by Maksim Yevmenkin * All rights reserved. * Copyright (c) 2019 Kyle Evans * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * BASED ON: * ------------------------------------------------------------------------- * * Copyright (c) 1988, Julian Onions * Nottingham University 1987. * * This source may be freely distributed, however I would be interested * in any changes that are made. * * This driver takes packets off the IP i/f and hands them up to a * user process to have its wicked way with. This driver has it's * roots in a similar driver written by Phil Cockcroft (formerly) at * UCL. This driver is based much more on read/write/poll mode of * operation though. * * $FreeBSD$ */ #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET #include #endif #include #include #include #include #include #include struct tuntap_driver; /* * tun_list is protected by global tunmtx. Other mutable fields are * protected by tun->tun_mtx, or by their owning subsystem. tun_dev is * static for the duration of a tunnel interface. */ struct tuntap_softc { TAILQ_ENTRY(tuntap_softc) tun_list; struct cdev *tun_dev; u_short tun_flags; /* misc flags */ #define TUN_OPEN 0x0001 #define TUN_INITED 0x0002 #define TUN_RCOLL 0x0004 #define TUN_IASET 0x0008 #define TUN_DSTADDR 0x0010 #define TUN_LMODE 0x0020 #define TUN_RWAIT 0x0040 #define TUN_ASYNC 0x0080 #define TUN_IFHEAD 0x0100 #define TUN_DYING 0x0200 #define TUN_L2 0x0400 #define TUN_VMNET 0x0800 #define TUN_DRIVER_IDENT_MASK (TUN_L2 | TUN_VMNET) #define TUN_READY (TUN_OPEN | TUN_INITED) pid_t tun_pid; /* owning pid */ struct ifnet *tun_ifp; /* the interface */ struct sigio *tun_sigio; /* async I/O info */ struct tuntap_driver *tun_drv; /* appropriate driver */ struct selinfo tun_rsel; /* read select */ struct mtx tun_mtx; /* softc field mutex */ struct cv tun_cv; /* for ref'd dev destroy */ struct ether_addr tun_ether; /* remote address */ }; #define TUN2IFP(sc) ((sc)->tun_ifp) #define TUNDEBUG if (tundebug) if_printf #define TUN_LOCK(tp) mtx_lock(&(tp)->tun_mtx) #define TUN_UNLOCK(tp) mtx_unlock(&(tp)->tun_mtx) #define TUN_VMIO_FLAG_MASK 0x0fff /* * All mutable global variables in if_tun are locked using tunmtx, with * the exception of tundebug, which is used unlocked, and the drivers' *clones, * which are static after setup. */ static struct mtx tunmtx; static eventhandler_tag tag; static const char tunname[] = "tun"; static const char tapname[] = "tap"; static const char vmnetname[] = "vmnet"; static MALLOC_DEFINE(M_TUN, tunname, "Tunnel Interface"); static int tundebug = 0; static int tundclone = 1; static int tap_allow_uopen = 0; /* allow user open() */ static int tapuponopen = 0; /* IFF_UP on open() */ static int tapdclone = 1; /* enable devfs cloning */ static TAILQ_HEAD(,tuntap_softc) tunhead = TAILQ_HEAD_INITIALIZER(tunhead); SYSCTL_INT(_debug, OID_AUTO, if_tun_debug, CTLFLAG_RW, &tundebug, 0, ""); static struct sx tun_ioctl_sx; SX_SYSINIT(tun_ioctl_sx, &tun_ioctl_sx, "tun_ioctl"); SYSCTL_DECL(_net_link); /* tun */ static SYSCTL_NODE(_net_link, OID_AUTO, tun, CTLFLAG_RW, 0, "IP tunnel software network interface."); SYSCTL_INT(_net_link_tun, OID_AUTO, devfs_cloning, CTLFLAG_RWTUN, &tundclone, 0, "Enable legacy devfs interface creation."); /* tap */ static SYSCTL_NODE(_net_link, OID_AUTO, tap, CTLFLAG_RW, 0, "Ethernet tunnel software network interface"); SYSCTL_INT(_net_link_tap, OID_AUTO, user_open, CTLFLAG_RW, &tap_allow_uopen, 0, "Allow user to open /dev/tap (based on node permissions)"); SYSCTL_INT(_net_link_tap, OID_AUTO, up_on_open, CTLFLAG_RW, &tapuponopen, 0, "Bring interface up when /dev/tap is opened"); SYSCTL_INT(_net_link_tap, OID_AUTO, devfs_cloning, CTLFLAG_RWTUN, &tapdclone, 0, "Enable legacy devfs interface creation"); SYSCTL_INT(_net_link_tap, OID_AUTO, debug, CTLFLAG_RW, &tundebug, 0, ""); static int tuntap_name2info(const char *name, int *unit, int *flags); static void tunclone(void *arg, struct ucred *cred, char *name, int namelen, struct cdev **dev); static void tuncreate(struct cdev *dev, struct tuntap_driver *); static int tunifioctl(struct ifnet *, u_long, caddr_t); static void tuninit(struct ifnet *); static void tunifinit(void *xtp); static int tuntapmodevent(module_t, int, void *); static int tunoutput(struct ifnet *, struct mbuf *, const struct sockaddr *, struct route *ro); static void tunstart(struct ifnet *); static void tunstart_l2(struct ifnet *); static int tun_clone_match(struct if_clone *ifc, const char *name); static int tap_clone_match(struct if_clone *ifc, const char *name); static int vmnet_clone_match(struct if_clone *ifc, const char *name); static int tun_clone_create(struct if_clone *, char *, size_t, caddr_t); static int tun_clone_destroy(struct if_clone *, struct ifnet *); static d_open_t tunopen; static d_close_t tunclose; static d_read_t tunread; static d_write_t tunwrite; static d_ioctl_t tunioctl; static d_poll_t tunpoll; static d_kqfilter_t tunkqfilter; static int tunkqread(struct knote *, long); static int tunkqwrite(struct knote *, long); static void tunkqdetach(struct knote *); static struct filterops tun_read_filterops = { .f_isfd = 1, .f_attach = NULL, .f_detach = tunkqdetach, .f_event = tunkqread, }; static struct filterops tun_write_filterops = { .f_isfd = 1, .f_attach = NULL, .f_detach = tunkqdetach, .f_event = tunkqwrite, }; static struct tuntap_driver { struct cdevsw cdevsw; int ident_flags; struct unrhdr *unrhdr; struct clonedevs *clones; ifc_match_t *clone_match_fn; ifc_create_t *clone_create_fn; ifc_destroy_t *clone_destroy_fn; } tuntap_drivers[] = { { .ident_flags = 0, .cdevsw = { .d_version = D_VERSION, .d_flags = D_NEEDMINOR, .d_open = tunopen, .d_close = tunclose, .d_read = tunread, .d_write = tunwrite, .d_ioctl = tunioctl, .d_poll = tunpoll, .d_kqfilter = tunkqfilter, .d_name = tunname, }, .clone_match_fn = tun_clone_match, .clone_create_fn = tun_clone_create, .clone_destroy_fn = tun_clone_destroy, }, { .ident_flags = TUN_L2, .cdevsw = { .d_version = D_VERSION, .d_flags = D_NEEDMINOR, .d_open = tunopen, .d_close = tunclose, .d_read = tunread, .d_write = tunwrite, .d_ioctl = tunioctl, .d_poll = tunpoll, .d_kqfilter = tunkqfilter, .d_name = tapname, }, .clone_match_fn = tap_clone_match, .clone_create_fn = tun_clone_create, .clone_destroy_fn = tun_clone_destroy, }, { .ident_flags = TUN_L2 | TUN_VMNET, .cdevsw = { .d_version = D_VERSION, .d_flags = D_NEEDMINOR, .d_open = tunopen, .d_close = tunclose, .d_read = tunread, .d_write = tunwrite, .d_ioctl = tunioctl, .d_poll = tunpoll, .d_kqfilter = tunkqfilter, .d_name = vmnetname, }, .clone_match_fn = vmnet_clone_match, .clone_create_fn = tun_clone_create, .clone_destroy_fn = tun_clone_destroy, }, }; struct tuntap_driver_cloner { SLIST_ENTRY(tuntap_driver_cloner) link; struct tuntap_driver *drv; struct if_clone *cloner; }; VNET_DEFINE_STATIC(SLIST_HEAD(, tuntap_driver_cloner), tuntap_driver_cloners) = SLIST_HEAD_INITIALIZER(tuntap_driver_cloners); #define V_tuntap_driver_cloners VNET(tuntap_driver_cloners) /* * Sets unit and/or flags given the device name. Must be called with correct * vnet context. */ static int tuntap_name2info(const char *name, int *outunit, int *outflags) { struct tuntap_driver *drv; struct tuntap_driver_cloner *drvc; char *dname; int flags, unit; bool found; if (name == NULL) return (EINVAL); /* * Needed for dev_stdclone, but dev_stdclone will not modify, it just * wants to be able to pass back a char * through the second param. We * will always set that as NULL here, so we'll fake it. */ dname = __DECONST(char *, name); found = false; KASSERT(!SLIST_EMPTY(&V_tuntap_driver_cloners), ("tuntap_driver_cloners failed to initialize")); SLIST_FOREACH(drvc, &V_tuntap_driver_cloners, link) { KASSERT(drvc->drv != NULL, ("tuntap_driver_cloners entry not properly initialized")); drv = drvc->drv; if (strcmp(name, drv->cdevsw.d_name) == 0) { found = true; unit = -1; flags = drv->ident_flags; break; } if (dev_stdclone(dname, NULL, drv->cdevsw.d_name, &unit) == 1) { found = true; flags = drv->ident_flags; break; } } if (!found) return (ENXIO); if (outunit != NULL) *outunit = unit; if (outflags != NULL) *outflags = flags; return (0); } /* * Get driver information from a set of flags specified. Masks the identifying * part of the flags and compares it against all of the available * tuntap_drivers. Must be called with correct vnet context. */ static struct tuntap_driver * tuntap_driver_from_flags(int tun_flags) { struct tuntap_driver *drv; struct tuntap_driver_cloner *drvc; KASSERT(!SLIST_EMPTY(&V_tuntap_driver_cloners), ("tuntap_driver_cloners failed to initialize")); SLIST_FOREACH(drvc, &V_tuntap_driver_cloners, link) { KASSERT(drvc->drv != NULL, ("tuntap_driver_cloners entry not properly initialized")); drv = drvc->drv; if ((tun_flags & TUN_DRIVER_IDENT_MASK) == drv->ident_flags) return (drv); } return (NULL); } static int tun_clone_match(struct if_clone *ifc, const char *name) { int tunflags; if (tuntap_name2info(name, NULL, &tunflags) == 0) { if ((tunflags & TUN_L2) == 0) return (1); } return (0); } static int tap_clone_match(struct if_clone *ifc, const char *name) { int tunflags; if (tuntap_name2info(name, NULL, &tunflags) == 0) { if ((tunflags & (TUN_L2 | TUN_VMNET)) == TUN_L2) return (1); } return (0); } static int vmnet_clone_match(struct if_clone *ifc, const char *name) { int tunflags; if (tuntap_name2info(name, NULL, &tunflags) == 0) { if ((tunflags & TUN_VMNET) != 0) return (1); } return (0); } static int tun_clone_create(struct if_clone *ifc, char *name, size_t len, caddr_t params) { struct tuntap_driver *drv; struct cdev *dev; int err, i, tunflags, unit; tunflags = 0; /* The name here tells us exactly what we're creating */ err = tuntap_name2info(name, &unit, &tunflags); if (err != 0) return (err); drv = tuntap_driver_from_flags(tunflags); if (drv == NULL) return (ENXIO); if (unit != -1) { /* If this unit number is still available that/s okay. */ if (alloc_unr_specific(drv->unrhdr, unit) == -1) return (EEXIST); } else { unit = alloc_unr(drv->unrhdr); } snprintf(name, IFNAMSIZ, "%s%d", drv->cdevsw.d_name, unit); /* find any existing device, or allocate new unit number */ i = clone_create(&drv->clones, &drv->cdevsw, &unit, &dev, 0); if (i) { /* No preexisting struct cdev *, create one */ dev = make_dev(&drv->cdevsw, unit, UID_UUCP, GID_DIALER, 0600, "%s%d", drv->cdevsw.d_name, unit); } tuncreate(dev, drv); return (0); } static void tunclone(void *arg, struct ucred *cred, char *name, int namelen, struct cdev **dev) { char devname[SPECNAMELEN + 1]; struct tuntap_driver *drv; int append_unit, i, u, tunflags; bool mayclone; if (*dev != NULL) return; tunflags = 0; CURVNET_SET(CRED_TO_VNET(cred)); if (tuntap_name2info(name, &u, &tunflags) != 0) goto out; /* Not recognized */ if (u != -1 && u > IF_MAXUNIT) goto out; /* Unit number too high */ mayclone = priv_check_cred(cred, PRIV_NET_IFCREATE) == 0; if ((tunflags & TUN_L2) != 0) { /* tap/vmnet allow user open with a sysctl */ mayclone = (mayclone || tap_allow_uopen) && tapdclone; } else { mayclone = mayclone && tundclone; } /* * If tun cloning is enabled, only the superuser can create an * interface. */ if (!mayclone) goto out; if (u == -1) append_unit = 1; else append_unit = 0; drv = tuntap_driver_from_flags(tunflags); if (drv == NULL) goto out; /* find any existing device, or allocate new unit number */ i = clone_create(&drv->clones, &drv->cdevsw, &u, dev, 0); if (i) { if (append_unit) { namelen = snprintf(devname, sizeof(devname), "%s%d", name, u); name = devname; } /* No preexisting struct cdev *, create one */ *dev = make_dev_credf(MAKEDEV_REF, &drv->cdevsw, u, cred, UID_UUCP, GID_DIALER, 0600, "%s", name); } if_clone_create(name, namelen, NULL); out: CURVNET_RESTORE(); } static void tun_destroy(struct tuntap_softc *tp) { TUN_LOCK(tp); tp->tun_flags |= TUN_DYING; if ((tp->tun_flags & TUN_OPEN) != 0) cv_wait_unlock(&tp->tun_cv, &tp->tun_mtx); else TUN_UNLOCK(tp); CURVNET_SET(TUN2IFP(tp)->if_vnet); destroy_dev(tp->tun_dev); seldrain(&tp->tun_rsel); knlist_clear(&tp->tun_rsel.si_note, 0); knlist_destroy(&tp->tun_rsel.si_note); if ((tp->tun_flags & TUN_L2) != 0) { ether_ifdetach(TUN2IFP(tp)); } else { bpfdetach(TUN2IFP(tp)); if_detach(TUN2IFP(tp)); } sx_xlock(&tun_ioctl_sx); TUN2IFP(tp)->if_softc = NULL; sx_xunlock(&tun_ioctl_sx); free_unr(tp->tun_drv->unrhdr, TUN2IFP(tp)->if_dunit); if_free(TUN2IFP(tp)); mtx_destroy(&tp->tun_mtx); cv_destroy(&tp->tun_cv); free(tp, M_TUN); CURVNET_RESTORE(); } static int tun_clone_destroy(struct if_clone *ifc __unused, struct ifnet *ifp) { struct tuntap_softc *tp = ifp->if_softc; mtx_lock(&tunmtx); TAILQ_REMOVE(&tunhead, tp, tun_list); mtx_unlock(&tunmtx); tun_destroy(tp); return (0); } static void vnet_tun_init(const void *unused __unused) { struct tuntap_driver *drv; struct tuntap_driver_cloner *drvc; int i; for (i = 0; i < nitems(tuntap_drivers); ++i) { drv = &tuntap_drivers[i]; drvc = malloc(sizeof(*drvc), M_TUN, M_WAITOK | M_ZERO); drvc->drv = drv; drvc->cloner = if_clone_advanced(drv->cdevsw.d_name, 0, drv->clone_match_fn, drv->clone_create_fn, drv->clone_destroy_fn); SLIST_INSERT_HEAD(&V_tuntap_driver_cloners, drvc, link); }; } VNET_SYSINIT(vnet_tun_init, SI_SUB_PROTO_IF, SI_ORDER_ANY, vnet_tun_init, NULL); static void vnet_tun_uninit(const void *unused __unused) { struct tuntap_driver_cloner *drvc; while (!SLIST_EMPTY(&V_tuntap_driver_cloners)) { drvc = SLIST_FIRST(&V_tuntap_driver_cloners); SLIST_REMOVE_HEAD(&V_tuntap_driver_cloners, link); if_clone_detach(drvc->cloner); free(drvc, M_TUN); } } VNET_SYSUNINIT(vnet_tun_uninit, SI_SUB_PROTO_IF, SI_ORDER_ANY, vnet_tun_uninit, NULL); static void tun_uninit(const void *unused __unused) { struct tuntap_driver *drv; struct tuntap_softc *tp; int i; EVENTHANDLER_DEREGISTER(dev_clone, tag); drain_dev_clone_events(); mtx_lock(&tunmtx); while ((tp = TAILQ_FIRST(&tunhead)) != NULL) { TAILQ_REMOVE(&tunhead, tp, tun_list); mtx_unlock(&tunmtx); tun_destroy(tp); mtx_lock(&tunmtx); } mtx_unlock(&tunmtx); for (i = 0; i < nitems(tuntap_drivers); ++i) { drv = &tuntap_drivers[i]; delete_unrhdr(drv->unrhdr); clone_cleanup(&drv->clones); } mtx_destroy(&tunmtx); } SYSUNINIT(tun_uninit, SI_SUB_PROTO_IF, SI_ORDER_ANY, tun_uninit, NULL); static int tuntapmodevent(module_t mod, int type, void *data) { struct tuntap_driver *drv; int i; switch (type) { case MOD_LOAD: mtx_init(&tunmtx, "tunmtx", NULL, MTX_DEF); for (i = 0; i < nitems(tuntap_drivers); ++i) { drv = &tuntap_drivers[i]; clone_setup(&drv->clones); drv->unrhdr = new_unrhdr(0, IF_MAXUNIT, &tunmtx); } tag = EVENTHANDLER_REGISTER(dev_clone, tunclone, 0, 1000); if (tag == NULL) return (ENOMEM); break; case MOD_UNLOAD: /* See tun_uninit, so it's done after the vnet_sysuninit() */ break; default: return EOPNOTSUPP; } return 0; } static moduledata_t tuntap_mod = { "if_tuntap", tuntapmodevent, 0 }; DECLARE_MODULE(if_tuntap, tuntap_mod, SI_SUB_PSEUDO, SI_ORDER_ANY); MODULE_VERSION(if_tuntap, 1); static void tunstart(struct ifnet *ifp) { struct tuntap_softc *tp = ifp->if_softc; struct mbuf *m; TUNDEBUG(ifp, "starting\n"); if (ALTQ_IS_ENABLED(&ifp->if_snd)) { IFQ_LOCK(&ifp->if_snd); IFQ_POLL_NOLOCK(&ifp->if_snd, m); if (m == NULL) { IFQ_UNLOCK(&ifp->if_snd); return; } IFQ_UNLOCK(&ifp->if_snd); } TUN_LOCK(tp); if (tp->tun_flags & TUN_RWAIT) { tp->tun_flags &= ~TUN_RWAIT; wakeup(tp); } selwakeuppri(&tp->tun_rsel, PZERO + 1); KNOTE_LOCKED(&tp->tun_rsel.si_note, 0); if (tp->tun_flags & TUN_ASYNC && tp->tun_sigio) { TUN_UNLOCK(tp); pgsigio(&tp->tun_sigio, SIGIO, 0); } else TUN_UNLOCK(tp); } /* * tunstart_l2 * * queue packets from higher level ready to put out */ static void tunstart_l2(struct ifnet *ifp) { struct tuntap_softc *tp = ifp->if_softc; TUNDEBUG(ifp, "starting\n"); /* * do not junk pending output if we are in VMnet mode. * XXX: can this do any harm because of queue overflow? */ TUN_LOCK(tp); if (((tp->tun_flags & TUN_VMNET) == 0) && ((tp->tun_flags & TUN_READY) != TUN_READY)) { struct mbuf *m; /* Unlocked read. */ TUNDEBUG(ifp, "not ready, tun_flags = 0x%x\n", tp->tun_flags); for (;;) { IF_DEQUEUE(&ifp->if_snd, m); if (m != NULL) { m_freem(m); if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); } else break; } TUN_UNLOCK(tp); return; } ifp->if_drv_flags |= IFF_DRV_OACTIVE; if (!IFQ_IS_EMPTY(&ifp->if_snd)) { if (tp->tun_flags & TUN_RWAIT) { tp->tun_flags &= ~TUN_RWAIT; wakeup(tp); } if ((tp->tun_flags & TUN_ASYNC) && (tp->tun_sigio != NULL)) { TUN_UNLOCK(tp); pgsigio(&tp->tun_sigio, SIGIO, 0); TUN_LOCK(tp); } selwakeuppri(&tp->tun_rsel, PZERO+1); KNOTE_LOCKED(&tp->tun_rsel.si_note, 0); if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1); /* obytes are counted in ether_output */ } ifp->if_drv_flags &= ~IFF_DRV_OACTIVE; TUN_UNLOCK(tp); } /* tunstart_l2 */ /* XXX: should return an error code so it can fail. */ static void tuncreate(struct cdev *dev, struct tuntap_driver *drv) { struct tuntap_softc *sc; struct ifnet *ifp; struct ether_addr eaddr; int iflags; u_char type; sc = malloc(sizeof(*sc), M_TUN, M_WAITOK | M_ZERO); mtx_init(&sc->tun_mtx, "tun_mtx", NULL, MTX_DEF); cv_init(&sc->tun_cv, "tun_condvar"); sc->tun_flags = drv->ident_flags; sc->tun_dev = dev; sc->tun_drv = drv; mtx_lock(&tunmtx); TAILQ_INSERT_TAIL(&tunhead, sc, tun_list); mtx_unlock(&tunmtx); iflags = IFF_MULTICAST; if ((sc->tun_flags & TUN_L2) != 0) { type = IFT_ETHER; iflags |= IFF_BROADCAST | IFF_SIMPLEX; } else { type = IFT_PPP; iflags |= IFF_POINTOPOINT; } ifp = sc->tun_ifp = if_alloc(type); if (ifp == NULL) panic("%s%d: failed to if_alloc() interface.\n", drv->cdevsw.d_name, dev2unit(dev)); ifp->if_softc = sc; if_initname(ifp, drv->cdevsw.d_name, dev2unit(dev)); ifp->if_ioctl = tunifioctl; ifp->if_flags = iflags; IFQ_SET_MAXLEN(&ifp->if_snd, ifqmaxlen); knlist_init_mtx(&sc->tun_rsel.si_note, &sc->tun_mtx); ifp->if_capabilities |= IFCAP_LINKSTATE; ifp->if_capenable |= IFCAP_LINKSTATE; if ((sc->tun_flags & TUN_L2) != 0) { ifp->if_mtu = ETHERMTU; ifp->if_init = tunifinit; ifp->if_start = tunstart_l2; ether_gen_addr(ifp, &eaddr); ether_ifattach(ifp, eaddr.octet); } else { ifp->if_mtu = TUNMTU; ifp->if_start = tunstart; ifp->if_output = tunoutput; ifp->if_snd.ifq_drv_maxlen = 0; IFQ_SET_READY(&ifp->if_snd); if_attach(ifp); bpfattach(ifp, DLT_NULL, sizeof(u_int32_t)); } dev->si_drv1 = sc; TUN_LOCK(sc); sc->tun_flags |= TUN_INITED; TUN_UNLOCK(sc); TUNDEBUG(ifp, "interface %s is created, minor = %#x\n", ifp->if_xname, dev2unit(dev)); } static int tunopen(struct cdev *dev, int flag, int mode, struct thread *td) { struct ifnet *ifp; struct tuntap_driver *drv; struct tuntap_softc *tp; int error, tunflags; tunflags = 0; CURVNET_SET(TD_TO_VNET(td)); error = tuntap_name2info(dev->si_name, NULL, &tunflags); if (error != 0) { CURVNET_RESTORE(); return (error); /* Shouldn't happen */ } if ((tunflags & TUN_L2) != 0) { /* Restrict? */ if (tap_allow_uopen == 0) { error = priv_check(td, PRIV_NET_TAP); if (error != 0) { CURVNET_RESTORE(); return (error); } } } /* * XXXRW: Non-atomic test and set of dev->si_drv1 requires * synchronization. */ tp = dev->si_drv1; if (!tp) { drv = tuntap_driver_from_flags(tunflags); if (drv == NULL) { CURVNET_RESTORE(); return (ENXIO); } tuncreate(dev, drv); tp = dev->si_drv1; } TUN_LOCK(tp); if ((tp->tun_flags & (TUN_OPEN | TUN_DYING)) != 0) { TUN_UNLOCK(tp); CURVNET_RESTORE(); return (EBUSY); } ifp = TUN2IFP(tp); if ((tp->tun_flags & TUN_L2) != 0) { bcopy(IF_LLADDR(ifp), tp->tun_ether.octet, sizeof(tp->tun_ether.octet)); ifp->if_drv_flags |= IFF_DRV_RUNNING; ifp->if_drv_flags &= ~IFF_DRV_OACTIVE; if (tapuponopen) ifp->if_flags |= IFF_UP; } tp->tun_pid = td->td_proc->p_pid; tp->tun_flags |= TUN_OPEN; if_link_state_change(ifp, LINK_STATE_UP); TUNDEBUG(ifp, "open\n"); TUN_UNLOCK(tp); CURVNET_RESTORE(); return (0); } /* * tunclose - close the device - mark i/f down & delete * routing info */ static int tunclose(struct cdev *dev, int foo, int bar, struct thread *td) { struct tuntap_softc *tp; struct ifnet *ifp; bool l2tun; tp = dev->si_drv1; ifp = TUN2IFP(tp); TUN_LOCK(tp); /* * Simply close the device if this isn't the controlling process. This * may happen if, for instance, the tunnel has been handed off to * another process. The original controller should be able to close it * without putting us into an inconsistent state. */ if (td->td_proc->p_pid != tp->tun_pid) { TUN_UNLOCK(tp); return (0); } /* * junk all pending output */ CURVNET_SET(ifp->if_vnet); l2tun = false; if ((tp->tun_flags & TUN_L2) != 0) { l2tun = true; IF_DRAIN(&ifp->if_snd); } else { IFQ_PURGE(&ifp->if_snd); } /* For vmnet, we won't do most of the address/route bits */ if ((tp->tun_flags & TUN_VMNET) != 0 || (l2tun && (ifp->if_flags & IFF_LINK0) != 0)) goto out; if (ifp->if_flags & IFF_UP) { TUN_UNLOCK(tp); if_down(ifp); TUN_LOCK(tp); } /* Delete all addresses and routes which reference this interface. */ if (ifp->if_drv_flags & IFF_DRV_RUNNING) { struct ifaddr *ifa; ifp->if_drv_flags &= ~IFF_DRV_RUNNING; TUN_UNLOCK(tp); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { /* deal w/IPv4 PtP destination; unlocked read */ if (!l2tun && ifa->ifa_addr->sa_family == AF_INET) { rtinit(ifa, (int)RTM_DELETE, tp->tun_flags & TUN_DSTADDR ? RTF_HOST : 0); } else { rtinit(ifa, (int)RTM_DELETE, 0); } } if_purgeaddrs(ifp); TUN_LOCK(tp); } out: if_link_state_change(ifp, LINK_STATE_DOWN); CURVNET_RESTORE(); funsetown(&tp->tun_sigio); selwakeuppri(&tp->tun_rsel, PZERO + 1); KNOTE_LOCKED(&tp->tun_rsel.si_note, 0); TUNDEBUG (ifp, "closed\n"); tp->tun_flags &= ~TUN_OPEN; tp->tun_pid = 0; cv_broadcast(&tp->tun_cv); TUN_UNLOCK(tp); return (0); } static void tuninit(struct ifnet *ifp) { struct tuntap_softc *tp = ifp->if_softc; #ifdef INET struct ifaddr *ifa; #endif TUNDEBUG(ifp, "tuninit\n"); TUN_LOCK(tp); ifp->if_drv_flags |= IFF_DRV_RUNNING; if ((tp->tun_flags & TUN_L2) == 0) { ifp->if_flags |= IFF_UP; getmicrotime(&ifp->if_lastchange); #ifdef INET if_addr_rlock(ifp); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family == AF_INET) { struct sockaddr_in *si; si = (struct sockaddr_in *)ifa->ifa_addr; if (si->sin_addr.s_addr) tp->tun_flags |= TUN_IASET; si = (struct sockaddr_in *)ifa->ifa_dstaddr; if (si && si->sin_addr.s_addr) tp->tun_flags |= TUN_DSTADDR; } } if_addr_runlock(ifp); #endif TUN_UNLOCK(tp); } else { ifp->if_drv_flags &= ~IFF_DRV_OACTIVE; TUN_UNLOCK(tp); /* attempt to start output */ tunstart_l2(ifp); } } /* * Used only for l2 tunnel. */ static void tunifinit(void *xtp) { struct tuntap_softc *tp; tp = (struct tuntap_softc *)xtp; tuninit(tp->tun_ifp); } /* * Process an ioctl request. */ static int tunifioctl(struct ifnet *ifp, u_long cmd, caddr_t data) { struct ifreq *ifr = (struct ifreq *)data; struct tuntap_softc *tp; struct ifstat *ifs; struct ifmediareq *ifmr; int dummy, error = 0; bool l2tun; ifmr = NULL; sx_xlock(&tun_ioctl_sx); tp = ifp->if_softc; if (tp == NULL) { error = ENXIO; goto bad; } l2tun = (tp->tun_flags & TUN_L2) != 0; switch(cmd) { case SIOCGIFSTATUS: ifs = (struct ifstat *)data; TUN_LOCK(tp); if (tp->tun_pid) snprintf(ifs->ascii, sizeof(ifs->ascii), "\tOpened by PID %d\n", tp->tun_pid); else ifs->ascii[0] = '\0'; TUN_UNLOCK(tp); break; case SIOCSIFADDR: if (l2tun) error = ether_ioctl(ifp, cmd, data); else tuninit(ifp); if (error == 0) TUNDEBUG(ifp, "address set\n"); break; case SIOCSIFMTU: ifp->if_mtu = ifr->ifr_mtu; TUNDEBUG(ifp, "mtu set\n"); break; case SIOCSIFFLAGS: case SIOCADDMULTI: case SIOCDELMULTI: break; case SIOCGIFMEDIA: if (!l2tun) { error = EINVAL; break; } ifmr = (struct ifmediareq *)data; dummy = ifmr->ifm_count; ifmr->ifm_count = 1; ifmr->ifm_status = IFM_AVALID; ifmr->ifm_active = IFM_ETHER; if (tp->tun_flags & TUN_OPEN) ifmr->ifm_status |= IFM_ACTIVE; ifmr->ifm_current = ifmr->ifm_active; if (dummy >= 1) { int media = IFM_ETHER; error = copyout(&media, ifmr->ifm_ulist, sizeof(int)); } break; default: if (l2tun) { error = ether_ioctl(ifp, cmd, data); } else { error = EINVAL; } } bad: sx_xunlock(&tun_ioctl_sx); return (error); } /* * tunoutput - queue packets from higher level ready to put out. */ static int tunoutput(struct ifnet *ifp, struct mbuf *m0, const struct sockaddr *dst, struct route *ro) { struct tuntap_softc *tp = ifp->if_softc; u_short cached_tun_flags; int error; u_int32_t af; TUNDEBUG (ifp, "tunoutput\n"); #ifdef MAC error = mac_ifnet_check_transmit(ifp, m0); if (error) { m_freem(m0); return (error); } #endif /* Could be unlocked read? */ TUN_LOCK(tp); cached_tun_flags = tp->tun_flags; TUN_UNLOCK(tp); if ((cached_tun_flags & TUN_READY) != TUN_READY) { TUNDEBUG (ifp, "not ready 0%o\n", tp->tun_flags); m_freem (m0); return (EHOSTDOWN); } if ((ifp->if_flags & IFF_UP) != IFF_UP) { m_freem (m0); return (EHOSTDOWN); } /* BPF writes need to be handled specially. */ if (dst->sa_family == AF_UNSPEC) bcopy(dst->sa_data, &af, sizeof(af)); else af = dst->sa_family; if (bpf_peers_present(ifp->if_bpf)) bpf_mtap2(ifp->if_bpf, &af, sizeof(af), m0); /* prepend sockaddr? this may abort if the mbuf allocation fails */ if (cached_tun_flags & TUN_LMODE) { /* allocate space for sockaddr */ M_PREPEND(m0, dst->sa_len, M_NOWAIT); /* if allocation failed drop packet */ if (m0 == NULL) { if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1); if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); return (ENOBUFS); } else { bcopy(dst, m0->m_data, dst->sa_len); } } if (cached_tun_flags & TUN_IFHEAD) { /* Prepend the address family */ M_PREPEND(m0, 4, M_NOWAIT); /* if allocation failed drop packet */ if (m0 == NULL) { if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1); if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); return (ENOBUFS); } else *(u_int32_t *)m0->m_data = htonl(af); } else { #ifdef INET if (af != AF_INET) #endif { m_freem(m0); return (EAFNOSUPPORT); } } error = (ifp->if_transmit)(ifp, m0); if (error) return (ENOBUFS); if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1); return (0); } /* * the cdevsw interface is now pretty minimal. */ static int tunioctl(struct cdev *dev, u_long cmd, caddr_t data, int flag, struct thread *td) { struct ifreq ifr, *ifrp; struct tuntap_softc *tp = dev->si_drv1; struct tuninfo *tunp; int error, iflags; #if defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD5) || \ defined(COMPAT_FREEBSD4) int ival; #endif bool l2tun; l2tun = (tp->tun_flags & TUN_L2) != 0; if (l2tun) { /* tap specific ioctls */ switch(cmd) { - case TAPGIFNAME: - ifrp = (struct ifreq *)data; - strlcpy(ifrp->ifr_name, TUN2IFP(tp)->if_xname, - IFNAMSIZ); - - return (0); /* VMware/VMnet port ioctl's */ #if defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD5) || \ defined(COMPAT_FREEBSD4) case _IO('V', 0): ival = IOCPARM_IVAL(data); data = (caddr_t)&ival; /* FALLTHROUGH */ #endif case VMIO_SIOCSIFFLAGS: /* VMware/VMnet SIOCSIFFLAGS */ iflags = *(int *)data; iflags &= TUN_VMIO_FLAG_MASK; iflags &= ~IFF_CANTCHANGE; iflags |= IFF_UP; TUN_LOCK(tp); TUN2IFP(tp)->if_flags = iflags | (TUN2IFP(tp)->if_flags & IFF_CANTCHANGE); TUN_UNLOCK(tp); return (0); case SIOCGIFADDR: /* get MAC address of the remote side */ TUN_LOCK(tp); bcopy(&tp->tun_ether.octet, data, sizeof(tp->tun_ether.octet)); TUN_UNLOCK(tp); return (0); case SIOCSIFADDR: /* set MAC address of the remote side */ TUN_LOCK(tp); bcopy(data, &tp->tun_ether.octet, sizeof(tp->tun_ether.octet)); TUN_UNLOCK(tp); return (0); } /* Fall through to the common ioctls if unhandled */ } else { switch (cmd) { case TUNSLMODE: TUN_LOCK(tp); if (*(int *)data) { tp->tun_flags |= TUN_LMODE; tp->tun_flags &= ~TUN_IFHEAD; } else tp->tun_flags &= ~TUN_LMODE; TUN_UNLOCK(tp); return (0); case TUNSIFHEAD: TUN_LOCK(tp); if (*(int *)data) { tp->tun_flags |= TUN_IFHEAD; tp->tun_flags &= ~TUN_LMODE; } else tp->tun_flags &= ~TUN_IFHEAD; TUN_UNLOCK(tp); return (0); case TUNGIFHEAD: TUN_LOCK(tp); *(int *)data = (tp->tun_flags & TUN_IFHEAD) ? 1 : 0; TUN_UNLOCK(tp); return (0); case TUNSIFMODE: /* deny this if UP */ if (TUN2IFP(tp)->if_flags & IFF_UP) return (EBUSY); switch (*(int *)data & ~IFF_MULTICAST) { case IFF_POINTOPOINT: case IFF_BROADCAST: TUN_LOCK(tp); TUN2IFP(tp)->if_flags &= ~(IFF_BROADCAST|IFF_POINTOPOINT|IFF_MULTICAST); TUN2IFP(tp)->if_flags |= *(int *)data; TUN_UNLOCK(tp); break; default: return (EINVAL); } return (0); case TUNSIFPID: TUN_LOCK(tp); tp->tun_pid = curthread->td_proc->p_pid; TUN_UNLOCK(tp); return (0); } /* Fall through to the common ioctls if unhandled */ } switch (cmd) { + case TUNGIFNAME: + ifrp = (struct ifreq *)data; + strlcpy(ifrp->ifr_name, TUN2IFP(tp)->if_xname, IFNAMSIZ); + + return (0); case TUNSIFINFO: tunp = (struct tuninfo *)data; if (TUN2IFP(tp)->if_type != tunp->type) return (EPROTOTYPE); TUN_LOCK(tp); if (TUN2IFP(tp)->if_mtu != tunp->mtu) { strlcpy(ifr.ifr_name, if_name(TUN2IFP(tp)), IFNAMSIZ); ifr.ifr_mtu = tunp->mtu; CURVNET_SET(TUN2IFP(tp)->if_vnet); error = ifhwioctl(SIOCSIFMTU, TUN2IFP(tp), (caddr_t)&ifr, td); CURVNET_RESTORE(); if (error) { TUN_UNLOCK(tp); return (error); } } TUN2IFP(tp)->if_baudrate = tunp->baudrate; TUN_UNLOCK(tp); break; case TUNGIFINFO: tunp = (struct tuninfo *)data; TUN_LOCK(tp); tunp->mtu = TUN2IFP(tp)->if_mtu; tunp->type = TUN2IFP(tp)->if_type; tunp->baudrate = TUN2IFP(tp)->if_baudrate; TUN_UNLOCK(tp); break; case TUNSDEBUG: tundebug = *(int *)data; break; case TUNGDEBUG: *(int *)data = tundebug; break; case FIONBIO: break; case FIOASYNC: TUN_LOCK(tp); if (*(int *)data) tp->tun_flags |= TUN_ASYNC; else tp->tun_flags &= ~TUN_ASYNC; TUN_UNLOCK(tp); break; case FIONREAD: if (!IFQ_IS_EMPTY(&TUN2IFP(tp)->if_snd)) { struct mbuf *mb; IFQ_LOCK(&TUN2IFP(tp)->if_snd); IFQ_POLL_NOLOCK(&TUN2IFP(tp)->if_snd, mb); for (*(int *)data = 0; mb != NULL; mb = mb->m_next) *(int *)data += mb->m_len; IFQ_UNLOCK(&TUN2IFP(tp)->if_snd); } else *(int *)data = 0; break; case FIOSETOWN: return (fsetown(*(int *)data, &tp->tun_sigio)); case FIOGETOWN: *(int *)data = fgetown(&tp->tun_sigio); return (0); /* This is deprecated, FIOSETOWN should be used instead. */ case TIOCSPGRP: return (fsetown(-(*(int *)data), &tp->tun_sigio)); /* This is deprecated, FIOGETOWN should be used instead. */ case TIOCGPGRP: *(int *)data = -fgetown(&tp->tun_sigio); return (0); default: return (ENOTTY); } return (0); } /* * The cdevsw read interface - reads a packet at a time, or at * least as much of a packet as can be read. */ static int tunread(struct cdev *dev, struct uio *uio, int flag) { struct tuntap_softc *tp = dev->si_drv1; struct ifnet *ifp = TUN2IFP(tp); struct mbuf *m; int error=0, len; TUNDEBUG (ifp, "read\n"); TUN_LOCK(tp); if ((tp->tun_flags & TUN_READY) != TUN_READY) { TUN_UNLOCK(tp); TUNDEBUG (ifp, "not ready 0%o\n", tp->tun_flags); return (EHOSTDOWN); } tp->tun_flags &= ~TUN_RWAIT; do { IFQ_DEQUEUE(&ifp->if_snd, m); if (m == NULL) { if (flag & O_NONBLOCK) { TUN_UNLOCK(tp); return (EWOULDBLOCK); } tp->tun_flags |= TUN_RWAIT; error = mtx_sleep(tp, &tp->tun_mtx, PCATCH | (PZERO + 1), "tunread", 0); if (error != 0) { TUN_UNLOCK(tp); return (error); } } } while (m == NULL); TUN_UNLOCK(tp); if ((tp->tun_flags & TUN_L2) != 0) BPF_MTAP(ifp, m); while (m && uio->uio_resid > 0 && error == 0) { len = min(uio->uio_resid, m->m_len); if (len != 0) error = uiomove(mtod(m, void *), len, uio); m = m_free(m); } if (m) { TUNDEBUG(ifp, "Dropping mbuf\n"); m_freem(m); } return (error); } static int tunwrite_l2(struct tuntap_softc *tp, struct mbuf *m) { struct ether_header *eh; struct ifnet *ifp; ifp = TUN2IFP(tp); /* * Only pass a unicast frame to ether_input(), if it would * actually have been received by non-virtual hardware. */ if (m->m_len < sizeof(struct ether_header)) { m_freem(m); return (0); } eh = mtod(m, struct ether_header *); if (eh && (ifp->if_flags & IFF_PROMISC) == 0 && !ETHER_IS_MULTICAST(eh->ether_dhost) && bcmp(eh->ether_dhost, IF_LLADDR(ifp), ETHER_ADDR_LEN) != 0) { m_freem(m); return (0); } /* Pass packet up to parent. */ CURVNET_SET(ifp->if_vnet); (*ifp->if_input)(ifp, m); CURVNET_RESTORE(); /* ibytes are counted in parent */ if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1); return (0); } static int tunwrite_l3(struct tuntap_softc *tp, struct mbuf *m) { struct ifnet *ifp; int family, isr; ifp = TUN2IFP(tp); /* Could be unlocked read? */ TUN_LOCK(tp); if (tp->tun_flags & TUN_IFHEAD) { TUN_UNLOCK(tp); if (m->m_len < sizeof(family) && (m = m_pullup(m, sizeof(family))) == NULL) return (ENOBUFS); family = ntohl(*mtod(m, u_int32_t *)); m_adj(m, sizeof(family)); } else { TUN_UNLOCK(tp); family = AF_INET; } BPF_MTAP2(ifp, &family, sizeof(family), m); switch (family) { #ifdef INET case AF_INET: isr = NETISR_IP; break; #endif #ifdef INET6 case AF_INET6: isr = NETISR_IPV6; break; #endif default: m_freem(m); return (EAFNOSUPPORT); } random_harvest_queue(m, sizeof(*m), RANDOM_NET_TUN); if_inc_counter(ifp, IFCOUNTER_IBYTES, m->m_pkthdr.len); if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1); CURVNET_SET(ifp->if_vnet); M_SETFIB(m, ifp->if_fib); netisr_dispatch(isr, m); CURVNET_RESTORE(); return (0); } /* * the cdevsw write interface - an atomic write is a packet - or else! */ static int tunwrite(struct cdev *dev, struct uio *uio, int flag) { struct tuntap_softc *tp; struct ifnet *ifp; struct mbuf *m; uint32_t mru; int align; bool l2tun; tp = dev->si_drv1; ifp = TUN2IFP(tp); TUNDEBUG(ifp, "tunwrite\n"); if ((ifp->if_flags & IFF_UP) != IFF_UP) /* ignore silently */ return (0); if (uio->uio_resid == 0) return (0); l2tun = (tp->tun_flags & TUN_L2) != 0; align = 0; mru = l2tun ? TAPMRU : TUNMRU; if (l2tun) align = ETHER_ALIGN; else if ((tp->tun_flags & TUN_IFHEAD) != 0) mru += sizeof(uint32_t); /* family */ if (uio->uio_resid < 0 || uio->uio_resid > mru) { TUNDEBUG(ifp, "len=%zd!\n", uio->uio_resid); return (EIO); } if ((m = m_uiotombuf(uio, M_NOWAIT, 0, align, M_PKTHDR)) == NULL) { if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); return (ENOBUFS); } m->m_pkthdr.rcvif = ifp; #ifdef MAC mac_ifnet_create_mbuf(ifp, m); #endif if (l2tun) return (tunwrite_l2(tp, m)); return (tunwrite_l3(tp, m)); } /* * tunpoll - the poll interface, this is only useful on reads * really. The write detect always returns true, write never blocks * anyway, it either accepts the packet or drops it. */ static int tunpoll(struct cdev *dev, int events, struct thread *td) { struct tuntap_softc *tp = dev->si_drv1; struct ifnet *ifp = TUN2IFP(tp); int revents = 0; TUNDEBUG(ifp, "tunpoll\n"); if (events & (POLLIN | POLLRDNORM)) { IFQ_LOCK(&ifp->if_snd); if (!IFQ_IS_EMPTY(&ifp->if_snd)) { TUNDEBUG(ifp, "tunpoll q=%d\n", ifp->if_snd.ifq_len); revents |= events & (POLLIN | POLLRDNORM); } else { TUNDEBUG(ifp, "tunpoll waiting\n"); selrecord(td, &tp->tun_rsel); } IFQ_UNLOCK(&ifp->if_snd); } if (events & (POLLOUT | POLLWRNORM)) revents |= events & (POLLOUT | POLLWRNORM); return (revents); } /* * tunkqfilter - support for the kevent() system call. */ static int tunkqfilter(struct cdev *dev, struct knote *kn) { struct tuntap_softc *tp = dev->si_drv1; struct ifnet *ifp = TUN2IFP(tp); switch(kn->kn_filter) { case EVFILT_READ: TUNDEBUG(ifp, "%s kqfilter: EVFILT_READ, minor = %#x\n", ifp->if_xname, dev2unit(dev)); kn->kn_fop = &tun_read_filterops; break; case EVFILT_WRITE: TUNDEBUG(ifp, "%s kqfilter: EVFILT_WRITE, minor = %#x\n", ifp->if_xname, dev2unit(dev)); kn->kn_fop = &tun_write_filterops; break; default: TUNDEBUG(ifp, "%s kqfilter: invalid filter, minor = %#x\n", ifp->if_xname, dev2unit(dev)); return(EINVAL); } kn->kn_hook = tp; knlist_add(&tp->tun_rsel.si_note, kn, 0); return (0); } /* * Return true of there is data in the interface queue. */ static int tunkqread(struct knote *kn, long hint) { int ret; struct tuntap_softc *tp = kn->kn_hook; struct cdev *dev = tp->tun_dev; struct ifnet *ifp = TUN2IFP(tp); if ((kn->kn_data = ifp->if_snd.ifq_len) > 0) { TUNDEBUG(ifp, "%s have data in the queue. Len = %d, minor = %#x\n", ifp->if_xname, ifp->if_snd.ifq_len, dev2unit(dev)); ret = 1; } else { TUNDEBUG(ifp, "%s waiting for data, minor = %#x\n", ifp->if_xname, dev2unit(dev)); ret = 0; } return (ret); } /* * Always can write, always return MTU in kn->data. */ static int tunkqwrite(struct knote *kn, long hint) { struct tuntap_softc *tp = kn->kn_hook; struct ifnet *ifp = TUN2IFP(tp); kn->kn_data = ifp->if_mtu; return (1); } static void tunkqdetach(struct knote *kn) { struct tuntap_softc *tp = kn->kn_hook; knlist_remove(&tp->tun_rsel.si_note, kn, 0); } Index: projects/nfsv42/sys/riscv/riscv/pmap.c =================================================================== --- projects/nfsv42/sys/riscv/riscv/pmap.c (revision 350367) +++ projects/nfsv42/sys/riscv/riscv/pmap.c (revision 350368) @@ -1,4482 +1,4473 @@ /*- * SPDX-License-Identifier: BSD-4-Clause * * Copyright (c) 1991 Regents of the University of California. * All rights reserved. * Copyright (c) 1994 John S. Dyson * All rights reserved. * Copyright (c) 1994 David Greenman * All rights reserved. * Copyright (c) 2003 Peter Wemm * All rights reserved. * Copyright (c) 2005-2010 Alan L. Cox * All rights reserved. * Copyright (c) 2014 Andrew Turner * All rights reserved. * Copyright (c) 2014 The FreeBSD Foundation * All rights reserved. * Copyright (c) 2015-2018 Ruslan Bukin * All rights reserved. * * This code is derived from software contributed to Berkeley by * the Systems Programming Group of the University of Utah Computer * Science Department and William Jolitz of UUNET Technologies Inc. * * Portions of this software were developed by Andrew Turner under * sponsorship from The FreeBSD Foundation. * * Portions of this software were developed by SRI International and the * University of Cambridge Computer Laboratory under DARPA/AFRL contract * FA8750-10-C-0237 ("CTSRD"), as part of the DARPA CRASH research programme. * * Portions of this software were developed by the University of Cambridge * Computer Laboratory as part of the CTSRD Project, with support from the * UK Higher Education Innovation Fund (HEIF). * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)pmap.c 7.7 (Berkeley) 5/12/91 */ /*- * Copyright (c) 2003 Networks Associates Technology, Inc. * All rights reserved. * * This software was developed for the FreeBSD Project by Jake Burkholder, * Safeport Network Services, and Network Associates Laboratories, the * Security Research Division of Network Associates, Inc. under * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA * CHATS research program. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); /* * Manages physical address maps. * * Since the information managed by this module is * also stored by the logical address mapping module, * this module may throw away valid virtual-to-physical * mappings at almost any time. However, invalidations * of virtual-to-physical mappings must be done as * requested. * * In order to cope with hardware architectures which * make virtual-to-physical map invalidates expensive, * this module may delay invalidate or reduced protection * operations until such time as they are actually * necessary. This module is given full information as * to which processors are currently using which maps, * and to when physical maps must be made correct. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define NUL1E (Ln_ENTRIES * Ln_ENTRIES) #define NUL2E (Ln_ENTRIES * NUL1E) #if !defined(DIAGNOSTIC) #ifdef __GNUC_GNU_INLINE__ #define PMAP_INLINE __attribute__((__gnu_inline__)) inline #else #define PMAP_INLINE extern inline #endif #else #define PMAP_INLINE #endif #ifdef PV_STATS #define PV_STAT(x) do { x ; } while (0) #else #define PV_STAT(x) do { } while (0) #endif #define pmap_l2_pindex(v) ((v) >> L2_SHIFT) #define pa_to_pvh(pa) (&pv_table[pa_index(pa)]) #define NPV_LIST_LOCKS MAXCPU #define PHYS_TO_PV_LIST_LOCK(pa) \ (&pv_list_locks[pmap_l2_pindex(pa) % NPV_LIST_LOCKS]) #define CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa) do { \ struct rwlock **_lockp = (lockp); \ struct rwlock *_new_lock; \ \ _new_lock = PHYS_TO_PV_LIST_LOCK(pa); \ if (_new_lock != *_lockp) { \ if (*_lockp != NULL) \ rw_wunlock(*_lockp); \ *_lockp = _new_lock; \ rw_wlock(*_lockp); \ } \ } while (0) #define CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m) \ CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, VM_PAGE_TO_PHYS(m)) #define RELEASE_PV_LIST_LOCK(lockp) do { \ struct rwlock **_lockp = (lockp); \ \ if (*_lockp != NULL) { \ rw_wunlock(*_lockp); \ *_lockp = NULL; \ } \ } while (0) #define VM_PAGE_TO_PV_LIST_LOCK(m) \ PHYS_TO_PV_LIST_LOCK(VM_PAGE_TO_PHYS(m)) /* The list of all the user pmaps */ LIST_HEAD(pmaplist, pmap); static struct pmaplist allpmaps = LIST_HEAD_INITIALIZER(); struct pmap kernel_pmap_store; vm_offset_t virtual_avail; /* VA of first avail page (after kernel bss) */ vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */ vm_offset_t kernel_vm_end = 0; vm_paddr_t dmap_phys_base; /* The start of the dmap region */ vm_paddr_t dmap_phys_max; /* The limit of the dmap region */ vm_offset_t dmap_max_addr; /* The virtual address limit of the dmap */ /* This code assumes all L1 DMAP entries will be used */ CTASSERT((DMAP_MIN_ADDRESS & ~L1_OFFSET) == DMAP_MIN_ADDRESS); CTASSERT((DMAP_MAX_ADDRESS & ~L1_OFFSET) == DMAP_MAX_ADDRESS); static struct rwlock_padalign pvh_global_lock; static struct mtx_padalign allpmaps_lock; static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD, 0, "VM/pmap parameters"); static int superpages_enabled = 1; SYSCTL_INT(_vm_pmap, OID_AUTO, superpages_enabled, CTLFLAG_RDTUN, &superpages_enabled, 0, "Enable support for transparent superpages"); static SYSCTL_NODE(_vm_pmap, OID_AUTO, l2, CTLFLAG_RD, 0, "2MB page mapping counters"); static u_long pmap_l2_demotions; SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, demotions, CTLFLAG_RD, &pmap_l2_demotions, 0, "2MB page demotions"); static u_long pmap_l2_mappings; SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, mappings, CTLFLAG_RD, &pmap_l2_mappings, 0, "2MB page mappings"); static u_long pmap_l2_p_failures; SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, p_failures, CTLFLAG_RD, &pmap_l2_p_failures, 0, "2MB page promotion failures"); static u_long pmap_l2_promotions; SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, promotions, CTLFLAG_RD, &pmap_l2_promotions, 0, "2MB page promotions"); /* * Data for the pv entry allocation mechanism */ static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks); static struct mtx pv_chunks_mutex; static struct rwlock pv_list_locks[NPV_LIST_LOCKS]; static struct md_page *pv_table; static struct md_page pv_dummy; extern cpuset_t all_harts; /* * Internal flags for pmap_enter()'s helper functions. */ #define PMAP_ENTER_NORECLAIM 0x1000000 /* Don't reclaim PV entries. */ #define PMAP_ENTER_NOREPLACE 0x2000000 /* Don't replace mappings. */ static void free_pv_chunk(struct pv_chunk *pc); static void free_pv_entry(pmap_t pmap, pv_entry_t pv); static pv_entry_t get_pv_entry(pmap_t pmap, struct rwlock **lockp); static vm_page_t reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp); static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va); static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va); static bool pmap_demote_l2(pmap_t pmap, pd_entry_t *l2, vm_offset_t va); static bool pmap_demote_l2_locked(pmap_t pmap, pd_entry_t *l2, vm_offset_t va, struct rwlock **lockp); static int pmap_enter_l2(pmap_t pmap, vm_offset_t va, pd_entry_t new_l2, u_int flags, vm_page_t m, struct rwlock **lockp); static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp); static int pmap_remove_l3(pmap_t pmap, pt_entry_t *l3, vm_offset_t sva, pd_entry_t ptepde, struct spglist *free, struct rwlock **lockp); static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m, struct rwlock **lockp); static vm_page_t _pmap_alloc_l3(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp); static void _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free); static int pmap_unuse_pt(pmap_t, vm_offset_t, pd_entry_t, struct spglist *); #define pmap_clear(pte) pmap_store(pte, 0) #define pmap_clear_bits(pte, bits) atomic_clear_64(pte, bits) #define pmap_load_store(pte, entry) atomic_swap_64(pte, entry) #define pmap_load_clear(pte) pmap_load_store(pte, 0) #define pmap_load(pte) atomic_load_64(pte) #define pmap_store(pte, entry) atomic_store_64(pte, entry) #define pmap_store_bits(pte, bits) atomic_set_64(pte, bits) /********************/ /* Inline functions */ /********************/ static __inline void pagecopy(void *s, void *d) { memcpy(d, s, PAGE_SIZE); } static __inline void pagezero(void *p) { bzero(p, PAGE_SIZE); } #define pmap_l1_index(va) (((va) >> L1_SHIFT) & Ln_ADDR_MASK) #define pmap_l2_index(va) (((va) >> L2_SHIFT) & Ln_ADDR_MASK) #define pmap_l3_index(va) (((va) >> L3_SHIFT) & Ln_ADDR_MASK) #define PTE_TO_PHYS(pte) ((pte >> PTE_PPN0_S) * PAGE_SIZE) static __inline pd_entry_t * pmap_l1(pmap_t pmap, vm_offset_t va) { return (&pmap->pm_l1[pmap_l1_index(va)]); } static __inline pd_entry_t * pmap_l1_to_l2(pd_entry_t *l1, vm_offset_t va) { vm_paddr_t phys; pd_entry_t *l2; phys = PTE_TO_PHYS(pmap_load(l1)); l2 = (pd_entry_t *)PHYS_TO_DMAP(phys); return (&l2[pmap_l2_index(va)]); } static __inline pd_entry_t * pmap_l2(pmap_t pmap, vm_offset_t va) { pd_entry_t *l1; l1 = pmap_l1(pmap, va); if ((pmap_load(l1) & PTE_V) == 0) return (NULL); if ((pmap_load(l1) & PTE_RX) != 0) return (NULL); return (pmap_l1_to_l2(l1, va)); } static __inline pt_entry_t * pmap_l2_to_l3(pd_entry_t *l2, vm_offset_t va) { vm_paddr_t phys; pt_entry_t *l3; phys = PTE_TO_PHYS(pmap_load(l2)); l3 = (pd_entry_t *)PHYS_TO_DMAP(phys); return (&l3[pmap_l3_index(va)]); } static __inline pt_entry_t * pmap_l3(pmap_t pmap, vm_offset_t va) { pd_entry_t *l2; l2 = pmap_l2(pmap, va); if (l2 == NULL) return (NULL); if ((pmap_load(l2) & PTE_V) == 0) return (NULL); if ((pmap_load(l2) & PTE_RX) != 0) return (NULL); return (pmap_l2_to_l3(l2, va)); } static __inline void pmap_resident_count_inc(pmap_t pmap, int count) { PMAP_LOCK_ASSERT(pmap, MA_OWNED); pmap->pm_stats.resident_count += count; } static __inline void pmap_resident_count_dec(pmap_t pmap, int count) { PMAP_LOCK_ASSERT(pmap, MA_OWNED); KASSERT(pmap->pm_stats.resident_count >= count, ("pmap %p resident count underflow %ld %d", pmap, pmap->pm_stats.resident_count, count)); pmap->pm_stats.resident_count -= count; } static void pmap_distribute_l1(struct pmap *pmap, vm_pindex_t l1index, pt_entry_t entry) { struct pmap *user_pmap; pd_entry_t *l1; /* Distribute new kernel L1 entry to all the user pmaps */ if (pmap != kernel_pmap) return; mtx_lock(&allpmaps_lock); LIST_FOREACH(user_pmap, &allpmaps, pm_list) { l1 = &user_pmap->pm_l1[l1index]; pmap_store(l1, entry); } mtx_unlock(&allpmaps_lock); } static pt_entry_t * pmap_early_page_idx(vm_offset_t l1pt, vm_offset_t va, u_int *l1_slot, u_int *l2_slot) { pt_entry_t *l2; pd_entry_t *l1; l1 = (pd_entry_t *)l1pt; *l1_slot = (va >> L1_SHIFT) & Ln_ADDR_MASK; /* Check locore has used a table L1 map */ KASSERT((l1[*l1_slot] & PTE_RX) == 0, ("Invalid bootstrap L1 table")); /* Find the address of the L2 table */ l2 = (pt_entry_t *)init_pt_va; *l2_slot = pmap_l2_index(va); return (l2); } static vm_paddr_t pmap_early_vtophys(vm_offset_t l1pt, vm_offset_t va) { u_int l1_slot, l2_slot; pt_entry_t *l2; vm_paddr_t ret; l2 = pmap_early_page_idx(l1pt, va, &l1_slot, &l2_slot); /* Check locore has used L2 superpages */ KASSERT((l2[l2_slot] & PTE_RX) != 0, ("Invalid bootstrap L2 table")); /* L2 is superpages */ ret = (l2[l2_slot] >> PTE_PPN1_S) << L2_SHIFT; ret += (va & L2_OFFSET); return (ret); } static void pmap_bootstrap_dmap(vm_offset_t kern_l1, vm_paddr_t min_pa, vm_paddr_t max_pa) { vm_offset_t va; vm_paddr_t pa; pd_entry_t *l1; u_int l1_slot; pt_entry_t entry; pn_t pn; pa = dmap_phys_base = min_pa & ~L1_OFFSET; va = DMAP_MIN_ADDRESS; l1 = (pd_entry_t *)kern_l1; l1_slot = pmap_l1_index(DMAP_MIN_ADDRESS); for (; va < DMAP_MAX_ADDRESS && pa < max_pa; pa += L1_SIZE, va += L1_SIZE, l1_slot++) { KASSERT(l1_slot < Ln_ENTRIES, ("Invalid L1 index")); /* superpages */ pn = (pa / PAGE_SIZE); entry = PTE_KERN; entry |= (pn << PTE_PPN0_S); pmap_store(&l1[l1_slot], entry); } /* Set the upper limit of the DMAP region */ dmap_phys_max = pa; dmap_max_addr = va; sfence_vma(); } static vm_offset_t pmap_bootstrap_l3(vm_offset_t l1pt, vm_offset_t va, vm_offset_t l3_start) { vm_offset_t l3pt; pt_entry_t entry; pd_entry_t *l2; vm_paddr_t pa; u_int l2_slot; pn_t pn; KASSERT((va & L2_OFFSET) == 0, ("Invalid virtual address")); l2 = pmap_l2(kernel_pmap, va); l2 = (pd_entry_t *)((uintptr_t)l2 & ~(PAGE_SIZE - 1)); l2_slot = pmap_l2_index(va); l3pt = l3_start; for (; va < VM_MAX_KERNEL_ADDRESS; l2_slot++, va += L2_SIZE) { KASSERT(l2_slot < Ln_ENTRIES, ("Invalid L2 index")); pa = pmap_early_vtophys(l1pt, l3pt); pn = (pa / PAGE_SIZE); entry = (PTE_V); entry |= (pn << PTE_PPN0_S); pmap_store(&l2[l2_slot], entry); l3pt += PAGE_SIZE; } /* Clean the L2 page table */ memset((void *)l3_start, 0, l3pt - l3_start); return (l3pt); } /* * Bootstrap the system enough to run with virtual memory. */ void pmap_bootstrap(vm_offset_t l1pt, vm_paddr_t kernstart, vm_size_t kernlen) { u_int l1_slot, l2_slot, avail_slot, map_slot; vm_offset_t freemempos; vm_offset_t dpcpu, msgbufpv; vm_paddr_t end, max_pa, min_pa, pa, start; int i; printf("pmap_bootstrap %lx %lx %lx\n", l1pt, kernstart, kernlen); printf("%lx\n", l1pt); printf("%lx\n", (KERNBASE >> L1_SHIFT) & Ln_ADDR_MASK); /* Set this early so we can use the pagetable walking functions */ kernel_pmap_store.pm_l1 = (pd_entry_t *)l1pt; PMAP_LOCK_INIT(kernel_pmap); rw_init(&pvh_global_lock, "pmap pv global"); CPU_FILL(&kernel_pmap->pm_active); /* Assume the address we were loaded to is a valid physical address. */ min_pa = max_pa = kernstart; /* * Find the minimum physical address. physmap is sorted, * but may contain empty ranges. */ for (i = 0; i < physmap_idx * 2; i += 2) { if (physmap[i] == physmap[i + 1]) continue; if (physmap[i] <= min_pa) min_pa = physmap[i]; if (physmap[i + 1] > max_pa) max_pa = physmap[i + 1]; } printf("physmap_idx %lx\n", physmap_idx); printf("min_pa %lx\n", min_pa); printf("max_pa %lx\n", max_pa); /* Create a direct map region early so we can use it for pa -> va */ pmap_bootstrap_dmap(l1pt, min_pa, max_pa); /* * Read the page table to find out what is already mapped. * This assumes we have mapped a block of memory from KERNBASE * using a single L1 entry. */ (void)pmap_early_page_idx(l1pt, KERNBASE, &l1_slot, &l2_slot); /* Sanity check the index, KERNBASE should be the first VA */ KASSERT(l2_slot == 0, ("The L2 index is non-zero")); freemempos = roundup2(KERNBASE + kernlen, PAGE_SIZE); /* Create the l3 tables for the early devmap */ freemempos = pmap_bootstrap_l3(l1pt, VM_MAX_KERNEL_ADDRESS - L2_SIZE, freemempos); sfence_vma(); #define alloc_pages(var, np) \ (var) = freemempos; \ freemempos += (np * PAGE_SIZE); \ memset((char *)(var), 0, ((np) * PAGE_SIZE)); /* Allocate dynamic per-cpu area. */ alloc_pages(dpcpu, DPCPU_SIZE / PAGE_SIZE); dpcpu_init((void *)dpcpu, 0); /* Allocate memory for the msgbuf, e.g. for /sbin/dmesg */ alloc_pages(msgbufpv, round_page(msgbufsize) / PAGE_SIZE); msgbufp = (void *)msgbufpv; virtual_avail = roundup2(freemempos, L2_SIZE); virtual_end = VM_MAX_KERNEL_ADDRESS - L2_SIZE; kernel_vm_end = virtual_avail; pa = pmap_early_vtophys(l1pt, freemempos); /* Initialize phys_avail and dump_avail. */ for (avail_slot = map_slot = physmem = 0; map_slot < physmap_idx * 2; map_slot += 2) { start = physmap[map_slot]; end = physmap[map_slot + 1]; if (start == end) continue; dump_avail[map_slot] = start; dump_avail[map_slot + 1] = end; realmem += atop((vm_offset_t)(end - start)); if (start >= kernstart && end <= pa) continue; if (start < kernstart && end > kernstart) end = kernstart; else if (start < pa && end > pa) start = pa; phys_avail[avail_slot] = start; phys_avail[avail_slot + 1] = end; physmem += (end - start) >> PAGE_SHIFT; avail_slot += 2; if (end != physmap[map_slot + 1] && end > pa) { phys_avail[avail_slot] = pa; phys_avail[avail_slot + 1] = physmap[map_slot + 1]; physmem += (physmap[map_slot + 1] - pa) >> PAGE_SHIFT; avail_slot += 2; } } phys_avail[avail_slot] = 0; phys_avail[avail_slot + 1] = 0; /* * Maxmem isn't the "maximum memory", it's one larger than the * highest page of the physical address space. It should be * called something like "Maxphyspage". */ Maxmem = atop(phys_avail[avail_slot - 1]); } /* * Initialize a vm_page's machine-dependent fields. */ void pmap_page_init(vm_page_t m) { TAILQ_INIT(&m->md.pv_list); m->md.pv_memattr = VM_MEMATTR_WRITE_BACK; } /* * Initialize the pmap module. * Called by vm_init, to initialize any structures that the pmap * system needs to map virtual memory. */ void pmap_init(void) { vm_size_t s; int i, pv_npg; /* * Initialize the pv chunk and pmap list mutexes. */ mtx_init(&pv_chunks_mutex, "pmap pv chunk list", NULL, MTX_DEF); mtx_init(&allpmaps_lock, "allpmaps", NULL, MTX_DEF); /* * Initialize the pool of pv list locks. */ for (i = 0; i < NPV_LIST_LOCKS; i++) rw_init(&pv_list_locks[i], "pmap pv list"); /* * Calculate the size of the pv head table for superpages. */ pv_npg = howmany(vm_phys_segs[vm_phys_nsegs - 1].end, L2_SIZE); /* * Allocate memory for the pv head table for superpages. */ s = (vm_size_t)(pv_npg * sizeof(struct md_page)); s = round_page(s); pv_table = (struct md_page *)kmem_malloc(s, M_WAITOK | M_ZERO); for (i = 0; i < pv_npg; i++) TAILQ_INIT(&pv_table[i].pv_list); TAILQ_INIT(&pv_dummy.pv_list); if (superpages_enabled) pagesizes[1] = L2_SIZE; } #ifdef SMP /* * For SMP, these functions have to use IPIs for coherence. * * In general, the calling thread uses a plain fence to order the * writes to the page tables before invoking an SBI callback to invoke * sfence_vma() on remote CPUs. */ static void pmap_invalidate_page(pmap_t pmap, vm_offset_t va) { cpuset_t mask; sched_pin(); mask = pmap->pm_active; CPU_CLR(PCPU_GET(hart), &mask); fence(); if (!CPU_EMPTY(&mask) && smp_started) sbi_remote_sfence_vma(mask.__bits, va, 1); sfence_vma_page(va); sched_unpin(); } static void pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { cpuset_t mask; sched_pin(); mask = pmap->pm_active; CPU_CLR(PCPU_GET(hart), &mask); fence(); if (!CPU_EMPTY(&mask) && smp_started) sbi_remote_sfence_vma(mask.__bits, sva, eva - sva + 1); /* * Might consider a loop of sfence_vma_page() for a small * number of pages in the future. */ sfence_vma(); sched_unpin(); } static void pmap_invalidate_all(pmap_t pmap) { cpuset_t mask; sched_pin(); mask = pmap->pm_active; CPU_CLR(PCPU_GET(hart), &mask); /* * XXX: The SBI doc doesn't detail how to specify x0 as the * address to perform a global fence. BBL currently treats * all sfence_vma requests as global however. */ fence(); if (!CPU_EMPTY(&mask) && smp_started) sbi_remote_sfence_vma(mask.__bits, 0, 0); sfence_vma(); sched_unpin(); } #else /* * Normal, non-SMP, invalidation functions. * We inline these within pmap.c for speed. */ static __inline void pmap_invalidate_page(pmap_t pmap, vm_offset_t va) { sfence_vma_page(va); } static __inline void pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { /* * Might consider a loop of sfence_vma_page() for a small * number of pages in the future. */ sfence_vma(); } static __inline void pmap_invalidate_all(pmap_t pmap) { sfence_vma(); } #endif /* * Routine: pmap_extract * Function: * Extract the physical page address associated * with the given map/virtual_address pair. */ vm_paddr_t pmap_extract(pmap_t pmap, vm_offset_t va) { pd_entry_t *l2p, l2; pt_entry_t *l3p, l3; vm_paddr_t pa; pa = 0; PMAP_LOCK(pmap); /* * Start with the l2 tabel. We are unable to allocate * pages in the l1 table. */ l2p = pmap_l2(pmap, va); if (l2p != NULL) { l2 = pmap_load(l2p); if ((l2 & PTE_RX) == 0) { l3p = pmap_l2_to_l3(l2p, va); if (l3p != NULL) { l3 = pmap_load(l3p); pa = PTE_TO_PHYS(l3); pa |= (va & L3_OFFSET); } } else { /* L2 is superpages */ pa = (l2 >> PTE_PPN1_S) << L2_SHIFT; pa |= (va & L2_OFFSET); } } PMAP_UNLOCK(pmap); return (pa); } /* * Routine: pmap_extract_and_hold * Function: * Atomically extract and hold the physical page * with the given pmap and virtual address pair * if that mapping permits the given protection. */ vm_page_t pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot) { pt_entry_t *l3p, l3; vm_paddr_t phys; vm_paddr_t pa; vm_page_t m; pa = 0; m = NULL; PMAP_LOCK(pmap); retry: l3p = pmap_l3(pmap, va); if (l3p != NULL && (l3 = pmap_load(l3p)) != 0) { if ((l3 & PTE_W) != 0 || (prot & VM_PROT_WRITE) == 0) { phys = PTE_TO_PHYS(l3); if (vm_page_pa_tryrelock(pmap, phys, &pa)) goto retry; m = PHYS_TO_VM_PAGE(phys); vm_page_wire(m); } } PA_UNLOCK_COND(pa); PMAP_UNLOCK(pmap); return (m); } vm_paddr_t pmap_kextract(vm_offset_t va) { pd_entry_t *l2; pt_entry_t *l3; vm_paddr_t pa; if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) { pa = DMAP_TO_PHYS(va); } else { l2 = pmap_l2(kernel_pmap, va); if (l2 == NULL) panic("pmap_kextract: No l2"); if ((pmap_load(l2) & PTE_RX) != 0) { /* superpages */ pa = (pmap_load(l2) >> PTE_PPN1_S) << L2_SHIFT; pa |= (va & L2_OFFSET); return (pa); } l3 = pmap_l2_to_l3(l2, va); if (l3 == NULL) panic("pmap_kextract: No l3..."); pa = PTE_TO_PHYS(pmap_load(l3)); pa |= (va & PAGE_MASK); } return (pa); } /*************************************************** * Low level mapping routines..... ***************************************************/ void pmap_kenter_device(vm_offset_t sva, vm_size_t size, vm_paddr_t pa) { pt_entry_t entry; pt_entry_t *l3; vm_offset_t va; pn_t pn; KASSERT((pa & L3_OFFSET) == 0, ("pmap_kenter_device: Invalid physical address")); KASSERT((sva & L3_OFFSET) == 0, ("pmap_kenter_device: Invalid virtual address")); KASSERT((size & PAGE_MASK) == 0, ("pmap_kenter_device: Mapping is not page-sized")); va = sva; while (size != 0) { l3 = pmap_l3(kernel_pmap, va); KASSERT(l3 != NULL, ("Invalid page table, va: 0x%lx", va)); pn = (pa / PAGE_SIZE); entry = PTE_KERN; entry |= (pn << PTE_PPN0_S); pmap_store(l3, entry); va += PAGE_SIZE; pa += PAGE_SIZE; size -= PAGE_SIZE; } pmap_invalidate_range(kernel_pmap, sva, va); } /* * Remove a page from the kernel pagetables. * Note: not SMP coherent. */ PMAP_INLINE void pmap_kremove(vm_offset_t va) { pt_entry_t *l3; l3 = pmap_l3(kernel_pmap, va); KASSERT(l3 != NULL, ("pmap_kremove: Invalid address")); pmap_clear(l3); sfence_vma(); } void pmap_kremove_device(vm_offset_t sva, vm_size_t size) { pt_entry_t *l3; vm_offset_t va; KASSERT((sva & L3_OFFSET) == 0, ("pmap_kremove_device: Invalid virtual address")); KASSERT((size & PAGE_MASK) == 0, ("pmap_kremove_device: Mapping is not page-sized")); va = sva; while (size != 0) { l3 = pmap_l3(kernel_pmap, va); KASSERT(l3 != NULL, ("Invalid page table, va: 0x%lx", va)); pmap_clear(l3); va += PAGE_SIZE; size -= PAGE_SIZE; } pmap_invalidate_range(kernel_pmap, sva, va); } /* * Used to map a range of physical addresses into kernel * virtual address space. * * The value passed in '*virt' is a suggested virtual address for * the mapping. Architectures which can support a direct-mapped * physical to virtual region can return the appropriate address * within that region, leaving '*virt' unchanged. Other * architectures should map the pages starting at '*virt' and * update '*virt' with the first usable address after the mapped * region. */ vm_offset_t pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot) { return PHYS_TO_DMAP(start); } /* * Add a list of wired pages to the kva * this routine is only used for temporary * kernel mappings that do not need to have * page modification or references recorded. * Note that old mappings are simply written * over. The page *must* be wired. * Note: SMP coherent. Uses a ranged shootdown IPI. */ void pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count) { pt_entry_t *l3, pa; vm_offset_t va; vm_page_t m; pt_entry_t entry; pn_t pn; int i; va = sva; for (i = 0; i < count; i++) { m = ma[i]; pa = VM_PAGE_TO_PHYS(m); pn = (pa / PAGE_SIZE); l3 = pmap_l3(kernel_pmap, va); entry = PTE_KERN; entry |= (pn << PTE_PPN0_S); pmap_store(l3, entry); va += L3_SIZE; } pmap_invalidate_range(kernel_pmap, sva, va); } /* * This routine tears out page mappings from the * kernel -- it is meant only for temporary mappings. * Note: SMP coherent. Uses a ranged shootdown IPI. */ void pmap_qremove(vm_offset_t sva, int count) { pt_entry_t *l3; vm_offset_t va; KASSERT(sva >= VM_MIN_KERNEL_ADDRESS, ("usermode va %lx", sva)); for (va = sva; count-- > 0; va += PAGE_SIZE) { l3 = pmap_l3(kernel_pmap, va); KASSERT(l3 != NULL, ("pmap_kremove: Invalid address")); pmap_clear(l3); } pmap_invalidate_range(kernel_pmap, sva, va); } bool pmap_ps_enabled(pmap_t pmap __unused) { return (superpages_enabled); } /*************************************************** * Page table page management routines..... ***************************************************/ /* * Schedule the specified unused page table page to be freed. Specifically, * add the page to the specified list of pages that will be released to the * physical memory manager after the TLB has been updated. */ static __inline void pmap_add_delayed_free_list(vm_page_t m, struct spglist *free, boolean_t set_PG_ZERO) { if (set_PG_ZERO) m->flags |= PG_ZERO; else m->flags &= ~PG_ZERO; SLIST_INSERT_HEAD(free, m, plinks.s.ss); } /* * Inserts the specified page table page into the specified pmap's collection * of idle page table pages. Each of a pmap's page table pages is responsible * for mapping a distinct range of virtual addresses. The pmap's collection is * ordered by this virtual address range. * * If "promoted" is false, then the page table page "ml3" must be zero filled. */ static __inline int pmap_insert_pt_page(pmap_t pmap, vm_page_t ml3, bool promoted) { PMAP_LOCK_ASSERT(pmap, MA_OWNED); ml3->valid = promoted ? VM_PAGE_BITS_ALL : 0; return (vm_radix_insert(&pmap->pm_root, ml3)); } /* * Removes the page table page mapping the specified virtual address from the * specified pmap's collection of idle page table pages, and returns it. * Otherwise, returns NULL if there is no page table page corresponding to the * specified virtual address. */ static __inline vm_page_t pmap_remove_pt_page(pmap_t pmap, vm_offset_t va) { PMAP_LOCK_ASSERT(pmap, MA_OWNED); return (vm_radix_remove(&pmap->pm_root, pmap_l2_pindex(va))); } /* * Decrements a page table page's wire count, which is used to record the * number of valid page table entries within the page. If the wire count * drops to zero, then the page table page is unmapped. Returns TRUE if the * page table page was unmapped and FALSE otherwise. */ static inline boolean_t pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) { --m->wire_count; if (m->wire_count == 0) { _pmap_unwire_ptp(pmap, va, m, free); return (TRUE); } else { return (FALSE); } } static void _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) { vm_paddr_t phys; PMAP_LOCK_ASSERT(pmap, MA_OWNED); if (m->pindex >= NUL1E) { pd_entry_t *l1; l1 = pmap_l1(pmap, va); pmap_clear(l1); pmap_distribute_l1(pmap, pmap_l1_index(va), 0); } else { pd_entry_t *l2; l2 = pmap_l2(pmap, va); pmap_clear(l2); } pmap_resident_count_dec(pmap, 1); if (m->pindex < NUL1E) { pd_entry_t *l1; vm_page_t pdpg; l1 = pmap_l1(pmap, va); phys = PTE_TO_PHYS(pmap_load(l1)); pdpg = PHYS_TO_VM_PAGE(phys); pmap_unwire_ptp(pmap, va, pdpg, free); } pmap_invalidate_page(pmap, va); vm_wire_sub(1); /* * Put page on a list so that it is released after * *ALL* TLB shootdown is done */ pmap_add_delayed_free_list(m, free, TRUE); } /* * After removing a page table entry, this routine is used to * conditionally free the page, and manage the hold/wire counts. */ static int pmap_unuse_pt(pmap_t pmap, vm_offset_t va, pd_entry_t ptepde, struct spglist *free) { vm_page_t mpte; if (va >= VM_MAXUSER_ADDRESS) return (0); KASSERT(ptepde != 0, ("pmap_unuse_pt: ptepde != 0")); mpte = PHYS_TO_VM_PAGE(PTE_TO_PHYS(ptepde)); return (pmap_unwire_ptp(pmap, va, mpte, free)); } void pmap_pinit0(pmap_t pmap) { PMAP_LOCK_INIT(pmap); bzero(&pmap->pm_stats, sizeof(pmap->pm_stats)); pmap->pm_l1 = kernel_pmap->pm_l1; pmap->pm_satp = SATP_MODE_SV39 | (vtophys(pmap->pm_l1) >> PAGE_SHIFT); CPU_ZERO(&pmap->pm_active); pmap_activate_boot(pmap); } int pmap_pinit(pmap_t pmap) { vm_paddr_t l1phys; vm_page_t l1pt; /* * allocate the l1 page */ while ((l1pt = vm_page_alloc(NULL, 0xdeadbeef, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) vm_wait(NULL); l1phys = VM_PAGE_TO_PHYS(l1pt); pmap->pm_l1 = (pd_entry_t *)PHYS_TO_DMAP(l1phys); pmap->pm_satp = SATP_MODE_SV39 | (l1phys >> PAGE_SHIFT); if ((l1pt->flags & PG_ZERO) == 0) pagezero(pmap->pm_l1); bzero(&pmap->pm_stats, sizeof(pmap->pm_stats)); CPU_ZERO(&pmap->pm_active); /* Install kernel pagetables */ memcpy(pmap->pm_l1, kernel_pmap->pm_l1, PAGE_SIZE); /* Add to the list of all user pmaps */ mtx_lock(&allpmaps_lock); LIST_INSERT_HEAD(&allpmaps, pmap, pm_list); mtx_unlock(&allpmaps_lock); vm_radix_init(&pmap->pm_root); return (1); } /* * This routine is called if the desired page table page does not exist. * * If page table page allocation fails, this routine may sleep before * returning NULL. It sleeps only if a lock pointer was given. * * Note: If a page allocation fails at page table level two or three, * one or two pages may be held during the wait, only to be released * afterwards. This conservative approach is easily argued to avoid * race conditions. */ static vm_page_t _pmap_alloc_l3(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp) { vm_page_t m, /*pdppg, */pdpg; pt_entry_t entry; vm_paddr_t phys; pn_t pn; PMAP_LOCK_ASSERT(pmap, MA_OWNED); /* * Allocate a page table page. */ if ((m = vm_page_alloc(NULL, ptepindex, VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) { if (lockp != NULL) { RELEASE_PV_LIST_LOCK(lockp); PMAP_UNLOCK(pmap); rw_runlock(&pvh_global_lock); vm_wait(NULL); rw_rlock(&pvh_global_lock); PMAP_LOCK(pmap); } /* * Indicate the need to retry. While waiting, the page table * page may have been allocated. */ return (NULL); } if ((m->flags & PG_ZERO) == 0) pmap_zero_page(m); /* * Map the pagetable page into the process address space, if * it isn't already there. */ if (ptepindex >= NUL1E) { pd_entry_t *l1; vm_pindex_t l1index; l1index = ptepindex - NUL1E; l1 = &pmap->pm_l1[l1index]; pn = (VM_PAGE_TO_PHYS(m) / PAGE_SIZE); entry = (PTE_V); entry |= (pn << PTE_PPN0_S); pmap_store(l1, entry); pmap_distribute_l1(pmap, l1index, entry); } else { vm_pindex_t l1index; pd_entry_t *l1, *l2; l1index = ptepindex >> (L1_SHIFT - L2_SHIFT); l1 = &pmap->pm_l1[l1index]; if (pmap_load(l1) == 0) { /* recurse for allocating page dir */ if (_pmap_alloc_l3(pmap, NUL1E + l1index, lockp) == NULL) { vm_page_unwire_noq(m); vm_page_free_zero(m); return (NULL); } } else { phys = PTE_TO_PHYS(pmap_load(l1)); pdpg = PHYS_TO_VM_PAGE(phys); pdpg->wire_count++; } phys = PTE_TO_PHYS(pmap_load(l1)); l2 = (pd_entry_t *)PHYS_TO_DMAP(phys); l2 = &l2[ptepindex & Ln_ADDR_MASK]; pn = (VM_PAGE_TO_PHYS(m) / PAGE_SIZE); entry = (PTE_V); entry |= (pn << PTE_PPN0_S); pmap_store(l2, entry); } pmap_resident_count_inc(pmap, 1); return (m); } static vm_page_t pmap_alloc_l2(pmap_t pmap, vm_offset_t va, struct rwlock **lockp) { pd_entry_t *l1; vm_page_t l2pg; vm_pindex_t l2pindex; retry: l1 = pmap_l1(pmap, va); if (l1 != NULL && (pmap_load(l1) & PTE_RWX) == 0) { /* Add a reference to the L2 page. */ l2pg = PHYS_TO_VM_PAGE(PTE_TO_PHYS(pmap_load(l1))); l2pg->wire_count++; } else { /* Allocate a L2 page. */ l2pindex = pmap_l2_pindex(va) >> Ln_ENTRIES_SHIFT; l2pg = _pmap_alloc_l3(pmap, NUL2E + l2pindex, lockp); if (l2pg == NULL && lockp != NULL) goto retry; } return (l2pg); } static vm_page_t pmap_alloc_l3(pmap_t pmap, vm_offset_t va, struct rwlock **lockp) { vm_pindex_t ptepindex; pd_entry_t *l2; vm_paddr_t phys; vm_page_t m; /* * Calculate pagetable page index */ ptepindex = pmap_l2_pindex(va); retry: /* * Get the page directory entry */ l2 = pmap_l2(pmap, va); /* * If the page table page is mapped, we just increment the * hold count, and activate it. */ if (l2 != NULL && pmap_load(l2) != 0) { phys = PTE_TO_PHYS(pmap_load(l2)); m = PHYS_TO_VM_PAGE(phys); m->wire_count++; } else { /* * Here if the pte page isn't mapped, or if it has been * deallocated. */ m = _pmap_alloc_l3(pmap, ptepindex, lockp); if (m == NULL && lockp != NULL) goto retry; } return (m); } /*************************************************** * Pmap allocation/deallocation routines. ***************************************************/ /* * Release any resources held by the given physical map. * Called when a pmap initialized by pmap_pinit is being released. * Should only be called if the map contains no valid mappings. */ void pmap_release(pmap_t pmap) { vm_page_t m; KASSERT(pmap->pm_stats.resident_count == 0, ("pmap_release: pmap resident count %ld != 0", pmap->pm_stats.resident_count)); KASSERT(CPU_EMPTY(&pmap->pm_active), ("releasing active pmap %p", pmap)); mtx_lock(&allpmaps_lock); LIST_REMOVE(pmap, pm_list); mtx_unlock(&allpmaps_lock); m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pmap->pm_l1)); vm_page_unwire_noq(m); vm_page_free(m); } #if 0 static int kvm_size(SYSCTL_HANDLER_ARGS) { unsigned long ksize = VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS; return sysctl_handle_long(oidp, &ksize, 0, req); } SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG|CTLFLAG_RD, 0, 0, kvm_size, "LU", "Size of KVM"); static int kvm_free(SYSCTL_HANDLER_ARGS) { unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end; return sysctl_handle_long(oidp, &kfree, 0, req); } SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG|CTLFLAG_RD, 0, 0, kvm_free, "LU", "Amount of KVM free"); #endif /* 0 */ /* * grow the number of kernel page table entries, if needed */ void pmap_growkernel(vm_offset_t addr) { vm_paddr_t paddr; vm_page_t nkpg; pd_entry_t *l1, *l2; pt_entry_t entry; pn_t pn; mtx_assert(&kernel_map->system_mtx, MA_OWNED); addr = roundup2(addr, L2_SIZE); if (addr - 1 >= vm_map_max(kernel_map)) addr = vm_map_max(kernel_map); while (kernel_vm_end < addr) { l1 = pmap_l1(kernel_pmap, kernel_vm_end); if (pmap_load(l1) == 0) { /* We need a new PDP entry */ nkpg = vm_page_alloc(NULL, kernel_vm_end >> L1_SHIFT, VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO); if (nkpg == NULL) panic("pmap_growkernel: no memory to grow kernel"); if ((nkpg->flags & PG_ZERO) == 0) pmap_zero_page(nkpg); paddr = VM_PAGE_TO_PHYS(nkpg); pn = (paddr / PAGE_SIZE); entry = (PTE_V); entry |= (pn << PTE_PPN0_S); pmap_store(l1, entry); pmap_distribute_l1(kernel_pmap, pmap_l1_index(kernel_vm_end), entry); continue; /* try again */ } l2 = pmap_l1_to_l2(l1, kernel_vm_end); if ((pmap_load(l2) & PTE_V) != 0 && (pmap_load(l2) & PTE_RWX) == 0) { kernel_vm_end = (kernel_vm_end + L2_SIZE) & ~L2_OFFSET; if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) { kernel_vm_end = vm_map_max(kernel_map); break; } continue; } nkpg = vm_page_alloc(NULL, kernel_vm_end >> L2_SHIFT, VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO); if (nkpg == NULL) panic("pmap_growkernel: no memory to grow kernel"); if ((nkpg->flags & PG_ZERO) == 0) { pmap_zero_page(nkpg); } paddr = VM_PAGE_TO_PHYS(nkpg); pn = (paddr / PAGE_SIZE); entry = (PTE_V); entry |= (pn << PTE_PPN0_S); pmap_store(l2, entry); pmap_invalidate_page(kernel_pmap, kernel_vm_end); kernel_vm_end = (kernel_vm_end + L2_SIZE) & ~L2_OFFSET; if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) { kernel_vm_end = vm_map_max(kernel_map); break; } } } /*************************************************** * page management routines. ***************************************************/ CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE); CTASSERT(_NPCM == 3); CTASSERT(_NPCPV == 168); static __inline struct pv_chunk * pv_to_chunk(pv_entry_t pv) { return ((struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK)); } #define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap) #define PC_FREE0 0xfffffffffffffffful #define PC_FREE1 0xfffffffffffffffful #define PC_FREE2 0x000000fffffffffful static const uint64_t pc_freemask[_NPCM] = { PC_FREE0, PC_FREE1, PC_FREE2 }; #if 0 #ifdef PV_STATS static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail; SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0, "Current number of pv entry chunks"); SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0, "Current number of pv entry chunks allocated"); SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0, "Current number of pv entry chunks frees"); SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0, "Number of times tried to get a chunk page but failed."); static long pv_entry_frees, pv_entry_allocs, pv_entry_count; static int pv_entry_spare; SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0, "Current number of pv entry frees"); SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0, "Current number of pv entry allocs"); SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0, "Current number of pv entries"); SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0, "Current number of spare pv entries"); #endif #endif /* 0 */ /* * We are in a serious low memory condition. Resort to * drastic measures to free some pages so we can allocate * another pv entry chunk. * * Returns NULL if PV entries were reclaimed from the specified pmap. * * We do not, however, unmap 2mpages because subsequent accesses will * allocate per-page pv entries until repromotion occurs, thereby * exacerbating the shortage of free pv entries. */ static vm_page_t reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp) { panic("RISCVTODO: reclaim_pv_chunk"); } /* * free the pv_entry back to the free list */ static void free_pv_entry(pmap_t pmap, pv_entry_t pv) { struct pv_chunk *pc; int idx, field, bit; rw_assert(&pvh_global_lock, RA_LOCKED); PMAP_LOCK_ASSERT(pmap, MA_OWNED); PV_STAT(atomic_add_long(&pv_entry_frees, 1)); PV_STAT(atomic_add_int(&pv_entry_spare, 1)); PV_STAT(atomic_subtract_long(&pv_entry_count, 1)); pc = pv_to_chunk(pv); idx = pv - &pc->pc_pventry[0]; field = idx / 64; bit = idx % 64; pc->pc_map[field] |= 1ul << bit; if (pc->pc_map[0] != PC_FREE0 || pc->pc_map[1] != PC_FREE1 || pc->pc_map[2] != PC_FREE2) { /* 98% of the time, pc is already at the head of the list. */ if (__predict_false(pc != TAILQ_FIRST(&pmap->pm_pvchunk))) { TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); } return; } TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); free_pv_chunk(pc); } static void free_pv_chunk(struct pv_chunk *pc) { vm_page_t m; mtx_lock(&pv_chunks_mutex); TAILQ_REMOVE(&pv_chunks, pc, pc_lru); mtx_unlock(&pv_chunks_mutex); PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV)); PV_STAT(atomic_subtract_int(&pc_chunk_count, 1)); PV_STAT(atomic_add_int(&pc_chunk_frees, 1)); /* entire chunk is free, return it */ m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc)); dump_drop_page(m->phys_addr); vm_page_unwire_noq(m); vm_page_free(m); } /* * Returns a new PV entry, allocating a new PV chunk from the system when * needed. If this PV chunk allocation fails and a PV list lock pointer was * given, a PV chunk is reclaimed from an arbitrary pmap. Otherwise, NULL is * returned. * * The given PV list lock may be released. */ static pv_entry_t get_pv_entry(pmap_t pmap, struct rwlock **lockp) { int bit, field; pv_entry_t pv; struct pv_chunk *pc; vm_page_t m; rw_assert(&pvh_global_lock, RA_LOCKED); PMAP_LOCK_ASSERT(pmap, MA_OWNED); PV_STAT(atomic_add_long(&pv_entry_allocs, 1)); retry: pc = TAILQ_FIRST(&pmap->pm_pvchunk); if (pc != NULL) { for (field = 0; field < _NPCM; field++) { if (pc->pc_map[field]) { bit = ffsl(pc->pc_map[field]) - 1; break; } } if (field < _NPCM) { pv = &pc->pc_pventry[field * 64 + bit]; pc->pc_map[field] &= ~(1ul << bit); /* If this was the last item, move it to tail */ if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 && pc->pc_map[2] == 0) { TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); } PV_STAT(atomic_add_long(&pv_entry_count, 1)); PV_STAT(atomic_subtract_int(&pv_entry_spare, 1)); return (pv); } } /* No free items, allocate another chunk */ m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED); if (m == NULL) { if (lockp == NULL) { PV_STAT(pc_chunk_tryfail++); return (NULL); } m = reclaim_pv_chunk(pmap, lockp); if (m == NULL) goto retry; } PV_STAT(atomic_add_int(&pc_chunk_count, 1)); PV_STAT(atomic_add_int(&pc_chunk_allocs, 1)); dump_add_page(m->phys_addr); pc = (void *)PHYS_TO_DMAP(m->phys_addr); pc->pc_pmap = pmap; pc->pc_map[0] = PC_FREE0 & ~1ul; /* preallocated bit 0 */ pc->pc_map[1] = PC_FREE1; pc->pc_map[2] = PC_FREE2; mtx_lock(&pv_chunks_mutex); TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru); mtx_unlock(&pv_chunks_mutex); pv = &pc->pc_pventry[0]; TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); PV_STAT(atomic_add_long(&pv_entry_count, 1)); PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV - 1)); return (pv); } /* * Ensure that the number of spare PV entries in the specified pmap meets or * exceeds the given count, "needed". * * The given PV list lock may be released. */ static void reserve_pv_entries(pmap_t pmap, int needed, struct rwlock **lockp) { struct pch new_tail; struct pv_chunk *pc; vm_page_t m; int avail, free; bool reclaimed; rw_assert(&pvh_global_lock, RA_LOCKED); PMAP_LOCK_ASSERT(pmap, MA_OWNED); KASSERT(lockp != NULL, ("reserve_pv_entries: lockp is NULL")); /* * Newly allocated PV chunks must be stored in a private list until * the required number of PV chunks have been allocated. Otherwise, * reclaim_pv_chunk() could recycle one of these chunks. In * contrast, these chunks must be added to the pmap upon allocation. */ TAILQ_INIT(&new_tail); retry: avail = 0; TAILQ_FOREACH(pc, &pmap->pm_pvchunk, pc_list) { bit_count((bitstr_t *)pc->pc_map, 0, sizeof(pc->pc_map) * NBBY, &free); if (free == 0) break; avail += free; if (avail >= needed) break; } for (reclaimed = false; avail < needed; avail += _NPCPV) { m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED); if (m == NULL) { m = reclaim_pv_chunk(pmap, lockp); if (m == NULL) goto retry; reclaimed = true; } /* XXX PV STATS */ #if 0 dump_add_page(m->phys_addr); #endif pc = (void *)PHYS_TO_DMAP(m->phys_addr); pc->pc_pmap = pmap; pc->pc_map[0] = PC_FREE0; pc->pc_map[1] = PC_FREE1; pc->pc_map[2] = PC_FREE2; TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru); /* * The reclaim might have freed a chunk from the current pmap. * If that chunk contained available entries, we need to * re-count the number of available entries. */ if (reclaimed) goto retry; } if (!TAILQ_EMPTY(&new_tail)) { mtx_lock(&pv_chunks_mutex); TAILQ_CONCAT(&pv_chunks, &new_tail, pc_lru); mtx_unlock(&pv_chunks_mutex); } } /* * First find and then remove the pv entry for the specified pmap and virtual * address from the specified pv list. Returns the pv entry if found and NULL * otherwise. This operation can be performed on pv lists for either 4KB or * 2MB page mappings. */ static __inline pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va) { pv_entry_t pv; rw_assert(&pvh_global_lock, RA_LOCKED); TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { if (pmap == PV_PMAP(pv) && va == pv->pv_va) { TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); pvh->pv_gen++; break; } } return (pv); } /* * First find and then destroy the pv entry for the specified pmap and virtual * address. This operation can be performed on pv lists for either 4KB or 2MB * page mappings. */ static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va) { pv_entry_t pv; pv = pmap_pvh_remove(pvh, pmap, va); KASSERT(pv != NULL, ("pmap_pvh_free: pv not found for %#lx", va)); free_pv_entry(pmap, pv); } /* * Conditionally create the PV entry for a 4KB page mapping if the required * memory can be allocated without resorting to reclamation. */ static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m, struct rwlock **lockp) { pv_entry_t pv; rw_assert(&pvh_global_lock, RA_LOCKED); PMAP_LOCK_ASSERT(pmap, MA_OWNED); /* Pass NULL instead of the lock pointer to disable reclamation. */ if ((pv = get_pv_entry(pmap, NULL)) != NULL) { pv->pv_va = va; CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); m->md.pv_gen++; return (TRUE); } else return (FALSE); } /* * After demotion from a 2MB page mapping to 512 4KB page mappings, * destroy the pv entry for the 2MB page mapping and reinstantiate the pv * entries for each of the 4KB page mappings. */ static void __unused pmap_pv_demote_l2(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, struct rwlock **lockp) { struct md_page *pvh; struct pv_chunk *pc; pv_entry_t pv; vm_page_t m; vm_offset_t va_last; int bit, field; rw_assert(&pvh_global_lock, RA_LOCKED); PMAP_LOCK_ASSERT(pmap, MA_OWNED); CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); /* * Transfer the 2mpage's pv entry for this mapping to the first * page's pv list. Once this transfer begins, the pv list lock * must not be released until the last pv entry is reinstantiated. */ pvh = pa_to_pvh(pa); va &= ~L2_OFFSET; pv = pmap_pvh_remove(pvh, pmap, va); KASSERT(pv != NULL, ("pmap_pv_demote_l2: pv not found")); m = PHYS_TO_VM_PAGE(pa); TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); m->md.pv_gen++; /* Instantiate the remaining 511 pv entries. */ va_last = va + L2_SIZE - PAGE_SIZE; for (;;) { pc = TAILQ_FIRST(&pmap->pm_pvchunk); KASSERT(pc->pc_map[0] != 0 || pc->pc_map[1] != 0 || pc->pc_map[2] != 0, ("pmap_pv_demote_l2: missing spare")); for (field = 0; field < _NPCM; field++) { while (pc->pc_map[field] != 0) { bit = ffsl(pc->pc_map[field]) - 1; pc->pc_map[field] &= ~(1ul << bit); pv = &pc->pc_pventry[field * 64 + bit]; va += PAGE_SIZE; pv->pv_va = va; m++; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_pv_demote_l2: page %p is not managed", m)); TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); m->md.pv_gen++; if (va == va_last) goto out; } } TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); } out: if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 && pc->pc_map[2] == 0) { TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); } /* XXX PV stats */ } #if VM_NRESERVLEVEL > 0 static void pmap_pv_promote_l2(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, struct rwlock **lockp) { struct md_page *pvh; pv_entry_t pv; vm_page_t m; vm_offset_t va_last; rw_assert(&pvh_global_lock, RA_LOCKED); KASSERT((va & L2_OFFSET) == 0, ("pmap_pv_promote_l2: misaligned va %#lx", va)); CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); m = PHYS_TO_VM_PAGE(pa); pv = pmap_pvh_remove(&m->md, pmap, va); KASSERT(pv != NULL, ("pmap_pv_promote_l2: pv for %#lx not found", va)); pvh = pa_to_pvh(pa); TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); pvh->pv_gen++; va_last = va + L2_SIZE - PAGE_SIZE; do { m++; va += PAGE_SIZE; pmap_pvh_free(&m->md, pmap, va); } while (va < va_last); } #endif /* VM_NRESERVLEVEL > 0 */ /* * Create the PV entry for a 2MB page mapping. Always returns true unless the * flag PMAP_ENTER_NORECLAIM is specified. If that flag is specified, returns * false if the PV entry cannot be allocated without resorting to reclamation. */ static bool pmap_pv_insert_l2(pmap_t pmap, vm_offset_t va, pd_entry_t l2e, u_int flags, struct rwlock **lockp) { struct md_page *pvh; pv_entry_t pv; vm_paddr_t pa; PMAP_LOCK_ASSERT(pmap, MA_OWNED); /* Pass NULL instead of the lock pointer to disable reclamation. */ if ((pv = get_pv_entry(pmap, (flags & PMAP_ENTER_NORECLAIM) != 0 ? NULL : lockp)) == NULL) return (false); pv->pv_va = va; pa = PTE_TO_PHYS(l2e); CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); pvh = pa_to_pvh(pa); TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); pvh->pv_gen++; return (true); } static void pmap_remove_kernel_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va) { pt_entry_t newl2, oldl2; vm_page_t ml3; vm_paddr_t ml3pa; KASSERT(!VIRT_IN_DMAP(va), ("removing direct mapping of %#lx", va)); KASSERT(pmap == kernel_pmap, ("pmap %p is not kernel_pmap", pmap)); PMAP_LOCK_ASSERT(pmap, MA_OWNED); ml3 = pmap_remove_pt_page(pmap, va); if (ml3 == NULL) panic("pmap_remove_kernel_l2: Missing pt page"); ml3pa = VM_PAGE_TO_PHYS(ml3); newl2 = ml3pa | PTE_V; /* * If this page table page was unmapped by a promotion, then it * contains valid mappings. Zero it to invalidate those mappings. */ if (ml3->valid != 0) pagezero((void *)PHYS_TO_DMAP(ml3pa)); /* * Demote the mapping. */ oldl2 = pmap_load_store(l2, newl2); KASSERT(oldl2 == 0, ("%s: found existing mapping at %p: %#lx", __func__, l2, oldl2)); } /* * pmap_remove_l2: Do the things to unmap a level 2 superpage. */ static int pmap_remove_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t sva, pd_entry_t l1e, struct spglist *free, struct rwlock **lockp) { struct md_page *pvh; pt_entry_t oldl2; vm_offset_t eva, va; vm_page_t m, ml3; PMAP_LOCK_ASSERT(pmap, MA_OWNED); KASSERT((sva & L2_OFFSET) == 0, ("pmap_remove_l2: sva is not aligned")); oldl2 = pmap_load_clear(l2); KASSERT((oldl2 & PTE_RWX) != 0, ("pmap_remove_l2: L2e %lx is not a superpage mapping", oldl2)); /* * The sfence.vma documentation states that it is sufficient to specify * a single address within a superpage mapping. However, since we do * not perform any invalidation upon promotion, TLBs may still be * caching 4KB mappings within the superpage, so we must invalidate the * entire range. */ pmap_invalidate_range(pmap, sva, sva + L2_SIZE); if ((oldl2 & PTE_SW_WIRED) != 0) pmap->pm_stats.wired_count -= L2_SIZE / PAGE_SIZE; pmap_resident_count_dec(pmap, L2_SIZE / PAGE_SIZE); if ((oldl2 & PTE_SW_MANAGED) != 0) { CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, PTE_TO_PHYS(oldl2)); pvh = pa_to_pvh(PTE_TO_PHYS(oldl2)); pmap_pvh_free(pvh, pmap, sva); eva = sva + L2_SIZE; for (va = sva, m = PHYS_TO_VM_PAGE(PTE_TO_PHYS(oldl2)); va < eva; va += PAGE_SIZE, m++) { if ((oldl2 & PTE_D) != 0) vm_page_dirty(m); if ((oldl2 & PTE_A) != 0) vm_page_aflag_set(m, PGA_REFERENCED); if (TAILQ_EMPTY(&m->md.pv_list) && TAILQ_EMPTY(&pvh->pv_list)) vm_page_aflag_clear(m, PGA_WRITEABLE); } } if (pmap == kernel_pmap) { pmap_remove_kernel_l2(pmap, l2, sva); } else { ml3 = pmap_remove_pt_page(pmap, sva); if (ml3 != NULL) { KASSERT(ml3->valid == VM_PAGE_BITS_ALL, ("pmap_remove_l2: l3 page not promoted")); pmap_resident_count_dec(pmap, 1); KASSERT(ml3->wire_count == Ln_ENTRIES, ("pmap_remove_l2: l3 page wire count error")); ml3->wire_count = 1; vm_page_unwire_noq(ml3); pmap_add_delayed_free_list(ml3, free, FALSE); } } return (pmap_unuse_pt(pmap, sva, l1e, free)); } /* * pmap_remove_l3: do the things to unmap a page in a process */ static int pmap_remove_l3(pmap_t pmap, pt_entry_t *l3, vm_offset_t va, pd_entry_t l2e, struct spglist *free, struct rwlock **lockp) { pt_entry_t old_l3; vm_paddr_t phys; vm_page_t m; PMAP_LOCK_ASSERT(pmap, MA_OWNED); old_l3 = pmap_load_clear(l3); pmap_invalidate_page(pmap, va); if (old_l3 & PTE_SW_WIRED) pmap->pm_stats.wired_count -= 1; pmap_resident_count_dec(pmap, 1); if (old_l3 & PTE_SW_MANAGED) { phys = PTE_TO_PHYS(old_l3); m = PHYS_TO_VM_PAGE(phys); if ((old_l3 & PTE_D) != 0) vm_page_dirty(m); if (old_l3 & PTE_A) vm_page_aflag_set(m, PGA_REFERENCED); CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); pmap_pvh_free(&m->md, pmap, va); } return (pmap_unuse_pt(pmap, va, l2e, free)); } /* * Remove the given range of addresses from the specified map. * * It is assumed that the start and end are properly * rounded to the page size. */ void pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { struct spglist free; struct rwlock *lock; vm_offset_t va, va_next; pd_entry_t *l1, *l2, l2e; pt_entry_t *l3; /* * Perform an unsynchronized read. This is, however, safe. */ if (pmap->pm_stats.resident_count == 0) return; SLIST_INIT(&free); rw_rlock(&pvh_global_lock); PMAP_LOCK(pmap); lock = NULL; for (; sva < eva; sva = va_next) { if (pmap->pm_stats.resident_count == 0) break; l1 = pmap_l1(pmap, sva); if (pmap_load(l1) == 0) { va_next = (sva + L1_SIZE) & ~L1_OFFSET; if (va_next < sva) va_next = eva; continue; } /* * Calculate index for next page table. */ va_next = (sva + L2_SIZE) & ~L2_OFFSET; if (va_next < sva) va_next = eva; l2 = pmap_l1_to_l2(l1, sva); if (l2 == NULL) continue; if ((l2e = pmap_load(l2)) == 0) continue; if ((l2e & PTE_RWX) != 0) { if (sva + L2_SIZE == va_next && eva >= va_next) { (void)pmap_remove_l2(pmap, l2, sva, pmap_load(l1), &free, &lock); continue; } else if (!pmap_demote_l2_locked(pmap, l2, sva, &lock)) { /* * The large page mapping was destroyed. */ continue; } l2e = pmap_load(l2); } /* * Limit our scan to either the end of the va represented * by the current page table page, or to the end of the * range being removed. */ if (va_next > eva) va_next = eva; va = va_next; for (l3 = pmap_l2_to_l3(l2, sva); sva != va_next; l3++, sva += L3_SIZE) { if (pmap_load(l3) == 0) { if (va != va_next) { pmap_invalidate_range(pmap, va, sva); va = va_next; } continue; } if (va == va_next) va = sva; if (pmap_remove_l3(pmap, l3, sva, l2e, &free, &lock)) { sva += L3_SIZE; break; } } if (va != va_next) pmap_invalidate_range(pmap, va, sva); } if (lock != NULL) rw_wunlock(lock); rw_runlock(&pvh_global_lock); PMAP_UNLOCK(pmap); vm_page_free_pages_toq(&free, false); } /* * Routine: pmap_remove_all * Function: * Removes this physical page from * all physical maps in which it resides. * Reflects back modify bits to the pager. * * Notes: * Original versions of this routine were very * inefficient because they iteratively called * pmap_remove (slow...) */ void pmap_remove_all(vm_page_t m) { struct spglist free; struct md_page *pvh; pmap_t pmap; pt_entry_t *l3, l3e; pd_entry_t *l2, l2e; pv_entry_t pv; vm_offset_t va; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_remove_all: page %p is not managed", m)); SLIST_INIT(&free); pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : pa_to_pvh(VM_PAGE_TO_PHYS(m)); rw_wlock(&pvh_global_lock); while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); va = pv->pv_va; l2 = pmap_l2(pmap, va); (void)pmap_demote_l2(pmap, l2, va); PMAP_UNLOCK(pmap); } while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pmap_resident_count_dec(pmap, 1); l2 = pmap_l2(pmap, pv->pv_va); KASSERT(l2 != NULL, ("pmap_remove_all: no l2 table found")); l2e = pmap_load(l2); KASSERT((l2e & PTE_RX) == 0, ("pmap_remove_all: found a superpage in %p's pv list", m)); l3 = pmap_l2_to_l3(l2, pv->pv_va); l3e = pmap_load_clear(l3); pmap_invalidate_page(pmap, pv->pv_va); if (l3e & PTE_SW_WIRED) pmap->pm_stats.wired_count--; if ((l3e & PTE_A) != 0) vm_page_aflag_set(m, PGA_REFERENCED); /* * Update the vm_page_t clean and reference bits. */ if ((l3e & PTE_D) != 0) vm_page_dirty(m); pmap_unuse_pt(pmap, pv->pv_va, pmap_load(l2), &free); TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); m->md.pv_gen++; free_pv_entry(pmap, pv); PMAP_UNLOCK(pmap); } vm_page_aflag_clear(m, PGA_WRITEABLE); rw_wunlock(&pvh_global_lock); vm_page_free_pages_toq(&free, false); } /* * Set the physical protection on the * specified range of this map as requested. */ void pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) { pd_entry_t *l1, *l2, l2e; pt_entry_t *l3, l3e, mask; vm_page_t m, mt; vm_paddr_t pa; vm_offset_t va_next; bool anychanged, pv_lists_locked; if ((prot & VM_PROT_READ) == VM_PROT_NONE) { pmap_remove(pmap, sva, eva); return; } if ((prot & (VM_PROT_WRITE | VM_PROT_EXECUTE)) == (VM_PROT_WRITE | VM_PROT_EXECUTE)) return; anychanged = false; pv_lists_locked = false; mask = 0; if ((prot & VM_PROT_WRITE) == 0) mask |= PTE_W | PTE_D; if ((prot & VM_PROT_EXECUTE) == 0) mask |= PTE_X; resume: PMAP_LOCK(pmap); for (; sva < eva; sva = va_next) { l1 = pmap_l1(pmap, sva); if (pmap_load(l1) == 0) { va_next = (sva + L1_SIZE) & ~L1_OFFSET; if (va_next < sva) va_next = eva; continue; } va_next = (sva + L2_SIZE) & ~L2_OFFSET; if (va_next < sva) va_next = eva; l2 = pmap_l1_to_l2(l1, sva); if (l2 == NULL || (l2e = pmap_load(l2)) == 0) continue; if ((l2e & PTE_RWX) != 0) { if (sva + L2_SIZE == va_next && eva >= va_next) { retryl2: if ((prot & VM_PROT_WRITE) == 0 && (l2e & (PTE_SW_MANAGED | PTE_D)) == (PTE_SW_MANAGED | PTE_D)) { pa = PTE_TO_PHYS(l2e); m = PHYS_TO_VM_PAGE(pa); for (mt = m; mt < &m[Ln_ENTRIES]; mt++) vm_page_dirty(mt); } if (!atomic_fcmpset_long(l2, &l2e, l2e & ~mask)) goto retryl2; anychanged = true; } else { if (!pv_lists_locked) { pv_lists_locked = true; if (!rw_try_rlock(&pvh_global_lock)) { if (anychanged) pmap_invalidate_all( pmap); PMAP_UNLOCK(pmap); rw_rlock(&pvh_global_lock); goto resume; } } if (!pmap_demote_l2(pmap, l2, sva)) { /* * The large page mapping was destroyed. */ continue; } } } if (va_next > eva) va_next = eva; for (l3 = pmap_l2_to_l3(l2, sva); sva != va_next; l3++, sva += L3_SIZE) { l3e = pmap_load(l3); retryl3: if ((l3e & PTE_V) == 0) continue; if ((prot & VM_PROT_WRITE) == 0 && (l3e & (PTE_SW_MANAGED | PTE_D)) == (PTE_SW_MANAGED | PTE_D)) { m = PHYS_TO_VM_PAGE(PTE_TO_PHYS(l3e)); vm_page_dirty(m); } if (!atomic_fcmpset_long(l3, &l3e, l3e & ~mask)) goto retryl3; anychanged = true; } } if (anychanged) pmap_invalidate_all(pmap); if (pv_lists_locked) rw_runlock(&pvh_global_lock); PMAP_UNLOCK(pmap); } int pmap_fault_fixup(pmap_t pmap, vm_offset_t va, vm_prot_t ftype) { pd_entry_t *l2, l2e; pt_entry_t bits, *pte, oldpte; int rv; rv = 0; PMAP_LOCK(pmap); l2 = pmap_l2(pmap, va); if (l2 == NULL || ((l2e = pmap_load(l2)) & PTE_V) == 0) goto done; if ((l2e & PTE_RWX) == 0) { pte = pmap_l2_to_l3(l2, va); if (pte == NULL || ((oldpte = pmap_load(pte) & PTE_V)) == 0) goto done; } else { pte = l2; oldpte = l2e; } if ((pmap != kernel_pmap && (oldpte & PTE_U) == 0) || (ftype == VM_PROT_WRITE && (oldpte & PTE_W) == 0) || (ftype == VM_PROT_EXECUTE && (oldpte & PTE_X) == 0) || (ftype == VM_PROT_READ && (oldpte & PTE_R) == 0)) goto done; bits = PTE_A; if (ftype == VM_PROT_WRITE) bits |= PTE_D; /* * Spurious faults can occur if the implementation caches invalid * entries in the TLB, or if simultaneous accesses on multiple CPUs * race with each other. */ if ((oldpte & bits) != bits) pmap_store_bits(pte, bits); sfence_vma(); rv = 1; done: PMAP_UNLOCK(pmap); return (rv); } static bool pmap_demote_l2(pmap_t pmap, pd_entry_t *l2, vm_offset_t va) { struct rwlock *lock; bool rv; lock = NULL; rv = pmap_demote_l2_locked(pmap, l2, va, &lock); if (lock != NULL) rw_wunlock(lock); return (rv); } /* * Tries to demote a 2MB page mapping. If demotion fails, the 2MB page * mapping is invalidated. */ static bool pmap_demote_l2_locked(pmap_t pmap, pd_entry_t *l2, vm_offset_t va, struct rwlock **lockp) { struct spglist free; vm_page_t mpte; pd_entry_t newl2, oldl2; pt_entry_t *firstl3, newl3; vm_paddr_t mptepa; int i; PMAP_LOCK_ASSERT(pmap, MA_OWNED); oldl2 = pmap_load(l2); KASSERT((oldl2 & PTE_RWX) != 0, ("pmap_demote_l2_locked: oldl2 is not a leaf entry")); if ((oldl2 & PTE_A) == 0 || (mpte = pmap_remove_pt_page(pmap, va)) == NULL) { if ((oldl2 & PTE_A) == 0 || (mpte = vm_page_alloc(NULL, pmap_l2_pindex(va), (VIRT_IN_DMAP(va) ? VM_ALLOC_INTERRUPT : VM_ALLOC_NORMAL) | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) { SLIST_INIT(&free); (void)pmap_remove_l2(pmap, l2, va & ~L2_OFFSET, pmap_load(pmap_l1(pmap, va)), &free, lockp); vm_page_free_pages_toq(&free, true); CTR2(KTR_PMAP, "pmap_demote_l2_locked: " "failure for va %#lx in pmap %p", va, pmap); return (false); } if (va < VM_MAXUSER_ADDRESS) { mpte->wire_count = Ln_ENTRIES; pmap_resident_count_inc(pmap, 1); } } mptepa = VM_PAGE_TO_PHYS(mpte); firstl3 = (pt_entry_t *)PHYS_TO_DMAP(mptepa); newl2 = ((mptepa / PAGE_SIZE) << PTE_PPN0_S) | PTE_V; KASSERT((oldl2 & PTE_A) != 0, ("pmap_demote_l2_locked: oldl2 is missing PTE_A")); KASSERT((oldl2 & (PTE_D | PTE_W)) != PTE_W, ("pmap_demote_l2_locked: oldl2 is missing PTE_D")); newl3 = oldl2; /* * If the page table page is not leftover from an earlier promotion, * initialize it. */ if (mpte->valid == 0) { for (i = 0; i < Ln_ENTRIES; i++) pmap_store(firstl3 + i, newl3 + (i << PTE_PPN0_S)); } KASSERT(PTE_TO_PHYS(pmap_load(firstl3)) == PTE_TO_PHYS(newl3), ("pmap_demote_l2_locked: firstl3 and newl3 map different physical " "addresses")); /* * If the mapping has changed attributes, update the page table * entries. */ if ((pmap_load(firstl3) & PTE_PROMOTE) != (newl3 & PTE_PROMOTE)) for (i = 0; i < Ln_ENTRIES; i++) pmap_store(firstl3 + i, newl3 + (i << PTE_PPN0_S)); /* * The spare PV entries must be reserved prior to demoting the * mapping, that is, prior to changing the L2 entry. Otherwise, the * state of the L2 entry and the PV lists will be inconsistent, which * can result in reclaim_pv_chunk() attempting to remove a PV entry from * the wrong PV list and pmap_pv_demote_l2() failing to find the * expected PV entry for the 2MB page mapping that is being demoted. */ if ((oldl2 & PTE_SW_MANAGED) != 0) reserve_pv_entries(pmap, Ln_ENTRIES - 1, lockp); /* * Demote the mapping. */ pmap_store(l2, newl2); /* * Demote the PV entry. */ if ((oldl2 & PTE_SW_MANAGED) != 0) pmap_pv_demote_l2(pmap, va, PTE_TO_PHYS(oldl2), lockp); atomic_add_long(&pmap_l2_demotions, 1); CTR2(KTR_PMAP, "pmap_demote_l2_locked: success for va %#lx in pmap %p", va, pmap); return (true); } #if VM_NRESERVLEVEL > 0 static void pmap_promote_l2(pmap_t pmap, pd_entry_t *l2, vm_offset_t va, struct rwlock **lockp) { pt_entry_t *firstl3, *l3; vm_paddr_t pa; vm_page_t ml3; PMAP_LOCK_ASSERT(pmap, MA_OWNED); va &= ~L2_OFFSET; KASSERT((pmap_load(l2) & PTE_RWX) == 0, ("pmap_promote_l2: invalid l2 entry %p", l2)); firstl3 = (pt_entry_t *)PHYS_TO_DMAP(PTE_TO_PHYS(pmap_load(l2))); pa = PTE_TO_PHYS(pmap_load(firstl3)); if ((pa & L2_OFFSET) != 0) { CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx pmap %p", va, pmap); atomic_add_long(&pmap_l2_p_failures, 1); return; } pa += PAGE_SIZE; for (l3 = firstl3 + 1; l3 < firstl3 + Ln_ENTRIES; l3++) { if (PTE_TO_PHYS(pmap_load(l3)) != pa) { CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx pmap %p", va, pmap); atomic_add_long(&pmap_l2_p_failures, 1); return; } if ((pmap_load(l3) & PTE_PROMOTE) != (pmap_load(firstl3) & PTE_PROMOTE)) { CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx pmap %p", va, pmap); atomic_add_long(&pmap_l2_p_failures, 1); return; } pa += PAGE_SIZE; } ml3 = PHYS_TO_VM_PAGE(PTE_TO_PHYS(pmap_load(l2))); KASSERT(ml3->pindex == pmap_l2_pindex(va), ("pmap_promote_l2: page table page's pindex is wrong")); if (pmap_insert_pt_page(pmap, ml3, true)) { CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx pmap %p", va, pmap); atomic_add_long(&pmap_l2_p_failures, 1); return; } if ((pmap_load(firstl3) & PTE_SW_MANAGED) != 0) pmap_pv_promote_l2(pmap, va, PTE_TO_PHYS(pmap_load(firstl3)), lockp); pmap_store(l2, pmap_load(firstl3)); atomic_add_long(&pmap_l2_promotions, 1); CTR2(KTR_PMAP, "pmap_promote_l2: success for va %#lx in pmap %p", va, pmap); } #endif /* * Insert the given physical page (p) at * the specified virtual address (v) in the * target physical map with the protection requested. * * If specified, the page will be wired down, meaning * that the related pte can not be reclaimed. * * NB: This is the only routine which MAY NOT lazy-evaluate * or lose information. That is, this routine must actually * insert this page into the given map NOW. */ int pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, u_int flags, int8_t psind) { struct rwlock *lock; pd_entry_t *l1, *l2, l2e; pt_entry_t new_l3, orig_l3; pt_entry_t *l3; pv_entry_t pv; vm_paddr_t opa, pa, l2_pa, l3_pa; vm_page_t mpte, om, l2_m, l3_m; pt_entry_t entry; pn_t l2_pn, l3_pn, pn; int rv; bool nosleep; va = trunc_page(va); if ((m->oflags & VPO_UNMANAGED) == 0 && !vm_page_xbusied(m)) VM_OBJECT_ASSERT_LOCKED(m->object); pa = VM_PAGE_TO_PHYS(m); pn = (pa / PAGE_SIZE); new_l3 = PTE_V | PTE_R | PTE_A; if (prot & VM_PROT_EXECUTE) new_l3 |= PTE_X; if (flags & VM_PROT_WRITE) new_l3 |= PTE_D; if (prot & VM_PROT_WRITE) new_l3 |= PTE_W; if (va < VM_MAX_USER_ADDRESS) new_l3 |= PTE_U; new_l3 |= (pn << PTE_PPN0_S); if ((flags & PMAP_ENTER_WIRED) != 0) new_l3 |= PTE_SW_WIRED; /* * Set modified bit gratuitously for writeable mappings if * the page is unmanaged. We do not want to take a fault * to do the dirty bit accounting for these mappings. */ if ((m->oflags & VPO_UNMANAGED) != 0) { if (prot & VM_PROT_WRITE) new_l3 |= PTE_D; } else new_l3 |= PTE_SW_MANAGED; CTR2(KTR_PMAP, "pmap_enter: %.16lx -> %.16lx", va, pa); lock = NULL; mpte = NULL; rw_rlock(&pvh_global_lock); PMAP_LOCK(pmap); if (psind == 1) { /* Assert the required virtual and physical alignment. */ KASSERT((va & L2_OFFSET) == 0, ("pmap_enter: va %#lx unaligned", va)); KASSERT(m->psind > 0, ("pmap_enter: m->psind < psind")); rv = pmap_enter_l2(pmap, va, new_l3, flags, m, &lock); goto out; } l2 = pmap_l2(pmap, va); if (l2 != NULL && ((l2e = pmap_load(l2)) & PTE_V) != 0 && ((l2e & PTE_RWX) == 0 || pmap_demote_l2_locked(pmap, l2, va, &lock))) { l3 = pmap_l2_to_l3(l2, va); if (va < VM_MAXUSER_ADDRESS) { mpte = PHYS_TO_VM_PAGE(PTE_TO_PHYS(pmap_load(l2))); mpte->wire_count++; } } else if (va < VM_MAXUSER_ADDRESS) { nosleep = (flags & PMAP_ENTER_NOSLEEP) != 0; mpte = pmap_alloc_l3(pmap, va, nosleep ? NULL : &lock); if (mpte == NULL && nosleep) { CTR0(KTR_PMAP, "pmap_enter: mpte == NULL"); if (lock != NULL) rw_wunlock(lock); rw_runlock(&pvh_global_lock); PMAP_UNLOCK(pmap); return (KERN_RESOURCE_SHORTAGE); } l3 = pmap_l3(pmap, va); } else { l3 = pmap_l3(pmap, va); /* TODO: This is not optimal, but should mostly work */ if (l3 == NULL) { if (l2 == NULL) { l2_m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO); if (l2_m == NULL) panic("pmap_enter: l2 pte_m == NULL"); if ((l2_m->flags & PG_ZERO) == 0) pmap_zero_page(l2_m); l2_pa = VM_PAGE_TO_PHYS(l2_m); l2_pn = (l2_pa / PAGE_SIZE); l1 = pmap_l1(pmap, va); entry = (PTE_V); entry |= (l2_pn << PTE_PPN0_S); pmap_store(l1, entry); pmap_distribute_l1(pmap, pmap_l1_index(va), entry); l2 = pmap_l1_to_l2(l1, va); } l3_m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO); if (l3_m == NULL) panic("pmap_enter: l3 pte_m == NULL"); if ((l3_m->flags & PG_ZERO) == 0) pmap_zero_page(l3_m); l3_pa = VM_PAGE_TO_PHYS(l3_m); l3_pn = (l3_pa / PAGE_SIZE); entry = (PTE_V); entry |= (l3_pn << PTE_PPN0_S); pmap_store(l2, entry); l3 = pmap_l2_to_l3(l2, va); } pmap_invalidate_page(pmap, va); } orig_l3 = pmap_load(l3); opa = PTE_TO_PHYS(orig_l3); pv = NULL; /* * Is the specified virtual address already mapped? */ if ((orig_l3 & PTE_V) != 0) { /* * Wiring change, just update stats. We don't worry about * wiring PT pages as they remain resident as long as there * are valid mappings in them. Hence, if a user page is wired, * the PT page will be also. */ if ((flags & PMAP_ENTER_WIRED) != 0 && (orig_l3 & PTE_SW_WIRED) == 0) pmap->pm_stats.wired_count++; else if ((flags & PMAP_ENTER_WIRED) == 0 && (orig_l3 & PTE_SW_WIRED) != 0) pmap->pm_stats.wired_count--; /* * Remove the extra PT page reference. */ if (mpte != NULL) { mpte->wire_count--; KASSERT(mpte->wire_count > 0, ("pmap_enter: missing reference to page table page," " va: 0x%lx", va)); } /* * Has the physical page changed? */ if (opa == pa) { /* * No, might be a protection or wiring change. */ if ((orig_l3 & PTE_SW_MANAGED) != 0 && (new_l3 & PTE_W) != 0) vm_page_aflag_set(m, PGA_WRITEABLE); goto validate; } /* * The physical page has changed. Temporarily invalidate * the mapping. This ensures that all threads sharing the * pmap keep a consistent view of the mapping, which is * necessary for the correct handling of COW faults. It * also permits reuse of the old mapping's PV entry, * avoiding an allocation. * * For consistency, handle unmanaged mappings the same way. */ orig_l3 = pmap_load_clear(l3); KASSERT(PTE_TO_PHYS(orig_l3) == opa, ("pmap_enter: unexpected pa update for %#lx", va)); if ((orig_l3 & PTE_SW_MANAGED) != 0) { om = PHYS_TO_VM_PAGE(opa); /* * The pmap lock is sufficient to synchronize with * concurrent calls to pmap_page_test_mappings() and * pmap_ts_referenced(). */ if ((orig_l3 & PTE_D) != 0) vm_page_dirty(om); if ((orig_l3 & PTE_A) != 0) vm_page_aflag_set(om, PGA_REFERENCED); CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, opa); pv = pmap_pvh_remove(&om->md, pmap, va); KASSERT(pv != NULL, ("pmap_enter: no PV entry for %#lx", va)); if ((new_l3 & PTE_SW_MANAGED) == 0) free_pv_entry(pmap, pv); if ((om->aflags & PGA_WRITEABLE) != 0 && TAILQ_EMPTY(&om->md.pv_list)) vm_page_aflag_clear(om, PGA_WRITEABLE); } pmap_invalidate_page(pmap, va); orig_l3 = 0; } else { /* * Increment the counters. */ if ((new_l3 & PTE_SW_WIRED) != 0) pmap->pm_stats.wired_count++; pmap_resident_count_inc(pmap, 1); } /* * Enter on the PV list if part of our managed memory. */ if ((new_l3 & PTE_SW_MANAGED) != 0) { if (pv == NULL) { pv = get_pv_entry(pmap, &lock); pv->pv_va = va; } CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, pa); TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); m->md.pv_gen++; if ((new_l3 & PTE_W) != 0) vm_page_aflag_set(m, PGA_WRITEABLE); } validate: /* * Sync the i-cache on all harts before updating the PTE * if the new PTE is executable. */ if (prot & VM_PROT_EXECUTE) pmap_sync_icache(pmap, va, PAGE_SIZE); /* * Update the L3 entry. */ if (orig_l3 != 0) { orig_l3 = pmap_load_store(l3, new_l3); pmap_invalidate_page(pmap, va); KASSERT(PTE_TO_PHYS(orig_l3) == pa, ("pmap_enter: invalid update")); if ((orig_l3 & (PTE_D | PTE_SW_MANAGED)) == (PTE_D | PTE_SW_MANAGED)) vm_page_dirty(m); } else { pmap_store(l3, new_l3); } #if VM_NRESERVLEVEL > 0 if (mpte != NULL && mpte->wire_count == Ln_ENTRIES && pmap_ps_enabled(pmap) && (m->flags & PG_FICTITIOUS) == 0 && vm_reserv_level_iffullpop(m) == 0) pmap_promote_l2(pmap, l2, va, &lock); #endif rv = KERN_SUCCESS; out: if (lock != NULL) rw_wunlock(lock); rw_runlock(&pvh_global_lock); PMAP_UNLOCK(pmap); return (rv); } /* * Tries to create a read- and/or execute-only 2MB page mapping. Returns true * if successful. Returns false if (1) a page table page cannot be allocated * without sleeping, (2) a mapping already exists at the specified virtual * address, or (3) a PV entry cannot be allocated without reclaiming another * PV entry. */ static bool pmap_enter_2mpage(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, struct rwlock **lockp) { pd_entry_t new_l2; pn_t pn; PMAP_LOCK_ASSERT(pmap, MA_OWNED); pn = VM_PAGE_TO_PHYS(m) / PAGE_SIZE; new_l2 = (pd_entry_t)((pn << PTE_PPN0_S) | PTE_R | PTE_V); if ((m->oflags & VPO_UNMANAGED) == 0) new_l2 |= PTE_SW_MANAGED; if ((prot & VM_PROT_EXECUTE) != 0) new_l2 |= PTE_X; if (va < VM_MAXUSER_ADDRESS) new_l2 |= PTE_U; return (pmap_enter_l2(pmap, va, new_l2, PMAP_ENTER_NOSLEEP | PMAP_ENTER_NOREPLACE | PMAP_ENTER_NORECLAIM, NULL, lockp) == KERN_SUCCESS); } /* * Tries to create the specified 2MB page mapping. Returns KERN_SUCCESS if * the mapping was created, and either KERN_FAILURE or KERN_RESOURCE_SHORTAGE * otherwise. Returns KERN_FAILURE if PMAP_ENTER_NOREPLACE was specified and * a mapping already exists at the specified virtual address. Returns * KERN_RESOURCE_SHORTAGE if PMAP_ENTER_NOSLEEP was specified and a page table * page allocation failed. Returns KERN_RESOURCE_SHORTAGE if * PMAP_ENTER_NORECLAIM was specified and a PV entry allocation failed. * * The parameter "m" is only used when creating a managed, writeable mapping. */ static int pmap_enter_l2(pmap_t pmap, vm_offset_t va, pd_entry_t new_l2, u_int flags, vm_page_t m, struct rwlock **lockp) { struct spglist free; pd_entry_t *l2, *l3, oldl2; vm_offset_t sva; vm_page_t l2pg, mt; PMAP_LOCK_ASSERT(pmap, MA_OWNED); if ((l2pg = pmap_alloc_l2(pmap, va, (flags & PMAP_ENTER_NOSLEEP) != 0 ? NULL : lockp)) == NULL) { CTR2(KTR_PMAP, "pmap_enter_l2: failure for va %#lx in pmap %p", va, pmap); return (KERN_RESOURCE_SHORTAGE); } l2 = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(l2pg)); l2 = &l2[pmap_l2_index(va)]; if ((oldl2 = pmap_load(l2)) != 0) { KASSERT(l2pg->wire_count > 1, ("pmap_enter_l2: l2pg's wire count is too low")); if ((flags & PMAP_ENTER_NOREPLACE) != 0) { l2pg->wire_count--; CTR2(KTR_PMAP, "pmap_enter_l2: failure for va %#lx in pmap %p", va, pmap); return (KERN_FAILURE); } SLIST_INIT(&free); if ((oldl2 & PTE_RWX) != 0) (void)pmap_remove_l2(pmap, l2, va, pmap_load(pmap_l1(pmap, va)), &free, lockp); else for (sva = va; sva < va + L2_SIZE; sva += PAGE_SIZE) { l3 = pmap_l2_to_l3(l2, sva); if ((pmap_load(l3) & PTE_V) != 0 && pmap_remove_l3(pmap, l3, sva, oldl2, &free, lockp) != 0) break; } vm_page_free_pages_toq(&free, true); if (va >= VM_MAXUSER_ADDRESS) { /* * Both pmap_remove_l2() and pmap_remove_l3() will * leave the kernel page table page zero filled. */ mt = PHYS_TO_VM_PAGE(PTE_TO_PHYS(pmap_load(l2))); if (pmap_insert_pt_page(pmap, mt, false)) panic("pmap_enter_l2: trie insert failed"); } else KASSERT(pmap_load(l2) == 0, ("pmap_enter_l2: non-zero L2 entry %p", l2)); } if ((new_l2 & PTE_SW_MANAGED) != 0) { /* * Abort this mapping if its PV entry could not be created. */ if (!pmap_pv_insert_l2(pmap, va, new_l2, flags, lockp)) { SLIST_INIT(&free); if (pmap_unwire_ptp(pmap, va, l2pg, &free)) { /* * Although "va" is not mapped, paging-structure * caches could nonetheless have entries that * refer to the freed page table pages. * Invalidate those entries. */ pmap_invalidate_page(pmap, va); vm_page_free_pages_toq(&free, true); } CTR2(KTR_PMAP, "pmap_enter_l2: failure for va %#lx in pmap %p", va, pmap); return (KERN_RESOURCE_SHORTAGE); } if ((new_l2 & PTE_W) != 0) for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++) vm_page_aflag_set(mt, PGA_WRITEABLE); } /* * Increment counters. */ if ((new_l2 & PTE_SW_WIRED) != 0) pmap->pm_stats.wired_count += L2_SIZE / PAGE_SIZE; pmap->pm_stats.resident_count += L2_SIZE / PAGE_SIZE; /* * Map the superpage. */ pmap_store(l2, new_l2); atomic_add_long(&pmap_l2_mappings, 1); CTR2(KTR_PMAP, "pmap_enter_l2: success for va %#lx in pmap %p", va, pmap); return (KERN_SUCCESS); } /* * Maps a sequence of resident pages belonging to the same object. * The sequence begins with the given page m_start. This page is * mapped at the given virtual address start. Each subsequent page is * mapped at a virtual address that is offset from start by the same * amount as the page is offset from m_start within the object. The * last page in the sequence is the page with the largest offset from * m_start that can be mapped at a virtual address less than the given * virtual address end. Not every virtual page between start and end * is mapped; only those for which a resident page exists with the * corresponding offset from m_start are mapped. */ void pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end, vm_page_t m_start, vm_prot_t prot) { struct rwlock *lock; vm_offset_t va; vm_page_t m, mpte; vm_pindex_t diff, psize; VM_OBJECT_ASSERT_LOCKED(m_start->object); psize = atop(end - start); mpte = NULL; m = m_start; lock = NULL; rw_rlock(&pvh_global_lock); PMAP_LOCK(pmap); while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) { va = start + ptoa(diff); if ((va & L2_OFFSET) == 0 && va + L2_SIZE <= end && m->psind == 1 && pmap_ps_enabled(pmap) && pmap_enter_2mpage(pmap, va, m, prot, &lock)) m = &m[L2_SIZE / PAGE_SIZE - 1]; else mpte = pmap_enter_quick_locked(pmap, va, m, prot, mpte, &lock); m = TAILQ_NEXT(m, listq); } if (lock != NULL) rw_wunlock(lock); rw_runlock(&pvh_global_lock); PMAP_UNLOCK(pmap); } /* * this code makes some *MAJOR* assumptions: * 1. Current pmap & pmap exists. * 2. Not wired. * 3. Read access. * 4. No page table pages. * but is *MUCH* faster than pmap_enter... */ void pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot) { struct rwlock *lock; lock = NULL; rw_rlock(&pvh_global_lock); PMAP_LOCK(pmap); (void)pmap_enter_quick_locked(pmap, va, m, prot, NULL, &lock); if (lock != NULL) rw_wunlock(lock); rw_runlock(&pvh_global_lock); PMAP_UNLOCK(pmap); } static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp) { struct spglist free; vm_paddr_t phys; pd_entry_t *l2; pt_entry_t *l3, newl3; KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva || (m->oflags & VPO_UNMANAGED) != 0, ("pmap_enter_quick_locked: managed mapping within the clean submap")); rw_assert(&pvh_global_lock, RA_LOCKED); PMAP_LOCK_ASSERT(pmap, MA_OWNED); CTR2(KTR_PMAP, "pmap_enter_quick_locked: %p %lx", pmap, va); /* * In the case that a page table page is not * resident, we are creating it here. */ if (va < VM_MAXUSER_ADDRESS) { vm_pindex_t l2pindex; /* * Calculate pagetable page index */ l2pindex = pmap_l2_pindex(va); if (mpte && (mpte->pindex == l2pindex)) { mpte->wire_count++; } else { /* * Get the l2 entry */ l2 = pmap_l2(pmap, va); /* * If the page table page is mapped, we just increment * the hold count, and activate it. Otherwise, we * attempt to allocate a page table page. If this * attempt fails, we don't retry. Instead, we give up. */ if (l2 != NULL && pmap_load(l2) != 0) { phys = PTE_TO_PHYS(pmap_load(l2)); mpte = PHYS_TO_VM_PAGE(phys); mpte->wire_count++; } else { /* * Pass NULL instead of the PV list lock * pointer, because we don't intend to sleep. */ mpte = _pmap_alloc_l3(pmap, l2pindex, NULL); if (mpte == NULL) return (mpte); } } l3 = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mpte)); l3 = &l3[pmap_l3_index(va)]; } else { mpte = NULL; l3 = pmap_l3(kernel_pmap, va); } if (l3 == NULL) panic("pmap_enter_quick_locked: No l3"); if (pmap_load(l3) != 0) { if (mpte != NULL) { mpte->wire_count--; mpte = NULL; } return (mpte); } /* * Enter on the PV list if part of our managed memory. */ if ((m->oflags & VPO_UNMANAGED) == 0 && !pmap_try_insert_pv_entry(pmap, va, m, lockp)) { if (mpte != NULL) { SLIST_INIT(&free); if (pmap_unwire_ptp(pmap, va, mpte, &free)) { pmap_invalidate_page(pmap, va); vm_page_free_pages_toq(&free, false); } mpte = NULL; } return (mpte); } /* * Increment counters */ pmap_resident_count_inc(pmap, 1); newl3 = ((VM_PAGE_TO_PHYS(m) / PAGE_SIZE) << PTE_PPN0_S) | PTE_V | PTE_R; if ((prot & VM_PROT_EXECUTE) != 0) newl3 |= PTE_X; if ((m->oflags & VPO_UNMANAGED) == 0) newl3 |= PTE_SW_MANAGED; if (va < VM_MAX_USER_ADDRESS) newl3 |= PTE_U; /* * Sync the i-cache on all harts before updating the PTE * if the new PTE is executable. */ if (prot & VM_PROT_EXECUTE) pmap_sync_icache(pmap, va, PAGE_SIZE); pmap_store(l3, newl3); pmap_invalidate_page(pmap, va); return (mpte); } /* * This code maps large physical mmap regions into the * processor address space. Note that some shortcuts * are taken, but the code works. */ void pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object, vm_pindex_t pindex, vm_size_t size) { VM_OBJECT_ASSERT_WLOCKED(object); KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG, ("pmap_object_init_pt: non-device object")); } /* * Clear the wired attribute from the mappings for the specified range of * addresses in the given pmap. Every valid mapping within that range * must have the wired attribute set. In contrast, invalid mappings * cannot have the wired attribute set, so they are ignored. * * The wired attribute of the page table entry is not a hardware feature, * so there is no need to invalidate any TLB entries. */ void pmap_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { vm_offset_t va_next; pd_entry_t *l1, *l2, l2e; pt_entry_t *l3, l3e; bool pv_lists_locked; pv_lists_locked = false; retry: PMAP_LOCK(pmap); for (; sva < eva; sva = va_next) { l1 = pmap_l1(pmap, sva); if (pmap_load(l1) == 0) { va_next = (sva + L1_SIZE) & ~L1_OFFSET; if (va_next < sva) va_next = eva; continue; } va_next = (sva + L2_SIZE) & ~L2_OFFSET; if (va_next < sva) va_next = eva; l2 = pmap_l1_to_l2(l1, sva); if ((l2e = pmap_load(l2)) == 0) continue; if ((l2e & PTE_RWX) != 0) { if (sva + L2_SIZE == va_next && eva >= va_next) { if ((l2e & PTE_SW_WIRED) == 0) panic("pmap_unwire: l2 %#jx is missing " "PTE_SW_WIRED", (uintmax_t)l2e); pmap_clear_bits(l2, PTE_SW_WIRED); continue; } else { if (!pv_lists_locked) { pv_lists_locked = true; if (!rw_try_rlock(&pvh_global_lock)) { PMAP_UNLOCK(pmap); rw_rlock(&pvh_global_lock); /* Repeat sva. */ goto retry; } } if (!pmap_demote_l2(pmap, l2, sva)) panic("pmap_unwire: demotion failed"); } } if (va_next > eva) va_next = eva; for (l3 = pmap_l2_to_l3(l2, sva); sva != va_next; l3++, sva += L3_SIZE) { if ((l3e = pmap_load(l3)) == 0) continue; if ((l3e & PTE_SW_WIRED) == 0) panic("pmap_unwire: l3 %#jx is missing " "PTE_SW_WIRED", (uintmax_t)l3e); /* * PG_W must be cleared atomically. Although the pmap * lock synchronizes access to PG_W, another processor * could be setting PG_M and/or PG_A concurrently. */ pmap_clear_bits(l3, PTE_SW_WIRED); pmap->pm_stats.wired_count--; } } if (pv_lists_locked) rw_runlock(&pvh_global_lock); PMAP_UNLOCK(pmap); } /* * Copy the range specified by src_addr/len * from the source map to the range dst_addr/len * in the destination map. * * This routine is only advisory and need not do anything. */ void pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len, vm_offset_t src_addr) { } /* * pmap_zero_page zeros the specified hardware page by mapping * the page into KVM and using bzero to clear its contents. */ void pmap_zero_page(vm_page_t m) { vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); pagezero((void *)va); } /* * pmap_zero_page_area zeros the specified hardware page by mapping * the page into KVM and using bzero to clear its contents. * * off and size may not cover an area beyond a single hardware page. */ void pmap_zero_page_area(vm_page_t m, int off, int size) { vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)); if (off == 0 && size == PAGE_SIZE) pagezero((void *)va); else bzero((char *)va + off, size); } /* * pmap_copy_page copies the specified (machine independent) * page by mapping the page into virtual memory and using * bcopy to copy the page, one machine dependent page at a * time. */ void pmap_copy_page(vm_page_t msrc, vm_page_t mdst) { vm_offset_t src = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(msrc)); vm_offset_t dst = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mdst)); pagecopy((void *)src, (void *)dst); } int unmapped_buf_allowed = 1; void pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[], vm_offset_t b_offset, int xfersize) { void *a_cp, *b_cp; vm_page_t m_a, m_b; vm_paddr_t p_a, p_b; vm_offset_t a_pg_offset, b_pg_offset; int cnt; while (xfersize > 0) { a_pg_offset = a_offset & PAGE_MASK; m_a = ma[a_offset >> PAGE_SHIFT]; p_a = m_a->phys_addr; b_pg_offset = b_offset & PAGE_MASK; m_b = mb[b_offset >> PAGE_SHIFT]; p_b = m_b->phys_addr; cnt = min(xfersize, PAGE_SIZE - a_pg_offset); cnt = min(cnt, PAGE_SIZE - b_pg_offset); if (__predict_false(!PHYS_IN_DMAP(p_a))) { panic("!DMAP a %lx", p_a); } else { a_cp = (char *)PHYS_TO_DMAP(p_a) + a_pg_offset; } if (__predict_false(!PHYS_IN_DMAP(p_b))) { panic("!DMAP b %lx", p_b); } else { b_cp = (char *)PHYS_TO_DMAP(p_b) + b_pg_offset; } bcopy(a_cp, b_cp, cnt); a_offset += cnt; b_offset += cnt; xfersize -= cnt; } } vm_offset_t pmap_quick_enter_page(vm_page_t m) { return (PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m))); } void pmap_quick_remove_page(vm_offset_t addr) { } /* * Returns true if the pmap's pv is one of the first * 16 pvs linked to from this page. This count may * be changed upwards or downwards in the future; it * is only necessary that true be returned for a small * subset of pmaps for proper page aging. */ boolean_t pmap_page_exists_quick(pmap_t pmap, vm_page_t m) { struct md_page *pvh; struct rwlock *lock; pv_entry_t pv; int loops = 0; boolean_t rv; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_page_exists_quick: page %p is not managed", m)); rv = FALSE; rw_rlock(&pvh_global_lock); lock = VM_PAGE_TO_PV_LIST_LOCK(m); rw_rlock(lock); TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { if (PV_PMAP(pv) == pmap) { rv = TRUE; break; } loops++; if (loops >= 16) break; } if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) { pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { if (PV_PMAP(pv) == pmap) { rv = TRUE; break; } loops++; if (loops >= 16) break; } } rw_runlock(lock); rw_runlock(&pvh_global_lock); return (rv); } /* * pmap_page_wired_mappings: * * Return the number of managed mappings to the given physical page * that are wired. */ int pmap_page_wired_mappings(vm_page_t m) { struct md_page *pvh; struct rwlock *lock; pmap_t pmap; pd_entry_t *l2; pt_entry_t *l3; pv_entry_t pv; int count, md_gen, pvh_gen; if ((m->oflags & VPO_UNMANAGED) != 0) return (0); rw_rlock(&pvh_global_lock); lock = VM_PAGE_TO_PV_LIST_LOCK(m); rw_rlock(lock); restart: count = 0; TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { md_gen = m->md.pv_gen; rw_runlock(lock); PMAP_LOCK(pmap); rw_rlock(lock); if (md_gen != m->md.pv_gen) { PMAP_UNLOCK(pmap); goto restart; } } l3 = pmap_l3(pmap, pv->pv_va); if ((pmap_load(l3) & PTE_SW_WIRED) != 0) count++; PMAP_UNLOCK(pmap); } if ((m->flags & PG_FICTITIOUS) == 0) { pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { md_gen = m->md.pv_gen; pvh_gen = pvh->pv_gen; rw_runlock(lock); PMAP_LOCK(pmap); rw_rlock(lock); if (md_gen != m->md.pv_gen || pvh_gen != pvh->pv_gen) { PMAP_UNLOCK(pmap); goto restart; } } l2 = pmap_l2(pmap, pv->pv_va); if ((pmap_load(l2) & PTE_SW_WIRED) != 0) count++; PMAP_UNLOCK(pmap); } } rw_runlock(lock); rw_runlock(&pvh_global_lock); return (count); } static void pmap_remove_pages_pv(pmap_t pmap, vm_page_t m, pv_entry_t pv, struct spglist *free, bool superpage) { struct md_page *pvh; vm_page_t mpte, mt; if (superpage) { pmap_resident_count_dec(pmap, Ln_ENTRIES); pvh = pa_to_pvh(m->phys_addr); TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); pvh->pv_gen++; if (TAILQ_EMPTY(&pvh->pv_list)) { for (mt = m; mt < &m[Ln_ENTRIES]; mt++) if (TAILQ_EMPTY(&mt->md.pv_list) && (mt->aflags & PGA_WRITEABLE) != 0) vm_page_aflag_clear(mt, PGA_WRITEABLE); } mpte = pmap_remove_pt_page(pmap, pv->pv_va); if (mpte != NULL) { KASSERT(mpte->valid == VM_PAGE_BITS_ALL, ("pmap_remove_pages: pte page not promoted")); pmap_resident_count_dec(pmap, 1); KASSERT(mpte->wire_count == Ln_ENTRIES, ("pmap_remove_pages: pte page wire count error")); mpte->wire_count = 0; pmap_add_delayed_free_list(mpte, free, FALSE); } } else { pmap_resident_count_dec(pmap, 1); TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); m->md.pv_gen++; if (TAILQ_EMPTY(&m->md.pv_list) && (m->aflags & PGA_WRITEABLE) != 0) { pvh = pa_to_pvh(m->phys_addr); if (TAILQ_EMPTY(&pvh->pv_list)) vm_page_aflag_clear(m, PGA_WRITEABLE); } } } /* * Destroy all managed, non-wired mappings in the given user-space * pmap. This pmap cannot be active on any processor besides the * caller. * * This function cannot be applied to the kernel pmap. Moreover, it * is not intended for general use. It is only to be used during * process termination. Consequently, it can be implemented in ways * that make it faster than pmap_remove(). First, it can more quickly * destroy mappings by iterating over the pmap's collection of PV * entries, rather than searching the page table. Second, it doesn't * have to test and clear the page table entries atomically, because * no processor is currently accessing the user address space. In * particular, a page table entry's dirty bit won't change state once * this function starts. */ void pmap_remove_pages(pmap_t pmap) { struct spglist free; pd_entry_t ptepde; pt_entry_t *pte, tpte; vm_page_t m, mt; pv_entry_t pv; struct pv_chunk *pc, *npc; struct rwlock *lock; int64_t bit; uint64_t inuse, bitmask; int allfree, field, freed, idx; bool superpage; lock = NULL; SLIST_INIT(&free); rw_rlock(&pvh_global_lock); PMAP_LOCK(pmap); TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) { allfree = 1; freed = 0; for (field = 0; field < _NPCM; field++) { inuse = ~pc->pc_map[field] & pc_freemask[field]; while (inuse != 0) { bit = ffsl(inuse) - 1; bitmask = 1UL << bit; idx = field * 64 + bit; pv = &pc->pc_pventry[idx]; inuse &= ~bitmask; pte = pmap_l1(pmap, pv->pv_va); ptepde = pmap_load(pte); pte = pmap_l1_to_l2(pte, pv->pv_va); tpte = pmap_load(pte); if ((tpte & PTE_RWX) != 0) { superpage = true; } else { ptepde = tpte; pte = pmap_l2_to_l3(pte, pv->pv_va); tpte = pmap_load(pte); superpage = false; } /* * We cannot remove wired pages from a * process' mapping at this time. */ if (tpte & PTE_SW_WIRED) { allfree = 0; continue; } m = PHYS_TO_VM_PAGE(PTE_TO_PHYS(tpte)); KASSERT((m->flags & PG_FICTITIOUS) != 0 || m < &vm_page_array[vm_page_array_size], ("pmap_remove_pages: bad pte %#jx", (uintmax_t)tpte)); pmap_clear(pte); /* * Update the vm_page_t clean/reference bits. */ if ((tpte & (PTE_D | PTE_W)) == (PTE_D | PTE_W)) { if (superpage) for (mt = m; mt < &m[Ln_ENTRIES]; mt++) vm_page_dirty(mt); else vm_page_dirty(m); } CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, m); /* Mark free */ pc->pc_map[field] |= bitmask; pmap_remove_pages_pv(pmap, m, pv, &free, superpage); pmap_unuse_pt(pmap, pv->pv_va, ptepde, &free); freed++; } } PV_STAT(atomic_add_long(&pv_entry_frees, freed)); PV_STAT(atomic_add_int(&pv_entry_spare, freed)); PV_STAT(atomic_subtract_long(&pv_entry_count, freed)); if (allfree) { TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); free_pv_chunk(pc); } } if (lock != NULL) rw_wunlock(lock); pmap_invalidate_all(pmap); rw_runlock(&pvh_global_lock); PMAP_UNLOCK(pmap); vm_page_free_pages_toq(&free, false); } static bool pmap_page_test_mappings(vm_page_t m, boolean_t accessed, boolean_t modified) { struct md_page *pvh; struct rwlock *lock; pd_entry_t *l2; pt_entry_t *l3, mask; pv_entry_t pv; pmap_t pmap; int md_gen, pvh_gen; bool rv; mask = 0; if (modified) mask |= PTE_D; if (accessed) mask |= PTE_A; rv = FALSE; rw_rlock(&pvh_global_lock); lock = VM_PAGE_TO_PV_LIST_LOCK(m); rw_rlock(lock); restart: TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { md_gen = m->md.pv_gen; rw_runlock(lock); PMAP_LOCK(pmap); rw_rlock(lock); if (md_gen != m->md.pv_gen) { PMAP_UNLOCK(pmap); goto restart; } } l3 = pmap_l3(pmap, pv->pv_va); rv = (pmap_load(l3) & mask) == mask; PMAP_UNLOCK(pmap); if (rv) goto out; } if ((m->flags & PG_FICTITIOUS) == 0) { pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { md_gen = m->md.pv_gen; pvh_gen = pvh->pv_gen; rw_runlock(lock); PMAP_LOCK(pmap); rw_rlock(lock); if (md_gen != m->md.pv_gen || pvh_gen != pvh->pv_gen) { PMAP_UNLOCK(pmap); goto restart; } } l2 = pmap_l2(pmap, pv->pv_va); rv = (pmap_load(l2) & mask) == mask; PMAP_UNLOCK(pmap); if (rv) goto out; } } out: rw_runlock(lock); rw_runlock(&pvh_global_lock); return (rv); } /* * pmap_is_modified: * * Return whether or not the specified physical page was modified * in any physical maps. */ boolean_t pmap_is_modified(vm_page_t m) { KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_is_modified: page %p is not managed", m)); /* * If the page is not exclusive busied, then PGA_WRITEABLE cannot be * concurrently set while the object is locked. Thus, if PGA_WRITEABLE * is clear, no PTEs can have PG_M set. */ VM_OBJECT_ASSERT_WLOCKED(m->object); if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0) return (FALSE); return (pmap_page_test_mappings(m, FALSE, TRUE)); } /* * pmap_is_prefaultable: * * Return whether or not the specified virtual address is eligible * for prefault. */ boolean_t pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr) { pt_entry_t *l3; boolean_t rv; rv = FALSE; PMAP_LOCK(pmap); l3 = pmap_l3(pmap, addr); if (l3 != NULL && pmap_load(l3) != 0) { rv = TRUE; } PMAP_UNLOCK(pmap); return (rv); } /* * pmap_is_referenced: * * Return whether or not the specified physical page was referenced * in any physical maps. */ boolean_t pmap_is_referenced(vm_page_t m) { KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_is_referenced: page %p is not managed", m)); return (pmap_page_test_mappings(m, TRUE, FALSE)); } /* * Clear the write and modified bits in each of the given page's mappings. */ void pmap_remove_write(vm_page_t m) { struct md_page *pvh; struct rwlock *lock; pmap_t pmap; pd_entry_t *l2; pt_entry_t *l3, oldl3, newl3; pv_entry_t next_pv, pv; vm_offset_t va; int md_gen, pvh_gen; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_remove_write: page %p is not managed", m)); /* * If the page is not exclusive busied, then PGA_WRITEABLE cannot be * set by another thread while the object is locked. Thus, * if PGA_WRITEABLE is clear, no page table entries need updating. */ VM_OBJECT_ASSERT_WLOCKED(m->object); if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0) return; lock = VM_PAGE_TO_PV_LIST_LOCK(m); pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : pa_to_pvh(VM_PAGE_TO_PHYS(m)); rw_rlock(&pvh_global_lock); retry_pv_loop: rw_wlock(lock); TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { pvh_gen = pvh->pv_gen; rw_wunlock(lock); PMAP_LOCK(pmap); rw_wlock(lock); if (pvh_gen != pvh->pv_gen) { PMAP_UNLOCK(pmap); rw_wunlock(lock); goto retry_pv_loop; } } va = pv->pv_va; l2 = pmap_l2(pmap, va); if ((pmap_load(l2) & PTE_W) != 0) (void)pmap_demote_l2_locked(pmap, l2, va, &lock); KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m), ("inconsistent pv lock %p %p for page %p", lock, VM_PAGE_TO_PV_LIST_LOCK(m), m)); PMAP_UNLOCK(pmap); } TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { pvh_gen = pvh->pv_gen; md_gen = m->md.pv_gen; rw_wunlock(lock); PMAP_LOCK(pmap); rw_wlock(lock); if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { PMAP_UNLOCK(pmap); rw_wunlock(lock); goto retry_pv_loop; } } l3 = pmap_l3(pmap, pv->pv_va); oldl3 = pmap_load(l3); retry: if ((oldl3 & PTE_W) != 0) { newl3 = oldl3 & ~(PTE_D | PTE_W); if (!atomic_fcmpset_long(l3, &oldl3, newl3)) goto retry; if ((oldl3 & PTE_D) != 0) vm_page_dirty(m); pmap_invalidate_page(pmap, pv->pv_va); } PMAP_UNLOCK(pmap); } rw_wunlock(lock); vm_page_aflag_clear(m, PGA_WRITEABLE); rw_runlock(&pvh_global_lock); } /* * pmap_ts_referenced: * * Return a count of reference bits for a page, clearing those bits. * It is not necessary for every reference bit to be cleared, but it * is necessary that 0 only be returned when there are truly no * reference bits set. * * As an optimization, update the page's dirty field if a modified bit is * found while counting reference bits. This opportunistic update can be * performed at low cost and can eliminate the need for some future calls * to pmap_is_modified(). However, since this function stops after * finding PMAP_TS_REFERENCED_MAX reference bits, it may not detect some * dirty pages. Those dirty pages will only be detected by a future call * to pmap_is_modified(). */ int pmap_ts_referenced(vm_page_t m) { struct spglist free; struct md_page *pvh; struct rwlock *lock; pv_entry_t pv, pvf; pmap_t pmap; pd_entry_t *l2, l2e; pt_entry_t *l3, l3e; vm_paddr_t pa; vm_offset_t va; int cleared, md_gen, not_cleared, pvh_gen; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_ts_referenced: page %p is not managed", m)); SLIST_INIT(&free); cleared = 0; pa = VM_PAGE_TO_PHYS(m); pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : pa_to_pvh(pa); lock = PHYS_TO_PV_LIST_LOCK(pa); rw_rlock(&pvh_global_lock); rw_wlock(lock); retry: not_cleared = 0; if ((pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL) goto small_mappings; pv = pvf; do { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { pvh_gen = pvh->pv_gen; rw_wunlock(lock); PMAP_LOCK(pmap); rw_wlock(lock); if (pvh_gen != pvh->pv_gen) { PMAP_UNLOCK(pmap); goto retry; } } va = pv->pv_va; l2 = pmap_l2(pmap, va); l2e = pmap_load(l2); if ((l2e & (PTE_W | PTE_D)) == (PTE_W | PTE_D)) { /* * Although l2e is mapping a 2MB page, because * this function is called at a 4KB page granularity, * we only update the 4KB page under test. */ vm_page_dirty(m); } if ((l2e & PTE_A) != 0) { /* * Since this reference bit is shared by 512 4KB * pages, it should not be cleared every time it is * tested. Apply a simple "hash" function on the * physical page number, the virtual superpage number, * and the pmap address to select one 4KB page out of * the 512 on which testing the reference bit will * result in clearing that reference bit. This * function is designed to avoid the selection of the * same 4KB page for every 2MB page mapping. * * On demotion, a mapping that hasn't been referenced * is simply destroyed. To avoid the possibility of a * subsequent page fault on a demoted wired mapping, * always leave its reference bit set. Moreover, * since the superpage is wired, the current state of * its reference bit won't affect page replacement. */ if ((((pa >> PAGE_SHIFT) ^ (pv->pv_va >> L2_SHIFT) ^ (uintptr_t)pmap) & (Ln_ENTRIES - 1)) == 0 && (l2e & PTE_SW_WIRED) == 0) { pmap_clear_bits(l2, PTE_A); pmap_invalidate_page(pmap, va); cleared++; } else not_cleared++; } PMAP_UNLOCK(pmap); /* Rotate the PV list if it has more than one entry. */ if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) { TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); pvh->pv_gen++; } if (cleared + not_cleared >= PMAP_TS_REFERENCED_MAX) goto out; } while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf); small_mappings: if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL) goto out; pv = pvf; do { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { pvh_gen = pvh->pv_gen; md_gen = m->md.pv_gen; rw_wunlock(lock); PMAP_LOCK(pmap); rw_wlock(lock); if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { PMAP_UNLOCK(pmap); goto retry; } } l2 = pmap_l2(pmap, pv->pv_va); KASSERT((pmap_load(l2) & PTE_RX) == 0, ("pmap_ts_referenced: found an invalid l2 table")); l3 = pmap_l2_to_l3(l2, pv->pv_va); l3e = pmap_load(l3); if ((l3e & PTE_D) != 0) vm_page_dirty(m); if ((l3e & PTE_A) != 0) { if ((l3e & PTE_SW_WIRED) == 0) { /* * Wired pages cannot be paged out so * doing accessed bit emulation for * them is wasted effort. We do the * hard work for unwired pages only. */ pmap_clear_bits(l3, PTE_A); pmap_invalidate_page(pmap, pv->pv_va); cleared++; } else not_cleared++; } PMAP_UNLOCK(pmap); /* Rotate the PV list if it has more than one entry. */ if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) { TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); m->md.pv_gen++; } } while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && cleared + not_cleared < PMAP_TS_REFERENCED_MAX); out: rw_wunlock(lock); rw_runlock(&pvh_global_lock); vm_page_free_pages_toq(&free, false); return (cleared + not_cleared); } /* * Apply the given advice to the specified range of addresses within the * given pmap. Depending on the advice, clear the referenced and/or * modified flags in each mapping and set the mapped page's dirty field. */ void pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice) { } /* * Clear the modify bits on the specified physical page. */ void pmap_clear_modify(vm_page_t m) { struct md_page *pvh; struct rwlock *lock; pmap_t pmap; pv_entry_t next_pv, pv; pd_entry_t *l2, oldl2; - pt_entry_t *l3, oldl3; + pt_entry_t *l3; vm_offset_t va; int md_gen, pvh_gen; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_clear_modify: page %p is not managed", m)); VM_OBJECT_ASSERT_WLOCKED(m->object); KASSERT(!vm_page_xbusied(m), ("pmap_clear_modify: page %p is exclusive busied", m)); /* * If the page is not PGA_WRITEABLE, then no PTEs can have PG_M set. * If the object containing the page is locked and the page is not * exclusive busied, then PGA_WRITEABLE cannot be concurrently set. */ if ((m->aflags & PGA_WRITEABLE) == 0) return; pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : pa_to_pvh(VM_PAGE_TO_PHYS(m)); lock = VM_PAGE_TO_PV_LIST_LOCK(m); rw_rlock(&pvh_global_lock); rw_wlock(lock); restart: TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { pvh_gen = pvh->pv_gen; rw_wunlock(lock); PMAP_LOCK(pmap); rw_wlock(lock); if (pvh_gen != pvh->pv_gen) { PMAP_UNLOCK(pmap); goto restart; } } va = pv->pv_va; l2 = pmap_l2(pmap, va); oldl2 = pmap_load(l2); - if ((oldl2 & PTE_W) != 0) { - if (pmap_demote_l2_locked(pmap, l2, va, &lock)) { - if ((oldl2 & PTE_SW_WIRED) == 0) { - /* - * Write protect the mapping to a - * single page so that a subsequent - * write access may repromote. - */ - va += VM_PAGE_TO_PHYS(m) - - PTE_TO_PHYS(oldl2); - l3 = pmap_l2_to_l3(l2, va); - oldl3 = pmap_load(l3); - if ((oldl3 & PTE_V) != 0) { - while (!atomic_fcmpset_long(l3, - &oldl3, oldl3 & ~(PTE_D | - PTE_W))) - cpu_spinwait(); - vm_page_dirty(m); - pmap_invalidate_page(pmap, va); - } - } - } + /* If oldl2 has PTE_W set, then it also has PTE_D set. */ + if ((oldl2 & PTE_W) != 0 && + pmap_demote_l2_locked(pmap, l2, va, &lock) && + (oldl2 & PTE_SW_WIRED) == 0) { + /* + * Write protect the mapping to a single page so that + * a subsequent write access may repromote. + */ + va += VM_PAGE_TO_PHYS(m) - PTE_TO_PHYS(oldl2); + l3 = pmap_l2_to_l3(l2, va); + pmap_clear_bits(l3, PTE_D | PTE_W); + vm_page_dirty(m); + pmap_invalidate_page(pmap, va); } PMAP_UNLOCK(pmap); } TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { md_gen = m->md.pv_gen; pvh_gen = pvh->pv_gen; rw_wunlock(lock); PMAP_LOCK(pmap); rw_wlock(lock); if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { PMAP_UNLOCK(pmap); goto restart; } } l2 = pmap_l2(pmap, pv->pv_va); KASSERT((pmap_load(l2) & PTE_RWX) == 0, ("pmap_clear_modify: found a 2mpage in page %p's pv list", m)); l3 = pmap_l2_to_l3(l2, pv->pv_va); if ((pmap_load(l3) & (PTE_D | PTE_W)) == (PTE_D | PTE_W)) { pmap_clear_bits(l3, PTE_D | PTE_W); pmap_invalidate_page(pmap, pv->pv_va); } PMAP_UNLOCK(pmap); } rw_wunlock(lock); rw_runlock(&pvh_global_lock); } void * pmap_mapbios(vm_paddr_t pa, vm_size_t size) { return ((void *)PHYS_TO_DMAP(pa)); } void pmap_unmapbios(vm_paddr_t pa, vm_size_t size) { } /* * Sets the memory attribute for the specified page. */ void pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma) { m->md.pv_memattr = ma; } /* * perform the pmap work for mincore */ int pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *locked_pa) { pt_entry_t *l2, *l3, tpte; vm_paddr_t pa; int val; bool managed; PMAP_LOCK(pmap); retry: managed = false; val = 0; l2 = pmap_l2(pmap, addr); if (l2 != NULL && ((tpte = pmap_load(l2)) & PTE_V) != 0) { if ((tpte & PTE_RWX) != 0) { pa = PTE_TO_PHYS(tpte) | (addr & L2_OFFSET); val = MINCORE_INCORE | MINCORE_SUPER; } else { l3 = pmap_l2_to_l3(l2, addr); tpte = pmap_load(l3); if ((tpte & PTE_V) == 0) goto done; pa = PTE_TO_PHYS(tpte) | (addr & L3_OFFSET); val = MINCORE_INCORE; } if ((tpte & PTE_D) != 0) val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER; if ((tpte & PTE_A) != 0) val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER; managed = (tpte & PTE_SW_MANAGED) == PTE_SW_MANAGED; } done: if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) != (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) && managed) { /* Ensure that "PHYS_TO_VM_PAGE(pa)->object" doesn't change. */ if (vm_page_pa_tryrelock(pmap, pa, locked_pa)) goto retry; } else PA_UNLOCK_COND(*locked_pa); PMAP_UNLOCK(pmap); return (val); } void pmap_activate_sw(struct thread *td) { pmap_t oldpmap, pmap; u_int hart; oldpmap = PCPU_GET(curpmap); pmap = vmspace_pmap(td->td_proc->p_vmspace); if (pmap == oldpmap) return; load_satp(pmap->pm_satp); hart = PCPU_GET(hart); #ifdef SMP CPU_SET_ATOMIC(hart, &pmap->pm_active); CPU_CLR_ATOMIC(hart, &oldpmap->pm_active); #else CPU_SET(hart, &pmap->pm_active); CPU_CLR(hart, &oldpmap->pm_active); #endif PCPU_SET(curpmap, pmap); sfence_vma(); } void pmap_activate(struct thread *td) { critical_enter(); pmap_activate_sw(td); critical_exit(); } void pmap_activate_boot(pmap_t pmap) { u_int hart; hart = PCPU_GET(hart); #ifdef SMP CPU_SET_ATOMIC(hart, &pmap->pm_active); #else CPU_SET(hart, &pmap->pm_active); #endif PCPU_SET(curpmap, pmap); } void pmap_sync_icache(pmap_t pmap, vm_offset_t va, vm_size_t sz) { cpuset_t mask; /* * From the RISC-V User-Level ISA V2.2: * * "To make a store to instruction memory visible to all * RISC-V harts, the writing hart has to execute a data FENCE * before requesting that all remote RISC-V harts execute a * FENCE.I." */ sched_pin(); mask = all_harts; CPU_CLR(PCPU_GET(hart), &mask); fence(); if (!CPU_EMPTY(&mask) && smp_started) sbi_remote_fence_i(mask.__bits); sched_unpin(); } /* * Increase the starting virtual address of the given mapping if a * different alignment might result in more superpage mappings. */ void pmap_align_superpage(vm_object_t object, vm_ooffset_t offset, vm_offset_t *addr, vm_size_t size) { vm_offset_t superpage_offset; if (size < L2_SIZE) return; if (object != NULL && (object->flags & OBJ_COLORED) != 0) offset += ptoa(object->pg_color); superpage_offset = offset & L2_OFFSET; if (size - ((L2_SIZE - superpage_offset) & L2_OFFSET) < L2_SIZE || (*addr & L2_OFFSET) == superpage_offset) return; if ((*addr & L2_OFFSET) < superpage_offset) *addr = (*addr & ~L2_OFFSET) + superpage_offset; else *addr = ((*addr + L2_OFFSET) & ~L2_OFFSET) + superpage_offset; } /** * Get the kernel virtual address of a set of physical pages. If there are * physical addresses not covered by the DMAP perform a transient mapping * that will be removed when calling pmap_unmap_io_transient. * * \param page The pages the caller wishes to obtain the virtual * address on the kernel memory map. * \param vaddr On return contains the kernel virtual memory address * of the pages passed in the page parameter. * \param count Number of pages passed in. * \param can_fault TRUE if the thread using the mapped pages can take * page faults, FALSE otherwise. * * \returns TRUE if the caller must call pmap_unmap_io_transient when * finished or FALSE otherwise. * */ boolean_t pmap_map_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count, boolean_t can_fault) { vm_paddr_t paddr; boolean_t needs_mapping; int error, i; /* * Allocate any KVA space that we need, this is done in a separate * loop to prevent calling vmem_alloc while pinned. */ needs_mapping = FALSE; for (i = 0; i < count; i++) { paddr = VM_PAGE_TO_PHYS(page[i]); if (__predict_false(paddr >= DMAP_MAX_PHYSADDR)) { error = vmem_alloc(kernel_arena, PAGE_SIZE, M_BESTFIT | M_WAITOK, &vaddr[i]); KASSERT(error == 0, ("vmem_alloc failed: %d", error)); needs_mapping = TRUE; } else { vaddr[i] = PHYS_TO_DMAP(paddr); } } /* Exit early if everything is covered by the DMAP */ if (!needs_mapping) return (FALSE); if (!can_fault) sched_pin(); for (i = 0; i < count; i++) { paddr = VM_PAGE_TO_PHYS(page[i]); if (paddr >= DMAP_MAX_PHYSADDR) { panic( "pmap_map_io_transient: TODO: Map out of DMAP data"); } } return (needs_mapping); } void pmap_unmap_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count, boolean_t can_fault) { vm_paddr_t paddr; int i; if (!can_fault) sched_unpin(); for (i = 0; i < count; i++) { paddr = VM_PAGE_TO_PHYS(page[i]); if (paddr >= DMAP_MAX_PHYSADDR) { panic("RISCVTODO: pmap_unmap_io_transient: Unmap data"); } } } boolean_t pmap_is_valid_memattr(pmap_t pmap __unused, vm_memattr_t mode) { return (mode >= VM_MEMATTR_DEVICE && mode <= VM_MEMATTR_WRITE_BACK); } bool pmap_get_tables(pmap_t pmap, vm_offset_t va, pd_entry_t **l1, pd_entry_t **l2, pt_entry_t **l3) { pd_entry_t *l1p, *l2p; /* Get l1 directory entry. */ l1p = pmap_l1(pmap, va); *l1 = l1p; if (l1p == NULL || (pmap_load(l1p) & PTE_V) == 0) return (false); if ((pmap_load(l1p) & PTE_RX) != 0) { *l2 = NULL; *l3 = NULL; return (true); } /* Get l2 directory entry. */ l2p = pmap_l1_to_l2(l1p, va); *l2 = l2p; if (l2p == NULL || (pmap_load(l2p) & PTE_V) == 0) return (false); if ((pmap_load(l2p) & PTE_RX) != 0) { *l3 = NULL; return (true); } /* Get l3 page table entry. */ *l3 = pmap_l2_to_l3(l2p, va); return (true); } Index: projects/nfsv42/sys/sys/ata.h =================================================================== --- projects/nfsv42/sys/sys/ata.h (revision 350367) +++ projects/nfsv42/sys/sys/ata.h (revision 350368) @@ -1,1053 +1,1056 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2000 - 2008 Søren Schmidt * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer, * without modification, immediately at the beginning of the file. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _SYS_ATA_H_ #define _SYS_ATA_H_ #include /* ATA/ATAPI device parameters */ struct ata_params { /*000*/ u_int16_t config; /* configuration info */ #define ATA_PROTO_MASK 0x8003 #define ATA_PROTO_ATAPI 0x8000 #define ATA_PROTO_ATAPI_12 0x8000 #define ATA_PROTO_ATAPI_16 0x8001 #define ATA_PROTO_CFA 0x848a #define ATA_ATAPI_TYPE_MASK 0x1f00 #define ATA_ATAPI_TYPE_DIRECT 0x0000 /* disk/floppy */ #define ATA_ATAPI_TYPE_TAPE 0x0100 /* streaming tape */ #define ATA_ATAPI_TYPE_CDROM 0x0500 /* CD-ROM device */ #define ATA_ATAPI_TYPE_OPTICAL 0x0700 /* optical disk */ #define ATA_DRQ_MASK 0x0060 #define ATA_DRQ_SLOW 0x0000 /* cpu 3 ms delay */ #define ATA_DRQ_INTR 0x0020 /* interrupt 10 ms delay */ #define ATA_DRQ_FAST 0x0040 /* accel 50 us delay */ #define ATA_RESP_INCOMPLETE 0x0004 /*001*/ u_int16_t cylinders; /* # of cylinders */ /*002*/ u_int16_t specconf; /* specific configuration */ /*003*/ u_int16_t heads; /* # heads */ u_int16_t obsolete4; u_int16_t obsolete5; /*006*/ u_int16_t sectors; /* # sectors/track */ /*007*/ u_int16_t vendor7[3]; /*010*/ u_int8_t serial[20]; /* serial number */ /*020*/ u_int16_t retired20; u_int16_t retired21; u_int16_t obsolete22; /*023*/ u_int8_t revision[8]; /* firmware revision */ /*027*/ u_int8_t model[40]; /* model name */ /*047*/ u_int16_t sectors_intr; /* sectors per interrupt */ /*048*/ u_int16_t tcg; /* Trusted Computing Group */ #define ATA_SUPPORT_TCG 0x0001 /*049*/ u_int16_t capabilities1; #define ATA_SUPPORT_DMA 0x0100 #define ATA_SUPPORT_LBA 0x0200 #define ATA_SUPPORT_IORDYDIS 0x0400 #define ATA_SUPPORT_IORDY 0x0800 #define ATA_SUPPORT_OVERLAP 0x4000 /*050*/ u_int16_t capabilities2; /*051*/ u_int16_t retired_piomode; /* PIO modes 0-2 */ #define ATA_RETIRED_PIO_MASK 0x0300 /*052*/ u_int16_t retired_dmamode; /* DMA modes */ #define ATA_RETIRED_DMA_MASK 0x0003 /*053*/ u_int16_t atavalid; /* fields valid */ #define ATA_FLAG_54_58 0x0001 /* words 54-58 valid */ #define ATA_FLAG_64_70 0x0002 /* words 64-70 valid */ #define ATA_FLAG_88 0x0004 /* word 88 valid */ /*054*/ u_int16_t current_cylinders; /*055*/ u_int16_t current_heads; /*056*/ u_int16_t current_sectors; /*057*/ u_int16_t current_size_1; /*058*/ u_int16_t current_size_2; /*059*/ u_int16_t multi; #define ATA_SUPPORT_BLOCK_ERASE_EXT 0x8000 #define ATA_SUPPORT_OVERWRITE_EXT 0x4000 #define ATA_SUPPORT_CRYPTO_SCRAMBLE_EXT 0x2000 #define ATA_SUPPORT_SANITIZE 0x1000 +#define ATA_SUPPORT_SANITIZE_ALLOWED 0x0800 +#define ATA_SUPPORT_ANTIFREEZE_LOCK_EXT 0x0400 #define ATA_MULTI_VALID 0x0100 /*060*/ u_int16_t lba_size_1; u_int16_t lba_size_2; u_int16_t obsolete62; /*063*/ u_int16_t mwdmamodes; /* multiword DMA modes */ /*064*/ u_int16_t apiomodes; /* advanced PIO modes */ /*065*/ u_int16_t mwdmamin; /* min. M/W DMA time/word ns */ /*066*/ u_int16_t mwdmarec; /* rec. M/W DMA time ns */ /*067*/ u_int16_t pioblind; /* min. PIO cycle w/o flow */ /*068*/ u_int16_t pioiordy; /* min. PIO cycle IORDY flow */ /*069*/ u_int16_t support3; #define ATA_SUPPORT_RZAT 0x0020 #define ATA_SUPPORT_DRAT 0x4000 #define ATA_ENCRYPTS_ALL_USER_DATA 0x0010 /* Self-encrypting drive */ #define ATA_SUPPORT_ZONE_MASK 0x0003 #define ATA_SUPPORT_ZONE_NR 0x0000 #define ATA_SUPPORT_ZONE_HOST_AWARE 0x0001 #define ATA_SUPPORT_ZONE_DEV_MANAGED 0x0002 u_int16_t reserved70; /*071*/ u_int16_t rlsovlap; /* rel time (us) for overlap */ /*072*/ u_int16_t rlsservice; /* rel time (us) for service */ u_int16_t reserved73; u_int16_t reserved74; /*075*/ u_int16_t queue; #define ATA_QUEUE_LEN(x) ((x) & 0x001f) /*76*/ u_int16_t satacapabilities; #define ATA_SATA_GEN1 0x0002 #define ATA_SATA_GEN2 0x0004 #define ATA_SATA_GEN3 0x0008 #define ATA_SUPPORT_NCQ 0x0100 #define ATA_SUPPORT_IFPWRMNGTRCV 0x0200 #define ATA_SUPPORT_PHYEVENTCNT 0x0400 #define ATA_SUPPORT_NCQ_UNLOAD 0x0800 #define ATA_SUPPORT_NCQ_PRIO 0x1000 #define ATA_SUPPORT_HAPST 0x2000 #define ATA_SUPPORT_DAPST 0x4000 #define ATA_SUPPORT_READLOGDMAEXT 0x8000 /*77*/ u_int16_t satacapabilities2; #define ATA_SATA_CURR_GEN_MASK 0x0006 #define ATA_SUPPORT_NCQ_STREAM 0x0010 #define ATA_SUPPORT_NCQ_QMANAGEMENT 0x0020 #define ATA_SUPPORT_RCVSND_FPDMA_QUEUED 0x0040 /*78*/ u_int16_t satasupport; #define ATA_SUPPORT_NONZERO 0x0002 #define ATA_SUPPORT_AUTOACTIVATE 0x0004 #define ATA_SUPPORT_IFPWRMNGT 0x0008 #define ATA_SUPPORT_INORDERDATA 0x0010 #define ATA_SUPPORT_ASYNCNOTIF 0x0020 #define ATA_SUPPORT_SOFTSETPRESERVE 0x0040 /*79*/ u_int16_t sataenabled; #define ATA_ENABLED_DAPST 0x0080 /*080*/ u_int16_t version_major; /*081*/ u_int16_t version_minor; struct { /*082/085*/ u_int16_t command1; #define ATA_SUPPORT_SMART 0x0001 #define ATA_SUPPORT_SECURITY 0x0002 #define ATA_SUPPORT_REMOVABLE 0x0004 #define ATA_SUPPORT_POWERMGT 0x0008 #define ATA_SUPPORT_PACKET 0x0010 #define ATA_SUPPORT_WRITECACHE 0x0020 #define ATA_SUPPORT_LOOKAHEAD 0x0040 #define ATA_SUPPORT_RELEASEIRQ 0x0080 #define ATA_SUPPORT_SERVICEIRQ 0x0100 #define ATA_SUPPORT_RESET 0x0200 #define ATA_SUPPORT_PROTECTED 0x0400 #define ATA_SUPPORT_WRITEBUFFER 0x1000 #define ATA_SUPPORT_READBUFFER 0x2000 #define ATA_SUPPORT_NOP 0x4000 /*083/086*/ u_int16_t command2; #define ATA_SUPPORT_MICROCODE 0x0001 #define ATA_SUPPORT_QUEUED 0x0002 #define ATA_SUPPORT_CFA 0x0004 #define ATA_SUPPORT_APM 0x0008 #define ATA_SUPPORT_NOTIFY 0x0010 #define ATA_SUPPORT_STANDBY 0x0020 #define ATA_SUPPORT_SPINUP 0x0040 #define ATA_SUPPORT_MAXSECURITY 0x0100 #define ATA_SUPPORT_AUTOACOUSTIC 0x0200 #define ATA_SUPPORT_ADDRESS48 0x0400 #define ATA_SUPPORT_OVERLAY 0x0800 #define ATA_SUPPORT_FLUSHCACHE 0x1000 #define ATA_SUPPORT_FLUSHCACHE48 0x2000 /*084/087*/ u_int16_t extension; #define ATA_SUPPORT_SMARTLOG 0x0001 #define ATA_SUPPORT_SMARTTEST 0x0002 #define ATA_SUPPORT_MEDIASN 0x0004 #define ATA_SUPPORT_MEDIAPASS 0x0008 #define ATA_SUPPORT_STREAMING 0x0010 #define ATA_SUPPORT_GENLOG 0x0020 #define ATA_SUPPORT_WRITEDMAFUAEXT 0x0040 #define ATA_SUPPORT_WRITEDMAQFUAEXT 0x0080 #define ATA_SUPPORT_64BITWWN 0x0100 #define ATA_SUPPORT_UNLOAD 0x2000 } __packed support, enabled; /*088*/ u_int16_t udmamodes; /* UltraDMA modes */ /*089*/ u_int16_t erase_time; /* time req'd in 2min units */ /*090*/ u_int16_t enhanced_erase_time; /* time req'd in 2min units */ /*091*/ u_int16_t apm_value; /*092*/ u_int16_t master_passwd_revision; /* password revision code */ /*093*/ u_int16_t hwres; #define ATA_CABLE_ID 0x2000 /*094*/ u_int16_t acoustic; #define ATA_ACOUSTIC_CURRENT(x) ((x) & 0x00ff) #define ATA_ACOUSTIC_VENDOR(x) (((x) & 0xff00) >> 8) /*095*/ u_int16_t stream_min_req_size; /*096*/ u_int16_t stream_transfer_time; /*097*/ u_int16_t stream_access_latency; /*098*/ u_int32_t stream_granularity; /*100*/ u_int16_t lba_size48_1; u_int16_t lba_size48_2; u_int16_t lba_size48_3; u_int16_t lba_size48_4; u_int16_t reserved104; /*105*/ u_int16_t max_dsm_blocks; /*106*/ u_int16_t pss; #define ATA_PSS_LSPPS 0x000F #define ATA_PSS_LSSABOVE512 0x1000 #define ATA_PSS_MULTLS 0x2000 #define ATA_PSS_VALID_MASK 0xC000 #define ATA_PSS_VALID_VALUE 0x4000 /*107*/ u_int16_t isd; /*108*/ u_int16_t wwn[4]; u_int16_t reserved112[5]; /*117*/ u_int16_t lss_1; /*118*/ u_int16_t lss_2; /*119*/ u_int16_t support2; #define ATA_SUPPORT_WRITEREADVERIFY 0x0002 #define ATA_SUPPORT_WRITEUNCORREXT 0x0004 #define ATA_SUPPORT_RWLOGDMAEXT 0x0008 #define ATA_SUPPORT_MICROCODE3 0x0010 #define ATA_SUPPORT_FREEFALL 0x0020 #define ATA_SUPPORT_SENSE_REPORT 0x0040 #define ATA_SUPPORT_EPC 0x0080 #define ATA_SUPPORT_AMAX_ADDR 0x0100 #define ATA_SUPPORT_DSN 0x0200 /*120*/ u_int16_t enabled2; #define ATA_ENABLED_WRITEREADVERIFY 0x0002 #define ATA_ENABLED_WRITEUNCORREXT 0x0004 #define ATA_ENABLED_FREEFALL 0x0020 #define ATA_ENABLED_SENSE_REPORT 0x0040 #define ATA_ENABLED_EPC 0x0080 #define ATA_ENABLED_DSN 0x0200 u_int16_t reserved121[6]; /*127*/ u_int16_t removable_status; /*128*/ u_int16_t security_status; #define ATA_SECURITY_LEVEL 0x0100 /* 0: high, 1: maximum */ #define ATA_SECURITY_ENH_SUPP 0x0020 /* enhanced erase supported */ #define ATA_SECURITY_COUNT_EXP 0x0010 /* count expired */ #define ATA_SECURITY_FROZEN 0x0008 /* security config is frozen */ #define ATA_SECURITY_LOCKED 0x0004 /* drive is locked */ #define ATA_SECURITY_ENABLED 0x0002 /* ATA Security is enabled */ #define ATA_SECURITY_SUPPORTED 0x0001 /* ATA Security is supported */ u_int16_t reserved129[31]; /*160*/ u_int16_t cfa_powermode1; u_int16_t reserved161; /*162*/ u_int16_t cfa_kms_support; /*163*/ u_int16_t cfa_trueide_modes; /*164*/ u_int16_t cfa_memory_modes; u_int16_t reserved165[3]; /*168*/ u_int16_t form_factor; #define ATA_FORM_FACTOR_MASK 0x000f #define ATA_FORM_FACTOR_NOT_REPORTED 0x0000 #define ATA_FORM_FACTOR_5_25 0x0001 #define ATA_FORM_FACTOR_3_5 0x0002 #define ATA_FORM_FACTOR_2_5 0x0003 #define ATA_FORM_FACTOR_1_8 0x0004 #define ATA_FORM_FACTOR_SUB_1_8 0x0005 #define ATA_FORM_FACTOR_MSATA 0x0006 #define ATA_FORM_FACTOR_M_2 0x0007 #define ATA_FORM_FACTOR_MICRO_SSD 0x0008 #define ATA_FORM_FACTOR_C_FAST 0x0009 /*169*/ u_int16_t support_dsm; #define ATA_SUPPORT_DSM_TRIM 0x0001 u_int16_t reserved170[6]; /*176*/ u_int8_t media_serial[60]; /*206*/ u_int16_t sct; u_int16_t reserved207[2]; /*209*/ u_int16_t lsalign; /*210*/ u_int16_t wrv_sectors_m3_1; u_int16_t wrv_sectors_m3_2; /*212*/ u_int16_t wrv_sectors_m2_1; u_int16_t wrv_sectors_m2_2; /*214*/ u_int16_t nv_cache_caps; /*215*/ u_int16_t nv_cache_size_1; u_int16_t nv_cache_size_2; /*217*/ u_int16_t media_rotation_rate; #define ATA_RATE_NOT_REPORTED 0x0000 #define ATA_RATE_NON_ROTATING 0x0001 u_int16_t reserved218; /*219*/ u_int16_t nv_cache_opt; /*220*/ u_int16_t wrv_mode; u_int16_t reserved221; /*222*/ u_int16_t transport_major; /*223*/ u_int16_t transport_minor; u_int16_t reserved224[31]; /*255*/ u_int16_t integrity; } __packed; /* ATA Dataset Management */ #define ATA_DSM_BLK_SIZE 512 #define ATA_DSM_BLK_RANGES 64 #define ATA_DSM_RANGE_SIZE 8 #define ATA_DSM_RANGE_MAX 65535 /* * ATA Device Register * * bit 7 Obsolete (was 1 in early ATA specs) * bit 6 Sets LBA/CHS mode. 1=LBA, 0=CHS * bit 5 Obsolete (was 1 in early ATA specs) * bit 4 1 = Slave Drive, 0 = Master Drive * bit 3-0 In LBA mode, 27-24 of address. In CHS mode, head number */ #define ATA_DEV_MASTER 0x00 #define ATA_DEV_SLAVE 0x10 #define ATA_DEV_LBA 0x40 /* ATA limits */ #define ATA_MAX_28BIT_LBA 268435455UL /* ATA Status Register */ #define ATA_STATUS_ERROR 0x01 #define ATA_STATUS_SENSE_AVAIL 0x02 #define ATA_STATUS_ALIGN_ERR 0x04 #define ATA_STATUS_DATA_REQ 0x08 #define ATA_STATUS_DEF_WRITE_ERR 0x10 #define ATA_STATUS_DEVICE_FAULT 0x20 #define ATA_STATUS_DEVICE_READY 0x40 #define ATA_STATUS_BUSY 0x80 /* ATA Error Register */ #define ATA_ERROR_ABORT 0x04 #define ATA_ERROR_ID_NOT_FOUND 0x10 /* ATA HPA Features */ #define ATA_HPA_FEAT_MAX_ADDR 0x00 #define ATA_HPA_FEAT_SET_PWD 0x01 #define ATA_HPA_FEAT_LOCK 0x02 #define ATA_HPA_FEAT_UNLOCK 0x03 #define ATA_HPA_FEAT_FREEZE 0x04 /* ATA transfer modes */ #define ATA_MODE_MASK 0x0f #define ATA_DMA_MASK 0xf0 #define ATA_PIO 0x00 #define ATA_PIO0 0x08 #define ATA_PIO1 0x09 #define ATA_PIO2 0x0a #define ATA_PIO3 0x0b #define ATA_PIO4 0x0c #define ATA_PIO_MAX 0x0f #define ATA_DMA 0x10 #define ATA_WDMA0 0x20 #define ATA_WDMA1 0x21 #define ATA_WDMA2 0x22 #define ATA_UDMA0 0x40 #define ATA_UDMA1 0x41 #define ATA_UDMA2 0x42 #define ATA_UDMA3 0x43 #define ATA_UDMA4 0x44 #define ATA_UDMA5 0x45 #define ATA_UDMA6 0x46 #define ATA_SA150 0x47 #define ATA_SA300 0x48 #define ATA_SA600 0x49 #define ATA_DMA_MAX 0x4f /* ATA commands */ #define ATA_NOP 0x00 /* NOP */ #define ATA_NF_FLUSHQUEUE 0x00 /* flush queued cmd's */ #define ATA_NF_AUTOPOLL 0x01 /* start autopoll function */ #define ATA_DATA_SET_MANAGEMENT 0x06 #define ATA_DSM_TRIM 0x01 #define ATA_DEVICE_RESET 0x08 /* reset device */ #define ATA_READ 0x20 /* read */ #define ATA_READ48 0x24 /* read 48bit LBA */ #define ATA_READ_DMA48 0x25 /* read DMA 48bit LBA */ #define ATA_READ_DMA_QUEUED48 0x26 /* read DMA QUEUED 48bit LBA */ #define ATA_READ_NATIVE_MAX_ADDRESS48 0x27 /* read native max addr 48bit */ #define ATA_READ_MUL48 0x29 /* read multi 48bit LBA */ #define ATA_READ_STREAM_DMA48 0x2a /* read DMA stream 48bit LBA */ #define ATA_READ_LOG_EXT 0x2f /* read log ext - PIO Data-In */ #define ATA_READ_STREAM48 0x2b /* read stream 48bit LBA */ #define ATA_WRITE 0x30 /* write */ #define ATA_WRITE48 0x34 /* write 48bit LBA */ #define ATA_WRITE_DMA48 0x35 /* write DMA 48bit LBA */ #define ATA_WRITE_DMA_QUEUED48 0x36 /* write DMA QUEUED 48bit LBA*/ #define ATA_SET_MAX_ADDRESS48 0x37 /* set max address 48bit */ #define ATA_WRITE_MUL48 0x39 /* write multi 48bit LBA */ #define ATA_WRITE_STREAM_DMA48 0x3a #define ATA_WRITE_STREAM48 0x3b #define ATA_WRITE_DMA_FUA48 0x3d #define ATA_WRITE_DMA_QUEUED_FUA48 0x3e #define ATA_WRITE_LOG_EXT 0x3f #define ATA_READ_VERIFY 0x40 #define ATA_READ_VERIFY48 0x42 #define ATA_WRITE_UNCORRECTABLE48 0x45 /* write uncorrectable 48bit LBA */ #define ATA_WU_PSEUDO 0x55 /* pseudo-uncorrectable error */ #define ATA_WU_FLAGGED 0xaa /* flagged-uncorrectable error */ #define ATA_READ_LOG_DMA_EXT 0x47 /* read log DMA ext - PIO Data-In */ #define ATA_ZAC_MANAGEMENT_IN 0x4a /* ZAC management in */ #define ATA_ZM_REPORT_ZONES 0x00 /* report zones */ #define ATA_WRITE_LOG_DMA_EXT 0x57 /* WRITE LOG DMA EXT */ #define ATA_TRUSTED_NON_DATA 0x5b /* TRUSTED NON-DATA */ #define ATA_TRUSTED_RECEIVE 0x5c /* TRUSTED RECEIVE */ #define ATA_TRUSTED_RECEIVE_DMA 0x5d /* TRUSTED RECEIVE DMA */ #define ATA_TRUSTED_SEND 0x5e /* TRUSTED SEND */ #define ATA_TRUSTED_SEND_DMA 0x5f /* TRUSTED SEND DMA */ #define ATA_READ_FPDMA_QUEUED 0x60 /* read DMA NCQ */ #define ATA_WRITE_FPDMA_QUEUED 0x61 /* write DMA NCQ */ #define ATA_NCQ_NON_DATA 0x63 /* NCQ non-data command */ #define ATA_ABORT_NCQ_QUEUE 0x00 /* abort NCQ queue */ #define ATA_DEADLINE_HANDLING 0x01 /* deadline handling */ #define ATA_SET_FEATURES 0x05 /* set features */ #define ATA_ZERO_EXT 0x06 /* zero ext */ #define ATA_NCQ_ZAC_MGMT_OUT 0x07 /* NCQ ZAC mgmt out no data */ #define ATA_SEND_FPDMA_QUEUED 0x64 /* send DMA NCQ */ #define ATA_SFPDMA_DSM 0x00 /* Data set management */ #define ATA_SFPDMA_DSM_TRIM 0x01 /* Set trim bit in auxiliary */ #define ATA_SFPDMA_HYBRID_EVICT 0x01 /* Hybrid Evict */ #define ATA_SFPDMA_WLDMA 0x02 /* Write Log DMA EXT */ #define ATA_SFPDMA_ZAC_MGMT_OUT 0x03 /* NCQ ZAC mgmt out w/data */ #define ATA_RECV_FPDMA_QUEUED 0x65 /* receive DMA NCQ */ #define ATA_RFPDMA_RL_DMA_EXT 0x00 /* Read Log DMA EXT */ #define ATA_RFPDMA_ZAC_MGMT_IN 0x02 /* NCQ ZAC mgmt in w/data */ #define ATA_SEP_ATTN 0x67 /* SEP request */ #define ATA_SEEK 0x70 /* seek */ #define ATA_AMAX_ADDR 0x78 /* Accessible Max Address */ #define ATA_AMAX_ADDR_GET 0x00 /* GET NATIVE MAX ADDRESS EXT */ #define ATA_AMAX_ADDR_SET 0x01 /* SET ACCESSIBLE MAX ADDRESS EXT */ #define ATA_AMAX_ADDR_FREEZE 0x02 /* FREEZE ACCESSIBLE MAX ADDRESS EXT */ #define ATA_ZAC_MANAGEMENT_OUT 0x9f /* ZAC management out */ #define ATA_ZM_CLOSE_ZONE 0x01 /* close zone */ #define ATA_ZM_FINISH_ZONE 0x02 /* finish zone */ #define ATA_ZM_OPEN_ZONE 0x03 /* open zone */ #define ATA_ZM_RWP 0x04 /* reset write pointer */ #define ATA_DOWNLOAD_MICROCODE 0x92 /* DOWNLOAD MICROCODE */ #define ATA_DOWNLOAD_MICROCODE_DMA 0x93 /* DOWNLOAD MICROCODE DMA */ #define ATA_PACKET_CMD 0xa0 /* packet command */ #define ATA_ATAPI_IDENTIFY 0xa1 /* get ATAPI params*/ #define ATA_SERVICE 0xa2 /* service command */ #define ATA_SMART_CMD 0xb0 /* SMART command */ +#define ATA_SANITIZE 0xb4 /* sanitize device */ #define ATA_CFA_ERASE 0xc0 /* CFA erase */ #define ATA_READ_MUL 0xc4 /* read multi */ #define ATA_WRITE_MUL 0xc5 /* write multi */ #define ATA_SET_MULTI 0xc6 /* set multi size */ #define ATA_READ_DMA_QUEUED 0xc7 /* read DMA QUEUED */ #define ATA_READ_DMA 0xc8 /* read DMA */ #define ATA_WRITE_DMA 0xca /* write DMA */ #define ATA_WRITE_DMA_QUEUED 0xcc /* write DMA QUEUED */ #define ATA_WRITE_MUL_FUA48 0xce #define ATA_STANDBY_IMMEDIATE 0xe0 /* standby immediate */ #define ATA_IDLE_IMMEDIATE 0xe1 /* idle immediate */ #define ATA_STANDBY_CMD 0xe2 /* standby */ #define ATA_IDLE_CMD 0xe3 /* idle */ #define ATA_READ_BUFFER 0xe4 /* read buffer */ #define ATA_READ_PM 0xe4 /* read portmultiplier */ #define ATA_CHECK_POWER_MODE 0xe5 /* device power mode */ #define ATA_SLEEP 0xe6 /* sleep */ #define ATA_FLUSHCACHE 0xe7 /* flush cache to disk */ #define ATA_WRITE_BUFFER 0xe8 /* write buffer */ #define ATA_WRITE_PM 0xe8 /* write portmultiplier */ #define ATA_READ_BUFFER_DMA 0xe9 /* read buffer DMA */ #define ATA_FLUSHCACHE48 0xea /* flush cache to disk */ #define ATA_WRITE_BUFFER_DMA 0xeb /* write buffer DMA */ #define ATA_ATA_IDENTIFY 0xec /* get ATA params */ #define ATA_SETFEATURES 0xef /* features command */ #define ATA_SF_ENAB_WCACHE 0x02 /* enable write cache */ #define ATA_SF_DIS_WCACHE 0x82 /* disable write cache */ #define ATA_SF_SETXFER 0x03 /* set transfer mode */ #define ATA_SF_APM 0x05 /* Enable APM feature set */ #define ATA_SF_ENAB_PUIS 0x06 /* enable PUIS */ #define ATA_SF_DIS_PUIS 0x86 /* disable PUIS */ #define ATA_SF_PUIS_SPINUP 0x07 /* PUIS spin-up */ #define ATA_SF_WRV 0x0b /* Enable Write-Read-Verify */ #define ATA_SF_DLC 0x0c /* Enable device life control */ #define ATA_SF_SATA 0x10 /* Enable use of SATA feature */ #define ATA_SF_FFC 0x41 /* Free-fall Control */ #define ATA_SF_MHIST 0x43 /* Set Max Host Sect. Times */ #define ATA_SF_RATE 0x45 /* Set Rate Basis */ #define ATA_SF_EPC 0x4A /* Extended Power Conditions */ #define ATA_SF_ENAB_RCACHE 0xaa /* enable readahead cache */ #define ATA_SF_DIS_RCACHE 0x55 /* disable readahead cache */ #define ATA_SF_ENAB_RELIRQ 0x5d /* enable release interrupt */ #define ATA_SF_DIS_RELIRQ 0xdd /* disable release interrupt */ #define ATA_SF_ENAB_SRVIRQ 0x5e /* enable service interrupt */ #define ATA_SF_DIS_SRVIRQ 0xde /* disable service interrupt */ #define ATA_SF_LPSAERC 0x62 /* Long Phys Sect Align ErrRep*/ #define ATA_SF_DSN 0x63 /* Device Stats Notification */ #define ATA_CHECK_POWER_MODE 0xe5 /* Check Power Mode */ #define ATA_SECURITY_SET_PASSWORD 0xf1 /* set drive password */ #define ATA_SECURITY_UNLOCK 0xf2 /* unlock drive using passwd */ #define ATA_SECURITY_ERASE_PREPARE 0xf3 /* prepare to erase drive */ #define ATA_SECURITY_ERASE_UNIT 0xf4 /* erase all blocks on drive */ #define ATA_SECURITY_FREEZE_LOCK 0xf5 /* freeze security config */ #define ATA_SECURITY_DISABLE_PASSWORD 0xf6 /* disable drive password */ #define ATA_READ_NATIVE_MAX_ADDRESS 0xf8 /* read native max address */ #define ATA_SET_MAX_ADDRESS 0xf9 /* set max address */ /* ATAPI commands */ #define ATAPI_TEST_UNIT_READY 0x00 /* check if device is ready */ #define ATAPI_REZERO 0x01 /* rewind */ #define ATAPI_REQUEST_SENSE 0x03 /* get sense data */ #define ATAPI_FORMAT 0x04 /* format unit */ #define ATAPI_READ 0x08 /* read data */ #define ATAPI_WRITE 0x0a /* write data */ #define ATAPI_WEOF 0x10 /* write filemark */ #define ATAPI_WF_WRITE 0x01 #define ATAPI_SPACE 0x11 /* space command */ #define ATAPI_SP_FM 0x01 #define ATAPI_SP_EOD 0x03 #define ATAPI_INQUIRY 0x12 /* get inquiry data */ #define ATAPI_MODE_SELECT 0x15 /* mode select */ #define ATAPI_ERASE 0x19 /* erase */ #define ATAPI_MODE_SENSE 0x1a /* mode sense */ #define ATAPI_START_STOP 0x1b /* start/stop unit */ #define ATAPI_SS_LOAD 0x01 #define ATAPI_SS_RETENSION 0x02 #define ATAPI_SS_EJECT 0x04 #define ATAPI_PREVENT_ALLOW 0x1e /* media removal */ #define ATAPI_READ_FORMAT_CAPACITIES 0x23 /* get format capacities */ #define ATAPI_READ_CAPACITY 0x25 /* get volume capacity */ #define ATAPI_READ_BIG 0x28 /* read data */ #define ATAPI_WRITE_BIG 0x2a /* write data */ #define ATAPI_LOCATE 0x2b /* locate to position */ #define ATAPI_READ_POSITION 0x34 /* read position */ #define ATAPI_SYNCHRONIZE_CACHE 0x35 /* flush buf, close channel */ #define ATAPI_WRITE_BUFFER 0x3b /* write device buffer */ #define ATAPI_READ_BUFFER 0x3c /* read device buffer */ #define ATAPI_READ_SUBCHANNEL 0x42 /* get subchannel info */ #define ATAPI_READ_TOC 0x43 /* get table of contents */ #define ATAPI_PLAY_10 0x45 /* play by lba */ #define ATAPI_PLAY_MSF 0x47 /* play by MSF address */ #define ATAPI_PLAY_TRACK 0x48 /* play by track number */ #define ATAPI_PAUSE 0x4b /* pause audio operation */ #define ATAPI_READ_DISK_INFO 0x51 /* get disk info structure */ #define ATAPI_READ_TRACK_INFO 0x52 /* get track info structure */ #define ATAPI_RESERVE_TRACK 0x53 /* reserve track */ #define ATAPI_SEND_OPC_INFO 0x54 /* send OPC structurek */ #define ATAPI_MODE_SELECT_BIG 0x55 /* set device parameters */ #define ATAPI_REPAIR_TRACK 0x58 /* repair track */ #define ATAPI_READ_MASTER_CUE 0x59 /* read master CUE info */ #define ATAPI_MODE_SENSE_BIG 0x5a /* get device parameters */ #define ATAPI_CLOSE_TRACK 0x5b /* close track/session */ #define ATAPI_READ_BUFFER_CAPACITY 0x5c /* get buffer capicity */ #define ATAPI_SEND_CUE_SHEET 0x5d /* send CUE sheet */ #define ATAPI_SERVICE_ACTION_IN 0x96 /* get service data */ #define ATAPI_BLANK 0xa1 /* blank the media */ #define ATAPI_SEND_KEY 0xa3 /* send DVD key structure */ #define ATAPI_REPORT_KEY 0xa4 /* get DVD key structure */ #define ATAPI_PLAY_12 0xa5 /* play by lba */ #define ATAPI_LOAD_UNLOAD 0xa6 /* changer control command */ #define ATAPI_READ_STRUCTURE 0xad /* get DVD structure */ #define ATAPI_PLAY_CD 0xb4 /* universal play command */ #define ATAPI_SET_SPEED 0xbb /* set drive speed */ #define ATAPI_MECH_STATUS 0xbd /* get changer status */ #define ATAPI_READ_CD 0xbe /* read data */ #define ATAPI_POLL_DSC 0xff /* poll DSC status bit */ struct ata_ioc_devices { int channel; char name[2][32]; struct ata_params params[2]; }; /* pr channel ATA ioctl calls */ #define IOCATAGMAXCHANNEL _IOR('a', 1, int) #define IOCATAREINIT _IOW('a', 2, int) #define IOCATAATTACH _IOW('a', 3, int) #define IOCATADETACH _IOW('a', 4, int) #define IOCATADEVICES _IOWR('a', 5, struct ata_ioc_devices) /* ATAPI request sense structure */ struct atapi_sense { u_int8_t error; /* current or deferred errors */ #define ATA_SENSE_VALID 0x80 u_int8_t segment; /* segment number */ u_int8_t key; /* sense key */ #define ATA_SENSE_KEY_MASK 0x0f /* sense key mask */ #define ATA_SENSE_NO_SENSE 0x00 /* no specific sense key info */ #define ATA_SENSE_RECOVERED_ERROR 0x01 /* command OK, data recovered */ #define ATA_SENSE_NOT_READY 0x02 /* no access to drive */ #define ATA_SENSE_MEDIUM_ERROR 0x03 /* non-recovered data error */ #define ATA_SENSE_HARDWARE_ERROR 0x04 /* non-recoverable HW failure */ #define ATA_SENSE_ILLEGAL_REQUEST 0x05 /* invalid command param(s) */ #define ATA_SENSE_UNIT_ATTENTION 0x06 /* media changed */ #define ATA_SENSE_DATA_PROTECT 0x07 /* write protect */ #define ATA_SENSE_BLANK_CHECK 0x08 /* blank check */ #define ATA_SENSE_VENDOR_SPECIFIC 0x09 /* vendor specific skey */ #define ATA_SENSE_COPY_ABORTED 0x0a /* copy aborted */ #define ATA_SENSE_ABORTED_COMMAND 0x0b /* command aborted, try again */ #define ATA_SENSE_EQUAL 0x0c /* equal */ #define ATA_SENSE_VOLUME_OVERFLOW 0x0d /* volume overflow */ #define ATA_SENSE_MISCOMPARE 0x0e /* data dont match the medium */ #define ATA_SENSE_RESERVED 0x0f #define ATA_SENSE_ILI 0x20; #define ATA_SENSE_EOM 0x40; #define ATA_SENSE_FILEMARK 0x80; u_int32_t cmd_info; /* cmd information */ u_int8_t sense_length; /* additional sense len (n-7) */ u_int32_t cmd_specific_info; /* additional cmd spec info */ u_int8_t asc; /* additional sense code */ u_int8_t ascq; /* additional sense code qual */ u_int8_t replaceable_unit_code; /* replaceable unit code */ u_int8_t specific; /* sense key specific */ #define ATA_SENSE_SPEC_VALID 0x80 #define ATA_SENSE_SPEC_MASK 0x7f u_int8_t specific1; /* sense key specific */ u_int8_t specific2; /* sense key specific */ } __packed; /* * SET FEATURES subcommands */ /* * SET FEATURES command * Extended Power Conditions subcommand -- ATA_SF_EPC (0x4A) * These values go in the LBA 3:0. */ #define ATA_SF_EPC_RESTORE 0x00 /* Restore Power Condition Settings */ #define ATA_SF_EPC_GOTO 0x01 /* Go To Power Condition */ #define ATA_SF_EPC_SET_TIMER 0x02 /* Set Power Condition Timer */ #define ATA_SF_EPC_SET_STATE 0x03 /* Set Power Condition State */ #define ATA_SF_EPC_ENABLE 0x04 /* Enable the EPC feature set */ #define ATA_SF_EPC_DISABLE 0x05 /* Disable the EPC feature set */ #define ATA_SF_EPC_SET_SOURCE 0x06 /* Set EPC Power Source */ /* * SET FEATURES command * Extended Power Conditions subcommand -- ATA_SF_EPC (0x4A) * Power Condition ID field * These values go in the count register. */ #define ATA_EPC_STANDBY_Z 0x00 /* Substate of PM2:Standby */ #define ATA_EPC_STANDBY_Y 0x01 /* Substate of PM2:Standby */ #define ATA_EPC_IDLE_A 0x81 /* Substate of PM1:Idle */ #define ATA_EPC_IDLE_B 0x82 /* Substate of PM1:Idle */ #define ATA_EPC_IDLE_C 0x83 /* Substate of PM1:Idle */ #define ATA_EPC_ALL 0xff /* All supported power conditions */ /* * SET FEATURES command * Extended Power Conditions subcommand -- ATA_SF_EPC (0x4A) * Restore Power Conditions Settings subcommand * These values go in the LBA register. */ #define ATA_SF_EPC_RST_DFLT 0x40 /* 1=Rst from Default, 0= from Saved */ #define ATA_SF_EPC_RST_SAVE 0x10 /* 1=Save on completion */ /* * SET FEATURES command * Extended Power Conditions subcommand -- ATA_SF_EPC (0x4A) * Got To Power Condition subcommand * These values go in the LBA register. */ #define ATA_SF_EPC_GOTO_DELAY 0x02000000 /* Delayed entry bit */ #define ATA_SF_EPC_GOTO_HOLD 0x01000000 /* Hold Power Cond bit */ /* * SET FEATURES command * Extended Power Conditions subcommand -- ATA_SF_EPC (0x4A) * Set Power Condition Timer subcommand * These values go in the LBA register. */ #define ATA_SF_EPC_TIMER_MASK 0x00ffff00 /* Timer field */ #define ATA_SF_EPC_TIMER_SHIFT 8 #define ATA_SF_EPC_TIMER_SEC 0x00000080 /* Timer units, 1=sec, 0=.1s */ #define ATA_SF_EPC_TIMER_EN 0x00000020 /* Enable/disable cond. */ #define ATA_SF_EPC_TIMER_SAVE 0x00000010 /* Save settings on comp. */ /* * SET FEATURES command * Extended Power Conditions subcommand -- ATA_SF_EPC (0x4A) * Set Power Condition State subcommand * These values go in the LBA register. */ #define ATA_SF_EPC_SETCON_EN 0x00000020 /* Enable power cond. */ #define ATA_SF_EPC_SETCON_SAVE 0x00000010 /* Save settings on comp */ /* * SET FEATURES command * Extended Power Conditions subcommand -- ATA_SF_EPC (0x4A) * Set EPC Power Source subcommand * These values go in the count register. */ #define ATA_SF_EPC_SRC_UNKNOWN 0x0000 /* Unknown source */ #define ATA_SF_EPC_SRC_BAT 0x0001 /* battery source */ #define ATA_SF_EPC_SRC_NOT_BAT 0x0002 /* not battery source */ #define ATA_LOG_DIRECTORY 0x00 /* Directory of all logs */ #define ATA_POWER_COND_LOG 0x08 /* Power Conditions Log */ #define ATA_PCL_IDLE 0x00 /* Idle Power Conditions Page */ #define ATA_PCL_STANDBY 0x01 /* Standby Power Conditions Page */ #define ATA_IDENTIFY_DATA_LOG 0x30 /* Identify Device Data Log */ #define ATA_IDL_PAGE_LIST 0x00 /* List of supported pages */ #define ATA_IDL_IDENTIFY_DATA 0x01 /* Copy of Identify Device data */ #define ATA_IDL_CAPACITY 0x02 /* Capacity */ #define ATA_IDL_SUP_CAP 0x03 /* Supported Capabilities */ #define ATA_IDL_CUR_SETTINGS 0x04 /* Current Settings */ #define ATA_IDL_ATA_STRINGS 0x05 /* ATA Strings */ #define ATA_IDL_SECURITY 0x06 /* Security */ #define ATA_IDL_PARALLEL_ATA 0x07 /* Parallel ATA */ #define ATA_IDL_SERIAL_ATA 0x08 /* Serial ATA */ #define ATA_IDL_ZDI 0x09 /* Zoned Device Information */ struct ata_gp_log_dir { uint8_t header[2]; #define ATA_GP_LOG_DIR_VERSION 0x0001 uint8_t num_pages[255*2]; /* Number of log pages at address */ }; /* * ATA Power Conditions log descriptor */ struct ata_power_cond_log_desc { uint8_t reserved1; uint8_t flags; #define ATA_PCL_COND_SUPPORTED 0x80 #define ATA_PCL_COND_SAVEABLE 0x40 #define ATA_PCL_COND_CHANGEABLE 0x20 #define ATA_PCL_DEFAULT_TIMER_EN 0x10 #define ATA_PCL_SAVED_TIMER_EN 0x08 #define ATA_PCL_CURRENT_TIMER_EN 0x04 #define ATA_PCL_HOLD_PC_NOT_SUP 0x02 uint8_t reserved2[2]; uint8_t default_timer[4]; uint8_t saved_timer[4]; uint8_t current_timer[4]; uint8_t nom_time_to_active[4]; uint8_t min_timer[4]; uint8_t max_timer[4]; uint8_t num_transitions_to_pc[4]; uint8_t hours_in_pc[4]; uint8_t reserved3[28]; }; /* * ATA Power Conditions Log (0x08), Idle power conditions page (0x00) */ struct ata_power_cond_log_idle { struct ata_power_cond_log_desc idle_a_desc; struct ata_power_cond_log_desc idle_b_desc; struct ata_power_cond_log_desc idle_c_desc; uint8_t reserved[320]; }; /* * ATA Power Conditions Log (0x08), Standby power conditions page (0x01) */ struct ata_power_cond_log_standby { uint8_t reserved[384]; struct ata_power_cond_log_desc standby_y_desc; struct ata_power_cond_log_desc standby_z_desc; }; /* * ATA IDENTIFY DEVICE data log (0x30) page 0x00 * List of Supported IDENTIFY DEVICE data pages. */ struct ata_identify_log_pages { uint8_t header[8]; #define ATA_IDLOG_REVISION 0x0000000000000001 uint8_t entry_count; uint8_t entries[503]; }; /* * ATA IDENTIFY DEVICE data log (0x30) * Capacity (Page 0x02). */ struct ata_identify_log_capacity { uint8_t header[8]; #define ATA_CAP_HEADER_VALID 0x8000000000000000 #define ATA_CAP_PAGE_NUM_MASK 0x0000000000ff0000 #define ATA_CAP_PAGE_NUM_SHIFT 16 #define ATA_CAP_REV_MASK 0x00000000000000ff uint8_t capacity[8]; #define ATA_CAP_CAPACITY_VALID 0x8000000000000000 #define ATA_CAP_ACCESSIBLE_CAP 0x0000ffffffffffff uint8_t phys_logical_sect_size[8]; #define ATA_CAP_PL_VALID 0x8000000000000000 #define ATA_CAP_LTOP_REL_SUP 0x4000000000000000 #define ATA_CAP_LOG_SECT_SUP 0x2000000000000000 #define ATA_CAP_ALIGN_ERR_MASK 0x0000000000300000 #define ATA_CAP_LTOP_MASK 0x00000000000f0000 #define ATA_CAP_LOG_SECT_OFF 0x000000000000ffff uint8_t logical_sect_size[8]; #define ATA_CAP_LOG_SECT_VALID 0x8000000000000000 #define ATA_CAP_LOG_SECT_SIZE 0x00000000ffffffff uint8_t nominal_buffer_size[8]; #define ATA_CAP_NOM_BUF_VALID 0x8000000000000000 #define ATA_CAP_NOM_BUF_SIZE 0x7fffffffffffffff uint8_t reserved[472]; }; /* * ATA IDENTIFY DEVICE data log (0x30) * Supported Capabilities (Page 0x03). */ struct ata_identify_log_sup_cap { uint8_t header[8]; #define ATA_SUP_CAP_HEADER_VALID 0x8000000000000000 #define ATA_SUP_CAP_PAGE_NUM_MASK 0x0000000000ff0000 #define ATA_SUP_CAP_PAGE_NUM_SHIFT 16 #define ATA_SUP_CAP_REV_MASK 0x00000000000000ff uint8_t sup_cap[8]; #define ATA_SUP_CAP_VALID 0x8000000000000000 #define ATA_SC_SET_SECT_CONFIG_SUP 0x0002000000000000 /* Set Sect Conf*/ #define ATA_SC_ZERO_EXT_SUP 0x0001000000000000 /* Zero EXT */ #define ATA_SC_SUCC_NCQ_SENSE_SUP 0x0000800000000000 /* Succ. NCQ Sns */ #define ATA_SC_DLC_SUP 0x0000400000000000 /* DLC */ #define ATA_SC_RQSN_DEV_FAULT_SUP 0x0000200000000000 /* Req Sns Dev Flt*/ #define ATA_SC_DSN_SUP 0x0000100000000000 /* DSN */ #define ATA_SC_LP_STANDBY_SUP 0x0000080000000000 /* LP Standby */ #define ATA_SC_SET_EPC_PS_SUP 0x0000040000000000 /* Set EPC PS */ #define ATA_SC_AMAX_ADDR_SUP 0x0000020000000000 /* AMAX Addr */ #define ATA_SC_DRAT_SUP 0x0000008000000000 /* DRAT */ #define ATA_SC_LPS_MISALGN_SUP 0x0000004000000000 /* LPS Misalign */ #define ATA_SC_RB_DMA_SUP 0x0000001000000000 /* Read Buf DMA */ #define ATA_SC_WB_DMA_SUP 0x0000000800000000 /* Write Buf DMA */ #define ATA_SC_DNLD_MC_DMA_SUP 0x0000000200000000 /* DL MCode DMA */ #define ATA_SC_28BIT_SUP 0x0000000100000000 /* 28-bit */ #define ATA_SC_RZAT_SUP 0x0000000080000000 /* RZAT */ #define ATA_SC_NOP_SUP 0x0000000020000000 /* NOP */ #define ATA_SC_READ_BUFFER_SUP 0x0000000010000000 /* Read Buffer */ #define ATA_SC_WRITE_BUFFER_SUP 0x0000000008000000 /* Write Buffer */ #define ATA_SC_READ_LOOK_AHEAD_SUP 0x0000000002000000 /* Read Look-Ahead*/ #define ATA_SC_VOLATILE_WC_SUP 0x0000000001000000 /* Volatile WC */ #define ATA_SC_SMART_SUP 0x0000000000800000 /* SMART */ #define ATA_SC_FLUSH_CACHE_EXT_SUP 0x0000000000400000 /* Flush Cache Ext */ #define ATA_SC_48BIT_SUP 0x0000000000100000 /* 48-Bit */ #define ATA_SC_SPINUP_SUP 0x0000000000040000 /* Spin-Up */ #define ATA_SC_PUIS_SUP 0x0000000000020000 /* PUIS */ #define ATA_SC_APM_SUP 0x0000000000010000 /* APM */ #define ATA_SC_DL_MICROCODE_SUP 0x0000000000004000 /* DL Microcode */ #define ATA_SC_UNLOAD_SUP 0x0000000000002000 /* Unload */ #define ATA_SC_WRITE_FUA_EXT_SUP 0x0000000000001000 /* Write FUA EXT */ #define ATA_SC_GPL_SUP 0x0000000000000800 /* GPL */ #define ATA_SC_STREAMING_SUP 0x0000000000000400 /* Streaming */ #define ATA_SC_SMART_SELFTEST_SUP 0x0000000000000100 /* SMART self-test */ #define ATA_SC_SMART_ERR_LOG_SUP 0x0000000000000080 /* SMART Err Log */ #define ATA_SC_EPC_SUP 0x0000000000000040 /* EPC */ #define ATA_SC_SENSE_SUP 0x0000000000000020 /* Sense data */ #define ATA_SC_FREEFALL_SUP 0x0000000000000010 /* Free-Fall */ #define ATA_SC_DM_MODE3_SUP 0x0000000000000008 /* DM Mode 3 */ #define ATA_SC_GPL_DMA_SUP 0x0000000000000004 /* GPL DMA */ #define ATA_SC_WRITE_UNCOR_SUP 0x0000000000000002 /* Write uncorr. */ #define ATA_SC_WRV_SUP 0x0000000000000001 /* WRV */ uint8_t download_code_cap[8]; #define ATA_DL_CODE_VALID 0x8000000000000000 #define ATA_DLC_DM_OFFSETS_DEFER_SUP 0x0000000400000000 #define ATA_DLC_DM_IMMED_SUP 0x0000000200000000 #define ATA_DLC_DM_OFF_IMMED_SUP 0x0000000100000000 #define ATA_DLC_DM_MAX_XFER_SIZE_MASK 0x00000000ffff0000 #define ATA_DLC_DM_MAX_XFER_SIZE_SHIFT 16 #define ATA_DLC_DM_MIN_XFER_SIZE_MASK 0x000000000000ffff uint8_t nom_media_rotation_rate[8]; #define ATA_NOM_MEDIA_ROTATION_VALID 0x8000000000000000 #define ATA_ROTATION_MASK 0x000000000000ffff uint8_t form_factor[8]; #define ATA_FORM_FACTOR_VALID 0x8000000000000000 #define ATA_FF_MASK 0x000000000000000f #define ATA_FF_NOT_REPORTED 0x0000000000000000 /* Not reported */ #define ATA_FF_525_IN 0x0000000000000001 /* 5.25 inch */ #define ATA_FF_35_IN 0x0000000000000002 /* 3.5 inch */ #define ATA_FF_25_IN 0x0000000000000003 /* 2.5 inch */ #define ATA_FF_18_IN 0x0000000000000004 /* 1.8 inch */ #define ATA_FF_LT_18_IN 0x0000000000000005 /* < 1.8 inch */ #define ATA_FF_MSATA 0x0000000000000006 /* mSATA */ #define ATA_FF_M2 0x0000000000000007 /* M.2 */ #define ATA_FF_MICROSSD 0x0000000000000008 /* MicroSSD */ #define ATA_FF_CFAST 0x0000000000000009 /* CFast */ uint8_t wrv_sec_cnt_mode3[8]; #define ATA_WRV_MODE3_VALID 0x8000000000000000 #define ATA_WRV_MODE3_COUNT 0x00000000ffffffff uint8_t wrv_sec_cnt_mode2[8]; #define ATA_WRV_MODE2_VALID 0x8000000000000000 #define ATA_WRV_MODE2_COUNT 0x00000000ffffffff uint8_t wwn[16]; /* XXX KDM need to figure out how to handle 128-bit fields */ uint8_t dsm[8]; #define ATA_DSM_VALID 0x8000000000000000 #define ATA_LB_MARKUP_SUP 0x000000000000ff00 #define ATA_TRIM_SUP 0x0000000000000001 uint8_t util_per_unit_time[16]; /* XXX KDM need to figure out how to handle 128-bit fields */ uint8_t util_usage_rate_sup[8]; #define ATA_UTIL_USAGE_RATE_VALID 0x8000000000000000 #define ATA_SETTING_RATE_SUP 0x0000000000800000 #define ATA_SINCE_POWERON_SUP 0x0000000000000100 #define ATA_POH_RATE_SUP 0x0000000000000010 #define ATA_DATE_TIME_RATE_SUP 0x0000000000000001 uint8_t zoned_cap[8]; #define ATA_ZONED_VALID 0x8000000000000000 #define ATA_ZONED_MASK 0x0000000000000003 uint8_t sup_zac_cap[8]; #define ATA_SUP_ZAC_CAP_VALID 0x8000000000000000 #define ATA_ND_RWP_SUP 0x0000000000000010 /* Reset Write Ptr*/ #define ATA_ND_FINISH_ZONE_SUP 0x0000000000000008 /* Finish Zone */ #define ATA_ND_CLOSE_ZONE_SUP 0x0000000000000004 /* Close Zone */ #define ATA_ND_OPEN_ZONE_SUP 0x0000000000000002 /* Open Zone */ #define ATA_REPORT_ZONES_SUP 0x0000000000000001 /* Report Zones */ uint8_t reserved[392]; }; /* * ATA Identify Device Data Log Zoned Device Information Page (0x09). * Current as of ZAC r04a, August 25, 2015. */ struct ata_zoned_info_log { uint8_t header[8]; #define ATA_ZDI_HEADER_VALID 0x8000000000000000 #define ATA_ZDI_PAGE_NUM_MASK 0x0000000000ff0000 #define ATA_ZDI_PAGE_NUM_SHIFT 16 #define ATA_ZDI_REV_MASK 0x00000000000000ff uint8_t zoned_cap[8]; #define ATA_ZDI_CAP_VALID 0x8000000000000000 #define ATA_ZDI_CAP_URSWRZ 0x0000000000000001 uint8_t zoned_settings[8]; #define ATA_ZDI_SETTINGS_VALID 0x8000000000000000 uint8_t optimal_seq_zones[8]; #define ATA_ZDI_OPT_SEQ_VALID 0x8000000000000000 #define ATA_ZDI_OPT_SEQ_MASK 0x00000000ffffffff uint8_t optimal_nonseq_zones[8]; #define ATA_ZDI_OPT_NS_VALID 0x8000000000000000 #define ATA_ZDI_OPT_NS_MASK 0x00000000ffffffff uint8_t max_seq_req_zones[8]; #define ATA_ZDI_MAX_SEQ_VALID 0x8000000000000000 #define ATA_ZDI_MAX_SEQ_MASK 0x00000000ffffffff uint8_t version_info[8]; #define ATA_ZDI_VER_VALID 0x8000000000000000 #define ATA_ZDI_VER_ZAC_SUP 0x0100000000000000 #define ATA_ZDI_VER_ZAC_MASK 0x00000000000000ff uint8_t reserved[456]; }; struct ata_ioc_request { union { struct { u_int8_t command; u_int8_t feature; u_int64_t lba; u_int16_t count; } ata; struct { char ccb[16]; struct atapi_sense sense; } atapi; } u; caddr_t data; int count; int flags; #define ATA_CMD_CONTROL 0x01 #define ATA_CMD_READ 0x02 #define ATA_CMD_WRITE 0x04 #define ATA_CMD_ATAPI 0x08 int timeout; int error; }; struct ata_security_password { u_int16_t ctrl; #define ATA_SECURITY_PASSWORD_USER 0x0000 #define ATA_SECURITY_PASSWORD_MASTER 0x0001 #define ATA_SECURITY_ERASE_NORMAL 0x0000 #define ATA_SECURITY_ERASE_ENHANCED 0x0002 #define ATA_SECURITY_LEVEL_HIGH 0x0000 #define ATA_SECURITY_LEVEL_MAXIMUM 0x0100 u_int8_t password[32]; u_int16_t revision; u_int16_t reserved[238]; }; /* pr device ATA ioctl calls */ #define IOCATAREQUEST _IOWR('a', 100, struct ata_ioc_request) #define IOCATAGPARM _IOR('a', 101, struct ata_params) #define IOCATAGMODE _IOR('a', 102, int) #define IOCATASMODE _IOW('a', 103, int) #define IOCATAGSPINDOWN _IOR('a', 104, int) #define IOCATASSPINDOWN _IOW('a', 105, int) struct ata_ioc_raid_config { int lun; int type; #define AR_JBOD 0x0001 #define AR_SPAN 0x0002 #define AR_RAID0 0x0004 #define AR_RAID1 0x0008 #define AR_RAID01 0x0010 #define AR_RAID3 0x0020 #define AR_RAID4 0x0040 #define AR_RAID5 0x0080 int interleave; int status; #define AR_READY 1 #define AR_DEGRADED 2 #define AR_REBUILDING 4 int progress; int total_disks; int disks[16]; }; struct ata_ioc_raid_status { int lun; int type; int interleave; int status; int progress; int total_disks; struct { int state; #define AR_DISK_ONLINE 0x01 #define AR_DISK_PRESENT 0x02 #define AR_DISK_SPARE 0x04 int lun; } disks[16]; }; /* ATA RAID ioctl calls */ #define IOCATARAIDCREATE _IOWR('a', 200, struct ata_ioc_raid_config) #define IOCATARAIDDELETE _IOW('a', 201, int) #define IOCATARAIDSTATUS _IOWR('a', 202, struct ata_ioc_raid_status) #define IOCATARAIDADDSPARE _IOW('a', 203, struct ata_ioc_raid_config) #define IOCATARAIDREBUILD _IOW('a', 204, int) #endif /* _SYS_ATA_H_ */ Index: projects/nfsv42/sys/ufs/ufs/ufs_vnops.c =================================================================== --- projects/nfsv42/sys/ufs/ufs/ufs_vnops.c (revision 350367) +++ projects/nfsv42/sys/ufs/ufs/ufs_vnops.c (revision 350368) @@ -1,2795 +1,2802 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1989, 1993, 1995 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ufs_vnops.c 8.27 (Berkeley) 5/27/95 */ #include __FBSDID("$FreeBSD$"); #include "opt_quota.h" #include "opt_suiddir.h" #include "opt_ufs.h" #include "opt_ffs.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* XXX */ #include #include #include #include #include #include #include #include #include #ifdef UFS_DIRHASH #include #endif #ifdef UFS_GJOURNAL #include FEATURE(ufs_gjournal, "Journaling support through GEOM for UFS"); #endif #ifdef QUOTA FEATURE(ufs_quota, "UFS disk quotas support"); FEATURE(ufs_quota64, "64bit UFS disk quotas support"); #endif #ifdef SUIDDIR FEATURE(suiddir, "Give all new files in directory the same ownership as the directory"); #endif #include static vop_accessx_t ufs_accessx; static int ufs_chmod(struct vnode *, int, struct ucred *, struct thread *); static int ufs_chown(struct vnode *, uid_t, gid_t, struct ucred *, struct thread *); static vop_close_t ufs_close; static vop_create_t ufs_create; static vop_getattr_t ufs_getattr; static vop_ioctl_t ufs_ioctl; static vop_link_t ufs_link; static int ufs_makeinode(int mode, struct vnode *, struct vnode **, struct componentname *, const char *); static vop_markatime_t ufs_markatime; static vop_mkdir_t ufs_mkdir; static vop_mknod_t ufs_mknod; static vop_open_t ufs_open; static vop_pathconf_t ufs_pathconf; static vop_print_t ufs_print; static vop_readlink_t ufs_readlink; static vop_remove_t ufs_remove; static vop_rename_t ufs_rename; static vop_rmdir_t ufs_rmdir; static vop_setattr_t ufs_setattr; static vop_strategy_t ufs_strategy; static vop_symlink_t ufs_symlink; static vop_whiteout_t ufs_whiteout; static vop_close_t ufsfifo_close; static vop_kqfilter_t ufsfifo_kqfilter; SYSCTL_NODE(_vfs, OID_AUTO, ufs, CTLFLAG_RD, 0, "UFS filesystem"); /* * A virgin directory (no blushing please). */ static struct dirtemplate mastertemplate = { 0, 12, DT_DIR, 1, ".", 0, DIRBLKSIZ - 12, DT_DIR, 2, ".." }; static struct odirtemplate omastertemplate = { 0, 12, 1, ".", 0, DIRBLKSIZ - 12, 2, ".." }; static void ufs_itimes_locked(struct vnode *vp) { struct inode *ip; struct timespec ts; ASSERT_VI_LOCKED(vp, __func__); ip = VTOI(vp); if (UFS_RDONLY(ip)) goto out; if ((ip->i_flag & (IN_ACCESS | IN_CHANGE | IN_UPDATE)) == 0) return; if ((vp->v_type == VBLK || vp->v_type == VCHR) && !DOINGSOFTDEP(vp)) ip->i_flag |= IN_LAZYMOD; else if (((vp->v_mount->mnt_kern_flag & (MNTK_SUSPENDED | MNTK_SUSPEND)) == 0) || (ip->i_flag & (IN_CHANGE | IN_UPDATE))) ip->i_flag |= IN_MODIFIED; else if (ip->i_flag & IN_ACCESS) ip->i_flag |= IN_LAZYACCESS; vfs_timestamp(&ts); if (ip->i_flag & IN_ACCESS) { DIP_SET(ip, i_atime, ts.tv_sec); DIP_SET(ip, i_atimensec, ts.tv_nsec); } if (ip->i_flag & IN_UPDATE) { DIP_SET(ip, i_mtime, ts.tv_sec); DIP_SET(ip, i_mtimensec, ts.tv_nsec); } if (ip->i_flag & IN_CHANGE) { DIP_SET(ip, i_ctime, ts.tv_sec); DIP_SET(ip, i_ctimensec, ts.tv_nsec); DIP_SET(ip, i_modrev, DIP(ip, i_modrev) + 1); } out: ip->i_flag &= ~(IN_ACCESS | IN_CHANGE | IN_UPDATE); } void ufs_itimes(struct vnode *vp) { VI_LOCK(vp); ufs_itimes_locked(vp); VI_UNLOCK(vp); } /* * Create a regular file */ static int ufs_create(ap) struct vop_create_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; struct vattr *a_vap; } */ *ap; { int error; error = ufs_makeinode(MAKEIMODE(ap->a_vap->va_type, ap->a_vap->va_mode), ap->a_dvp, ap->a_vpp, ap->a_cnp, "ufs_create"); if (error != 0) return (error); if ((ap->a_cnp->cn_flags & MAKEENTRY) != 0) cache_enter(ap->a_dvp, *ap->a_vpp, ap->a_cnp); return (0); } /* * Mknod vnode call */ /* ARGSUSED */ static int ufs_mknod(ap) struct vop_mknod_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; struct vattr *a_vap; } */ *ap; { struct vattr *vap = ap->a_vap; struct vnode **vpp = ap->a_vpp; struct inode *ip; ino_t ino; int error; error = ufs_makeinode(MAKEIMODE(vap->va_type, vap->va_mode), ap->a_dvp, vpp, ap->a_cnp, "ufs_mknod"); if (error) return (error); ip = VTOI(*vpp); ip->i_flag |= IN_ACCESS | IN_CHANGE | IN_UPDATE; if (vap->va_rdev != VNOVAL) { /* * Want to be able to use this to make badblock * inodes, so don't truncate the dev number. */ DIP_SET(ip, i_rdev, vap->va_rdev); } /* * Remove inode, then reload it through VFS_VGET so it is * checked to see if it is an alias of an existing entry in * the inode cache. XXX I don't believe this is necessary now. */ (*vpp)->v_type = VNON; ino = ip->i_number; /* Save this before vgone() invalidates ip. */ vgone(*vpp); vput(*vpp); error = VFS_VGET(ap->a_dvp->v_mount, ino, LK_EXCLUSIVE, vpp); if (error) { *vpp = NULL; return (error); } return (0); } /* * Open called. */ /* ARGSUSED */ static int ufs_open(struct vop_open_args *ap) { struct vnode *vp = ap->a_vp; struct inode *ip; if (vp->v_type == VCHR || vp->v_type == VBLK) return (EOPNOTSUPP); ip = VTOI(vp); /* * Files marked append-only must be opened for appending. */ if ((ip->i_flags & APPEND) && (ap->a_mode & (FWRITE | O_APPEND)) == FWRITE) return (EPERM); vnode_create_vobject(vp, DIP(ip, i_size), ap->a_td); return (0); } /* * Close called. * * Update the times on the inode. */ /* ARGSUSED */ static int ufs_close(ap) struct vop_close_args /* { struct vnode *a_vp; int a_fflag; struct ucred *a_cred; struct thread *a_td; } */ *ap; { struct vnode *vp = ap->a_vp; int usecount; VI_LOCK(vp); usecount = vp->v_usecount; if (usecount > 1) ufs_itimes_locked(vp); VI_UNLOCK(vp); return (0); } static int ufs_accessx(ap) struct vop_accessx_args /* { struct vnode *a_vp; accmode_t a_accmode; struct ucred *a_cred; struct thread *a_td; } */ *ap; { struct vnode *vp = ap->a_vp; struct inode *ip = VTOI(vp); accmode_t accmode = ap->a_accmode; int error; #ifdef UFS_ACL struct acl *acl; acl_type_t type; #endif /* * Disallow write attempts on read-only filesystems; * unless the file is a socket, fifo, or a block or * character device resident on the filesystem. */ if (accmode & VMODIFY_PERMS) { switch (vp->v_type) { case VDIR: case VLNK: case VREG: if (vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); #ifdef QUOTA /* * Inode is accounted in the quotas only if struct * dquot is attached to it. VOP_ACCESS() is called * from vn_open_cred() and provides a convenient * point to call getinoquota(). The lock mode is * exclusive when the file is opening for write. */ if (VOP_ISLOCKED(vp) == LK_EXCLUSIVE) { error = getinoquota(ip); if (error != 0) return (error); } #endif break; default: break; } } /* * If immutable bit set, nobody gets to write it. "& ~VADMIN_PERMS" * permits the owner of the file to remove the IMMUTABLE flag. */ if ((accmode & (VMODIFY_PERMS & ~VADMIN_PERMS)) && (ip->i_flags & (IMMUTABLE | SF_SNAPSHOT))) return (EPERM); #ifdef UFS_ACL if ((vp->v_mount->mnt_flag & (MNT_ACLS | MNT_NFS4ACLS)) != 0) { if (vp->v_mount->mnt_flag & MNT_NFS4ACLS) type = ACL_TYPE_NFS4; else type = ACL_TYPE_ACCESS; acl = acl_alloc(M_WAITOK); if (type == ACL_TYPE_NFS4) error = ufs_getacl_nfs4_internal(vp, acl, ap->a_td); else error = VOP_GETACL(vp, type, acl, ap->a_cred, ap->a_td); switch (error) { case 0: if (type == ACL_TYPE_NFS4) { error = vaccess_acl_nfs4(vp->v_type, ip->i_uid, ip->i_gid, acl, accmode, ap->a_cred, NULL); } else { error = vfs_unixify_accmode(&accmode); if (error == 0) error = vaccess_acl_posix1e(vp->v_type, ip->i_uid, ip->i_gid, acl, accmode, ap->a_cred, NULL); } break; default: if (error != EOPNOTSUPP) printf( "ufs_accessx(): Error retrieving ACL on object (%d).\n", error); /* * XXX: Fall back until debugged. Should * eventually possibly log an error, and return * EPERM for safety. */ error = vfs_unixify_accmode(&accmode); if (error == 0) error = vaccess(vp->v_type, ip->i_mode, ip->i_uid, ip->i_gid, accmode, ap->a_cred, NULL); } acl_free(acl); return (error); } #endif /* !UFS_ACL */ error = vfs_unixify_accmode(&accmode); if (error == 0) error = vaccess(vp->v_type, ip->i_mode, ip->i_uid, ip->i_gid, accmode, ap->a_cred, NULL); return (error); } /* ARGSUSED */ static int ufs_getattr(ap) struct vop_getattr_args /* { struct vnode *a_vp; struct vattr *a_vap; struct ucred *a_cred; } */ *ap; { struct vnode *vp = ap->a_vp; struct inode *ip = VTOI(vp); struct vattr *vap = ap->a_vap; VI_LOCK(vp); ufs_itimes_locked(vp); if (I_IS_UFS1(ip)) { vap->va_atime.tv_sec = ip->i_din1->di_atime; vap->va_atime.tv_nsec = ip->i_din1->di_atimensec; } else { vap->va_atime.tv_sec = ip->i_din2->di_atime; vap->va_atime.tv_nsec = ip->i_din2->di_atimensec; } VI_UNLOCK(vp); /* * Copy from inode table */ vap->va_fsid = dev2udev(ITOUMP(ip)->um_dev); vap->va_fileid = ip->i_number; vap->va_mode = ip->i_mode & ~IFMT; vap->va_nlink = ip->i_effnlink; vap->va_uid = ip->i_uid; vap->va_gid = ip->i_gid; if (I_IS_UFS1(ip)) { vap->va_rdev = ip->i_din1->di_rdev; vap->va_size = ip->i_din1->di_size; vap->va_mtime.tv_sec = ip->i_din1->di_mtime; vap->va_mtime.tv_nsec = ip->i_din1->di_mtimensec; vap->va_ctime.tv_sec = ip->i_din1->di_ctime; vap->va_ctime.tv_nsec = ip->i_din1->di_ctimensec; vap->va_bytes = dbtob((u_quad_t)ip->i_din1->di_blocks); vap->va_filerev = ip->i_din1->di_modrev; } else { vap->va_rdev = ip->i_din2->di_rdev; vap->va_size = ip->i_din2->di_size; vap->va_mtime.tv_sec = ip->i_din2->di_mtime; vap->va_mtime.tv_nsec = ip->i_din2->di_mtimensec; vap->va_ctime.tv_sec = ip->i_din2->di_ctime; vap->va_ctime.tv_nsec = ip->i_din2->di_ctimensec; vap->va_birthtime.tv_sec = ip->i_din2->di_birthtime; vap->va_birthtime.tv_nsec = ip->i_din2->di_birthnsec; vap->va_bytes = dbtob((u_quad_t)ip->i_din2->di_blocks); vap->va_filerev = ip->i_din2->di_modrev; } vap->va_flags = ip->i_flags; vap->va_gen = ip->i_gen; vap->va_blocksize = vp->v_mount->mnt_stat.f_iosize; vap->va_type = IFTOVT(ip->i_mode); return (0); } /* * Set attribute vnode op. called from several syscalls */ static int ufs_setattr(ap) struct vop_setattr_args /* { struct vnode *a_vp; struct vattr *a_vap; struct ucred *a_cred; } */ *ap; { struct vattr *vap = ap->a_vap; struct vnode *vp = ap->a_vp; struct inode *ip = VTOI(vp); struct ucred *cred = ap->a_cred; struct thread *td = curthread; int error; /* * Check for unsettable attributes. */ if ((vap->va_type != VNON) || (vap->va_nlink != VNOVAL) || (vap->va_fsid != VNOVAL) || (vap->va_fileid != VNOVAL) || (vap->va_blocksize != VNOVAL) || (vap->va_rdev != VNOVAL) || ((int)vap->va_bytes != VNOVAL) || (vap->va_gen != VNOVAL)) { return (EINVAL); } if (vap->va_flags != VNOVAL) { if ((vap->va_flags & ~(SF_APPEND | SF_ARCHIVED | SF_IMMUTABLE | SF_NOUNLINK | SF_SNAPSHOT | UF_APPEND | UF_ARCHIVE | UF_HIDDEN | UF_IMMUTABLE | UF_NODUMP | UF_NOUNLINK | UF_OFFLINE | UF_OPAQUE | UF_READONLY | UF_REPARSE | UF_SPARSE | UF_SYSTEM)) != 0) return (EOPNOTSUPP); if (vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); /* * Callers may only modify the file flags on objects they * have VADMIN rights for. */ if ((error = VOP_ACCESS(vp, VADMIN, cred, td))) return (error); /* * Unprivileged processes are not permitted to unset system * flags, or modify flags if any system flags are set. * Privileged non-jail processes may not modify system flags * if securelevel > 0 and any existing system flags are set. * Privileged jail processes behave like privileged non-jail * processes if the PR_ALLOW_CHFLAGS permission bit is set; * otherwise, they behave like unprivileged processes. */ if (!priv_check_cred(cred, PRIV_VFS_SYSFLAGS)) { if (ip->i_flags & (SF_NOUNLINK | SF_IMMUTABLE | SF_APPEND)) { error = securelevel_gt(cred, 0); if (error) return (error); } /* The snapshot flag cannot be toggled. */ if ((vap->va_flags ^ ip->i_flags) & SF_SNAPSHOT) return (EPERM); } else { if (ip->i_flags & (SF_NOUNLINK | SF_IMMUTABLE | SF_APPEND) || ((vap->va_flags ^ ip->i_flags) & SF_SETTABLE)) return (EPERM); } ip->i_flags = vap->va_flags; DIP_SET(ip, i_flags, vap->va_flags); ip->i_flag |= IN_CHANGE; error = UFS_UPDATE(vp, 0); if (ip->i_flags & (IMMUTABLE | APPEND)) return (error); } /* * If immutable or append, no one can change any of its attributes * except the ones already handled (in some cases, file flags * including the immutability flags themselves for the superuser). */ if (ip->i_flags & (IMMUTABLE | APPEND)) return (EPERM); /* * Go through the fields and update iff not VNOVAL. */ if (vap->va_uid != (uid_t)VNOVAL || vap->va_gid != (gid_t)VNOVAL) { if (vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); if ((error = ufs_chown(vp, vap->va_uid, vap->va_gid, cred, td)) != 0) return (error); } if (vap->va_size != VNOVAL) { /* * XXX most of the following special cases should be in * callers instead of in N filesystems. The VDIR check * mostly already is. */ switch (vp->v_type) { case VDIR: return (EISDIR); case VLNK: case VREG: /* * Truncation should have an effect in these cases. * Disallow it if the filesystem is read-only or * the file is being snapshotted. */ if (vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); if ((ip->i_flags & SF_SNAPSHOT) != 0) return (EPERM); break; default: /* * According to POSIX, the result is unspecified * for file types other than regular files, * directories and shared memory objects. We * don't support shared memory objects in the file * system, and have dubious support for truncating * symlinks. Just ignore the request in other cases. */ return (0); } if ((error = UFS_TRUNCATE(vp, vap->va_size, IO_NORMAL | ((vap->va_vaflags & VA_SYNC) != 0 ? IO_SYNC : 0), cred)) != 0) return (error); } if (vap->va_atime.tv_sec != VNOVAL || vap->va_mtime.tv_sec != VNOVAL || vap->va_birthtime.tv_sec != VNOVAL) { if (vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); if ((ip->i_flags & SF_SNAPSHOT) != 0) return (EPERM); error = vn_utimes_perm(vp, vap, cred, td); if (error != 0) return (error); ip->i_flag |= IN_CHANGE | IN_MODIFIED; if (vap->va_atime.tv_sec != VNOVAL) { ip->i_flag &= ~IN_ACCESS; DIP_SET(ip, i_atime, vap->va_atime.tv_sec); DIP_SET(ip, i_atimensec, vap->va_atime.tv_nsec); } if (vap->va_mtime.tv_sec != VNOVAL) { ip->i_flag &= ~IN_UPDATE; DIP_SET(ip, i_mtime, vap->va_mtime.tv_sec); DIP_SET(ip, i_mtimensec, vap->va_mtime.tv_nsec); } if (vap->va_birthtime.tv_sec != VNOVAL && I_IS_UFS2(ip)) { ip->i_din2->di_birthtime = vap->va_birthtime.tv_sec; ip->i_din2->di_birthnsec = vap->va_birthtime.tv_nsec; } error = UFS_UPDATE(vp, 0); if (error) return (error); } error = 0; if (vap->va_mode != (mode_t)VNOVAL) { if (vp->v_mount->mnt_flag & MNT_RDONLY) return (EROFS); if ((ip->i_flags & SF_SNAPSHOT) != 0 && (vap->va_mode & (S_IXUSR | S_IWUSR | S_IXGRP | S_IWGRP | S_IXOTH | S_IWOTH))) return (EPERM); error = ufs_chmod(vp, (int)vap->va_mode, cred, td); } return (error); } #ifdef UFS_ACL static int ufs_update_nfs4_acl_after_mode_change(struct vnode *vp, int mode, int file_owner_id, struct ucred *cred, struct thread *td) { int error; struct acl *aclp; aclp = acl_alloc(M_WAITOK); error = ufs_getacl_nfs4_internal(vp, aclp, td); /* * We don't have to handle EOPNOTSUPP here, as the filesystem claims * it supports ACLs. */ if (error) goto out; acl_nfs4_sync_acl_from_mode(aclp, mode, file_owner_id); error = ufs_setacl_nfs4_internal(vp, aclp, td); out: acl_free(aclp); return (error); } #endif /* UFS_ACL */ /* * Mark this file's access time for update for vfs_mark_atime(). This * is called from execve() and mmap(). */ static int ufs_markatime(ap) struct vop_markatime_args /* { struct vnode *a_vp; } */ *ap; { struct vnode *vp = ap->a_vp; struct inode *ip = VTOI(vp); VI_LOCK(vp); ip->i_flag |= IN_ACCESS; VI_UNLOCK(vp); /* * XXXKIB No UFS_UPDATE(ap->a_vp, 0) there. */ return (0); } /* * Change the mode on a file. * Inode must be locked before calling. */ static int ufs_chmod(vp, mode, cred, td) struct vnode *vp; int mode; struct ucred *cred; struct thread *td; { struct inode *ip = VTOI(vp); int error; /* * To modify the permissions on a file, must possess VADMIN * for that file. */ if ((error = VOP_ACCESSX(vp, VWRITE_ACL, cred, td))) return (error); /* * Privileged processes may set the sticky bit on non-directories, * as well as set the setgid bit on a file with a group that the * process is not a member of. Both of these are allowed in * jail(8). */ if (vp->v_type != VDIR && (mode & S_ISTXT)) { if (priv_check_cred(cred, PRIV_VFS_STICKYFILE)) return (EFTYPE); } if (!groupmember(ip->i_gid, cred) && (mode & ISGID)) { error = priv_check_cred(cred, PRIV_VFS_SETGID); if (error) return (error); } /* * Deny setting setuid if we are not the file owner. */ if ((mode & ISUID) && ip->i_uid != cred->cr_uid) { error = priv_check_cred(cred, PRIV_VFS_ADMIN); if (error) return (error); } ip->i_mode &= ~ALLPERMS; ip->i_mode |= (mode & ALLPERMS); DIP_SET(ip, i_mode, ip->i_mode); ip->i_flag |= IN_CHANGE; #ifdef UFS_ACL if ((vp->v_mount->mnt_flag & MNT_NFS4ACLS) != 0) error = ufs_update_nfs4_acl_after_mode_change(vp, mode, ip->i_uid, cred, td); #endif if (error == 0 && (ip->i_flag & IN_CHANGE) != 0) error = UFS_UPDATE(vp, 0); return (error); } /* * Perform chown operation on inode ip; * inode must be locked prior to call. */ static int ufs_chown(vp, uid, gid, cred, td) struct vnode *vp; uid_t uid; gid_t gid; struct ucred *cred; struct thread *td; { struct inode *ip = VTOI(vp); uid_t ouid; gid_t ogid; int error = 0; #ifdef QUOTA int i; ufs2_daddr_t change; #endif if (uid == (uid_t)VNOVAL) uid = ip->i_uid; if (gid == (gid_t)VNOVAL) gid = ip->i_gid; /* * To modify the ownership of a file, must possess VADMIN for that * file. */ if ((error = VOP_ACCESSX(vp, VWRITE_OWNER, cred, td))) return (error); /* * To change the owner of a file, or change the group of a file to a * group of which we are not a member, the caller must have * privilege. */ if (((uid != ip->i_uid && uid != cred->cr_uid) || (gid != ip->i_gid && !groupmember(gid, cred))) && (error = priv_check_cred(cred, PRIV_VFS_CHOWN))) return (error); ogid = ip->i_gid; ouid = ip->i_uid; #ifdef QUOTA if ((error = getinoquota(ip)) != 0) return (error); if (ouid == uid) { dqrele(vp, ip->i_dquot[USRQUOTA]); ip->i_dquot[USRQUOTA] = NODQUOT; } if (ogid == gid) { dqrele(vp, ip->i_dquot[GRPQUOTA]); ip->i_dquot[GRPQUOTA] = NODQUOT; } change = DIP(ip, i_blocks); (void) chkdq(ip, -change, cred, CHOWN); (void) chkiq(ip, -1, cred, CHOWN); for (i = 0; i < MAXQUOTAS; i++) { dqrele(vp, ip->i_dquot[i]); ip->i_dquot[i] = NODQUOT; } #endif ip->i_gid = gid; DIP_SET(ip, i_gid, gid); ip->i_uid = uid; DIP_SET(ip, i_uid, uid); #ifdef QUOTA if ((error = getinoquota(ip)) == 0) { if (ouid == uid) { dqrele(vp, ip->i_dquot[USRQUOTA]); ip->i_dquot[USRQUOTA] = NODQUOT; } if (ogid == gid) { dqrele(vp, ip->i_dquot[GRPQUOTA]); ip->i_dquot[GRPQUOTA] = NODQUOT; } if ((error = chkdq(ip, change, cred, CHOWN)) == 0) { if ((error = chkiq(ip, 1, cred, CHOWN)) == 0) goto good; else (void) chkdq(ip, -change, cred, CHOWN|FORCE); } for (i = 0; i < MAXQUOTAS; i++) { dqrele(vp, ip->i_dquot[i]); ip->i_dquot[i] = NODQUOT; } } ip->i_gid = ogid; DIP_SET(ip, i_gid, ogid); ip->i_uid = ouid; DIP_SET(ip, i_uid, ouid); if (getinoquota(ip) == 0) { if (ouid == uid) { dqrele(vp, ip->i_dquot[USRQUOTA]); ip->i_dquot[USRQUOTA] = NODQUOT; } if (ogid == gid) { dqrele(vp, ip->i_dquot[GRPQUOTA]); ip->i_dquot[GRPQUOTA] = NODQUOT; } (void) chkdq(ip, change, cred, FORCE|CHOWN); (void) chkiq(ip, 1, cred, FORCE|CHOWN); (void) getinoquota(ip); } return (error); good: if (getinoquota(ip)) panic("ufs_chown: lost quota"); #endif /* QUOTA */ ip->i_flag |= IN_CHANGE; if ((ip->i_mode & (ISUID | ISGID)) && (ouid != uid || ogid != gid)) { if (priv_check_cred(cred, PRIV_VFS_RETAINSUGID)) { ip->i_mode &= ~(ISUID | ISGID); DIP_SET(ip, i_mode, ip->i_mode); } } error = UFS_UPDATE(vp, 0); return (error); } static int ufs_remove(ap) struct vop_remove_args /* { struct vnode *a_dvp; struct vnode *a_vp; struct componentname *a_cnp; } */ *ap; { struct inode *ip; struct vnode *vp = ap->a_vp; struct vnode *dvp = ap->a_dvp; int error; struct thread *td; td = curthread; ip = VTOI(vp); if ((ip->i_flags & (NOUNLINK | IMMUTABLE | APPEND)) || (VTOI(dvp)->i_flags & APPEND)) { error = EPERM; goto out; } #ifdef UFS_GJOURNAL ufs_gjournal_orphan(vp); #endif error = ufs_dirremove(dvp, ip, ap->a_cnp->cn_flags, 0); if (ip->i_nlink <= 0) vp->v_vflag |= VV_NOSYNC; if ((ip->i_flags & SF_SNAPSHOT) != 0) { /* * Avoid deadlock where another thread is trying to * update the inodeblock for dvp and is waiting on * snaplk. Temporary unlock the vnode lock for the * unlinked file and sync the directory. This should * allow vput() of the directory to not block later on * while holding the snapshot vnode locked, assuming * that the directory hasn't been unlinked too. */ VOP_UNLOCK(vp, 0); (void) VOP_FSYNC(dvp, MNT_WAIT, td); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); } out: return (error); } static void print_bad_link_count(const char *funcname, struct vnode *dvp) { struct inode *dip; dip = VTOI(dvp); uprintf("%s: Bad link count %d on parent inode %jd in file system %s\n", funcname, dip->i_effnlink, (intmax_t)dip->i_number, dvp->v_mount->mnt_stat.f_mntonname); } /* * link vnode call */ static int ufs_link(ap) struct vop_link_args /* { struct vnode *a_tdvp; struct vnode *a_vp; struct componentname *a_cnp; } */ *ap; { struct vnode *vp = ap->a_vp; struct vnode *tdvp = ap->a_tdvp; struct componentname *cnp = ap->a_cnp; struct inode *ip; struct direct newdir; int error; #ifdef INVARIANTS if ((cnp->cn_flags & HASBUF) == 0) panic("ufs_link: no name"); #endif if (VTOI(tdvp)->i_effnlink < 2) { print_bad_link_count("ufs_link", tdvp); error = EINVAL; goto out; } ip = VTOI(vp); if (ip->i_nlink >= UFS_LINK_MAX) { error = EMLINK; goto out; } /* * The file may have been removed after namei droped the original * lock. */ if (ip->i_effnlink == 0) { error = ENOENT; goto out; } if (ip->i_flags & (IMMUTABLE | APPEND)) { error = EPERM; goto out; } ip->i_effnlink++; ip->i_nlink++; DIP_SET(ip, i_nlink, ip->i_nlink); ip->i_flag |= IN_CHANGE; if (DOINGSOFTDEP(vp)) softdep_setup_link(VTOI(tdvp), ip); error = UFS_UPDATE(vp, !DOINGSOFTDEP(vp) && !DOINGASYNC(vp)); if (!error) { ufs_makedirentry(ip, cnp, &newdir); error = ufs_direnter(tdvp, vp, &newdir, cnp, NULL, 0); } if (error) { ip->i_effnlink--; ip->i_nlink--; DIP_SET(ip, i_nlink, ip->i_nlink); ip->i_flag |= IN_CHANGE; if (DOINGSOFTDEP(vp)) softdep_revert_link(VTOI(tdvp), ip); } out: return (error); } /* * whiteout vnode call */ static int ufs_whiteout(ap) struct vop_whiteout_args /* { struct vnode *a_dvp; struct componentname *a_cnp; int a_flags; } */ *ap; { struct vnode *dvp = ap->a_dvp; struct componentname *cnp = ap->a_cnp; struct direct newdir; int error = 0; switch (ap->a_flags) { case LOOKUP: /* 4.4 format directories support whiteout operations */ if (dvp->v_mount->mnt_maxsymlinklen > 0) return (0); return (EOPNOTSUPP); case CREATE: /* create a new directory whiteout */ #ifdef INVARIANTS if ((cnp->cn_flags & SAVENAME) == 0) panic("ufs_whiteout: missing name"); if (dvp->v_mount->mnt_maxsymlinklen <= 0) panic("ufs_whiteout: old format filesystem"); #endif newdir.d_ino = UFS_WINO; newdir.d_namlen = cnp->cn_namelen; bcopy(cnp->cn_nameptr, newdir.d_name, (unsigned)cnp->cn_namelen + 1); newdir.d_type = DT_WHT; error = ufs_direnter(dvp, NULL, &newdir, cnp, NULL, 0); break; case DELETE: /* remove an existing directory whiteout */ #ifdef INVARIANTS if (dvp->v_mount->mnt_maxsymlinklen <= 0) panic("ufs_whiteout: old format filesystem"); #endif cnp->cn_flags &= ~DOWHITEOUT; error = ufs_dirremove(dvp, NULL, cnp->cn_flags, 0); break; default: panic("ufs_whiteout: unknown op"); } return (error); } static volatile int rename_restarts; SYSCTL_INT(_vfs_ufs, OID_AUTO, rename_restarts, CTLFLAG_RD, __DEVOLATILE(int *, &rename_restarts), 0, "Times rename had to restart due to lock contention"); /* * Rename system call. * rename("foo", "bar"); * is essentially * unlink("bar"); * link("foo", "bar"); * unlink("foo"); * but ``atomically''. Can't do full commit without saving state in the * inode on disk which isn't feasible at this time. Best we can do is * always guarantee the target exists. * * Basic algorithm is: * * 1) Bump link count on source while we're linking it to the * target. This also ensure the inode won't be deleted out * from underneath us while we work (it may be truncated by * a concurrent `trunc' or `open' for creation). * 2) Link source to destination. If destination already exists, * delete it first. * 3) Unlink source reference to inode if still around. If a * directory was moved and the parent of the destination * is different from the source, patch the ".." entry in the * directory. */ static int ufs_rename(ap) struct vop_rename_args /* { struct vnode *a_fdvp; struct vnode *a_fvp; struct componentname *a_fcnp; struct vnode *a_tdvp; struct vnode *a_tvp; struct componentname *a_tcnp; } */ *ap; { struct vnode *tvp = ap->a_tvp; struct vnode *tdvp = ap->a_tdvp; struct vnode *fvp = ap->a_fvp; struct vnode *fdvp = ap->a_fdvp; struct vnode *nvp; struct componentname *tcnp = ap->a_tcnp; struct componentname *fcnp = ap->a_fcnp; struct thread *td = fcnp->cn_thread; struct inode *fip, *tip, *tdp, *fdp; struct direct newdir; off_t endoff; int doingdirectory, newparent; int error = 0; struct mount *mp; ino_t ino; #ifdef INVARIANTS if ((tcnp->cn_flags & HASBUF) == 0 || (fcnp->cn_flags & HASBUF) == 0) panic("ufs_rename: no name"); #endif endoff = 0; mp = tdvp->v_mount; VOP_UNLOCK(tdvp, 0); if (tvp && tvp != tdvp) VOP_UNLOCK(tvp, 0); /* * Check for cross-device rename. */ if ((fvp->v_mount != tdvp->v_mount) || (tvp && (fvp->v_mount != tvp->v_mount))) { error = EXDEV; mp = NULL; goto releout; } relock: /* * We need to acquire 2 to 4 locks depending on whether tvp is NULL * and fdvp and tdvp are the same directory. Subsequently we need * to double-check all paths and in the directory rename case we * need to verify that we are not creating a directory loop. To * handle this we acquire all but fdvp using non-blocking * acquisitions. If we fail to acquire any lock in the path we will * drop all held locks, acquire the new lock in a blocking fashion, * and then release it and restart the rename. This acquire/release * step ensures that we do not spin on a lock waiting for release. */ error = vn_lock(fdvp, LK_EXCLUSIVE); if (error) goto releout; if (vn_lock(tdvp, LK_EXCLUSIVE | LK_NOWAIT) != 0) { VOP_UNLOCK(fdvp, 0); error = vn_lock(tdvp, LK_EXCLUSIVE); if (error) goto releout; VOP_UNLOCK(tdvp, 0); atomic_add_int(&rename_restarts, 1); goto relock; } /* * Re-resolve fvp to be certain it still exists and fetch the * correct vnode. */ error = ufs_lookup_ino(fdvp, NULL, fcnp, &ino); if (error) { VOP_UNLOCK(fdvp, 0); VOP_UNLOCK(tdvp, 0); goto releout; } error = VFS_VGET(mp, ino, LK_EXCLUSIVE | LK_NOWAIT, &nvp); if (error) { VOP_UNLOCK(fdvp, 0); VOP_UNLOCK(tdvp, 0); if (error != EBUSY) goto releout; error = VFS_VGET(mp, ino, LK_EXCLUSIVE, &nvp); if (error != 0) goto releout; VOP_UNLOCK(nvp, 0); vrele(fvp); fvp = nvp; atomic_add_int(&rename_restarts, 1); goto relock; } vrele(fvp); fvp = nvp; /* * Re-resolve tvp and acquire the vnode lock if present. */ error = ufs_lookup_ino(tdvp, NULL, tcnp, &ino); if (error != 0 && error != EJUSTRETURN) { VOP_UNLOCK(fdvp, 0); VOP_UNLOCK(tdvp, 0); VOP_UNLOCK(fvp, 0); goto releout; } /* * If tvp disappeared we just carry on. */ if (error == EJUSTRETURN && tvp != NULL) { vrele(tvp); tvp = NULL; } /* * Get the tvp ino if the lookup succeeded. We may have to restart * if the non-blocking acquire fails. */ if (error == 0) { nvp = NULL; error = VFS_VGET(mp, ino, LK_EXCLUSIVE | LK_NOWAIT, &nvp); if (tvp) vrele(tvp); tvp = nvp; if (error) { VOP_UNLOCK(fdvp, 0); VOP_UNLOCK(tdvp, 0); VOP_UNLOCK(fvp, 0); if (error != EBUSY) goto releout; error = VFS_VGET(mp, ino, LK_EXCLUSIVE, &nvp); if (error != 0) goto releout; vput(nvp); atomic_add_int(&rename_restarts, 1); goto relock; } } fdp = VTOI(fdvp); fip = VTOI(fvp); tdp = VTOI(tdvp); tip = NULL; if (tvp) tip = VTOI(tvp); if (tvp && ((VTOI(tvp)->i_flags & (NOUNLINK | IMMUTABLE | APPEND)) || (VTOI(tdvp)->i_flags & APPEND))) { error = EPERM; goto unlockout; } /* * Renaming a file to itself has no effect. The upper layers should * not call us in that case. However, things could change after * we drop the locks above. */ if (fvp == tvp) { error = 0; goto unlockout; } doingdirectory = 0; newparent = 0; ino = fip->i_number; if (fip->i_nlink >= UFS_LINK_MAX) { error = EMLINK; goto unlockout; } if ((fip->i_flags & (NOUNLINK | IMMUTABLE | APPEND)) || (fdp->i_flags & APPEND)) { error = EPERM; goto unlockout; } if ((fip->i_mode & IFMT) == IFDIR) { /* * Avoid ".", "..", and aliases of "." for obvious reasons. */ if ((fcnp->cn_namelen == 1 && fcnp->cn_nameptr[0] == '.') || fdp == fip || (fcnp->cn_flags | tcnp->cn_flags) & ISDOTDOT) { error = EINVAL; goto unlockout; } if (fdp->i_number != tdp->i_number) newparent = tdp->i_number; doingdirectory = 1; } if ((fvp->v_type == VDIR && fvp->v_mountedhere != NULL) || (tvp != NULL && tvp->v_type == VDIR && tvp->v_mountedhere != NULL)) { error = EXDEV; goto unlockout; } /* * If ".." must be changed (ie the directory gets a new * parent) then the source directory must not be in the * directory hierarchy above the target, as this would * orphan everything below the source directory. Also * the user must have write permission in the source so * as to be able to change "..". */ if (doingdirectory && newparent) { error = VOP_ACCESS(fvp, VWRITE, tcnp->cn_cred, tcnp->cn_thread); if (error) goto unlockout; error = ufs_checkpath(ino, fdp->i_number, tdp, tcnp->cn_cred, &ino); /* * We encountered a lock that we have to wait for. Unlock * everything else and VGET before restarting. */ if (ino) { VOP_UNLOCK(fdvp, 0); VOP_UNLOCK(fvp, 0); VOP_UNLOCK(tdvp, 0); if (tvp) VOP_UNLOCK(tvp, 0); error = VFS_VGET(mp, ino, LK_SHARED, &nvp); if (error == 0) vput(nvp); atomic_add_int(&rename_restarts, 1); goto relock; } if (error) goto unlockout; if ((tcnp->cn_flags & SAVESTART) == 0) panic("ufs_rename: lost to startdir"); } if (fip->i_effnlink == 0 || fdp->i_effnlink == 0 || tdp->i_effnlink == 0) panic("Bad effnlink fip %p, fdp %p, tdp %p", fip, fdp, tdp); /* * 1) Bump link count while we're moving stuff * around. If we crash somewhere before * completing our work, the link count * may be wrong, but correctable. */ fip->i_effnlink++; fip->i_nlink++; DIP_SET(fip, i_nlink, fip->i_nlink); fip->i_flag |= IN_CHANGE; if (DOINGSOFTDEP(fvp)) softdep_setup_link(tdp, fip); error = UFS_UPDATE(fvp, !DOINGSOFTDEP(fvp) && !DOINGASYNC(fvp)); if (error) goto bad; /* * 2) If target doesn't exist, link the target * to the source and unlink the source. * Otherwise, rewrite the target directory * entry to reference the source inode and * expunge the original entry's existence. */ if (tip == NULL) { if (ITODEV(tdp) != ITODEV(fip)) panic("ufs_rename: EXDEV"); if (doingdirectory && newparent) { /* * Account for ".." in new directory. * When source and destination have the same * parent we don't adjust the link count. The * actual link modification is completed when * .. is rewritten below. */ if (tdp->i_nlink >= UFS_LINK_MAX) { error = EMLINK; goto bad; } } ufs_makedirentry(fip, tcnp, &newdir); error = ufs_direnter(tdvp, NULL, &newdir, tcnp, NULL, 1); if (error) goto bad; /* Setup tdvp for directory compaction if needed. */ if (tdp->i_count && tdp->i_endoff && tdp->i_endoff < tdp->i_size) endoff = tdp->i_endoff; } else { if (ITODEV(tip) != ITODEV(tdp) || ITODEV(tip) != ITODEV(fip)) panic("ufs_rename: EXDEV"); /* * Short circuit rename(foo, foo). */ if (tip->i_number == fip->i_number) panic("ufs_rename: same file"); /* * If the parent directory is "sticky", then the caller * must possess VADMIN for the parent directory, or the * destination of the rename. This implements append-only * directories. */ if ((tdp->i_mode & S_ISTXT) && VOP_ACCESS(tdvp, VADMIN, tcnp->cn_cred, td) && VOP_ACCESS(tvp, VADMIN, tcnp->cn_cred, td)) { error = EPERM; goto bad; } /* * Target must be empty if a directory and have no links * to it. Also, ensure source and target are compatible * (both directories, or both not directories). */ if ((tip->i_mode & IFMT) == IFDIR) { if ((tip->i_effnlink > 2) || !ufs_dirempty(tip, tdp->i_number, tcnp->cn_cred)) { error = ENOTEMPTY; goto bad; } if (!doingdirectory) { error = ENOTDIR; goto bad; } cache_purge(tdvp); } else if (doingdirectory) { error = EISDIR; goto bad; } if (doingdirectory) { if (!newparent) { tdp->i_effnlink--; if (DOINGSOFTDEP(tdvp)) softdep_change_linkcnt(tdp); } tip->i_effnlink--; if (DOINGSOFTDEP(tvp)) softdep_change_linkcnt(tip); } error = ufs_dirrewrite(tdp, tip, fip->i_number, IFTODT(fip->i_mode), (doingdirectory && newparent) ? newparent : doingdirectory); if (error) { if (doingdirectory) { if (!newparent) { tdp->i_effnlink++; if (DOINGSOFTDEP(tdvp)) softdep_change_linkcnt(tdp); } tip->i_effnlink++; if (DOINGSOFTDEP(tvp)) softdep_change_linkcnt(tip); } } if (doingdirectory && !DOINGSOFTDEP(tvp)) { /* * The only stuff left in the directory is "." * and "..". The "." reference is inconsequential * since we are quashing it. We have removed the "." * reference and the reference in the parent directory, * but there may be other hard links. The soft * dependency code will arrange to do these operations * after the parent directory entry has been deleted on * disk, so when running with that code we avoid doing * them now. */ if (!newparent) { tdp->i_nlink--; DIP_SET(tdp, i_nlink, tdp->i_nlink); tdp->i_flag |= IN_CHANGE; } tip->i_nlink--; DIP_SET(tip, i_nlink, tip->i_nlink); tip->i_flag |= IN_CHANGE; } } /* * 3) Unlink the source. We have to resolve the path again to * fixup the directory offset and count for ufs_dirremove. */ if (fdvp == tdvp) { error = ufs_lookup_ino(fdvp, NULL, fcnp, &ino); if (error) panic("ufs_rename: from entry went away!"); if (ino != fip->i_number) panic("ufs_rename: ino mismatch %ju != %ju\n", (uintmax_t)ino, (uintmax_t)fip->i_number); } /* * If the source is a directory with a * new parent, the link count of the old * parent directory must be decremented * and ".." set to point to the new parent. */ if (doingdirectory && newparent) { /* * If tip exists we simply use its link, otherwise we must * add a new one. */ if (tip == NULL) { tdp->i_effnlink++; tdp->i_nlink++; DIP_SET(tdp, i_nlink, tdp->i_nlink); tdp->i_flag |= IN_CHANGE; if (DOINGSOFTDEP(tdvp)) softdep_setup_dotdot_link(tdp, fip); error = UFS_UPDATE(tdvp, !DOINGSOFTDEP(tdvp) && !DOINGASYNC(tdvp)); /* Don't go to bad here as the new link exists. */ if (error) goto unlockout; } else if (DOINGSUJ(tdvp)) /* Journal must account for each new link. */ softdep_setup_dotdot_link(tdp, fip); fip->i_offset = mastertemplate.dot_reclen; ufs_dirrewrite(fip, fdp, newparent, DT_DIR, 0); cache_purge(fdvp); } error = ufs_dirremove(fdvp, fip, fcnp->cn_flags, 0); /* * The kern_renameat() looks up the fvp using the DELETE flag, which * causes the removal of the name cache entry for fvp. * As the relookup of the fvp is done in two steps: * ufs_lookup_ino() and then VFS_VGET(), another thread might do a * normal lookup of the from name just before the VFS_VGET() call, * causing the cache entry to be re-instantiated. * * The same issue also applies to tvp if it exists as * otherwise we may have a stale name cache entry for the new * name that references the old i-node if it has other links * or open file descriptors. */ cache_purge(fvp); if (tvp) cache_purge(tvp); cache_purge_negative(tdvp); unlockout: vput(fdvp); vput(fvp); if (tvp) vput(tvp); /* * If compaction or fsync was requested do it now that other locks * are no longer needed. */ if (error == 0 && endoff != 0) { error = UFS_TRUNCATE(tdvp, endoff, IO_NORMAL | (DOINGASYNC(tdvp) ? 0 : IO_SYNC), tcnp->cn_cred); if (error != 0) vn_printf(tdvp, "ufs_rename: failed to truncate, error %d\n", error); #ifdef UFS_DIRHASH else if (tdp->i_dirhash != NULL) ufsdirhash_dirtrunc(tdp, endoff); #endif /* * Even if the directory compaction failed, rename was * succesful. Do not propagate a UFS_TRUNCATE() error * to the caller. */ error = 0; } if (error == 0 && tdp->i_flag & IN_NEEDSYNC) error = VOP_FSYNC(tdvp, MNT_WAIT, td); vput(tdvp); return (error); bad: fip->i_effnlink--; fip->i_nlink--; DIP_SET(fip, i_nlink, fip->i_nlink); fip->i_flag |= IN_CHANGE; if (DOINGSOFTDEP(fvp)) softdep_revert_link(tdp, fip); goto unlockout; releout: vrele(fdvp); vrele(fvp); vrele(tdvp); if (tvp) vrele(tvp); return (error); } #ifdef UFS_ACL static int ufs_do_posix1e_acl_inheritance_dir(struct vnode *dvp, struct vnode *tvp, mode_t dmode, struct ucred *cred, struct thread *td) { int error; struct inode *ip = VTOI(tvp); struct acl *dacl, *acl; acl = acl_alloc(M_WAITOK); dacl = acl_alloc(M_WAITOK); /* * Retrieve default ACL from parent, if any. */ error = VOP_GETACL(dvp, ACL_TYPE_DEFAULT, acl, cred, td); switch (error) { case 0: /* * Retrieved a default ACL, so merge mode and ACL if * necessary. If the ACL is empty, fall through to * the "not defined or available" case. */ if (acl->acl_cnt != 0) { dmode = acl_posix1e_newfilemode(dmode, acl); ip->i_mode = dmode; DIP_SET(ip, i_mode, dmode); *dacl = *acl; ufs_sync_acl_from_inode(ip, acl); break; } /* FALLTHROUGH */ case EOPNOTSUPP: /* * Just use the mode as-is. */ ip->i_mode = dmode; DIP_SET(ip, i_mode, dmode); error = 0; goto out; default: goto out; } /* * XXX: If we abort now, will Soft Updates notify the extattr * code that the EAs for the file need to be released? */ error = VOP_SETACL(tvp, ACL_TYPE_ACCESS, acl, cred, td); if (error == 0) error = VOP_SETACL(tvp, ACL_TYPE_DEFAULT, dacl, cred, td); switch (error) { case 0: break; case EOPNOTSUPP: /* * XXX: This should not happen, as EOPNOTSUPP above * was supposed to free acl. */ printf("ufs_mkdir: VOP_GETACL() but no VOP_SETACL()\n"); /* panic("ufs_mkdir: VOP_GETACL() but no VOP_SETACL()"); */ break; default: goto out; } out: acl_free(acl); acl_free(dacl); return (error); } static int ufs_do_posix1e_acl_inheritance_file(struct vnode *dvp, struct vnode *tvp, mode_t mode, struct ucred *cred, struct thread *td) { int error; struct inode *ip = VTOI(tvp); struct acl *acl; acl = acl_alloc(M_WAITOK); /* * Retrieve default ACL for parent, if any. */ error = VOP_GETACL(dvp, ACL_TYPE_DEFAULT, acl, cred, td); switch (error) { case 0: /* * Retrieved a default ACL, so merge mode and ACL if * necessary. */ if (acl->acl_cnt != 0) { /* * Two possible ways for default ACL to not * be present. First, the EA can be * undefined, or second, the default ACL can * be blank. If it's blank, fall through to * the it's not defined case. */ mode = acl_posix1e_newfilemode(mode, acl); ip->i_mode = mode; DIP_SET(ip, i_mode, mode); ufs_sync_acl_from_inode(ip, acl); break; } /* FALLTHROUGH */ case EOPNOTSUPP: /* * Just use the mode as-is. */ ip->i_mode = mode; DIP_SET(ip, i_mode, mode); error = 0; goto out; default: goto out; } /* * XXX: If we abort now, will Soft Updates notify the extattr * code that the EAs for the file need to be released? */ error = VOP_SETACL(tvp, ACL_TYPE_ACCESS, acl, cred, td); switch (error) { case 0: break; case EOPNOTSUPP: /* * XXX: This should not happen, as EOPNOTSUPP above was * supposed to free acl. */ printf("ufs_do_posix1e_acl_inheritance_file: VOP_GETACL() " "but no VOP_SETACL()\n"); /* panic("ufs_do_posix1e_acl_inheritance_file: VOP_GETACL() " "but no VOP_SETACL()"); */ break; default: goto out; } out: acl_free(acl); return (error); } static int ufs_do_nfs4_acl_inheritance(struct vnode *dvp, struct vnode *tvp, mode_t child_mode, struct ucred *cred, struct thread *td) { int error; struct acl *parent_aclp, *child_aclp; parent_aclp = acl_alloc(M_WAITOK); child_aclp = acl_alloc(M_WAITOK | M_ZERO); error = ufs_getacl_nfs4_internal(dvp, parent_aclp, td); if (error) goto out; acl_nfs4_compute_inherited_acl(parent_aclp, child_aclp, child_mode, VTOI(tvp)->i_uid, tvp->v_type == VDIR); error = ufs_setacl_nfs4_internal(tvp, child_aclp, td); if (error) goto out; out: acl_free(parent_aclp); acl_free(child_aclp); return (error); } #endif /* * Mkdir system call */ static int ufs_mkdir(ap) struct vop_mkdir_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; struct vattr *a_vap; } */ *ap; { struct vnode *dvp = ap->a_dvp; struct vattr *vap = ap->a_vap; struct componentname *cnp = ap->a_cnp; struct inode *ip, *dp; struct vnode *tvp; struct buf *bp; struct dirtemplate dirtemplate, *dtp; struct direct newdir; int error, dmode; long blkoff; #ifdef INVARIANTS if ((cnp->cn_flags & HASBUF) == 0) panic("ufs_mkdir: no name"); #endif dp = VTOI(dvp); if (dp->i_nlink >= UFS_LINK_MAX) { error = EMLINK; goto out; } dmode = vap->va_mode & 0777; dmode |= IFDIR; /* * Must simulate part of ufs_makeinode here to acquire the inode, * but not have it entered in the parent directory. The entry is * made later after writing "." and ".." entries. */ if (dp->i_effnlink < 2) { print_bad_link_count("ufs_mkdir", dvp); error = EINVAL; goto out; } error = UFS_VALLOC(dvp, dmode, cnp->cn_cred, &tvp); if (error) goto out; ip = VTOI(tvp); ip->i_gid = dp->i_gid; DIP_SET(ip, i_gid, dp->i_gid); #ifdef SUIDDIR { #ifdef QUOTA struct ucred ucred, *ucp; gid_t ucred_group; ucp = cnp->cn_cred; #endif /* * If we are hacking owners here, (only do this where told to) * and we are not giving it TO root, (would subvert quotas) * then go ahead and give it to the other user. * The new directory also inherits the SUID bit. * If user's UID and dir UID are the same, * 'give it away' so that the SUID is still forced on. */ if ((dvp->v_mount->mnt_flag & MNT_SUIDDIR) && (dp->i_mode & ISUID) && dp->i_uid) { dmode |= ISUID; ip->i_uid = dp->i_uid; DIP_SET(ip, i_uid, dp->i_uid); #ifdef QUOTA if (dp->i_uid != cnp->cn_cred->cr_uid) { /* * Make sure the correct user gets charged * for the space. * Make a dummy credential for the victim. * XXX This seems to never be accessed out of * our context so a stack variable is ok. */ refcount_init(&ucred.cr_ref, 1); ucred.cr_uid = ip->i_uid; ucred.cr_ngroups = 1; ucred.cr_groups = &ucred_group; ucred.cr_groups[0] = dp->i_gid; ucp = &ucred; } #endif } else { ip->i_uid = cnp->cn_cred->cr_uid; DIP_SET(ip, i_uid, ip->i_uid); } #ifdef QUOTA if ((error = getinoquota(ip)) || (error = chkiq(ip, 1, ucp, 0))) { if (DOINGSOFTDEP(tvp)) softdep_revert_link(dp, ip); UFS_VFREE(tvp, ip->i_number, dmode); vput(tvp); return (error); } #endif } #else /* !SUIDDIR */ ip->i_uid = cnp->cn_cred->cr_uid; DIP_SET(ip, i_uid, ip->i_uid); #ifdef QUOTA if ((error = getinoquota(ip)) || (error = chkiq(ip, 1, cnp->cn_cred, 0))) { if (DOINGSOFTDEP(tvp)) softdep_revert_link(dp, ip); UFS_VFREE(tvp, ip->i_number, dmode); vput(tvp); return (error); } #endif #endif /* !SUIDDIR */ ip->i_flag |= IN_ACCESS | IN_CHANGE | IN_UPDATE; ip->i_mode = dmode; DIP_SET(ip, i_mode, dmode); tvp->v_type = VDIR; /* Rest init'd in getnewvnode(). */ ip->i_effnlink = 2; ip->i_nlink = 2; DIP_SET(ip, i_nlink, 2); if (cnp->cn_flags & ISWHITEOUT) { ip->i_flags |= UF_OPAQUE; DIP_SET(ip, i_flags, ip->i_flags); } /* * Bump link count in parent directory to reflect work done below. * Should be done before reference is created so cleanup is * possible if we crash. */ dp->i_effnlink++; dp->i_nlink++; DIP_SET(dp, i_nlink, dp->i_nlink); dp->i_flag |= IN_CHANGE; if (DOINGSOFTDEP(dvp)) softdep_setup_mkdir(dp, ip); error = UFS_UPDATE(dvp, !DOINGSOFTDEP(dvp) && !DOINGASYNC(dvp)); if (error) goto bad; #ifdef MAC if (dvp->v_mount->mnt_flag & MNT_MULTILABEL) { error = mac_vnode_create_extattr(cnp->cn_cred, dvp->v_mount, dvp, tvp, cnp); if (error) goto bad; } #endif #ifdef UFS_ACL if (dvp->v_mount->mnt_flag & MNT_ACLS) { error = ufs_do_posix1e_acl_inheritance_dir(dvp, tvp, dmode, cnp->cn_cred, cnp->cn_thread); if (error) goto bad; } else if (dvp->v_mount->mnt_flag & MNT_NFS4ACLS) { error = ufs_do_nfs4_acl_inheritance(dvp, tvp, dmode, cnp->cn_cred, cnp->cn_thread); if (error) goto bad; } #endif /* !UFS_ACL */ /* * Initialize directory with "." and ".." from static template. */ if (dvp->v_mount->mnt_maxsymlinklen > 0) dtp = &mastertemplate; else dtp = (struct dirtemplate *)&omastertemplate; dirtemplate = *dtp; dirtemplate.dot_ino = ip->i_number; dirtemplate.dotdot_ino = dp->i_number; vnode_pager_setsize(tvp, DIRBLKSIZ); if ((error = UFS_BALLOC(tvp, (off_t)0, DIRBLKSIZ, cnp->cn_cred, BA_CLRBUF, &bp)) != 0) goto bad; ip->i_size = DIRBLKSIZ; DIP_SET(ip, i_size, DIRBLKSIZ); ip->i_flag |= IN_CHANGE | IN_UPDATE; bcopy((caddr_t)&dirtemplate, (caddr_t)bp->b_data, sizeof dirtemplate); if (DOINGSOFTDEP(tvp)) { /* * Ensure that the entire newly allocated block is a * valid directory so that future growth within the * block does not have to ensure that the block is * written before the inode. */ blkoff = DIRBLKSIZ; while (blkoff < bp->b_bcount) { ((struct direct *) (bp->b_data + blkoff))->d_reclen = DIRBLKSIZ; blkoff += DIRBLKSIZ; } } if ((error = UFS_UPDATE(tvp, !DOINGSOFTDEP(tvp) && !DOINGASYNC(tvp))) != 0) { (void)bwrite(bp); goto bad; } /* * Directory set up, now install its entry in the parent directory. * * If we are not doing soft dependencies, then we must write out the * buffer containing the new directory body before entering the new * name in the parent. If we are doing soft dependencies, then the * buffer containing the new directory body will be passed to and * released in the soft dependency code after the code has attached * an appropriate ordering dependency to the buffer which ensures that * the buffer is written before the new name is written in the parent. */ if (DOINGASYNC(dvp)) bdwrite(bp); else if (!DOINGSOFTDEP(dvp) && ((error = bwrite(bp)))) goto bad; ufs_makedirentry(ip, cnp, &newdir); error = ufs_direnter(dvp, tvp, &newdir, cnp, bp, 0); bad: if (error == 0) { *ap->a_vpp = tvp; } else { dp->i_effnlink--; dp->i_nlink--; DIP_SET(dp, i_nlink, dp->i_nlink); dp->i_flag |= IN_CHANGE; /* * No need to do an explicit VOP_TRUNCATE here, vrele will * do this for us because we set the link count to 0. */ ip->i_effnlink = 0; ip->i_nlink = 0; DIP_SET(ip, i_nlink, 0); ip->i_flag |= IN_CHANGE; if (DOINGSOFTDEP(tvp)) softdep_revert_mkdir(dp, ip); vput(tvp); } out: return (error); } /* * Rmdir system call. */ static int ufs_rmdir(ap) struct vop_rmdir_args /* { struct vnode *a_dvp; struct vnode *a_vp; struct componentname *a_cnp; } */ *ap; { struct vnode *vp = ap->a_vp; struct vnode *dvp = ap->a_dvp; struct componentname *cnp = ap->a_cnp; struct inode *ip, *dp; int error; ip = VTOI(vp); dp = VTOI(dvp); /* * Do not remove a directory that is in the process of being renamed. * Verify the directory is empty (and valid). Rmdir ".." will not be * valid since ".." will contain a reference to the current directory * and thus be non-empty. Do not allow the removal of mounted on * directories (this can happen when an NFS exported filesystem * tries to remove a locally mounted on directory). */ error = 0; if (dp->i_effnlink <= 2) { if (dp->i_effnlink == 2) print_bad_link_count("ufs_rmdir", dvp); error = EINVAL; goto out; } if (!ufs_dirempty(ip, dp->i_number, cnp->cn_cred)) { error = ENOTEMPTY; goto out; } if ((dp->i_flags & APPEND) || (ip->i_flags & (NOUNLINK | IMMUTABLE | APPEND))) { error = EPERM; goto out; } if (vp->v_mountedhere != 0) { error = EINVAL; goto out; } #ifdef UFS_GJOURNAL ufs_gjournal_orphan(vp); #endif /* * Delete reference to directory before purging * inode. If we crash in between, the directory * will be reattached to lost+found, */ dp->i_effnlink--; ip->i_effnlink--; if (DOINGSOFTDEP(vp)) softdep_setup_rmdir(dp, ip); error = ufs_dirremove(dvp, ip, cnp->cn_flags, 1); if (error) { dp->i_effnlink++; ip->i_effnlink++; if (DOINGSOFTDEP(vp)) softdep_revert_rmdir(dp, ip); goto out; } cache_purge(dvp); /* * The only stuff left in the directory is "." and "..". The "." * reference is inconsequential since we are quashing it. The soft * dependency code will arrange to do these operations after * the parent directory entry has been deleted on disk, so * when running with that code we avoid doing them now. */ if (!DOINGSOFTDEP(vp)) { dp->i_nlink--; DIP_SET(dp, i_nlink, dp->i_nlink); dp->i_flag |= IN_CHANGE; error = UFS_UPDATE(dvp, 0); ip->i_nlink--; DIP_SET(ip, i_nlink, ip->i_nlink); ip->i_flag |= IN_CHANGE; } cache_purge(vp); #ifdef UFS_DIRHASH /* Kill any active hash; i_effnlink == 0, so it will not come back. */ if (ip->i_dirhash != NULL) ufsdirhash_free(ip); #endif out: return (error); } /* * symlink -- make a symbolic link */ static int ufs_symlink(ap) struct vop_symlink_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; struct vattr *a_vap; const char *a_target; } */ *ap; { struct vnode *vp, **vpp = ap->a_vpp; struct inode *ip; int len, error; error = ufs_makeinode(IFLNK | ap->a_vap->va_mode, ap->a_dvp, vpp, ap->a_cnp, "ufs_symlink"); if (error) return (error); vp = *vpp; len = strlen(ap->a_target); if (len < vp->v_mount->mnt_maxsymlinklen) { ip = VTOI(vp); bcopy(ap->a_target, SHORTLINK(ip), len); ip->i_size = len; DIP_SET(ip, i_size, len); ip->i_flag |= IN_CHANGE | IN_UPDATE; error = UFS_UPDATE(vp, 0); } else error = vn_rdwr(UIO_WRITE, vp, __DECONST(void *, ap->a_target), len, (off_t)0, UIO_SYSSPACE, IO_NODELOCKED | IO_NOMACCHECK, ap->a_cnp->cn_cred, NOCRED, NULL, NULL); if (error) vput(vp); return (error); } /* * Vnode op for reading directories. */ int ufs_readdir(ap) struct vop_readdir_args /* { struct vnode *a_vp; struct uio *a_uio; struct ucred *a_cred; int *a_eofflag; int *a_ncookies; u_long **a_cookies; } */ *ap; { struct vnode *vp = ap->a_vp; struct uio *uio = ap->a_uio; struct buf *bp; struct inode *ip; struct direct *dp, *edp; u_long *cookies; struct dirent dstdp; off_t offset, startoffset; size_t readcnt, skipcnt; ssize_t startresid; u_int ncookies; int error; if (uio->uio_offset < 0) return (EINVAL); ip = VTOI(vp); if (ip->i_effnlink == 0) return (0); if (ap->a_ncookies != NULL) { if (uio->uio_resid < 0) ncookies = 0; else ncookies = uio->uio_resid; if (uio->uio_offset >= ip->i_size) ncookies = 0; else if (ip->i_size - uio->uio_offset < ncookies) ncookies = ip->i_size - uio->uio_offset; ncookies = ncookies / (offsetof(struct direct, d_name) + 4) + 1; cookies = malloc(ncookies * sizeof(*cookies), M_TEMP, M_WAITOK); *ap->a_ncookies = ncookies; *ap->a_cookies = cookies; } else { ncookies = 0; cookies = NULL; } offset = startoffset = uio->uio_offset; startresid = uio->uio_resid; error = 0; while (error == 0 && uio->uio_resid > 0 && uio->uio_offset < ip->i_size) { error = ffs_blkatoff(vp, uio->uio_offset, NULL, &bp); if (error) break; if (bp->b_offset + bp->b_bcount > ip->i_size) readcnt = ip->i_size - bp->b_offset; else readcnt = bp->b_bcount; skipcnt = (size_t)(uio->uio_offset - bp->b_offset) & ~(size_t)(DIRBLKSIZ - 1); offset = bp->b_offset + skipcnt; dp = (struct direct *)&bp->b_data[skipcnt]; edp = (struct direct *)&bp->b_data[readcnt]; while (error == 0 && uio->uio_resid > 0 && dp < edp) { if (dp->d_reclen <= offsetof(struct direct, d_name) || (caddr_t)dp + dp->d_reclen > (caddr_t)edp) { error = EIO; break; } #if BYTE_ORDER == LITTLE_ENDIAN /* Old filesystem format. */ if (vp->v_mount->mnt_maxsymlinklen <= 0) { dstdp.d_namlen = dp->d_type; dstdp.d_type = dp->d_namlen; } else #endif { dstdp.d_namlen = dp->d_namlen; dstdp.d_type = dp->d_type; } if (offsetof(struct direct, d_name) + dstdp.d_namlen > dp->d_reclen) { error = EIO; break; } if (offset < startoffset || dp->d_ino == 0) goto nextentry; dstdp.d_fileno = dp->d_ino; dstdp.d_reclen = GENERIC_DIRSIZ(&dstdp); bcopy(dp->d_name, dstdp.d_name, dstdp.d_namlen); /* NOTE: d_off is the offset of the *next* entry. */ dstdp.d_off = offset + dp->d_reclen; dirent_terminate(&dstdp); if (dstdp.d_reclen > uio->uio_resid) { if (uio->uio_resid == startresid) error = EINVAL; else error = EJUSTRETURN; break; } /* Advance dp. */ error = uiomove((caddr_t)&dstdp, dstdp.d_reclen, uio); if (error) break; if (cookies != NULL) { KASSERT(ncookies > 0, ("ufs_readdir: cookies buffer too small")); *cookies = offset + dp->d_reclen; cookies++; ncookies--; } nextentry: offset += dp->d_reclen; dp = (struct direct *)((caddr_t)dp + dp->d_reclen); } bqrelse(bp); uio->uio_offset = offset; } /* We need to correct uio_offset. */ uio->uio_offset = offset; if (error == EJUSTRETURN) error = 0; if (ap->a_ncookies != NULL) { if (error == 0) { ap->a_ncookies -= ncookies; } else { free(*ap->a_cookies, M_TEMP); *ap->a_ncookies = 0; *ap->a_cookies = NULL; } } if (error == 0 && ap->a_eofflag) *ap->a_eofflag = ip->i_size <= uio->uio_offset; return (error); } /* * Return target name of a symbolic link */ static int ufs_readlink(ap) struct vop_readlink_args /* { struct vnode *a_vp; struct uio *a_uio; struct ucred *a_cred; } */ *ap; { struct vnode *vp = ap->a_vp; struct inode *ip = VTOI(vp); doff_t isize; isize = ip->i_size; if ((isize < vp->v_mount->mnt_maxsymlinklen) || DIP(ip, i_blocks) == 0) { /* XXX - for old fastlink support */ return (uiomove(SHORTLINK(ip), isize, ap->a_uio)); } return (VOP_READ(vp, ap->a_uio, 0, ap->a_cred)); } /* * Calculate the logical to physical mapping if not done already, * then call the device strategy routine. * * In order to be able to swap to a file, the ufs_bmaparray() operation may not * deadlock on memory. See ufs_bmap() for details. */ static int ufs_strategy(ap) struct vop_strategy_args /* { struct vnode *a_vp; struct buf *a_bp; } */ *ap; { struct buf *bp = ap->a_bp; struct vnode *vp = ap->a_vp; ufs2_daddr_t blkno; int error; if (bp->b_blkno == bp->b_lblkno) { error = ufs_bmaparray(vp, bp->b_lblkno, &blkno, bp, NULL, NULL); bp->b_blkno = blkno; if (error) { bp->b_error = error; bp->b_ioflags |= BIO_ERROR; bufdone(bp); return (0); } if ((long)bp->b_blkno == -1) vfs_bio_clrbuf(bp); } if ((long)bp->b_blkno == -1) { bufdone(bp); return (0); } bp->b_iooffset = dbtob(bp->b_blkno); BO_STRATEGY(VFSTOUFS(vp->v_mount)->um_bo, bp); return (0); } /* * Print out the contents of an inode. */ static int ufs_print(ap) struct vop_print_args /* { struct vnode *a_vp; } */ *ap; { struct vnode *vp = ap->a_vp; struct inode *ip = VTOI(vp); printf("\tnlink=%d, effnlink=%d, size=%jd", ip->i_nlink, ip->i_effnlink, (intmax_t)ip->i_size); if (I_IS_UFS2(ip)) printf(", extsize %d", ip->i_din2->di_extsize); printf("\n\tgeneration=%jx, uid=%d, gid=%d, flags=0x%b\n", (uintmax_t)ip->i_gen, ip->i_uid, ip->i_gid, (u_int)ip->i_flags, PRINT_INODE_FLAGS); printf("\tino %lu, on dev %s", (u_long)ip->i_number, devtoname(ITODEV(ip))); if (vp->v_type == VFIFO) fifo_printinfo(vp); printf("\n"); return (0); } /* * Close wrapper for fifos. * * Update the times on the inode then do device close. */ static int ufsfifo_close(ap) struct vop_close_args /* { struct vnode *a_vp; int a_fflag; struct ucred *a_cred; struct thread *a_td; } */ *ap; { struct vnode *vp = ap->a_vp; int usecount; VI_LOCK(vp); usecount = vp->v_usecount; if (usecount > 1) ufs_itimes_locked(vp); VI_UNLOCK(vp); return (fifo_specops.vop_close(ap)); } /* * Kqfilter wrapper for fifos. * * Fall through to ufs kqfilter routines if needed */ static int ufsfifo_kqfilter(ap) struct vop_kqfilter_args *ap; { int error; error = fifo_specops.vop_kqfilter(ap); if (error) error = vfs_kqfilter(ap); return (error); } /* * Return POSIX pathconf information applicable to ufs filesystems. */ static int ufs_pathconf(ap) struct vop_pathconf_args /* { struct vnode *a_vp; int a_name; int *a_retval; } */ *ap; { int error; error = 0; switch (ap->a_name) { case _PC_LINK_MAX: *ap->a_retval = UFS_LINK_MAX; break; case _PC_NAME_MAX: *ap->a_retval = UFS_MAXNAMLEN; break; case _PC_PIPE_BUF: if (ap->a_vp->v_type == VDIR || ap->a_vp->v_type == VFIFO) *ap->a_retval = PIPE_BUF; else error = EINVAL; break; case _PC_CHOWN_RESTRICTED: *ap->a_retval = 1; break; case _PC_NO_TRUNC: *ap->a_retval = 1; break; #ifdef UFS_ACL case _PC_ACL_EXTENDED: if (ap->a_vp->v_mount->mnt_flag & MNT_ACLS) *ap->a_retval = 1; else *ap->a_retval = 0; break; case _PC_ACL_NFS4: if (ap->a_vp->v_mount->mnt_flag & MNT_NFS4ACLS) *ap->a_retval = 1; else *ap->a_retval = 0; break; #endif case _PC_ACL_PATH_MAX: #ifdef UFS_ACL if (ap->a_vp->v_mount->mnt_flag & (MNT_ACLS | MNT_NFS4ACLS)) *ap->a_retval = ACL_MAX_ENTRIES; else *ap->a_retval = 3; #else *ap->a_retval = 3; #endif break; #ifdef MAC case _PC_MAC_PRESENT: if (ap->a_vp->v_mount->mnt_flag & MNT_MULTILABEL) *ap->a_retval = 1; else *ap->a_retval = 0; break; #endif case _PC_MIN_HOLE_SIZE: *ap->a_retval = ap->a_vp->v_mount->mnt_stat.f_iosize; break; case _PC_PRIO_IO: *ap->a_retval = 0; break; case _PC_SYNC_IO: *ap->a_retval = 0; break; case _PC_ALLOC_SIZE_MIN: *ap->a_retval = ap->a_vp->v_mount->mnt_stat.f_bsize; break; case _PC_FILESIZEBITS: *ap->a_retval = 64; break; case _PC_REC_INCR_XFER_SIZE: *ap->a_retval = ap->a_vp->v_mount->mnt_stat.f_iosize; break; case _PC_REC_MAX_XFER_SIZE: *ap->a_retval = -1; /* means ``unlimited'' */ break; case _PC_REC_MIN_XFER_SIZE: *ap->a_retval = ap->a_vp->v_mount->mnt_stat.f_iosize; break; case _PC_REC_XFER_ALIGN: *ap->a_retval = PAGE_SIZE; break; case _PC_SYMLINK_MAX: *ap->a_retval = MAXPATHLEN; break; default: error = vop_stdpathconf(ap); break; } return (error); } /* * Initialize the vnode associated with a new inode, handle aliased * vnodes. */ int ufs_vinit(mntp, fifoops, vpp) struct mount *mntp; struct vop_vector *fifoops; struct vnode **vpp; { struct inode *ip; struct vnode *vp; vp = *vpp; ASSERT_VOP_LOCKED(vp, "ufs_vinit"); ip = VTOI(vp); vp->v_type = IFTOVT(ip->i_mode); /* * Only unallocated inodes should be of type VNON. */ if (ip->i_mode != 0 && vp->v_type == VNON) return (EINVAL); if (vp->v_type == VFIFO) vp->v_op = fifoops; if (ip->i_number == UFS_ROOTINO) vp->v_vflag |= VV_ROOT; *vpp = vp; return (0); } /* * Allocate a new inode. * Vnode dvp must be locked. */ static int ufs_makeinode(mode, dvp, vpp, cnp, callfunc) int mode; struct vnode *dvp; struct vnode **vpp; struct componentname *cnp; const char *callfunc; { struct inode *ip, *pdir; struct direct newdir; struct vnode *tvp; int error; pdir = VTOI(dvp); #ifdef INVARIANTS if ((cnp->cn_flags & HASBUF) == 0) panic("%s: no name", callfunc); #endif *vpp = NULL; if ((mode & IFMT) == 0) mode |= IFREG; if (pdir->i_effnlink < 2) { print_bad_link_count(callfunc, dvp); return (EINVAL); } error = UFS_VALLOC(dvp, mode, cnp->cn_cred, &tvp); if (error) return (error); ip = VTOI(tvp); ip->i_gid = pdir->i_gid; DIP_SET(ip, i_gid, pdir->i_gid); #ifdef SUIDDIR { #ifdef QUOTA struct ucred ucred, *ucp; gid_t ucred_group; ucp = cnp->cn_cred; #endif /* * If we are not the owner of the directory, * and we are hacking owners here, (only do this where told to) * and we are not giving it TO root, (would subvert quotas) * then go ahead and give it to the other user. * Note that this drops off the execute bits for security. */ if ((dvp->v_mount->mnt_flag & MNT_SUIDDIR) && (pdir->i_mode & ISUID) && (pdir->i_uid != cnp->cn_cred->cr_uid) && pdir->i_uid) { ip->i_uid = pdir->i_uid; DIP_SET(ip, i_uid, ip->i_uid); mode &= ~07111; #ifdef QUOTA /* * Make sure the correct user gets charged * for the space. * Quickly knock up a dummy credential for the victim. * XXX This seems to never be accessed out of our * context so a stack variable is ok. */ refcount_init(&ucred.cr_ref, 1); ucred.cr_uid = ip->i_uid; ucred.cr_ngroups = 1; ucred.cr_groups = &ucred_group; ucred.cr_groups[0] = pdir->i_gid; ucp = &ucred; #endif } else { ip->i_uid = cnp->cn_cred->cr_uid; DIP_SET(ip, i_uid, ip->i_uid); } #ifdef QUOTA if ((error = getinoquota(ip)) || (error = chkiq(ip, 1, ucp, 0))) { if (DOINGSOFTDEP(tvp)) softdep_revert_link(pdir, ip); UFS_VFREE(tvp, ip->i_number, mode); vput(tvp); return (error); } #endif } #else /* !SUIDDIR */ ip->i_uid = cnp->cn_cred->cr_uid; DIP_SET(ip, i_uid, ip->i_uid); #ifdef QUOTA if ((error = getinoquota(ip)) || (error = chkiq(ip, 1, cnp->cn_cred, 0))) { if (DOINGSOFTDEP(tvp)) softdep_revert_link(pdir, ip); UFS_VFREE(tvp, ip->i_number, mode); vput(tvp); return (error); } #endif #endif /* !SUIDDIR */ ip->i_flag |= IN_ACCESS | IN_CHANGE | IN_UPDATE; ip->i_mode = mode; DIP_SET(ip, i_mode, mode); tvp->v_type = IFTOVT(mode); /* Rest init'd in getnewvnode(). */ ip->i_effnlink = 1; ip->i_nlink = 1; DIP_SET(ip, i_nlink, 1); if (DOINGSOFTDEP(tvp)) softdep_setup_create(VTOI(dvp), ip); if ((ip->i_mode & ISGID) && !groupmember(ip->i_gid, cnp->cn_cred) && priv_check_cred(cnp->cn_cred, PRIV_VFS_SETGID)) { ip->i_mode &= ~ISGID; DIP_SET(ip, i_mode, ip->i_mode); } if (cnp->cn_flags & ISWHITEOUT) { ip->i_flags |= UF_OPAQUE; DIP_SET(ip, i_flags, ip->i_flags); } /* * Make sure inode goes to disk before directory entry. */ error = UFS_UPDATE(tvp, !DOINGSOFTDEP(tvp) && !DOINGASYNC(tvp)); if (error) goto bad; #ifdef MAC if (dvp->v_mount->mnt_flag & MNT_MULTILABEL) { error = mac_vnode_create_extattr(cnp->cn_cred, dvp->v_mount, dvp, tvp, cnp); if (error) goto bad; } #endif #ifdef UFS_ACL if (dvp->v_mount->mnt_flag & MNT_ACLS) { error = ufs_do_posix1e_acl_inheritance_file(dvp, tvp, mode, cnp->cn_cred, cnp->cn_thread); if (error) goto bad; } else if (dvp->v_mount->mnt_flag & MNT_NFS4ACLS) { error = ufs_do_nfs4_acl_inheritance(dvp, tvp, mode, cnp->cn_cred, cnp->cn_thread); if (error) goto bad; } #endif /* !UFS_ACL */ ufs_makedirentry(ip, cnp, &newdir); error = ufs_direnter(dvp, tvp, &newdir, cnp, NULL, 0); if (error) goto bad; *vpp = tvp; return (0); bad: /* * Write error occurred trying to update the inode * or the directory so must deallocate the inode. */ ip->i_effnlink = 0; ip->i_nlink = 0; DIP_SET(ip, i_nlink, 0); ip->i_flag |= IN_CHANGE; if (DOINGSOFTDEP(tvp)) softdep_revert_create(VTOI(dvp), ip); vput(tvp); return (error); } static int ufs_ioctl(struct vop_ioctl_args *ap) { struct vnode *vp; + int error; vp = ap->a_vp; switch (ap->a_command) { case FIOSEEKDATA: - return (ufs_bmap_seekdata(vp, (off_t *)ap->a_data)); + error = vn_lock(vp, LK_SHARED); + if (error == 0) { + error = ufs_bmap_seekdata(vp, (off_t *)ap->a_data); + VOP_UNLOCK(vp, 0); + } else + error = EBADF; + return (error); case FIOSEEKHOLE: return (vn_bmap_seekhole(vp, ap->a_command, (off_t *)ap->a_data, ap->a_cred)); default: return (ENOTTY); } } /* Global vfs data structures for ufs. */ struct vop_vector ufs_vnodeops = { .vop_default = &default_vnodeops, .vop_fsync = VOP_PANIC, .vop_read = VOP_PANIC, .vop_reallocblks = VOP_PANIC, .vop_write = VOP_PANIC, .vop_accessx = ufs_accessx, .vop_bmap = ufs_bmap, .vop_cachedlookup = ufs_lookup, .vop_close = ufs_close, .vop_create = ufs_create, .vop_getattr = ufs_getattr, .vop_inactive = ufs_inactive, .vop_ioctl = ufs_ioctl, .vop_link = ufs_link, .vop_lookup = vfs_cache_lookup, .vop_markatime = ufs_markatime, .vop_mkdir = ufs_mkdir, .vop_mknod = ufs_mknod, .vop_open = ufs_open, .vop_pathconf = ufs_pathconf, .vop_poll = vop_stdpoll, .vop_print = ufs_print, .vop_readdir = ufs_readdir, .vop_readlink = ufs_readlink, .vop_reclaim = ufs_reclaim, .vop_remove = ufs_remove, .vop_rename = ufs_rename, .vop_rmdir = ufs_rmdir, .vop_setattr = ufs_setattr, #ifdef MAC .vop_setlabel = vop_stdsetlabel_ea, #endif .vop_strategy = ufs_strategy, .vop_symlink = ufs_symlink, .vop_whiteout = ufs_whiteout, #ifdef UFS_EXTATTR .vop_getextattr = ufs_getextattr, .vop_deleteextattr = ufs_deleteextattr, .vop_setextattr = ufs_setextattr, #endif #ifdef UFS_ACL .vop_getacl = ufs_getacl, .vop_setacl = ufs_setacl, .vop_aclcheck = ufs_aclcheck, #endif }; struct vop_vector ufs_fifoops = { .vop_default = &fifo_specops, .vop_fsync = VOP_PANIC, .vop_accessx = ufs_accessx, .vop_close = ufsfifo_close, .vop_getattr = ufs_getattr, .vop_inactive = ufs_inactive, .vop_kqfilter = ufsfifo_kqfilter, .vop_markatime = ufs_markatime, .vop_pathconf = ufs_pathconf, .vop_print = ufs_print, .vop_read = VOP_PANIC, .vop_reclaim = ufs_reclaim, .vop_setattr = ufs_setattr, #ifdef MAC .vop_setlabel = vop_stdsetlabel_ea, #endif .vop_write = VOP_PANIC, #ifdef UFS_EXTATTR .vop_getextattr = ufs_getextattr, .vop_deleteextattr = ufs_deleteextattr, .vop_setextattr = ufs_setextattr, #endif #ifdef UFS_ACL .vop_getacl = ufs_getacl, .vop_setacl = ufs_setacl, .vop_aclcheck = ufs_aclcheck, #endif }; Index: projects/nfsv42/sys =================================================================== --- projects/nfsv42/sys (revision 350367) +++ projects/nfsv42/sys (revision 350368) Property changes on: projects/nfsv42/sys ___________________________________________________________________ Modified: svn:mergeinfo ## -0,0 +0,1 ## Merged /head/sys:r350326-350367