Index: head/sys/conf/files.i386 =================================================================== --- head/sys/conf/files.i386 (revision 313153) +++ head/sys/conf/files.i386 (revision 313154) @@ -1,648 +1,648 @@ # This file tells config what files go into building a kernel, # files marked standard are always included. # # $FreeBSD$ # # The long compile-with and dependency lines are required because of # limitations in config: backslash-newline doesn't work in strings, and # dependency lines other than the first are silently ignored. # cloudabi32_vdso.o optional compat_cloudabi32 \ dependency "$S/contrib/cloudabi/cloudabi_vdso_i686.S" \ compile-with "${CC} -x assembler-with-cpp -shared -nostdinc -nostdlib -Wl,-T$S/compat/cloudabi/cloudabi_vdso.lds $S/contrib/cloudabi/cloudabi_vdso_i686.S -o ${.TARGET}" \ no-obj no-implicit-rule \ clean "cloudabi32_vdso.o" # cloudabi32_vdso_blob.o optional compat_cloudabi32 \ dependency "cloudabi32_vdso.o" \ compile-with "${OBJCOPY} --input-target binary --output-target elf32-i386-freebsd --binary-architecture i386 cloudabi32_vdso.o ${.TARGET}" \ no-implicit-rule \ clean "cloudabi32_vdso_blob.o" # linux_genassym.o optional compat_linux \ dependency "$S/i386/linux/linux_genassym.c" \ compile-with "${CC} ${CFLAGS:N-fno-common} -c ${.IMPSRC}" \ no-obj no-implicit-rule \ clean "linux_genassym.o" # linux_assym.h optional compat_linux \ dependency "$S/kern/genassym.sh linux_genassym.o" \ compile-with "sh $S/kern/genassym.sh linux_genassym.o > ${.TARGET}" \ no-obj no-implicit-rule before-depend \ clean "linux_assym.h" # linux_locore.o optional compat_linux \ dependency "linux_assym.h $S/i386/linux/linux_locore.s" \ compile-with "${CC} -x assembler-with-cpp -DLOCORE -shared -s -pipe -I. -I$S -Werror -Wall -fno-common -nostdinc -nostdlib -Wl,-T$S/i386/linux/linux_vdso.lds.s -Wl,-soname=linux_vdso.so,--eh-frame-hdr,-fPIC,-warn-common ${.IMPSRC} -o ${.TARGET}" \ no-obj no-implicit-rule \ clean "linux_locore.o" # linux_vdso.so optional compat_linux \ dependency "linux_locore.o" \ compile-with "${OBJCOPY} --input-target binary --output-target elf32-i386-freebsd --binary-architecture i386 linux_locore.o ${.TARGET}" \ no-implicit-rule \ clean "linux_vdso.so" # svr4_genassym.o optional compat_svr4 \ dependency "$S/i386/svr4/svr4_genassym.c" \ compile-with "${CC} ${CFLAGS:N-fno-common} -c ${.IMPSRC}" \ no-obj no-implicit-rule \ clean "svr4_genassym.o" # svr4_assym.h optional compat_svr4 \ dependency "$S/kern/genassym.sh svr4_genassym.o" \ compile-with "sh $S/kern/genassym.sh svr4_genassym.o > ${.TARGET}" \ no-obj no-implicit-rule before-depend \ clean "svr4_assym.h" # font.h optional sc_dflt_font \ compile-with "uudecode < /usr/share/syscons/fonts/${SC_DFLT_FONT}-8x16.fnt && file2c 'static u_char dflt_font_16[16*256] = {' '};' < ${SC_DFLT_FONT}-8x16 > font.h && uudecode < /usr/share/syscons/fonts/${SC_DFLT_FONT}-8x14.fnt && file2c 'static u_char dflt_font_14[14*256] = {' '};' < ${SC_DFLT_FONT}-8x14 >> font.h && uudecode < /usr/share/syscons/fonts/${SC_DFLT_FONT}-8x8.fnt && file2c 'static u_char dflt_font_8[8*256] = {' '};' < ${SC_DFLT_FONT}-8x8 >> font.h" \ no-obj no-implicit-rule before-depend \ clean "font.h ${SC_DFLT_FONT}-8x14 ${SC_DFLT_FONT}-8x16 ${SC_DFLT_FONT}-8x8" # atkbdmap.h optional atkbd_dflt_keymap \ compile-with "kbdcontrol -P ${S:S/sys$/share/}/vt/keymaps -P ${S:S/sys$/share/}/syscons/keymaps -L ${ATKBD_DFLT_KEYMAP} | sed -e 's/^static keymap_t.* = /static keymap_t key_map = /' -e 's/^static accentmap_t.* = /static accentmap_t accent_map = /' > atkbdmap.h" \ no-obj no-implicit-rule before-depend \ clean "atkbdmap.h" # ukbdmap.h optional ukbd_dflt_keymap \ compile-with "kbdcontrol -P ${S:S/sys$/share/}/vt/keymaps -P ${S:S/sys$/share/}/syscons/keymaps -L ${UKBD_DFLT_KEYMAP} | sed -e 's/^static keymap_t.* = /static keymap_t key_map = /' -e 's/^static accentmap_t.* = /static accentmap_t accent_map = /' > ukbdmap.h" \ no-obj no-implicit-rule before-depend \ clean "ukbdmap.h" # hpt27xx_lib.o optional hpt27xx \ dependency "$S/dev/hpt27xx/i386-elf.hpt27xx_lib.o.uu" \ compile-with "uudecode < $S/dev/hpt27xx/i386-elf.hpt27xx_lib.o.uu" \ no-implicit-rule # hptmvraid.o optional hptmv \ dependency "$S/dev/hptmv/i386-elf.raid.o.uu" \ compile-with "uudecode < $S/dev/hptmv/i386-elf.raid.o.uu" \ no-implicit-rule # hptnr_lib.o optional hptnr \ dependency "$S/dev/hptnr/i386-elf.hptnr_lib.o.uu" \ compile-with "uudecode < $S/dev/hptnr/i386-elf.hptnr_lib.o.uu" \ no-implicit-rule # hptrr_lib.o optional hptrr \ dependency "$S/dev/hptrr/i386-elf.hptrr_lib.o.uu" \ compile-with "uudecode < $S/dev/hptrr/i386-elf.hptrr_lib.o.uu" \ no-implicit-rule # cddl/contrib/opensolaris/common/atomic/i386/opensolaris_atomic.S optional zfs | dtrace compile-with "${ZFS_S}" cddl/dev/dtrace/i386/dtrace_asm.S optional dtrace compile-with "${DTRACE_S}" cddl/dev/dtrace/i386/dtrace_subr.c optional dtrace compile-with "${DTRACE_C}" cddl/dev/fbt/x86/fbt_isa.c optional dtrace_fbt | dtraceall compile-with "${FBT_C}" cddl/dev/dtrace/x86/dis_tables.c optional dtrace_fbt | dtraceall compile-with "${DTRACE_C}" cddl/dev/dtrace/x86/instr_size.c optional dtrace_fbt | dtraceall compile-with "${DTRACE_C}" compat/linprocfs/linprocfs.c optional linprocfs compat/linsysfs/linsysfs.c optional linsysfs compat/linux/linux_event.c optional compat_linux compat/linux/linux_emul.c optional compat_linux compat/linux/linux_file.c optional compat_linux compat/linux/linux_fork.c optional compat_linux compat/linux/linux_futex.c optional compat_linux compat/linux/linux_getcwd.c optional compat_linux compat/linux/linux_ioctl.c optional compat_linux compat/linux/linux_ipc.c optional compat_linux compat/linux/linux_mib.c optional compat_linux compat/linux/linux_misc.c optional compat_linux compat/linux/linux_mmap.c optional compat_linux compat/linux/linux_signal.c optional compat_linux compat/linux/linux_socket.c optional compat_linux compat/linux/linux_stats.c optional compat_linux compat/linux/linux_sysctl.c optional compat_linux compat/linux/linux_time.c optional compat_linux compat/linux/linux_timer.c optional compat_linux compat/linux/linux_uid16.c optional compat_linux compat/linux/linux_util.c optional compat_linux compat/linux/linux_vdso.c optional compat_linux compat/linux/linux.c optional compat_linux compat/ndis/kern_ndis.c optional ndisapi pci compat/ndis/kern_windrv.c optional ndisapi pci compat/ndis/subr_hal.c optional ndisapi pci compat/ndis/subr_ndis.c optional ndisapi pci compat/ndis/subr_ntoskrnl.c optional ndisapi pci compat/ndis/subr_pe.c optional ndisapi pci compat/ndis/subr_usbd.c optional ndisapi pci compat/ndis/winx32_wrap.S optional ndisapi pci compat/svr4/imgact_svr4.c optional compat_svr4 compat/svr4/svr4_fcntl.c optional compat_svr4 compat/svr4/svr4_filio.c optional compat_svr4 compat/svr4/svr4_ioctl.c optional compat_svr4 compat/svr4/svr4_ipc.c optional compat_svr4 compat/svr4/svr4_misc.c optional compat_svr4 compat/svr4/svr4_resource.c optional compat_svr4 compat/svr4/svr4_signal.c optional compat_svr4 compat/svr4/svr4_socket.c optional compat_svr4 compat/svr4/svr4_sockio.c optional compat_svr4 compat/svr4/svr4_stat.c optional compat_svr4 compat/svr4/svr4_stream.c optional compat_svr4 compat/svr4/svr4_syscallnames.c optional compat_svr4 compat/svr4/svr4_sysent.c optional compat_svr4 compat/svr4/svr4_sysvec.c optional compat_svr4 compat/svr4/svr4_termios.c optional compat_svr4 bf_enc.o optional crypto | ipsec \ dependency "$S/crypto/blowfish/arch/i386/bf_enc.S $S/crypto/blowfish/arch/i386/bf_enc_586.S $S/crypto/blowfish/arch/i386/bf_enc_686.S" \ compile-with "${CC} -c -I$S/crypto/blowfish/arch/i386 ${ASM_CFLAGS} ${WERROR} ${.IMPSRC}" \ no-implicit-rule crypto/aesni/aeskeys_i386.S optional aesni crypto/aesni/aesni.c optional aesni aesni_ghash.o optional aesni \ dependency "$S/crypto/aesni/aesni_ghash.c" \ compile-with "${CC} -c ${CFLAGS:C/^-O2$/-O3/:N-nostdinc} ${WERROR} ${NO_WCAST_QUAL} ${PROF} -mmmx -msse -msse4 -maes -mpclmul ${.IMPSRC}" \ no-implicit-rule \ clean "aesni_ghash.o" aesni_wrap.o optional aesni \ dependency "$S/crypto/aesni/aesni_wrap.c" \ compile-with "${CC} -c ${CFLAGS:C/^-O2$/-O3/:N-nostdinc} ${WERROR} ${NO_WCAST_QUAL} ${PROF} -mmmx -msse -msse4 -maes ${.IMPSRC}" \ no-implicit-rule \ clean "aesni_wrap.o" crypto/des/arch/i386/des_enc.S optional crypto | ipsec | netsmb crypto/via/padlock.c optional padlock crypto/via/padlock_cipher.c optional padlock crypto/via/padlock_hash.c optional padlock dev/advansys/adv_isa.c optional adv isa dev/agp/agp_ali.c optional agp dev/agp/agp_amd.c optional agp dev/agp/agp_amd64.c optional agp dev/agp/agp_ati.c optional agp dev/agp/agp_i810.c optional agp dev/agp/agp_intel.c optional agp dev/agp/agp_nvidia.c optional agp dev/agp/agp_sis.c optional agp dev/agp/agp_via.c optional agp dev/aic/aic_isa.c optional aic isa dev/amdsbwd/amdsbwd.c optional amdsbwd dev/amdtemp/amdtemp.c optional amdtemp dev/arcmsr/arcmsr.c optional arcmsr pci dev/asmc/asmc.c optional asmc isa dev/atkbdc/atkbd.c optional atkbd atkbdc dev/atkbdc/atkbd_atkbdc.c optional atkbd atkbdc dev/atkbdc/atkbdc.c optional atkbdc dev/atkbdc/atkbdc_isa.c optional atkbdc isa dev/atkbdc/atkbdc_subr.c optional atkbdc dev/atkbdc/psm.c optional psm atkbdc dev/bxe/bxe.c optional bxe pci dev/bxe/bxe_stats.c optional bxe pci dev/bxe/bxe_debug.c optional bxe pci dev/bxe/ecore_sp.c optional bxe pci dev/bxe/bxe_elink.c optional bxe pci dev/bxe/57710_init_values.c optional bxe pci dev/bxe/57711_init_values.c optional bxe pci dev/bxe/57712_init_values.c optional bxe pci dev/ce/ceddk.c optional ce dev/ce/if_ce.c optional ce dev/ce/tau32-ddk.c optional ce \ compile-with "${NORMAL_C} ${NO_WCONSTANT_CONVERSION}" dev/cm/if_cm_isa.c optional cm isa dev/coretemp/coretemp.c optional coretemp dev/cp/cpddk.c optional cp dev/cp/if_cp.c optional cp dev/cpuctl/cpuctl.c optional cpuctl dev/ctau/ctau.c optional ctau dev/ctau/ctddk.c optional ctau dev/ctau/if_ct.c optional ctau dev/cx/csigma.c optional cx dev/cx/cxddk.c optional cx dev/cx/if_cx.c optional cx dev/dpms/dpms.c optional dpms dev/ed/if_ed_3c503.c optional ed isa ed_3c503 dev/ed/if_ed_isa.c optional ed isa dev/ed/if_ed_wd80x3.c optional ed isa dev/ed/if_ed_hpp.c optional ed isa ed_hpp dev/ed/if_ed_sic.c optional ed isa ed_sic dev/fb/fb.c optional fb | vga dev/fb/s3_pci.c optional s3pci dev/fb/vesa.c optional vga vesa dev/fb/vga.c optional vga dev/fdc/fdc.c optional fdc dev/fdc/fdc_acpi.c optional fdc dev/fdc/fdc_isa.c optional fdc isa dev/fdc/fdc_pccard.c optional fdc pccard dev/fe/if_fe_isa.c optional fe isa dev/glxiic/glxiic.c optional glxiic dev/glxsb/glxsb.c optional glxsb dev/glxsb/glxsb_hash.c optional glxsb dev/gpio/bytgpio.c optional bytgpio dev/hpt27xx/hpt27xx_os_bsd.c optional hpt27xx dev/hpt27xx/hpt27xx_osm_bsd.c optional hpt27xx dev/hpt27xx/hpt27xx_config.c optional hpt27xx dev/hptmv/entry.c optional hptmv dev/hptmv/mv.c optional hptmv dev/hptmv/gui_lib.c optional hptmv dev/hptmv/hptproc.c optional hptmv dev/hptmv/ioctl.c optional hptmv dev/hptnr/hptnr_os_bsd.c optional hptnr dev/hptnr/hptnr_osm_bsd.c optional hptnr dev/hptnr/hptnr_config.c optional hptnr dev/hptrr/hptrr_os_bsd.c optional hptrr dev/hptrr/hptrr_osm_bsd.c optional hptrr dev/hptrr/hptrr_config.c optional hptrr dev/hwpmc/hwpmc_amd.c optional hwpmc dev/hwpmc/hwpmc_intel.c optional hwpmc dev/hwpmc/hwpmc_core.c optional hwpmc dev/hwpmc/hwpmc_uncore.c optional hwpmc dev/hwpmc/hwpmc_pentium.c optional hwpmc dev/hwpmc/hwpmc_piv.c optional hwpmc dev/hwpmc/hwpmc_ppro.c optional hwpmc dev/hwpmc/hwpmc_tsc.c optional hwpmc dev/hwpmc/hwpmc_x86.c optional hwpmc dev/hyperv/pcib/vmbus_pcib.c optional hyperv pci dev/hyperv/netvsc/hn_nvs.c optional hyperv dev/hyperv/netvsc/hn_rndis.c optional hyperv dev/hyperv/netvsc/if_hn.c optional hyperv dev/hyperv/storvsc/hv_storvsc_drv_freebsd.c optional hyperv dev/hyperv/utilities/hv_kvp.c optional hyperv dev/hyperv/utilities/hv_snapshot.c optional hyperv dev/hyperv/utilities/vmbus_heartbeat.c optional hyperv dev/hyperv/utilities/vmbus_ic.c optional hyperv dev/hyperv/utilities/vmbus_shutdown.c optional hyperv dev/hyperv/utilities/vmbus_timesync.c optional hyperv dev/hyperv/vmbus/hyperv.c optional hyperv dev/hyperv/vmbus/hyperv_busdma.c optional hyperv dev/hyperv/vmbus/vmbus.c optional hyperv pci dev/hyperv/vmbus/vmbus_br.c optional hyperv dev/hyperv/vmbus/vmbus_chan.c optional hyperv dev/hyperv/vmbus/vmbus_et.c optional hyperv dev/hyperv/vmbus/vmbus_if.m optional hyperv dev/hyperv/vmbus/vmbus_xact.c optional hyperv dev/hyperv/vmbus/i386/hyperv_machdep.c optional hyperv dev/hyperv/vmbus/i386/vmbus_vector.S optional hyperv dev/ichwd/ichwd.c optional ichwd dev/if_ndis/if_ndis.c optional ndis dev/if_ndis/if_ndis_pccard.c optional ndis pccard dev/if_ndis/if_ndis_pci.c optional ndis cardbus | ndis pci dev/if_ndis/if_ndis_usb.c optional ndis usb dev/intel/spi.c optional intelspi dev/io/iodev.c optional io dev/ipmi/ipmi.c optional ipmi dev/ipmi/ipmi_acpi.c optional ipmi acpi dev/ipmi/ipmi_isa.c optional ipmi isa dev/ipmi/ipmi_kcs.c optional ipmi dev/ipmi/ipmi_smic.c optional ipmi dev/ipmi/ipmi_smbus.c optional ipmi smbus dev/ipmi/ipmi_smbios.c optional ipmi dev/ipmi/ipmi_ssif.c optional ipmi smbus dev/ipmi/ipmi_pci.c optional ipmi pci dev/ipmi/ipmi_linux.c optional ipmi compat_linux dev/le/if_le_isa.c optional le isa dev/mse/mse.c optional mse dev/mse/mse_isa.c optional mse isa dev/nfe/if_nfe.c optional nfe pci dev/ntb/if_ntb/if_ntb.c optional if_ntb dev/ntb/ntb_transport.c optional if_ntb dev/ntb/ntb.c optional if_ntb | ntb_hw dev/ntb/ntb_if.m optional if_ntb | ntb_hw dev/ntb/ntb_hw/ntb_hw.c optional ntb_hw dev/nvd/nvd.c optional nvd nvme dev/nvme/nvme.c optional nvme dev/nvme/nvme_ctrlr.c optional nvme dev/nvme/nvme_ctrlr_cmd.c optional nvme dev/nvme/nvme_ns.c optional nvme dev/nvme/nvme_ns_cmd.c optional nvme dev/nvme/nvme_qpair.c optional nvme dev/nvme/nvme_sysctl.c optional nvme dev/nvme/nvme_test.c optional nvme dev/nvme/nvme_util.c optional nvme dev/nvram/nvram.c optional nvram isa dev/ofw/ofwpci.c optional fdt pci dev/pcf/pcf_isa.c optional pcf dev/random/ivy.c optional rdrand_rng dev/random/nehemiah.c optional padlock_rng dev/sbni/if_sbni.c optional sbni dev/sbni/if_sbni_isa.c optional sbni isa dev/sbni/if_sbni_pci.c optional sbni pci dev/sio/sio.c optional sio dev/sio/sio_isa.c optional sio isa dev/sio/sio_pccard.c optional sio pccard dev/sio/sio_pci.c optional sio pci dev/sio/sio_puc.c optional sio puc dev/speaker/spkr.c optional speaker dev/syscons/apm/apm_saver.c optional apm_saver apm dev/syscons/scterm-teken.c optional sc dev/syscons/scvesactl.c optional sc vga vesa dev/syscons/scvgarndr.c optional sc vga dev/syscons/scvtb.c optional sc dev/tpm/tpm.c optional tpm dev/tpm/tpm_acpi.c optional tpm acpi dev/tpm/tpm_isa.c optional tpm isa dev/uart/uart_cpu_x86.c optional uart dev/viawd/viawd.c optional viawd dev/vmware/vmxnet3/if_vmx.c optional vmx dev/acpica/acpi_if.m standard dev/acpica/acpi_hpet.c optional acpi dev/acpica/acpi_timer.c optional acpi dev/acpi_support/acpi_wmi_if.m standard dev/wbwd/wbwd.c optional wbwd dev/wpi/if_wpi.c optional wpi dev/isci/isci.c optional isci dev/isci/isci_controller.c optional isci dev/isci/isci_domain.c optional isci dev/isci/isci_interrupt.c optional isci dev/isci/isci_io_request.c optional isci dev/isci/isci_logger.c optional isci dev/isci/isci_oem_parameters.c optional isci dev/isci/isci_remote_device.c optional isci dev/isci/isci_sysctl.c optional isci dev/isci/isci_task_request.c optional isci dev/isci/isci_timer.c optional isci dev/isci/scil/sati.c optional isci dev/isci/scil/sati_abort_task_set.c optional isci dev/isci/scil/sati_atapi.c optional isci dev/isci/scil/sati_device.c optional isci dev/isci/scil/sati_inquiry.c optional isci dev/isci/scil/sati_log_sense.c optional isci dev/isci/scil/sati_lun_reset.c optional isci dev/isci/scil/sati_mode_pages.c optional isci dev/isci/scil/sati_mode_select.c optional isci dev/isci/scil/sati_mode_sense.c optional isci dev/isci/scil/sati_mode_sense_10.c optional isci dev/isci/scil/sati_mode_sense_6.c optional isci dev/isci/scil/sati_move.c optional isci dev/isci/scil/sati_passthrough.c optional isci dev/isci/scil/sati_read.c optional isci dev/isci/scil/sati_read_buffer.c optional isci dev/isci/scil/sati_read_capacity.c optional isci dev/isci/scil/sati_reassign_blocks.c optional isci dev/isci/scil/sati_report_luns.c optional isci dev/isci/scil/sati_request_sense.c optional isci dev/isci/scil/sati_start_stop_unit.c optional isci dev/isci/scil/sati_synchronize_cache.c optional isci dev/isci/scil/sati_test_unit_ready.c optional isci dev/isci/scil/sati_unmap.c optional isci dev/isci/scil/sati_util.c optional isci dev/isci/scil/sati_verify.c optional isci dev/isci/scil/sati_write.c optional isci dev/isci/scil/sati_write_and_verify.c optional isci dev/isci/scil/sati_write_buffer.c optional isci dev/isci/scil/sati_write_long.c optional isci dev/isci/scil/sci_abstract_list.c optional isci dev/isci/scil/sci_base_controller.c optional isci dev/isci/scil/sci_base_domain.c optional isci dev/isci/scil/sci_base_iterator.c optional isci dev/isci/scil/sci_base_library.c optional isci dev/isci/scil/sci_base_logger.c optional isci dev/isci/scil/sci_base_memory_descriptor_list.c optional isci dev/isci/scil/sci_base_memory_descriptor_list_decorator.c optional isci dev/isci/scil/sci_base_object.c optional isci dev/isci/scil/sci_base_observer.c optional isci dev/isci/scil/sci_base_phy.c optional isci dev/isci/scil/sci_base_port.c optional isci dev/isci/scil/sci_base_remote_device.c optional isci dev/isci/scil/sci_base_request.c optional isci dev/isci/scil/sci_base_state_machine.c optional isci dev/isci/scil/sci_base_state_machine_logger.c optional isci dev/isci/scil/sci_base_state_machine_observer.c optional isci dev/isci/scil/sci_base_subject.c optional isci dev/isci/scil/sci_util.c optional isci dev/isci/scil/scic_sds_controller.c optional isci dev/isci/scil/scic_sds_library.c optional isci dev/isci/scil/scic_sds_pci.c optional isci dev/isci/scil/scic_sds_phy.c optional isci dev/isci/scil/scic_sds_port.c optional isci dev/isci/scil/scic_sds_port_configuration_agent.c optional isci dev/isci/scil/scic_sds_remote_device.c optional isci dev/isci/scil/scic_sds_remote_node_context.c optional isci dev/isci/scil/scic_sds_remote_node_table.c optional isci dev/isci/scil/scic_sds_request.c optional isci dev/isci/scil/scic_sds_sgpio.c optional isci dev/isci/scil/scic_sds_smp_remote_device.c optional isci dev/isci/scil/scic_sds_smp_request.c optional isci dev/isci/scil/scic_sds_ssp_request.c optional isci dev/isci/scil/scic_sds_stp_packet_request.c optional isci dev/isci/scil/scic_sds_stp_remote_device.c optional isci dev/isci/scil/scic_sds_stp_request.c optional isci dev/isci/scil/scic_sds_unsolicited_frame_control.c optional isci dev/isci/scil/scif_sas_controller.c optional isci dev/isci/scil/scif_sas_controller_state_handlers.c optional isci dev/isci/scil/scif_sas_controller_states.c optional isci dev/isci/scil/scif_sas_domain.c optional isci dev/isci/scil/scif_sas_domain_state_handlers.c optional isci dev/isci/scil/scif_sas_domain_states.c optional isci dev/isci/scil/scif_sas_high_priority_request_queue.c optional isci dev/isci/scil/scif_sas_internal_io_request.c optional isci dev/isci/scil/scif_sas_io_request.c optional isci dev/isci/scil/scif_sas_io_request_state_handlers.c optional isci dev/isci/scil/scif_sas_io_request_states.c optional isci dev/isci/scil/scif_sas_library.c optional isci dev/isci/scil/scif_sas_remote_device.c optional isci dev/isci/scil/scif_sas_remote_device_ready_substate_handlers.c optional isci dev/isci/scil/scif_sas_remote_device_ready_substates.c optional isci dev/isci/scil/scif_sas_remote_device_starting_substate_handlers.c optional isci dev/isci/scil/scif_sas_remote_device_starting_substates.c optional isci dev/isci/scil/scif_sas_remote_device_state_handlers.c optional isci dev/isci/scil/scif_sas_remote_device_states.c optional isci dev/isci/scil/scif_sas_request.c optional isci dev/isci/scil/scif_sas_smp_activity_clear_affiliation.c optional isci dev/isci/scil/scif_sas_smp_io_request.c optional isci dev/isci/scil/scif_sas_smp_phy.c optional isci dev/isci/scil/scif_sas_smp_remote_device.c optional isci dev/isci/scil/scif_sas_stp_io_request.c optional isci dev/isci/scil/scif_sas_stp_remote_device.c optional isci dev/isci/scil/scif_sas_stp_task_request.c optional isci dev/isci/scil/scif_sas_task_request.c optional isci dev/isci/scil/scif_sas_task_request_state_handlers.c optional isci dev/isci/scil/scif_sas_task_request_states.c optional isci dev/isci/scil/scif_sas_timer.c optional isci i386/acpica/acpi_machdep.c optional acpi acpi_wakecode.o optional acpi \ dependency "$S/i386/acpica/acpi_wakecode.S assym.s" \ compile-with "${NORMAL_S}" \ no-obj no-implicit-rule before-depend \ clean "acpi_wakecode.o" acpi_wakecode.bin optional acpi \ dependency "acpi_wakecode.o" \ compile-with "${OBJCOPY} -S -O binary acpi_wakecode.o ${.TARGET}" \ no-obj no-implicit-rule before-depend \ clean "acpi_wakecode.bin" acpi_wakecode.h optional acpi \ dependency "acpi_wakecode.bin" \ compile-with "file2c -sx 'static char wakecode[] = {' '};' < acpi_wakecode.bin > ${.TARGET}" \ no-obj no-implicit-rule before-depend \ clean "acpi_wakecode.h" acpi_wakedata.h optional acpi \ dependency "acpi_wakecode.o" \ compile-with '${NM} -n --defined-only acpi_wakecode.o | while read offset dummy what; do echo "#define $${what} 0x$${offset}"; done > ${.TARGET}' \ no-obj no-implicit-rule before-depend \ clean "acpi_wakedata.h" # i386/bios/apm.c optional apm i386/bios/mca_machdep.c optional mca i386/bios/smapi.c optional smapi i386/bios/smapi_bios.S optional smapi i386/cloudabi32/cloudabi32_sysvec.c optional compat_cloudabi32 #i386/i386/apic_vector.s optional apic i386/i386/atomic.c standard \ compile-with "${CC} -c ${CFLAGS} ${DEFINED_PROF:S/^$/-fomit-frame-pointer/} ${.IMPSRC}" i386/i386/bios.c standard i386/i386/bioscall.s standard i386/i386/bpf_jit_machdep.c optional bpf_jitter i386/i386/db_disasm.c optional ddb i386/i386/db_interface.c optional ddb i386/i386/db_trace.c optional ddb i386/i386/elan-mmcr.c optional cpu_elan | cpu_soekris i386/i386/elf_machdep.c standard i386/i386/exception.s standard i386/i386/gdb_machdep.c optional gdb i386/i386/geode.c optional cpu_geode i386/i386/i686_mem.c optional mem i386/i386/in_cksum.c optional inet | inet6 i386/i386/initcpu.c standard i386/i386/io.c optional io i386/i386/k6_mem.c optional mem i386/i386/locore.s standard no-obj i386/i386/longrun.c optional cpu_enable_longrun i386/i386/machdep.c standard i386/i386/mem.c optional mem i386/i386/minidump_machdep.c standard i386/i386/mp_clock.c optional smp i386/i386/mp_machdep.c optional smp i386/i386/mpboot.s optional smp i386/i386/perfmon.c optional perfmon i386/i386/pmap.c standard i386/i386/ptrace_machdep.c standard i386/i386/support.s standard i386/i386/swtch.s standard i386/i386/sys_machdep.c standard i386/i386/trap.c standard i386/i386/uio_machdep.c standard i386/i386/vm86.c standard i386/i386/vm_machdep.c standard i386/ibcs2/ibcs2_errno.c optional ibcs2 i386/ibcs2/ibcs2_fcntl.c optional ibcs2 i386/ibcs2/ibcs2_ioctl.c optional ibcs2 i386/ibcs2/ibcs2_ipc.c optional ibcs2 i386/ibcs2/ibcs2_isc.c optional ibcs2 i386/ibcs2/ibcs2_isc_sysent.c optional ibcs2 i386/ibcs2/ibcs2_misc.c optional ibcs2 i386/ibcs2/ibcs2_msg.c optional ibcs2 i386/ibcs2/ibcs2_other.c optional ibcs2 i386/ibcs2/ibcs2_signal.c optional ibcs2 i386/ibcs2/ibcs2_socksys.c optional ibcs2 i386/ibcs2/ibcs2_stat.c optional ibcs2 i386/ibcs2/ibcs2_sysent.c optional ibcs2 i386/ibcs2/ibcs2_sysi86.c optional ibcs2 i386/ibcs2/ibcs2_sysvec.c optional ibcs2 i386/ibcs2/ibcs2_util.c optional ibcs2 i386/ibcs2/ibcs2_xenix.c optional ibcs2 i386/ibcs2/ibcs2_xenix_sysent.c optional ibcs2 i386/ibcs2/imgact_coff.c optional ibcs2 i386/isa/elink.c optional ep | ie -i386/isa/npx.c optional npx +i386/isa/npx.c standard i386/isa/pmtimer.c optional pmtimer i386/isa/prof_machdep.c optional profiling-routine i386/linux/imgact_linux.c optional compat_linux i386/linux/linux_dummy.c optional compat_linux i386/linux/linux_machdep.c optional compat_linux i386/linux/linux_ptrace.c optional compat_linux i386/linux/linux_support.s optional compat_linux \ dependency "linux_assym.h" i386/linux/linux_sysent.c optional compat_linux i386/linux/linux_sysvec.c optional compat_linux i386/pci/pci_cfgreg.c optional pci i386/pci/pci_pir.c optional pci i386/svr4/svr4_locore.s optional compat_svr4 \ dependency "svr4_assym.h" \ warning "COMPAT_SVR4 is broken and should be avoided" i386/svr4/svr4_machdep.c optional compat_svr4 # isa/syscons_isa.c optional sc isa/vga_isa.c optional vga kern/kern_clocksource.c standard kern/imgact_aout.c optional compat_aout kern/imgact_gzip.c optional gzip kern/subr_sfbuf.c standard crc32_sse42.o standard \ dependency "$S/libkern/x86/crc32_sse42.c" \ compile-with "${CC} -c ${CFLAGS:N-nostdinc} ${WERROR} ${PROF} -msse4 ${.IMPSRC}" \ no-implicit-rule \ clean "crc32_sse42.o" libkern/divdi3.c standard libkern/ffsll.c standard libkern/flsll.c standard libkern/memmove.c standard libkern/memset.c standard libkern/moddi3.c standard libkern/qdivrem.c standard libkern/ucmpdi2.c standard libkern/udivdi3.c standard libkern/umoddi3.c standard i386/xbox/xbox.c optional xbox i386/xbox/xboxfb.c optional xboxfb dev/fb/boot_font.c optional xboxfb i386/xbox/pic16l.s optional xbox # # x86 real mode BIOS support, required by dpms/pci/vesa # compat/x86bios/x86bios.c optional x86bios | dpms | pci | vesa # # bvm console # dev/bvm/bvm_console.c optional bvmconsole dev/bvm/bvm_dbg.c optional bvmdebug # # x86 shared code between IA32 and AMD64 architectures # x86/acpica/OsdEnvironment.c optional acpi x86/acpica/acpi_apm.c optional acpi x86/acpica/acpi_wakeup.c optional acpi x86/acpica/madt.c optional acpi apic x86/acpica/srat.c optional acpi x86/bios/smbios.c optional smbios x86/bios/vpd.c optional vpd x86/cpufreq/est.c optional cpufreq x86/cpufreq/hwpstate.c optional cpufreq x86/cpufreq/p4tcc.c optional cpufreq x86/cpufreq/powernow.c optional cpufreq x86/cpufreq/smist.c optional cpufreq x86/iommu/busdma_dmar.c optional acpi acpi_dmar pci x86/iommu/intel_ctx.c optional acpi acpi_dmar pci x86/iommu/intel_drv.c optional acpi acpi_dmar pci x86/iommu/intel_fault.c optional acpi acpi_dmar pci x86/iommu/intel_gas.c optional acpi acpi_dmar pci x86/iommu/intel_idpgtbl.c optional acpi acpi_dmar pci x86/iommu/intel_intrmap.c optional acpi acpi_dmar pci x86/iommu/intel_qi.c optional acpi acpi_dmar pci x86/iommu/intel_quirks.c optional acpi acpi_dmar pci x86/iommu/intel_utils.c optional acpi acpi_dmar pci x86/isa/atpic.c optional atpic x86/isa/atrtc.c standard x86/isa/clock.c standard x86/isa/elcr.c optional atpic | apic x86/isa/isa.c optional isa x86/isa/isa_dma.c optional isa x86/isa/nmi.c standard x86/isa/orm.c optional isa x86/pci/pci_bus.c optional pci x86/pci/qpi.c optional pci x86/x86/autoconf.c standard x86/x86/bus_machdep.c standard x86/x86/busdma_bounce.c standard x86/x86/busdma_machdep.c standard x86/x86/cpu_machdep.c standard x86/x86/dump_machdep.c standard x86/x86/fdt_machdep.c optional fdt x86/x86/identcpu.c standard x86/x86/intr_machdep.c standard x86/x86/io_apic.c optional apic x86/x86/legacy.c standard x86/x86/local_apic.c optional apic x86/x86/mca.c standard x86/x86/mptable.c optional apic x86/x86/mptable_pci.c optional apic pci x86/x86/mp_x86.c optional smp x86/x86/mp_watchdog.c optional mp_watchdog smp x86/x86/msi.c optional apic pci x86/x86/nexus.c standard x86/x86/stack_machdep.c optional ddb | stack x86/x86/tsc.c standard x86/x86/pvclock.c standard x86/x86/delay.c standard x86/xen/hvm.c optional xenhvm x86/xen/xen_intr.c optional xenhvm x86/xen/xen_apic.c optional xenhvm x86/xen/xenpv.c optional xenhvm x86/xen/xen_nexus.c optional xenhvm x86/xen/xen_msi.c optional xenhvm Index: head/sys/conf/options.i386 =================================================================== --- head/sys/conf/options.i386 (revision 313153) +++ head/sys/conf/options.i386 (revision 313154) @@ -1,127 +1,124 @@ # $FreeBSD$ # Options specific to the i386 platform kernels AUTO_EOI_1 opt_auto_eoi.h AUTO_EOI_2 opt_auto_eoi.h BROKEN_KEYBOARD_RESET opt_reset.h COUNT_XINVLTLB_HITS opt_smp.h COUNT_IPIS opt_smp.h DISABLE_PG_G opt_pmap.h DISABLE_PSE opt_pmap.h I586_PMC_GUPROF opt_i586_guprof.h MAXMEM MPTABLE_FORCE_HTT MP_WATCHDOG NKPT opt_pmap.h PERFMON PMAP_SHPGPERPROC opt_pmap.h POWERFAIL_NMI opt_trap.h PV_STATS opt_pmap.h # Options for emulators. These should only be used at config time, so # they are handled like options for static filesystems # (see src/sys/conf/options), except for broken debugging options. COMPAT_AOUT opt_dontuse.h IBCS2 opt_dontuse.h COMPAT_LINUX opt_dontuse.h COMPAT_SVR4 opt_dontuse.h DEBUG_SVR4 opt_svr4.h LINPROCFS opt_dontuse.h LINSYSFS opt_dontuse.h NDISAPI opt_dontuse.h # Change KVM size. Changes things all over the kernel. KVA_PAGES opt_global.h # Physical address extensions and support for >4G ram. As above. PAE opt_global.h # Use PAE page tables, but limit memory support to 4GB. # This keeps the i386 non-PAE KBI, in particular, drivers see # 32bit vm_paddr_t. PAE_TABLES opt_global.h TIMER_FREQ opt_clock.h CPU_ATHLON_SSE_HACK opt_cpu.h CPU_BLUELIGHTNING_3X opt_cpu.h CPU_BLUELIGHTNING_FPU_OP_CACHE opt_cpu.h CPU_BTB_EN opt_cpu.h CPU_CYRIX_NO_LOCK opt_cpu.h CPU_DIRECT_MAPPED_CACHE opt_cpu.h CPU_DISABLE_5X86_LSSER opt_cpu.h -CPU_DISABLE_CMPXCHG opt_global.h # XXX global, unlike other CPU_* -CPU_DISABLE_SSE opt_cpu.h CPU_ELAN opt_cpu.h CPU_ELAN_PPS opt_cpu.h CPU_ELAN_XTAL opt_cpu.h CPU_ENABLE_LONGRUN opt_cpu.h CPU_FASTER_5X86_FPU opt_cpu.h CPU_GEODE opt_cpu.h CPU_I486_ON_386 opt_cpu.h CPU_IORT opt_cpu.h CPU_L2_LATENCY opt_cpu.h CPU_LOOP_EN opt_cpu.h CPU_PPRO2CELERON opt_cpu.h CPU_RSTK_EN opt_cpu.h CPU_SOEKRIS opt_cpu.h CPU_SUSP_HLT opt_cpu.h CPU_UPGRADE_HW_CACHE opt_cpu.h CPU_WT_ALLOC opt_cpu.h CYRIX_CACHE_REALLY_WORKS opt_cpu.h CYRIX_CACHE_WORKS opt_cpu.h NO_F00F_HACK opt_cpu.h NO_MEMORY_HOLE opt_cpu.h # The CPU type affects the endian conversion functions all over the kernel. I486_CPU opt_global.h I586_CPU opt_global.h I686_CPU opt_global.h # options for serial support COM_ESP opt_sio.h COM_MULTIPORT opt_sio.h CONSPEED opt_sio.h GDBSPEED opt_sio.h COM_NO_ACPI opt_sio.h VGA_ALT_SEQACCESS opt_vga.h VGA_DEBUG opt_vga.h VGA_NO_FONT_LOADING opt_vga.h VGA_NO_MODE_CHANGE opt_vga.h VGA_SLOW_IOACCESS opt_vga.h VGA_WIDTH90 opt_vga.h VESA VESA_DEBUG opt_vesa.h # AGP debugging support AGP_DEBUG opt_agp.h PSM_DEBUG opt_psm.h PSM_HOOKRESUME opt_psm.h PSM_RESETAFTERSUSPEND opt_psm.h ATKBD_DFLT_KEYMAP opt_atkbd.h # Video spigot SPIGOT_UNSECURE opt_spigot.h # Enables NETGRAPH support for Cronyx adapters NETGRAPH_CRONYX opt_ng_cronyx.h # Device options DEV_APIC opt_apic.h DEV_ATPIC opt_atpic.h -DEV_NPX opt_npx.h # Debugging NPX_DEBUG opt_npx.h # BPF just-in-time compiler BPF_JITTER opt_bpf.h XENHVM opt_global.h # options for the Intel C600 SAS driver (isci) ISCI_LOGGING opt_isci.h Index: head/sys/i386/conf/DEFAULTS =================================================================== --- head/sys/i386/conf/DEFAULTS (revision 313153) +++ head/sys/i386/conf/DEFAULTS (revision 313154) @@ -1,31 +1,28 @@ # # DEFAULTS -- Default kernel configuration file for FreeBSD/i386 # # $FreeBSD$ machine i386 # Bus support. device isa options ISAPNP -# Floating point support. -device npx - # Pseudo devices. device mem # Memory and kernel memory devices device io # I/O device # UART chips on this platform device uart_ns8250 # Default partitioning schemes options GEOM_PART_BSD options GEOM_PART_EBR options GEOM_PART_EBR_COMPAT options GEOM_PART_MBR # enable support for native hardware device atpic options NEW_PCIB Index: head/sys/i386/conf/NOTES =================================================================== --- head/sys/i386/conf/NOTES (revision 313153) +++ head/sys/i386/conf/NOTES (revision 313154) @@ -1,1068 +1,1056 @@ # # NOTES -- Lines that can be cut/pasted into kernel and hints configs. # # This file contains machine dependent kernel configuration notes. For # machine independent notes, look in /sys/conf/NOTES. # # $FreeBSD$ # # # We want LINT to cover profiling as well. profile 2 # # Enable the kernel DTrace hooks which are required to load the DTrace # kernel modules. # options KDTRACE_HOOKS # DTrace core # NOTE: introduces CDDL-licensed components into the kernel #device dtrace # DTrace modules #device dtrace_profile #device dtrace_sdt #device dtrace_fbt #device dtrace_systrace #device dtrace_prototype #device dtnfscl #device dtmalloc # Alternatively include all the DTrace modules #device dtraceall ##################################################################### # SMP OPTIONS: # # The apic device enables the use of the I/O APIC for interrupt delivery. # The apic device can be used in both UP and SMP kernels, but is required # for SMP kernels. Thus, the apic device is not strictly an SMP option, # but it is a prerequisite for SMP. # # Notes: # # HTT CPUs should only be used if they are enabled in the BIOS. For # the ACPI case, ACPI only correctly tells us about any HTT CPUs if # they are enabled. However, most HTT systems do not list HTT CPUs # in the MP Table if they are enabled, thus we guess at the HTT CPUs # for the MP Table case. However, we shouldn't try to guess and use # these CPUs if HTT is disabled. Thus, HTT guessing is only enabled # for the MP Table if the user explicitly asks for it via the # MPTABLE_FORCE_HTT option. Do NOT use this option if you have HTT # disabled in your BIOS. # # IPI_PREEMPTION instructs the kernel to preempt threads running on other # CPUS if needed. Relies on the PREEMPTION option # Mandatory: device apic # I/O apic # Optional: options MPTABLE_FORCE_HTT # Enable HTT CPUs with the MP Table options IPI_PREEMPTION # # Watchdog routines. # options MP_WATCHDOG # Debugging options. # options COUNT_XINVLTLB_HITS # Counters for TLB events options COUNT_IPIS # Per-CPU IPI interrupt counters ##################################################################### # CPU OPTIONS # # You must specify at least one CPU (the one you intend to run on); # deleting the specification for CPUs you don't need to use may make # parts of the system run faster. # cpu I486_CPU cpu I586_CPU # aka Pentium(tm) cpu I686_CPU # aka Pentium Pro(tm) # # Options for CPU features. # # CPU_ATHLON_SSE_HACK tries to enable SSE instructions when the BIOS has # forgotten to enable them. # # CPU_BLUELIGHTNING_3X enables triple-clock mode on IBM Blue Lightning # CPU if CPU supports it. The default is double-clock mode on # BlueLightning CPU box. # # CPU_BLUELIGHTNING_FPU_OP_CACHE enables FPU operand cache on IBM # BlueLightning CPU. It works only with Cyrix FPU, and this option # should not be used with Intel FPU. # # CPU_BTB_EN enables branch target buffer on Cyrix 5x86 (NOTE 1). # # CPU_CYRIX_NO_LOCK enables weak locking for the entire address space # of Cyrix 6x86 and 6x86MX CPUs by setting the NO_LOCK bit of CCR1. # Otherwise, the NO_LOCK bit of CCR1 is cleared. (NOTE 3) # # CPU_DIRECT_MAPPED_CACHE sets L1 cache of Cyrix 486DLC CPU in direct # mapped mode. Default is 2-way set associative mode. # # CPU_DISABLE_5X86_LSSER disables load store serialize (i.e., enables # reorder). This option should not be used if you use memory mapped # I/O device(s). # -# CPU_DISABLE_CMPXCHG disables the CMPXCHG instruction on > i386 IA32 -# machines. VmWare 3.x seems to emulate this instruction poorly, causing -# the guest OS to run very slowly. This problem appears to be fixed in -# VmWare 4.x, at least in version 4.5.2, so that enabling this option with -# VmWare 4.x will result in locking operations to be 20-30 times slower. -# Enabling this with an SMP kernel will cause the kernel to be unusable. -# -# CPU_DISABLE_SSE explicitly prevents I686_CPU from turning on SSE. -# # CPU_ELAN enables support for AMDs ElanSC520 CPU. # CPU_ELAN_PPS enables precision timestamp code. # CPU_ELAN_XTAL sets the clock crystal frequency in Hz. # # CPU_ENABLE_LONGRUN enables support for Transmeta Crusoe LongRun # technology which allows to restrict power consumption of the CPU by # using group of hw.crusoe.* sysctls. # # CPU_FASTER_5X86_FPU enables faster FPU exception handler. # # CPU_GEODE is for the SC1100 Geode embedded processor. This option # is necessary because the i8254 timecounter is toast. # # CPU_I486_ON_386 enables CPU cache on i486 based CPU upgrade products # for i386 machines. # # CPU_IORT defines I/O clock delay time (NOTE 1). Default values of # I/O clock delay time on Cyrix 5x86 and 6x86 are 0 and 7,respectively # (no clock delay). # # CPU_L2_LATENCY specifies the L2 cache latency value. This option is used # only when CPU_PPRO2CELERON is defined and Mendocino Celeron is detected. # The default value is 5. # # CPU_LOOP_EN prevents flushing the prefetch buffer if the destination # of a jump is already present in the prefetch buffer on Cyrix 5x86(NOTE # 1). # # CPU_PPRO2CELERON enables L2 cache of Mendocino Celeron CPUs. This option # is useful when you use Socket 8 to Socket 370 converter, because most Pentium # Pro BIOSs do not enable L2 cache of Mendocino Celeron CPUs. # # CPU_RSTK_EN enables return stack on Cyrix 5x86 (NOTE 1). # # CPU_SOEKRIS enables support www.soekris.com hardware. # # CPU_SUSP_HLT enables suspend on HALT. If this option is set, CPU # enters suspend mode following execution of HALT instruction. # # CPU_UPGRADE_HW_CACHE eliminates unneeded cache flush instruction(s). # # CPU_WT_ALLOC enables write allocation on Cyrix 6x86/6x86MX and AMD # K5/K6/K6-2 CPUs. # # CYRIX_CACHE_WORKS enables CPU cache on Cyrix 486 CPUs with cache # flush at hold state. # # CYRIX_CACHE_REALLY_WORKS enables (1) CPU cache on Cyrix 486 CPUs # without cache flush at hold state, and (2) write-back CPU cache on # Cyrix 6x86 whose revision < 2.7 (NOTE 2). # # NO_F00F_HACK disables the hack that prevents Pentiums (and ONLY # Pentiums) from locking up when a LOCK CMPXCHG8B instruction is # executed. This option is only needed if I586_CPU is also defined, # and should be included for any non-Pentium CPU that defines it. # # NO_MEMORY_HOLE is an optimisation for systems with AMD K6 processors # which indicates that the 15-16MB range is *definitely* not being # occupied by an ISA memory hole. # # NOTE 1: The options, CPU_BTB_EN, CPU_LOOP_EN, CPU_IORT, # CPU_LOOP_EN and CPU_RSTK_EN should not be used because of CPU bugs. # These options may crash your system. # # NOTE 2: If CYRIX_CACHE_REALLY_WORKS is not set, CPU cache is enabled # in write-through mode when revision < 2.7. If revision of Cyrix # 6x86 >= 2.7, CPU cache is always enabled in write-back mode. # # NOTE 3: This option may cause failures for software that requires # locked cycles in order to operate correctly. # options CPU_ATHLON_SSE_HACK options CPU_BLUELIGHTNING_3X options CPU_BLUELIGHTNING_FPU_OP_CACHE options CPU_BTB_EN options CPU_DIRECT_MAPPED_CACHE options CPU_DISABLE_5X86_LSSER -options CPU_DISABLE_CMPXCHG -#options CPU_DISABLE_SSE options CPU_ELAN options CPU_ELAN_PPS options CPU_ELAN_XTAL=32768000 options CPU_ENABLE_LONGRUN options CPU_FASTER_5X86_FPU options CPU_GEODE options CPU_I486_ON_386 options CPU_IORT options CPU_L2_LATENCY=5 options CPU_LOOP_EN options CPU_PPRO2CELERON options CPU_RSTK_EN options CPU_SOEKRIS options CPU_SUSP_HLT options CPU_UPGRADE_HW_CACHE options CPU_WT_ALLOC options CYRIX_CACHE_WORKS options CYRIX_CACHE_REALLY_WORKS #options NO_F00F_HACK # Debug options options NPX_DEBUG # enable npx debugging # # PERFMON causes the driver for Pentium/Pentium Pro performance counters # to be compiled. See perfmon(4) for more information. # options PERFMON # # XBOX causes the kernel to be bootable on the Microsoft XBox console system. # The resulting kernel will auto-detect whether it is being booted on a XBox, # so kernels compiled with this option will also work on an ordinary PC. # This option require I686_CPU. # # xboxfb includes support for the XBox frame buffer device. It is fully USB- # keyboard aware, and will only be used if an xbox is detected. This option # (obviously) requires XBOX support in your kernel. # # NOTE: xboxfb currently conflicts with syscons(4); if you have an XBOX and # include both in your kernel; you will not get any video output. Ordinary # PC's do not suffer from this. # options XBOX device xboxfb ##################################################################### # NETWORKING OPTIONS # # DEVICE_POLLING adds support for mixed interrupt-polling handling # of network device drivers, which has significant benefits in terms # of robustness to overloads and responsivity, as well as permitting # accurate scheduling of the CPU time between kernel network processing # and other activities. The drawback is a moderate (up to 1/HZ seconds) # potential increase in response times. # It is strongly recommended to use HZ=1000 or 2000 with DEVICE_POLLING # to achieve smoother behaviour. # Additionally, you can enable/disable polling at runtime with help of # the ifconfig(8) utility, and select the CPU fraction reserved to # userland with the sysctl variable kern.polling.user_frac # (default 50, range 0..100). # # Not all device drivers support this mode of operation at the time of # this writing. See polling(4) for more details. options DEVICE_POLLING # BPF_JITTER adds support for BPF just-in-time compiler. options BPF_JITTER # OpenFabrics Enterprise Distribution (Infiniband). options OFED options OFED_DEBUG_INIT # Sockets Direct Protocol options SDP options SDP_DEBUG # IP over Infiniband options IPOIB options IPOIB_DEBUG options IPOIB_CM ##################################################################### # CLOCK OPTIONS # Provide read/write access to the memory in the clock chip. device nvram # Access to rtc cmos via /dev/nvram ##################################################################### # MISCELLANEOUS DEVICES AND OPTIONS device speaker #Play IBM BASIC-style noises out your speaker hint.speaker.0.at="isa" hint.speaker.0.port="0x61" device gzip #Exec gzipped a.out's. REQUIRES COMPAT_AOUT! device apm_saver # Requires APM ##################################################################### # HARDWARE BUS CONFIGURATION # # ISA bus # -device isa # Required by npx(4) +device isa # # Options for `isa': # # AUTO_EOI_1 enables the `automatic EOI' feature for the master 8259A # interrupt controller. This saves about 0.7-1.25 usec for each interrupt. # This option breaks suspend/resume on some portables. # # AUTO_EOI_2 enables the `automatic EOI' feature for the slave 8259A # interrupt controller. This saves about 0.7-1.25 usec for each interrupt. # Automatic EOI is documented not to work for for the slave with the # original i8259A, but it works for some clones and some integrated # versions. # # MAXMEM specifies the amount of RAM on the machine; if this is not # specified, FreeBSD will first read the amount of memory from the CMOS # RAM, so the amount of memory will initially be limited to 64MB or 16MB # depending on the BIOS. If the BIOS reports 64MB, a memory probe will # then attempt to detect the installed amount of RAM. If this probe # fails to detect >64MB RAM you will have to use the MAXMEM option. # The amount is in kilobytes, so for a machine with 128MB of RAM, it would # be 131072 (128 * 1024). # # BROKEN_KEYBOARD_RESET disables the use of the keyboard controller to # reset the CPU for reboot. This is needed on some systems with broken # keyboard controllers. options AUTO_EOI_1 #options AUTO_EOI_2 options MAXMEM=(128*1024) #options BROKEN_KEYBOARD_RESET # # EISA bus # # The EISA bus device is `eisa'. It provides auto-detection and # configuration support for all devices on the EISA bus. device eisa # By default, only 10 EISA slots are probed, since the slot numbers # above clash with the configuration address space of the PCI subsystem, # and the EISA probe is not very smart about this. This is sufficient # for most machines, but in particular the HP NetServer LC series comes # with an onboard AIC7770 dual-channel SCSI controller on EISA slot #11, # thus you need to bump this figure to 12 for them. options EISA_SLOTS=12 # # MCA bus: # # The MCA bus device is `mca'. It provides auto-detection and # configuration support for all devices on the MCA bus. # No hints are required for MCA. device mca # # AGP GART support device agp # AGP debugging. options AGP_DEBUG ##################################################################### # HARDWARE DEVICE CONFIGURATION # To include support for VGA VESA video modes options VESA # Turn on extra debugging checks and output for VESA support. options VESA_DEBUG device dpms # DPMS suspend & resume via VESA BIOS # x86 real mode BIOS emulator, required by atkbdc/dpms/vesa options X86BIOS # -# The Numeric Processing eXtension driver. This is non-optional. -device npx +# Hints for the non-optional Numeric Processing eXtension driver. hint.npx.0.flags="0x0" hint.npx.0.irq="13" # # `flags' for npx0: # 0x01 don't use the npx registers to optimize bcopy. # 0x02 don't use the npx registers to optimize bzero. # 0x04 don't use the npx registers to optimize copyin or copyout. # The npx registers are normally used to optimize copying and zeroing when # all of the following conditions are satisfied: # I586_CPU is an option # the cpu is an i586 (perhaps not a Pentium) # the probe for npx0 succeeds # INT 16 exception handling works. # Then copying and zeroing using the npx registers is normally 30-100% faster. # The flags can be used to control cases where it doesn't work or is slower. # Setting them at boot time using hints works right (the optimizations # are not used until later in the bootstrap when npx0 is attached). # Flag 0x08 automatically disables the i586 optimized routines. # # # Optional devices: # # PS/2 mouse device psm hint.psm.0.at="atkbdc" hint.psm.0.irq="12" # Options for psm: options PSM_HOOKRESUME #hook the system resume event, useful #for some laptops options PSM_RESETAFTERSUSPEND #reset the device at the resume event # The keyboard controller; it controls the keyboard and the PS/2 mouse. device atkbdc hint.atkbdc.0.at="isa" hint.atkbdc.0.port="0x060" # The AT keyboard device atkbd hint.atkbd.0.at="atkbdc" hint.atkbd.0.irq="1" # Options for atkbd: options ATKBD_DFLT_KEYMAP # specify the built-in keymap makeoptions ATKBD_DFLT_KEYMAP=fr.dvorak # `flags' for atkbd: # 0x01 Force detection of keyboard, else we always assume a keyboard # 0x02 Don't reset keyboard, useful for some newer ThinkPads # 0x03 Force detection and avoid reset, might help with certain # dockingstations # 0x04 Old-style (XT) keyboard support, useful for older ThinkPads # Video card driver for VGA adapters. device vga hint.vga.0.at="isa" # Options for vga: # Try the following option if the mouse pointer is not drawn correctly # or font does not seem to be loaded properly. May cause flicker on # some systems. options VGA_ALT_SEQACCESS # If you can dispense with some vga driver features, you may want to # use the following options to save some memory. #options VGA_NO_FONT_LOADING # don't save/load font #options VGA_NO_MODE_CHANGE # don't change video modes # Older video cards may require this option for proper operation. options VGA_SLOW_IOACCESS # do byte-wide i/o's to TS and GDC regs # The following option probably won't work with the LCD displays. options VGA_WIDTH90 # support 90 column modes # Debugging. options VGA_DEBUG # vt(4) drivers. device vt_vga # Linear framebuffer driver for S3 VESA 1.2 cards. Works on top of VESA. device s3pci # 3Dfx Voodoo Graphics, Voodoo II /dev/3dfx CDEV support. This will create # the /dev/3dfx0 device to work with glide implementations. This should get # linked to /dev/3dfx and /dev/voodoo. Note that this is not the same as # the tdfx DRI module from XFree86 and is completely unrelated. # # To enable Linuxulator support, one must also include COMPAT_LINUX in the # config as well. The other option is to load both as modules. device tdfx # Enable 3Dfx Voodoo support device tdfx_linux # Enable Linuxulator support # # ACPI support using the Intel ACPI Component Architecture reference # implementation. # # ACPI_DEBUG enables the use of the debug.acpi.level and debug.acpi.layer # kernel environment variables to select initial debugging levels for the # Intel ACPICA code. (Note that the Intel code must also have USE_DEBUGGER # defined when it is built). device acpi options ACPI_DEBUG options ACPI_DMAR # ACPI WMI Mapping driver device acpi_wmi # ACPI Asus Extras (LCD backlight/brightness, video output, etc.) device acpi_asus # ACPI Fujitsu Extras (Buttons) device acpi_fujitsu # ACPI extras driver for HP laptops device acpi_hp # ACPI extras driver for IBM laptops device acpi_ibm # ACPI Panasonic Extras (LCD backlight/brightness, video output, etc.) device acpi_panasonic # ACPI Sony extra (LCD brightness) device acpi_sony # ACPI Toshiba Extras (LCD backlight/brightness, video output, etc.) device acpi_toshiba # ACPI Video Extensions (LCD backlight/brightness, video output, etc.) device acpi_video # ACPI Docking Station device acpi_dock # ACPI ASOC ATK0110 ASUSTeK AI Booster (voltage, temperature and fan sensors) device aibs # The cpufreq(4) driver provides support for non-ACPI CPU frequency control device cpufreq # Direct Rendering modules for 3D acceleration. device drm # DRM core module required by DRM drivers device i915drm # Intel i830 through i915 device mach64drm # ATI Rage Pro, Rage Mobility P/M, Rage XL device mgadrm # AGP Matrox G200, G400, G450, G550 device r128drm # ATI Rage 128 device radeondrm # ATI Radeon device savagedrm # S3 Savage3D, Savage4 device sisdrm # SiS 300/305, 540, 630 device tdfxdrm # 3dfx Voodoo 3/4/5 and Banshee device viadrm # VIA options DRM_DEBUG # Include debug printfs (slow) # # mse: Logitech and ATI InPort bus mouse ports device mse hint.mse.0.at="isa" hint.mse.0.port="0x23c" hint.mse.0.irq="5" # # Network interfaces: # # bxe: Broadcom NetXtreme II (BCM5771X/BCM578XX) PCIe 10Gb Ethernet # adapters. # ce: Cronyx Tau-PCI/32 sync single/dual port G.703/E1 serial adaptor # with 32 HDLC subchannels (requires sppp (default), or NETGRAPH if # NETGRAPH_CRONYX is configured) # cp: Cronyx Tau-PCI sync single/dual/four port # V.35/RS-232/RS-530/RS-449/X.21/G.703/E1/E3/T3/STS-1 # serial adaptor (requires sppp (default), or NETGRAPH if # NETGRAPH_CRONYX is configured) # cs: IBM Etherjet and other Crystal Semi CS89x0-based adapters # ctau: Cronyx Tau sync dual port V.35/RS-232/RS-530/RS-449/X.21/G.703/E1 # serial adaptor (requires sppp (default), or NETGRAPH if # NETGRAPH_CRONYX is configured) # ed: Western Digital and SMC 80xx; Novell NE1000 and NE2000; 3Com 3C503 # HP PC Lan+, various PC Card devices # (requires miibus) # ie: AT&T StarLAN 10 and EN100; 3Com 3C507; unknown NI5210; # Intel EtherExpress # ipw: Intel PRO/Wireless 2100 IEEE 802.11 adapter # iwi: Intel PRO/Wireless 2200BG/2225BG/2915ABG IEEE 802.11 adapters # Requires the iwi firmware module # iwn: Intel Wireless WiFi Link 1000/105/135/2000/4965/5000/6000/6050 abgn # 802.11 network adapters # Requires the iwn firmware module # mlx4ib: Mellanox ConnectX HCA InfiniBand # mlx4en: Mellanox ConnectX HCA Ethernet # mthca: Mellanox HCA InfiniBand # nfe: nVidia nForce MCP on-board Ethernet Networking (BSD open source) # sbni: Granch SBNI12-xx ISA and PCI adapters # vmx: VMware VMXNET3 Ethernet (BSD open source) # wl: Lucent Wavelan (ISA card only). # wpi: Intel 3945ABG Wireless LAN controller # Requires the wpi firmware module # Order for ISA/EISA devices is important here device bxe # Broadcom NetXtreme II BCM5771X/BCM578XX 10GbE device ce device cp device cs # Crystal Semiconductor CS89x0 NIC hint.cs.0.at="isa" hint.cs.0.port="0x300" device ctau hint.ctau.0.at="isa" hint.ctau.0.port="0x240" hint.ctau.0.irq="15" hint.ctau.0.drq="7" #options NETGRAPH_CRONYX # Enable NETGRAPH support for Cronyx adapter(s) device ed # NE[12]000, SMC Ultra, 3c503, DS8390 cards options ED_3C503 options ED_HPP options ED_SIC hint.ed.0.at="isa" hint.ed.0.port="0x280" hint.ed.0.irq="5" hint.ed.0.maddr="0xd8000" device ipw # Intel 2100 wireless NICs. device iwi # Intel 2200BG/2225BG/2915ABG wireless NICs. device iwn # Intel 4965/1000/5000/6000 wireless NICs. # Hint for the i386-only ISA front-end of le(4). hint.le.0.at="isa" hint.le.0.port="0x280" hint.le.0.irq="10" hint.le.0.drq="0" device mlx4 # Shared code module between IB and Ethernet device mlx4ib # Mellanox ConnectX HCA InfiniBand device mlx4en # Mellanox ConnectX HCA Ethernet device mthca # Mellanox HCA InfiniBand device nfe # nVidia nForce MCP on-board Ethernet device sbni hint.sbni.0.at="isa" hint.sbni.0.port="0x210" hint.sbni.0.irq="0xefdead" hint.sbni.0.flags="0" device vmx # VMware VMXNET3 Ethernet device wpi # Intel 3945ABG wireless NICs. # IEEE 802.11 adapter firmware modules # Intel PRO/Wireless 2100 firmware: # ipwfw: BSS/IBSS/monitor mode firmware # ipwbssfw: BSS mode firmware # ipwibssfw: IBSS mode firmware # ipwmonitorfw: Monitor mode firmware # Intel PRO/Wireless 2200BG/2225BG/2915ABG firmware: # iwifw: BSS/IBSS/monitor mode firmware # iwibssfw: BSS mode firmware # iwiibssfw: IBSS mode firmware # iwimonitorfw: Monitor mode firmware # Intel Wireless WiFi Link 4965/1000/5000/6000 series firmware: # iwnfw: Single module to support all devices # iwn1000fw: Specific module for the 1000 only # iwn105fw: Specific module for the 105 only # iwn135fw: Specific module for the 135 only # iwn2000fw: Specific module for the 2000 only # iwn2030fw: Specific module for the 2030 only # iwn4965fw: Specific module for the 4965 only # iwn5000fw: Specific module for the 5000 only # iwn5150fw: Specific module for the 5150 only # iwn6000fw: Specific module for the 6000 only # iwn6000g2afw: Specific module for the 6000g2a only # iwn6000g2bfw: Specific module for the 6000g2b only # iwn6050fw: Specific module for the 6050 only # wpifw: Intel 3945ABG Wireless LAN Controller firmware device iwifw device iwibssfw device iwiibssfw device iwimonitorfw device ipwfw device ipwbssfw device ipwibssfw device ipwmonitorfw device iwnfw device iwn1000fw device iwn105fw device iwn135fw device iwn2000fw device iwn2030fw device iwn4965fw device iwn5000fw device iwn5150fw device iwn6000fw device iwn6000g2afw device iwn6000g2bfw device iwn6050fw device wpifw # # ATA raid adapters # device pst # # Areca 11xx and 12xx series of SATA II RAID controllers. # CAM is required. # device arcmsr # Areca SATA II RAID # # 3ware 9000 series PATA/SATA RAID controller driver and options. # The driver is implemented as a SIM, and so, needs the CAM infrastructure. # options TWA_DEBUG # 0-10; 10 prints the most messages. options TWA_FLASH_FIRMWARE # firmware image bundled when defined. device twa # 3ware 9000 series PATA/SATA RAID # # SCSI host adapters: # # ncv: NCR 53C500 based SCSI host adapters. # nsp: Workbit Ninja SCSI-3 based PC Card SCSI host adapters. # stg: TMC 18C30, 18C50 based SCSI host adapters. device ncv device nsp device stg hint.stg.0.at="isa" hint.stg.0.port="0x140" hint.stg.0.port="11" # # Adaptec FSA RAID controllers, including integrated DELL controllers, # the Dell PERC 2/QC and the HP NetRAID-4M device aac device aacp # SCSI Passthrough interface (optional, CAM required) # # Adaptec by PMC RAID controllers, Series 6/7/8 and upcoming families device aacraid # Container interface, CAM required # # Highpoint RocketRAID 27xx. device hpt27xx # # Highpoint RocketRAID 182x. device hptmv # # Highpoint DC7280 and R750. device hptnr # # Highpoint RocketRAID. Supports RR172x, RR222x, RR2240, RR232x, RR2340, # RR2210, RR174x, RR2522, RR231x, RR230x. device hptrr # # Highpoint RocketRaid 3xxx series SATA RAID device hptiop # # IBM (now Adaptec) ServeRAID controllers device ips # # Intel C600 (Patsburg) integrated SAS controller device isci options ISCI_LOGGING # enable debugging in isci HAL # # NVM Express (NVMe) support device nvme # base NVMe driver device nvd # expose NVMe namespaces as disks, depends on nvme # # PMC-Sierra SAS/SATA controller device pmspcv # # SafeNet crypto driver: can be moved to the MI NOTES as soon as # it's tested on a big-endian machine # device safe # SafeNet 1141 options SAFE_DEBUG # enable debugging support: hw.safe.debug options SAFE_RNDTEST # enable rndtest support # # glxiic is an I2C driver for the AMD Geode LX CS5536 System Management Bus # controller. Requires 'device iicbus'. # device glxiic # AMD Geode LX CS5536 System Management Bus # # glxsb is a driver for the Security Block in AMD Geode LX processors. # Requires 'device crypto'. # device glxsb # AMD Geode LX Security Block # # VirtIO support # # The virtio entry provides a generic bus for use by the device drivers. # It must be combined with an interface that communicates with the host. # Multiple such interfaces defined by the VirtIO specification. FreeBSD # only has support for PCI. Therefore, virtio_pci must be statically # compiled in or loaded as a module for the device drivers to function. # device virtio # Generic VirtIO bus (required) device virtio_pci # VirtIO PCI Interface device vtnet # VirtIO Ethernet device device virtio_blk # VirtIO Block device device virtio_scsi # VirtIO SCSI device device virtio_balloon # VirtIO Memory Balloon device device virtio_random # VirtIO Entropy device device virtio_console # VirtIO Console device device hyperv # HyperV drivers ##################################################################### # # Miscellaneous hardware: # # apm: Laptop Advanced Power Management (experimental) # ipmi: Intelligent Platform Management Interface # smapi: System Management Application Program Interface driver # smbios: DMI/SMBIOS entry point # vpd: Vital Product Data kernel interface # pmtimer: Adjust system timer at wakeup time # pbio: Parallel (8255 PPI) basic I/O (mode 0) port (e.g. Advantech PCL-724) # asmc: Apple System Management Controller # si: Specialix International SI/XIO or SX intelligent serial card driver # tpm: Trusted Platform Module # Notes on APM # The flags takes the following meaning for apm0: # 0x0020 Statclock is broken. # Notes on the Specialix SI/XIO driver: # The host card is memory, not IO mapped. # The Rev 1 host cards use a 64K chunk, on a 32K boundary. # The Rev 2 host cards use a 32K chunk, on a 32K boundary. # The cards can use an IRQ of 11, 12 or 15. # Notes on the Sony Programmable I/O controller # This is a temporary driver that should someday be replaced by something # that hooks into the ACPI layer. The device is hooked to the PIIX4's # General Device 10 decoder, which means you have to fiddle with PCI # registers to map it in, even though it is otherwise treated here as # an ISA device. At the moment, the driver polls, although the device # is capable of generating interrupts. It largely undocumented. # The port location in the hint is where you WANT the device to be # mapped. 0x10a0 seems to be traditional. At the moment the jogdial # is the only thing truly supported, but apparently a fair percentage # of the Vaio extra features are controlled by this device. device apm hint.apm.0.flags="0x20" device ipmi device smapi device smbios device vpd device pmtimer device pbio hint.pbio.0.at="isa" hint.pbio.0.port="0x360" device asmc device tpm device padlock_rng # VIA Padlock RNG device rdrand_rng # Intel Bull Mountain RNG device aesni # AES-NI OpenCrypto module # # Laptop/Notebook options: # # See also: # apm under `Miscellaneous hardware' # above. # For older notebooks that signal a powerfail condition (external # power supply dropped, or battery state low) by issuing an NMI: options POWERFAIL_NMI # make it beep instead of panicing # # I2C Bus # # Philips i2c bus support is provided by the `iicbus' device. # # Supported interfaces: # pcf Philips PCF8584 ISA-bus controller # device pcf hint.pcf.0.at="isa" hint.pcf.0.port="0x320" hint.pcf.0.irq="5" # # Hardware watchdog timers: # # ichwd: Intel ICH watchdog timer # amdsbwd: AMD SB7xx watchdog timer # viawd: VIA south bridge watchdog timer # wbwd: Winbond watchdog timer # device ichwd device amdsbwd device viawd device wbwd # # Temperature sensors: # # coretemp: on-die sensor on Intel Core and newer CPUs # amdtemp: on-die sensor on AMD K8/K10/K11 CPUs # device coretemp device amdtemp # # CPU control pseudo-device. Provides access to MSRs, CPUID info and # microcode update feature. # device cpuctl # # System Management Bus (SMB) # options ENABLE_ALART # Control alarm on Intel intpm driver # # Set the number of PV entries per process. Increasing this can # stop panics related to heavy use of shared memory. However, that can # (combined with large amounts of physical memory) cause panics at # boot time due the kernel running out of VM space. # # If you're tweaking this, you might also want to increase the sysctls # "vm.v_free_min", "vm.v_free_reserved", and "vm.v_free_target". # # The value below is the one more than the default. # options PMAP_SHPGPERPROC=201 # # Change the size of the kernel virtual address space. Due to # constraints in loader(8) on i386, this must be a multiple of 4. # 256 = 1 GB of kernel address space. Increasing this also causes # a reduction of the address space in user processes. 512 splits # the 4GB cpu address space in half (2GB user, 2GB kernel). For PAE # kernels, the value will need to be double non-PAE. A value of 1024 # for PAE kernels is necessary to split the address space in half. # This will likely need to be increased to handle memory sizes >4GB. # PAE kernels default to a value of 512. # options KVA_PAGES=260 # # Number of initial kernel page table pages used for early bootstrap. # This number should include enough pages to map the kernel, any # modules or other data loaded with the kernel by the loader, and data # structures allocated before the VM system is initialized such as the # vm_page_t array. Each page table page maps 4MB (2MB with PAE). # options NKPT=31 ##################################################################### # ABI Emulation # Enable iBCS2 runtime support for SCO and ISC binaries #options IBCS2 # Emulate spx device for client side of SVR3 local X interface options SPX_HACK # Enable 32-bit runtime support for CloudABI binaries. options COMPAT_CLOUDABI32 # Enable Linux ABI emulation options COMPAT_LINUX # Enable i386 a.out binary support options COMPAT_AOUT # Enable the linux-like proc filesystem support (requires COMPAT_LINUX # and PSEUDOFS) options LINPROCFS #Enable the linux-like sys filesystem support (requires COMPAT_LINUX # and PSEUDOFS) options LINSYSFS # # SysVR4 ABI emulation # # The svr4 ABI emulator can be statically compiled into the kernel or loaded as # a KLD module. # The STREAMS network emulation code can also be compiled statically or as a # module. If loaded as a module, it must be loaded before the svr4 module # (the /usr/sbin/svr4 script does this for you). If compiling statically, # the `streams' device must be configured into any kernel which also # specifies COMPAT_SVR4. It is possible to have a statically-configured # STREAMS device and a dynamically loadable svr4 emulator; the /usr/sbin/svr4 # script understands that it doesn't need to load the `streams' module under # those circumstances. # Caveat: At this time, `options KTRACE' is required for the svr4 emulator # (whether static or dynamic). # options COMPAT_SVR4 # build emulator statically options DEBUG_SVR4 # enable verbose debugging device streams # STREAMS network driver (required for svr4). # Enable NDIS binary driver support options NDISAPI device ndis ##################################################################### # VM OPTIONS # Disable the 4 MByte page PSE CPU feature. The PSE feature allows the # kernel to use 4 MByte pages to map the kernel instead of 4k pages. # This saves on the amount of memory needed for page tables needed to # map the kernel. You should only disable this feature as a temporary # workaround if you are having problems with it enabled. # #options DISABLE_PSE # Disable the global pages PGE CPU feature. The PGE feature allows pages # to be marked with the PG_G bit. TLB entries for these pages are not # flushed from the cache when %cr3 is reloaded. This can make context # switches less expensive. You should only disable this feature as a # temporary workaround if you are having problems with it enabled. # #options DISABLE_PG_G # KSTACK_PAGES is the number of memory pages to assign to the kernel # stack of each thread. options KSTACK_PAGES=3 # Enable detailed accounting by the PV entry allocator. options PV_STATS ##################################################################### # More undocumented options for linting. # Note that documenting these are not considered an affront. options FB_INSTALL_CDEV # install a CDEV entry in /dev options I586_PMC_GUPROF=0x70000 options KBDIO_DEBUG=2 options KBD_MAXRETRY=4 options KBD_MAXWAIT=6 options KBD_RESETDELAY=201 options PSM_DEBUG=1 options TIMER_FREQ=((14318182+6)/12) options VM_KMEM_SIZE options VM_KMEM_SIZE_MAX options VM_KMEM_SIZE_SCALE Index: head/sys/i386/i386/elf_machdep.c =================================================================== --- head/sys/i386/i386/elf_machdep.c (revision 313153) +++ head/sys/i386/i386/elf_machdep.c (revision 313154) @@ -1,289 +1,281 @@ /*- * Copyright 1996-1998 John D. Polstra. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_cpu.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include -#if !defined(CPU_DISABLE_SSE) && defined(I686_CPU) -#define CPU_ENABLE_SSE -#endif - struct sysentvec elf32_freebsd_sysvec = { .sv_size = SYS_MAXSYSCALL, .sv_table = sysent, .sv_mask = 0, .sv_errsize = 0, .sv_errtbl = NULL, .sv_transtrap = NULL, .sv_fixup = __elfN(freebsd_fixup), .sv_sendsig = sendsig, .sv_sigcode = sigcode, .sv_szsigcode = &szsigcode, .sv_name = "FreeBSD ELF32", .sv_coredump = __elfN(coredump), .sv_imgact_try = NULL, .sv_minsigstksz = MINSIGSTKSZ, .sv_pagesize = PAGE_SIZE, .sv_minuser = VM_MIN_ADDRESS, .sv_maxuser = VM_MAXUSER_ADDRESS, .sv_usrstack = USRSTACK, .sv_psstrings = PS_STRINGS, .sv_stackprot = VM_PROT_ALL, .sv_copyout_strings = exec_copyout_strings, .sv_setregs = exec_setregs, .sv_fixlimit = NULL, .sv_maxssiz = NULL, .sv_flags = SV_ABI_FREEBSD | SV_IA32 | SV_ILP32 | SV_SHP | SV_TIMEKEEP, .sv_set_syscall_retval = cpu_set_syscall_retval, .sv_fetch_syscall_args = cpu_fetch_syscall_args, .sv_syscallnames = syscallnames, .sv_shared_page_base = SHAREDPAGE, .sv_shared_page_len = PAGE_SIZE, .sv_schedtail = NULL, .sv_thread_detach = NULL, .sv_trap = NULL, }; INIT_SYSENTVEC(elf32_sysvec, &elf32_freebsd_sysvec); static Elf32_Brandinfo freebsd_brand_info = { .brand = ELFOSABI_FREEBSD, .machine = EM_386, .compat_3_brand = "FreeBSD", .emul_path = NULL, .interp_path = "/libexec/ld-elf.so.1", .sysvec = &elf32_freebsd_sysvec, .interp_newpath = NULL, .brand_note = &elf32_freebsd_brandnote, .flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE }; SYSINIT(elf32, SI_SUB_EXEC, SI_ORDER_FIRST, (sysinit_cfunc_t) elf32_insert_brand_entry, &freebsd_brand_info); static Elf32_Brandinfo freebsd_brand_oinfo = { .brand = ELFOSABI_FREEBSD, .machine = EM_386, .compat_3_brand = "FreeBSD", .emul_path = NULL, .interp_path = "/usr/libexec/ld-elf.so.1", .sysvec = &elf32_freebsd_sysvec, .interp_newpath = NULL, .brand_note = &elf32_freebsd_brandnote, .flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE }; SYSINIT(oelf32, SI_SUB_EXEC, SI_ORDER_ANY, (sysinit_cfunc_t) elf32_insert_brand_entry, &freebsd_brand_oinfo); static Elf32_Brandinfo kfreebsd_brand_info = { .brand = ELFOSABI_FREEBSD, .machine = EM_386, .compat_3_brand = "FreeBSD", .emul_path = NULL, .interp_path = "/lib/ld.so.1", .sysvec = &elf32_freebsd_sysvec, .interp_newpath = NULL, .brand_note = &elf32_kfreebsd_brandnote, .flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE_MANDATORY }; SYSINIT(kelf32, SI_SUB_EXEC, SI_ORDER_ANY, (sysinit_cfunc_t) elf32_insert_brand_entry, &kfreebsd_brand_info); void elf32_dump_thread(struct thread *td, void *dst, size_t *off) { -#ifdef CPU_ENABLE_SSE void *buf; -#endif size_t len; len = 0; -#ifdef CPU_ENABLE_SSE if (use_xsave) { if (dst != NULL) { npxgetregs(td); len += elf32_populate_note(NT_X86_XSTATE, get_pcb_user_save_td(td), dst, cpu_max_ext_state_size, &buf); *(uint64_t *)((char *)buf + X86_XSTATE_XCR0_OFFSET) = xsave_mask; } else len += elf32_populate_note(NT_X86_XSTATE, NULL, NULL, cpu_max_ext_state_size, NULL); } -#endif *off = len; } /* Process one elf relocation with addend. */ static int elf_reloc_internal(linker_file_t lf, Elf_Addr relocbase, const void *data, int type, int local, elf_lookup_fn lookup) { Elf_Addr *where; Elf_Addr addr; Elf_Addr addend; Elf_Word rtype, symidx; const Elf_Rel *rel; const Elf_Rela *rela; int error; switch (type) { case ELF_RELOC_REL: rel = (const Elf_Rel *)data; where = (Elf_Addr *) (relocbase + rel->r_offset); addend = *where; rtype = ELF_R_TYPE(rel->r_info); symidx = ELF_R_SYM(rel->r_info); break; case ELF_RELOC_RELA: rela = (const Elf_Rela *)data; where = (Elf_Addr *) (relocbase + rela->r_offset); addend = rela->r_addend; rtype = ELF_R_TYPE(rela->r_info); symidx = ELF_R_SYM(rela->r_info); break; default: panic("unknown reloc type %d\n", type); } if (local) { if (rtype == R_386_RELATIVE) { /* A + B */ addr = elf_relocaddr(lf, relocbase + addend); if (*where != addr) *where = addr; } return (0); } switch (rtype) { case R_386_NONE: /* none */ break; case R_386_32: /* S + A */ error = lookup(lf, symidx, 1, &addr); if (error != 0) return -1; addr += addend; if (*where != addr) *where = addr; break; case R_386_PC32: /* S + A - P */ error = lookup(lf, symidx, 1, &addr); if (error != 0) return -1; addr += addend - (Elf_Addr)where; if (*where != addr) *where = addr; break; case R_386_COPY: /* none */ /* * There shouldn't be copy relocations in kernel * objects. */ printf("kldload: unexpected R_COPY relocation\n"); return -1; break; case R_386_GLOB_DAT: /* S */ error = lookup(lf, symidx, 1, &addr); if (error != 0) return -1; if (*where != addr) *where = addr; break; case R_386_RELATIVE: break; default: printf("kldload: unexpected relocation type %d\n", rtype); return -1; } return(0); } int elf_reloc(linker_file_t lf, Elf_Addr relocbase, const void *data, int type, elf_lookup_fn lookup) { return (elf_reloc_internal(lf, relocbase, data, type, 0, lookup)); } int elf_reloc_local(linker_file_t lf, Elf_Addr relocbase, const void *data, int type, elf_lookup_fn lookup) { return (elf_reloc_internal(lf, relocbase, data, type, 1, lookup)); } int elf_cpu_load_file(linker_file_t lf __unused) { return (0); } int elf_cpu_unload_file(linker_file_t lf __unused) { return (0); } Index: head/sys/i386/i386/exception.s =================================================================== --- head/sys/i386/i386/exception.s (revision 313153) +++ head/sys/i386/i386/exception.s (revision 313154) @@ -1,482 +1,481 @@ /*- * Copyright (c) 1989, 1990 William F. Jolitz. * Copyright (c) 1990 The Regents of the University of California. * Copyright (c) 2007 The FreeBSD Foundation * All rights reserved. * * Portions of this software were developed by A. Joseph Koshy under * sponsorship from the FreeBSD Foundation and Google, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include "opt_apic.h" #include "opt_atpic.h" #include "opt_hwpmc_hooks.h" -#include "opt_npx.h" #include #include #include #include "assym.s" #define SEL_RPL_MASK 0x0003 #define GSEL_KPL 0x0020 /* GSEL(GCODE_SEL, SEL_KPL) */ #ifdef KDTRACE_HOOKS .bss .globl dtrace_invop_jump_addr .align 4 .type dtrace_invop_jump_addr, @object .size dtrace_invop_jump_addr, 4 dtrace_invop_jump_addr: .zero 4 .globl dtrace_invop_calltrap_addr .align 4 .type dtrace_invop_calltrap_addr, @object .size dtrace_invop_calltrap_addr, 4 dtrace_invop_calltrap_addr: .zero 8 #endif .text #ifdef HWPMC_HOOKS ENTRY(start_exceptions) #endif /*****************************************************************************/ /* Trap handling */ /*****************************************************************************/ /* * Trap and fault vector routines. * * Most traps are 'trap gates', SDT_SYS386TGT. A trap gate pushes state on * the stack that mostly looks like an interrupt, but does not disable * interrupts. A few of the traps we are use are interrupt gates, * SDT_SYS386IGT, which are nearly the same thing except interrupts are * disabled on entry. * * The cpu will push a certain amount of state onto the kernel stack for * the current process. The amount of state depends on the type of trap * and whether the trap crossed rings or not. See i386/include/frame.h. * At the very least the current EFLAGS (status register, which includes * the interrupt disable state prior to the trap), the code segment register, * and the return instruction pointer are pushed by the cpu. The cpu * will also push an 'error' code for certain traps. We push a dummy * error code for those traps where the cpu doesn't in order to maintain * a consistent frame. We also push a contrived 'trap number'. * * The cpu does not push the general registers, we must do that, and we * must restore them prior to calling 'iret'. The cpu adjusts the %cs and * %ss segment registers, but does not mess with %ds, %es, or %fs. Thus we * must load them with appropriate values for supervisor mode operation. */ MCOUNT_LABEL(user) MCOUNT_LABEL(btrap) #define TRAP(a) pushl $(a) ; jmp alltraps IDTVEC(div) pushl $0; TRAP(T_DIVIDE) IDTVEC(dbg) pushl $0; TRAP(T_TRCTRAP) IDTVEC(nmi) pushl $0; TRAP(T_NMI) IDTVEC(bpt) pushl $0; TRAP(T_BPTFLT) IDTVEC(dtrace_ret) pushl $0; TRAP(T_DTRACE_RET) IDTVEC(ofl) pushl $0; TRAP(T_OFLOW) IDTVEC(bnd) pushl $0; TRAP(T_BOUND) #ifndef KDTRACE_HOOKS IDTVEC(ill) pushl $0; TRAP(T_PRIVINFLT) #endif IDTVEC(dna) pushl $0; TRAP(T_DNA) IDTVEC(fpusegm) pushl $0; TRAP(T_FPOPFLT) IDTVEC(tss) TRAP(T_TSSFLT) IDTVEC(missing) TRAP(T_SEGNPFLT) IDTVEC(stk) TRAP(T_STKFLT) IDTVEC(prot) TRAP(T_PROTFLT) IDTVEC(page) TRAP(T_PAGEFLT) IDTVEC(mchk) pushl $0; TRAP(T_MCHK) IDTVEC(rsvd) pushl $0; TRAP(T_RESERVED) IDTVEC(fpu) pushl $0; TRAP(T_ARITHTRAP) IDTVEC(align) TRAP(T_ALIGNFLT) IDTVEC(xmm) pushl $0; TRAP(T_XMMFLT) /* * All traps except ones for syscalls jump to alltraps. If * interrupts were enabled when the trap occurred, then interrupts * are enabled now if the trap was through a trap gate, else * disabled if the trap was through an interrupt gate. Note that * int0x80_syscall is a trap gate. Interrupt gates are used by * page faults, non-maskable interrupts, debug and breakpoint * exceptions. */ SUPERALIGN_TEXT .globl alltraps .type alltraps,@function alltraps: pushal pushl $0 movw %ds,(%esp) pushl $0 movw %es,(%esp) pushl $0 movw %fs,(%esp) alltraps_with_regs_pushed: SET_KERNEL_SREGS cld FAKE_MCOUNT(TF_EIP(%esp)) calltrap: pushl %esp call trap add $4, %esp /* * Return via doreti to handle ASTs. */ MEXITCOUNT jmp doreti /* * Privileged instruction fault. */ #ifdef KDTRACE_HOOKS SUPERALIGN_TEXT IDTVEC(ill) /* Check if there is no DTrace hook registered. */ cmpl $0,dtrace_invop_jump_addr je norm_ill /* Check if this is a user fault. */ cmpl $GSEL_KPL, 4(%esp) /* Check the code segment. */ /* If so, just handle it as a normal trap. */ jne norm_ill /* * This is a kernel instruction fault that might have been caused * by a DTrace provider. */ pushal /* Push all registers onto the stack. */ /* * Set our jump address for the jump back in the event that * the exception wasn't caused by DTrace at all. */ movl $norm_ill, dtrace_invop_calltrap_addr /* Jump to the code hooked in by DTrace. */ jmpl *dtrace_invop_jump_addr /* * Process the instruction fault in the normal way. */ norm_ill: pushl $0 TRAP(T_PRIVINFLT) #endif /* * Call gate entry for syscalls (lcall 7,0). * This is used by FreeBSD 1.x a.out executables and "old" NetBSD executables. * * The intersegment call has been set up to specify one dummy parameter. * This leaves a place to put eflags so that the call frame can be * converted to a trap frame. Note that the eflags is (semi-)bogusly * pushed into (what will be) tf_err and then copied later into the * final spot. It has to be done this way because esp can't be just * temporarily altered for the pushfl - an interrupt might come in * and clobber the saved cs/eip. */ SUPERALIGN_TEXT IDTVEC(lcall_syscall) pushfl /* save eflags */ popl 8(%esp) /* shuffle into tf_eflags */ pushl $7 /* sizeof "lcall 7,0" */ pushl $0 /* tf_trapno */ pushal pushl $0 movw %ds,(%esp) pushl $0 movw %es,(%esp) pushl $0 movw %fs,(%esp) SET_KERNEL_SREGS cld FAKE_MCOUNT(TF_EIP(%esp)) pushl %esp call syscall add $4, %esp MEXITCOUNT jmp doreti /* * Trap gate entry for syscalls (int 0x80). * This is used by FreeBSD ELF executables, "new" NetBSD executables, and all * Linux executables. * * Even though the name says 'int0x80', this is actually a trap gate, not an * interrupt gate. Thus interrupts are enabled on entry just as they are for * a normal syscall. */ SUPERALIGN_TEXT IDTVEC(int0x80_syscall) pushl $2 /* sizeof "int 0x80" */ pushl $0 /* tf_trapno */ pushal pushl $0 movw %ds,(%esp) pushl $0 movw %es,(%esp) pushl $0 movw %fs,(%esp) SET_KERNEL_SREGS cld FAKE_MCOUNT(TF_EIP(%esp)) pushl %esp call syscall add $4, %esp MEXITCOUNT jmp doreti ENTRY(fork_trampoline) pushl %esp /* trapframe pointer */ pushl %ebx /* arg1 */ pushl %esi /* function */ call fork_exit addl $12,%esp /* cut from syscall */ /* * Return via doreti to handle ASTs. */ MEXITCOUNT jmp doreti /* * To efficiently implement classification of trap and interrupt handlers * for profiling, there must be only trap handlers between the labels btrap * and bintr, and only interrupt handlers between the labels bintr and * eintr. This is implemented (partly) by including files that contain * some of the handlers. Before including the files, set up a normal asm * environment so that the included files doen't need to know that they are * included. */ .data .p2align 4 .text SUPERALIGN_TEXT MCOUNT_LABEL(bintr) #ifdef DEV_ATPIC #include #endif #if defined(DEV_APIC) && defined(DEV_ATPIC) .data .p2align 4 .text SUPERALIGN_TEXT #endif #ifdef DEV_APIC #include #endif .data .p2align 4 .text SUPERALIGN_TEXT #include .text MCOUNT_LABEL(eintr) /* * void doreti(struct trapframe) * * Handle return from interrupts, traps and syscalls. */ .text SUPERALIGN_TEXT .type doreti,@function .globl doreti doreti: FAKE_MCOUNT($bintr) /* init "from" bintr -> doreti */ doreti_next: /* * Check if ASTs can be handled now. ASTs cannot be safely * processed when returning from an NMI. */ cmpb $T_NMI,TF_TRAPNO(%esp) #ifdef HWPMC_HOOKS je doreti_nmi #else je doreti_exit #endif /* * PSL_VM must be checked first since segment registers only * have an RPL in non-VM86 mode. * ASTs can not be handled now if we are in a vm86 call. */ testl $PSL_VM,TF_EFLAGS(%esp) jz doreti_notvm86 movl PCPU(CURPCB),%ecx testl $PCB_VM86CALL,PCB_FLAGS(%ecx) jz doreti_ast jmp doreti_exit doreti_notvm86: testb $SEL_RPL_MASK,TF_CS(%esp) /* are we returning to user mode? */ jz doreti_exit /* can't handle ASTs now if not */ doreti_ast: /* * Check for ASTs atomically with returning. Disabling CPU * interrupts provides sufficient locking even in the SMP case, * since we will be informed of any new ASTs by an IPI. */ cli movl PCPU(CURTHREAD),%eax testl $TDF_ASTPENDING | TDF_NEEDRESCHED,TD_FLAGS(%eax) je doreti_exit sti pushl %esp /* pass a pointer to the trapframe */ call ast add $4,%esp jmp doreti_ast /* * doreti_exit: pop registers, iret. * * The segment register pop is a special case, since it may * fault if (for example) a sigreturn specifies bad segment * registers. The fault is handled in trap.c. */ doreti_exit: MEXITCOUNT .globl doreti_popl_fs doreti_popl_fs: popl %fs .globl doreti_popl_es doreti_popl_es: popl %es .globl doreti_popl_ds doreti_popl_ds: popl %ds popal addl $8,%esp .globl doreti_iret doreti_iret: iret /* * doreti_iret_fault and friends. Alternative return code for * the case where we get a fault in the doreti_exit code * above. trap() (i386/i386/trap.c) catches this specific * case, sends the process a signal and continues in the * corresponding place in the code below. */ ALIGN_TEXT .globl doreti_iret_fault doreti_iret_fault: subl $8,%esp pushal pushl $0 movw %ds,(%esp) .globl doreti_popl_ds_fault doreti_popl_ds_fault: pushl $0 movw %es,(%esp) .globl doreti_popl_es_fault doreti_popl_es_fault: pushl $0 movw %fs,(%esp) .globl doreti_popl_fs_fault doreti_popl_fs_fault: sti movl $0,TF_ERR(%esp) /* XXX should be the error code */ movl $T_PROTFLT,TF_TRAPNO(%esp) jmp alltraps_with_regs_pushed #ifdef HWPMC_HOOKS doreti_nmi: /* * Since we are returning from an NMI, check if the current trap * was from user mode and if so whether the current thread * needs a user call chain capture. */ testb $SEL_RPL_MASK,TF_CS(%esp) jz doreti_exit movl PCPU(CURTHREAD),%eax /* curthread present? */ orl %eax,%eax jz doreti_exit testl $TDP_CALLCHAIN,TD_PFLAGS(%eax) /* flagged for capture? */ jz doreti_exit /* * Take the processor out of NMI mode by executing a fake "iret". */ pushfl pushl %cs pushl $outofnmi iret outofnmi: /* * Call the callchain capture hook after turning interrupts back on. */ movl pmc_hook,%ecx orl %ecx,%ecx jz doreti_exit pushl %esp /* frame pointer */ pushl $PMC_FN_USER_CALLCHAIN /* command */ movl PCPU(CURTHREAD),%eax pushl %eax /* curthread */ sti call *%ecx addl $12,%esp jmp doreti_ast ENTRY(end_exceptions) #endif Index: head/sys/i386/i386/initcpu.c =================================================================== --- head/sys/i386/i386/initcpu.c (revision 313153) +++ head/sys/i386/i386/initcpu.c (revision 313154) @@ -1,989 +1,983 @@ /*- * Copyright (c) KATO Takenori, 1997, 1998. * * All rights reserved. Unpublished rights reserved under the copyright * laws of Japan. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer as * the first lines of this file unmodified. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_cpu.h" #include #include #include #include #include #include #include #include #include -#if !defined(CPU_DISABLE_SSE) && defined(I686_CPU) -#define CPU_ENABLE_SSE -#endif - #ifdef I486_CPU static void init_5x86(void); static void init_bluelightning(void); static void init_486dlc(void); static void init_cy486dx(void); #ifdef CPU_I486_ON_386 static void init_i486_on_386(void); #endif static void init_6x86(void); #endif /* I486_CPU */ #if defined(I586_CPU) && defined(CPU_WT_ALLOC) static void enable_K5_wt_alloc(void); static void enable_K6_wt_alloc(void); static void enable_K6_2_wt_alloc(void); #endif #ifdef I686_CPU static void init_6x86MX(void); static void init_ppro(void); static void init_mendocino(void); #endif static int hw_instruction_sse; SYSCTL_INT(_hw, OID_AUTO, instruction_sse, CTLFLAG_RD, &hw_instruction_sse, 0, "SIMD/MMX2 instructions available in CPU"); /* * -1: automatic (default) * 0: keep enable CLFLUSH * 1: force disable CLFLUSH */ static int hw_clflush_disable = -1; u_int cyrix_did; /* Device ID of Cyrix CPU */ #ifdef I486_CPU /* * IBM Blue Lightning */ static void init_bluelightning(void) { register_t saveintr; saveintr = intr_disable(); load_cr0(rcr0() | CR0_CD | CR0_NW); invd(); #ifdef CPU_BLUELIGHTNING_FPU_OP_CACHE wrmsr(0x1000, 0x9c92LL); /* FP operand can be cacheable on Cyrix FPU */ #else wrmsr(0x1000, 0x1c92LL); /* Intel FPU */ #endif /* Enables 13MB and 0-640KB cache. */ wrmsr(0x1001, (0xd0LL << 32) | 0x3ff); #ifdef CPU_BLUELIGHTNING_3X wrmsr(0x1002, 0x04000000LL); /* Enables triple-clock mode. */ #else wrmsr(0x1002, 0x03000000LL); /* Enables double-clock mode. */ #endif /* Enable caching in CR0. */ load_cr0(rcr0() & ~(CR0_CD | CR0_NW)); /* CD = 0 and NW = 0 */ invd(); intr_restore(saveintr); } /* * Cyrix 486SLC/DLC/SR/DR series */ static void init_486dlc(void) { register_t saveintr; u_char ccr0; saveintr = intr_disable(); invd(); ccr0 = read_cyrix_reg(CCR0); #ifndef CYRIX_CACHE_WORKS ccr0 |= CCR0_NC1 | CCR0_BARB; write_cyrix_reg(CCR0, ccr0); invd(); #else ccr0 &= ~CCR0_NC0; #ifndef CYRIX_CACHE_REALLY_WORKS ccr0 |= CCR0_NC1 | CCR0_BARB; #else ccr0 |= CCR0_NC1; #endif #ifdef CPU_DIRECT_MAPPED_CACHE ccr0 |= CCR0_CO; /* Direct mapped mode. */ #endif write_cyrix_reg(CCR0, ccr0); /* Clear non-cacheable region. */ write_cyrix_reg(NCR1+2, NCR_SIZE_0K); write_cyrix_reg(NCR2+2, NCR_SIZE_0K); write_cyrix_reg(NCR3+2, NCR_SIZE_0K); write_cyrix_reg(NCR4+2, NCR_SIZE_0K); write_cyrix_reg(0, 0); /* dummy write */ /* Enable caching in CR0. */ load_cr0(rcr0() & ~(CR0_CD | CR0_NW)); /* CD = 0 and NW = 0 */ invd(); #endif /* !CYRIX_CACHE_WORKS */ intr_restore(saveintr); } /* * Cyrix 486S/DX series */ static void init_cy486dx(void) { register_t saveintr; u_char ccr2; saveintr = intr_disable(); invd(); ccr2 = read_cyrix_reg(CCR2); #ifdef CPU_SUSP_HLT ccr2 |= CCR2_SUSP_HLT; #endif write_cyrix_reg(CCR2, ccr2); intr_restore(saveintr); } /* * Cyrix 5x86 */ static void init_5x86(void) { register_t saveintr; u_char ccr2, ccr3, ccr4, pcr0; saveintr = intr_disable(); load_cr0(rcr0() | CR0_CD | CR0_NW); wbinvd(); (void)read_cyrix_reg(CCR3); /* dummy */ /* Initialize CCR2. */ ccr2 = read_cyrix_reg(CCR2); ccr2 |= CCR2_WB; #ifdef CPU_SUSP_HLT ccr2 |= CCR2_SUSP_HLT; #else ccr2 &= ~CCR2_SUSP_HLT; #endif ccr2 |= CCR2_WT1; write_cyrix_reg(CCR2, ccr2); /* Initialize CCR4. */ ccr3 = read_cyrix_reg(CCR3); write_cyrix_reg(CCR3, CCR3_MAPEN0); ccr4 = read_cyrix_reg(CCR4); ccr4 |= CCR4_DTE; ccr4 |= CCR4_MEM; #ifdef CPU_FASTER_5X86_FPU ccr4 |= CCR4_FASTFPE; #else ccr4 &= ~CCR4_FASTFPE; #endif ccr4 &= ~CCR4_IOMASK; /******************************************************************** * WARNING: The "BIOS Writers Guide" mentions that I/O recovery time * should be 0 for errata fix. ********************************************************************/ #ifdef CPU_IORT ccr4 |= CPU_IORT & CCR4_IOMASK; #endif write_cyrix_reg(CCR4, ccr4); /* Initialize PCR0. */ /**************************************************************** * WARNING: RSTK_EN and LOOP_EN could make your system unstable. * BTB_EN might make your system unstable. ****************************************************************/ pcr0 = read_cyrix_reg(PCR0); #ifdef CPU_RSTK_EN pcr0 |= PCR0_RSTK; #else pcr0 &= ~PCR0_RSTK; #endif #ifdef CPU_BTB_EN pcr0 |= PCR0_BTB; #else pcr0 &= ~PCR0_BTB; #endif #ifdef CPU_LOOP_EN pcr0 |= PCR0_LOOP; #else pcr0 &= ~PCR0_LOOP; #endif /**************************************************************** * WARNING: if you use a memory mapped I/O device, don't use * DISABLE_5X86_LSSER option, which may reorder memory mapped * I/O access. * IF YOUR MOTHERBOARD HAS PCI BUS, DON'T DISABLE LSSER. ****************************************************************/ #ifdef CPU_DISABLE_5X86_LSSER pcr0 &= ~PCR0_LSSER; #else pcr0 |= PCR0_LSSER; #endif write_cyrix_reg(PCR0, pcr0); /* Restore CCR3. */ write_cyrix_reg(CCR3, ccr3); (void)read_cyrix_reg(0x80); /* dummy */ /* Unlock NW bit in CR0. */ write_cyrix_reg(CCR2, read_cyrix_reg(CCR2) & ~CCR2_LOCK_NW); load_cr0((rcr0() & ~CR0_CD) | CR0_NW); /* CD = 0, NW = 1 */ /* Lock NW bit in CR0. */ write_cyrix_reg(CCR2, read_cyrix_reg(CCR2) | CCR2_LOCK_NW); intr_restore(saveintr); } #ifdef CPU_I486_ON_386 /* * There are i486 based upgrade products for i386 machines. * In this case, BIOS doesn't enable CPU cache. */ static void init_i486_on_386(void) { register_t saveintr; saveintr = intr_disable(); load_cr0(rcr0() & ~(CR0_CD | CR0_NW)); /* CD = 0, NW = 0 */ intr_restore(saveintr); } #endif /* * Cyrix 6x86 * * XXX - What should I do here? Please let me know. */ static void init_6x86(void) { register_t saveintr; u_char ccr3, ccr4; saveintr = intr_disable(); load_cr0(rcr0() | CR0_CD | CR0_NW); wbinvd(); /* Initialize CCR0. */ write_cyrix_reg(CCR0, read_cyrix_reg(CCR0) | CCR0_NC1); /* Initialize CCR1. */ #ifdef CPU_CYRIX_NO_LOCK write_cyrix_reg(CCR1, read_cyrix_reg(CCR1) | CCR1_NO_LOCK); #else write_cyrix_reg(CCR1, read_cyrix_reg(CCR1) & ~CCR1_NO_LOCK); #endif /* Initialize CCR2. */ #ifdef CPU_SUSP_HLT write_cyrix_reg(CCR2, read_cyrix_reg(CCR2) | CCR2_SUSP_HLT); #else write_cyrix_reg(CCR2, read_cyrix_reg(CCR2) & ~CCR2_SUSP_HLT); #endif ccr3 = read_cyrix_reg(CCR3); write_cyrix_reg(CCR3, CCR3_MAPEN0); /* Initialize CCR4. */ ccr4 = read_cyrix_reg(CCR4); ccr4 |= CCR4_DTE; ccr4 &= ~CCR4_IOMASK; #ifdef CPU_IORT write_cyrix_reg(CCR4, ccr4 | (CPU_IORT & CCR4_IOMASK)); #else write_cyrix_reg(CCR4, ccr4 | 7); #endif /* Initialize CCR5. */ #ifdef CPU_WT_ALLOC write_cyrix_reg(CCR5, read_cyrix_reg(CCR5) | CCR5_WT_ALLOC); #endif /* Restore CCR3. */ write_cyrix_reg(CCR3, ccr3); /* Unlock NW bit in CR0. */ write_cyrix_reg(CCR2, read_cyrix_reg(CCR2) & ~CCR2_LOCK_NW); /* * Earlier revision of the 6x86 CPU could crash the system if * L1 cache is in write-back mode. */ if ((cyrix_did & 0xff00) > 0x1600) load_cr0(rcr0() & ~(CR0_CD | CR0_NW)); /* CD = 0 and NW = 0 */ else { /* Revision 2.6 and lower. */ #ifdef CYRIX_CACHE_REALLY_WORKS load_cr0(rcr0() & ~(CR0_CD | CR0_NW)); /* CD = 0 and NW = 0 */ #else load_cr0((rcr0() & ~CR0_CD) | CR0_NW); /* CD = 0 and NW = 1 */ #endif } /* Lock NW bit in CR0. */ write_cyrix_reg(CCR2, read_cyrix_reg(CCR2) | CCR2_LOCK_NW); intr_restore(saveintr); } #endif /* I486_CPU */ #ifdef I586_CPU /* * Rise mP6 */ static void init_rise(void) { /* * The CMPXCHG8B instruction is always available but hidden. */ cpu_feature |= CPUID_CX8; } /* * IDT WinChip C6/2/2A/2B/3 * * http://www.centtech.com/winchip_bios_writers_guide_v4_0.pdf */ static void init_winchip(void) { u_int regs[4]; uint64_t fcr; fcr = rdmsr(0x0107); /* * Set ECX8, DSMC, DTLOCK/EDCTLB, EMMX, and ERETSTK and clear DPDC. */ fcr |= (1 << 1) | (1 << 7) | (1 << 8) | (1 << 9) | (1 << 16); fcr &= ~(1ULL << 11); /* * Additionally, set EBRPRED, E2MMX and EAMD3D for WinChip 2 and 3. */ if (CPUID_TO_MODEL(cpu_id) >= 8) fcr |= (1 << 12) | (1 << 19) | (1 << 20); wrmsr(0x0107, fcr); do_cpuid(1, regs); cpu_feature = regs[3]; } #endif #ifdef I686_CPU /* * Cyrix 6x86MX (code-named M2) * * XXX - What should I do here? Please let me know. */ static void init_6x86MX(void) { register_t saveintr; u_char ccr3, ccr4; saveintr = intr_disable(); load_cr0(rcr0() | CR0_CD | CR0_NW); wbinvd(); /* Initialize CCR0. */ write_cyrix_reg(CCR0, read_cyrix_reg(CCR0) | CCR0_NC1); /* Initialize CCR1. */ #ifdef CPU_CYRIX_NO_LOCK write_cyrix_reg(CCR1, read_cyrix_reg(CCR1) | CCR1_NO_LOCK); #else write_cyrix_reg(CCR1, read_cyrix_reg(CCR1) & ~CCR1_NO_LOCK); #endif /* Initialize CCR2. */ #ifdef CPU_SUSP_HLT write_cyrix_reg(CCR2, read_cyrix_reg(CCR2) | CCR2_SUSP_HLT); #else write_cyrix_reg(CCR2, read_cyrix_reg(CCR2) & ~CCR2_SUSP_HLT); #endif ccr3 = read_cyrix_reg(CCR3); write_cyrix_reg(CCR3, CCR3_MAPEN0); /* Initialize CCR4. */ ccr4 = read_cyrix_reg(CCR4); ccr4 &= ~CCR4_IOMASK; #ifdef CPU_IORT write_cyrix_reg(CCR4, ccr4 | (CPU_IORT & CCR4_IOMASK)); #else write_cyrix_reg(CCR4, ccr4 | 7); #endif /* Initialize CCR5. */ #ifdef CPU_WT_ALLOC write_cyrix_reg(CCR5, read_cyrix_reg(CCR5) | CCR5_WT_ALLOC); #endif /* Restore CCR3. */ write_cyrix_reg(CCR3, ccr3); /* Unlock NW bit in CR0. */ write_cyrix_reg(CCR2, read_cyrix_reg(CCR2) & ~CCR2_LOCK_NW); load_cr0(rcr0() & ~(CR0_CD | CR0_NW)); /* CD = 0 and NW = 0 */ /* Lock NW bit in CR0. */ write_cyrix_reg(CCR2, read_cyrix_reg(CCR2) | CCR2_LOCK_NW); intr_restore(saveintr); } static int ppro_apic_used = -1; static void init_ppro(void) { u_int64_t apicbase; /* * Local APIC should be disabled if it is not going to be used. */ if (ppro_apic_used != 1) { apicbase = rdmsr(MSR_APICBASE); apicbase &= ~APICBASE_ENABLED; wrmsr(MSR_APICBASE, apicbase); ppro_apic_used = 0; } } /* * If the local APIC is going to be used after being disabled above, * re-enable it and don't disable it in the future. */ void ppro_reenable_apic(void) { u_int64_t apicbase; if (ppro_apic_used == 0) { apicbase = rdmsr(MSR_APICBASE); apicbase |= APICBASE_ENABLED; wrmsr(MSR_APICBASE, apicbase); ppro_apic_used = 1; } } /* * Initialize BBL_CR_CTL3 (Control register 3: used to configure the * L2 cache). */ static void init_mendocino(void) { #ifdef CPU_PPRO2CELERON register_t saveintr; u_int64_t bbl_cr_ctl3; saveintr = intr_disable(); load_cr0(rcr0() | CR0_CD | CR0_NW); wbinvd(); bbl_cr_ctl3 = rdmsr(MSR_BBL_CR_CTL3); /* If the L2 cache is configured, do nothing. */ if (!(bbl_cr_ctl3 & 1)) { bbl_cr_ctl3 = 0x134052bLL; /* Set L2 Cache Latency (Default: 5). */ #ifdef CPU_CELERON_L2_LATENCY #if CPU_L2_LATENCY > 15 #error invalid CPU_L2_LATENCY. #endif bbl_cr_ctl3 |= CPU_L2_LATENCY << 1; #else bbl_cr_ctl3 |= 5 << 1; #endif wrmsr(MSR_BBL_CR_CTL3, bbl_cr_ctl3); } load_cr0(rcr0() & ~(CR0_CD | CR0_NW)); intr_restore(saveintr); #endif /* CPU_PPRO2CELERON */ } /* * Initialize special VIA features */ static void init_via(void) { u_int regs[4], val; uint64_t fcr; /* * Explicitly enable CX8 and PGE on C3. * * http://www.via.com.tw/download/mainboards/6/13/VIA_C3_EBGA%20datasheet110.pdf */ if (CPUID_TO_MODEL(cpu_id) <= 9) fcr = (1 << 1) | (1 << 7); else fcr = 0; /* * Check extended CPUID for PadLock features. * * http://www.via.com.tw/en/downloads/whitepapers/initiatives/padlock/programming_guide.pdf */ do_cpuid(0xc0000000, regs); if (regs[0] >= 0xc0000001) { do_cpuid(0xc0000001, regs); val = regs[3]; } else val = 0; /* Enable RNG if present. */ if ((val & VIA_CPUID_HAS_RNG) != 0) { via_feature_rng = VIA_HAS_RNG; wrmsr(0x110B, rdmsr(0x110B) | VIA_CPUID_DO_RNG); } /* Enable PadLock if present. */ if ((val & VIA_CPUID_HAS_ACE) != 0) via_feature_xcrypt |= VIA_HAS_AES; if ((val & VIA_CPUID_HAS_ACE2) != 0) via_feature_xcrypt |= VIA_HAS_AESCTR; if ((val & VIA_CPUID_HAS_PHE) != 0) via_feature_xcrypt |= VIA_HAS_SHA; if ((val & VIA_CPUID_HAS_PMM) != 0) via_feature_xcrypt |= VIA_HAS_MM; if (via_feature_xcrypt != 0) fcr |= 1 << 28; wrmsr(0x1107, rdmsr(0x1107) | fcr); } #endif /* I686_CPU */ #if defined(I586_CPU) || defined(I686_CPU) static void init_transmeta(void) { u_int regs[0]; /* Expose all hidden features. */ wrmsr(0x80860004, rdmsr(0x80860004) | ~0UL); do_cpuid(1, regs); cpu_feature = regs[3]; } #endif extern int elf32_nxstack; void initializecpu(void) { switch (cpu) { #ifdef I486_CPU case CPU_BLUE: init_bluelightning(); break; case CPU_486DLC: init_486dlc(); break; case CPU_CY486DX: init_cy486dx(); break; case CPU_M1SC: init_5x86(); break; #ifdef CPU_I486_ON_386 case CPU_486: init_i486_on_386(); break; #endif case CPU_M1: init_6x86(); break; #endif /* I486_CPU */ #ifdef I586_CPU case CPU_586: switch (cpu_vendor_id) { case CPU_VENDOR_AMD: #ifdef CPU_WT_ALLOC if (((cpu_id & 0x0f0) > 0) && ((cpu_id & 0x0f0) < 0x60) && ((cpu_id & 0x00f) > 3)) enable_K5_wt_alloc(); else if (((cpu_id & 0x0f0) > 0x80) || (((cpu_id & 0x0f0) == 0x80) && (cpu_id & 0x00f) > 0x07)) enable_K6_2_wt_alloc(); else if ((cpu_id & 0x0f0) > 0x50) enable_K6_wt_alloc(); #endif if ((cpu_id & 0xf0) == 0xa0) /* * Make sure the TSC runs through * suspension, otherwise we can't use * it as timecounter */ wrmsr(0x1900, rdmsr(0x1900) | 0x20ULL); break; case CPU_VENDOR_CENTAUR: init_winchip(); break; case CPU_VENDOR_TRANSMETA: init_transmeta(); break; case CPU_VENDOR_RISE: init_rise(); break; } break; #endif #ifdef I686_CPU case CPU_M2: init_6x86MX(); break; case CPU_686: switch (cpu_vendor_id) { case CPU_VENDOR_INTEL: switch (cpu_id & 0xff0) { case 0x610: init_ppro(); break; case 0x660: init_mendocino(); break; } break; #ifdef CPU_ATHLON_SSE_HACK case CPU_VENDOR_AMD: /* * Sometimes the BIOS doesn't enable SSE instructions. * According to AMD document 20734, the mobile * Duron, the (mobile) Athlon 4 and the Athlon MP * support SSE. These correspond to cpu_id 0x66X * or 0x67X. */ if ((cpu_feature & CPUID_XMM) == 0 && ((cpu_id & ~0xf) == 0x660 || (cpu_id & ~0xf) == 0x670 || (cpu_id & ~0xf) == 0x680)) { u_int regs[4]; wrmsr(MSR_HWCR, rdmsr(MSR_HWCR) & ~0x08000); do_cpuid(1, regs); cpu_feature = regs[3]; } break; #endif case CPU_VENDOR_CENTAUR: init_via(); break; case CPU_VENDOR_TRANSMETA: init_transmeta(); break; } break; #endif default: break; } -#if defined(CPU_ENABLE_SSE) if ((cpu_feature & CPUID_XMM) && (cpu_feature & CPUID_FXSR)) { load_cr4(rcr4() | CR4_FXSR | CR4_XMM); cpu_fxsr = hw_instruction_sse = 1; } -#endif #if defined(PAE) || defined(PAE_TABLES) if ((amd_feature & AMDID_NX) != 0) { uint64_t msr; msr = rdmsr(MSR_EFER) | EFER_NXE; wrmsr(MSR_EFER, msr); pg_nx = PG_NX; elf32_nxstack = 1; } #endif } void initializecpucache(void) { /* * CPUID with %eax = 1, %ebx returns * Bits 15-8: CLFLUSH line size * (Value * 8 = cache line size in bytes) */ if ((cpu_feature & CPUID_CLFSH) != 0) cpu_clflush_line_size = ((cpu_procinfo >> 8) & 0xff) * 8; /* * XXXKIB: (temporary) hack to work around traps generated * when CLFLUSHing APIC register window under virtualization * environments. These environments tend to disable the * CPUID_SS feature even though the native CPU supports it. */ TUNABLE_INT_FETCH("hw.clflush_disable", &hw_clflush_disable); if (vm_guest != VM_GUEST_NO && hw_clflush_disable == -1) { cpu_feature &= ~CPUID_CLFSH; cpu_stdext_feature &= ~CPUID_STDEXT_CLFLUSHOPT; } /* * The kernel's use of CLFLUSH{,OPT} can be disabled manually * by setting the hw.clflush_disable tunable. */ if (hw_clflush_disable == 1) { cpu_feature &= ~CPUID_CLFSH; cpu_stdext_feature &= ~CPUID_STDEXT_CLFLUSHOPT; } } #if defined(I586_CPU) && defined(CPU_WT_ALLOC) /* * Enable write allocate feature of AMD processors. * Following two functions require the Maxmem variable being set. */ static void enable_K5_wt_alloc(void) { u_int64_t msr; register_t saveintr; /* * Write allocate is supported only on models 1, 2, and 3, with * a stepping of 4 or greater. */ if (((cpu_id & 0xf0) > 0) && ((cpu_id & 0x0f) > 3)) { saveintr = intr_disable(); msr = rdmsr(0x83); /* HWCR */ wrmsr(0x83, msr & !(0x10)); /* * We have to tell the chip where the top of memory is, * since video cards could have frame bufferes there, * memory-mapped I/O could be there, etc. */ if(Maxmem > 0) msr = Maxmem / 16; else msr = 0; msr |= AMD_WT_ALLOC_TME | AMD_WT_ALLOC_FRE; /* * There is no way to know wheter 15-16M hole exists or not. * Therefore, we disable write allocate for this range. */ wrmsr(0x86, 0x0ff00f0); msr |= AMD_WT_ALLOC_PRE; wrmsr(0x85, msr); msr=rdmsr(0x83); wrmsr(0x83, msr|0x10); /* enable write allocate */ intr_restore(saveintr); } } static void enable_K6_wt_alloc(void) { quad_t size; u_int64_t whcr; register_t saveintr; saveintr = intr_disable(); wbinvd(); #ifdef CPU_DISABLE_CACHE /* * Certain K6-2 box becomes unstable when write allocation is * enabled. */ /* * The AMD-K6 processer provides the 64-bit Test Register 12(TR12), * but only the Cache Inhibit(CI) (bit 3 of TR12) is suppported. * All other bits in TR12 have no effect on the processer's operation. * The I/O Trap Restart function (bit 9 of TR12) is always enabled * on the AMD-K6. */ wrmsr(0x0000000e, (u_int64_t)0x0008); #endif /* Don't assume that memory size is aligned with 4M. */ if (Maxmem > 0) size = ((Maxmem >> 8) + 3) >> 2; else size = 0; /* Limit is 508M bytes. */ if (size > 0x7f) size = 0x7f; whcr = (rdmsr(0xc0000082) & ~(0x7fLL << 1)) | (size << 1); #if defined(NO_MEMORY_HOLE) if (whcr & (0x7fLL << 1)) whcr |= 0x0001LL; #else /* * There is no way to know wheter 15-16M hole exists or not. * Therefore, we disable write allocate for this range. */ whcr &= ~0x0001LL; #endif wrmsr(0x0c0000082, whcr); intr_restore(saveintr); } static void enable_K6_2_wt_alloc(void) { quad_t size; u_int64_t whcr; register_t saveintr; saveintr = intr_disable(); wbinvd(); #ifdef CPU_DISABLE_CACHE /* * Certain K6-2 box becomes unstable when write allocation is * enabled. */ /* * The AMD-K6 processer provides the 64-bit Test Register 12(TR12), * but only the Cache Inhibit(CI) (bit 3 of TR12) is suppported. * All other bits in TR12 have no effect on the processer's operation. * The I/O Trap Restart function (bit 9 of TR12) is always enabled * on the AMD-K6. */ wrmsr(0x0000000e, (u_int64_t)0x0008); #endif /* Don't assume that memory size is aligned with 4M. */ if (Maxmem > 0) size = ((Maxmem >> 8) + 3) >> 2; else size = 0; /* Limit is 4092M bytes. */ if (size > 0x3fff) size = 0x3ff; whcr = (rdmsr(0xc0000082) & ~(0x3ffLL << 22)) | (size << 22); #if defined(NO_MEMORY_HOLE) if (whcr & (0x3ffLL << 22)) whcr |= 1LL << 16; #else /* * There is no way to know wheter 15-16M hole exists or not. * Therefore, we disable write allocate for this range. */ whcr &= ~(1LL << 16); #endif wrmsr(0x0c0000082, whcr); intr_restore(saveintr); } #endif /* I585_CPU && CPU_WT_ALLOC */ #include "opt_ddb.h" #ifdef DDB #include DB_SHOW_COMMAND(cyrixreg, cyrixreg) { register_t saveintr; u_int cr0; u_char ccr1, ccr2, ccr3; u_char ccr0 = 0, ccr4 = 0, ccr5 = 0, pcr0 = 0; cr0 = rcr0(); if (cpu_vendor_id == CPU_VENDOR_CYRIX) { saveintr = intr_disable(); if ((cpu != CPU_M1SC) && (cpu != CPU_CY486DX)) { ccr0 = read_cyrix_reg(CCR0); } ccr1 = read_cyrix_reg(CCR1); ccr2 = read_cyrix_reg(CCR2); ccr3 = read_cyrix_reg(CCR3); if ((cpu == CPU_M1SC) || (cpu == CPU_M1) || (cpu == CPU_M2)) { write_cyrix_reg(CCR3, CCR3_MAPEN0); ccr4 = read_cyrix_reg(CCR4); if ((cpu == CPU_M1) || (cpu == CPU_M2)) ccr5 = read_cyrix_reg(CCR5); else pcr0 = read_cyrix_reg(PCR0); write_cyrix_reg(CCR3, ccr3); /* Restore CCR3. */ } intr_restore(saveintr); if ((cpu != CPU_M1SC) && (cpu != CPU_CY486DX)) printf("CCR0=%x, ", (u_int)ccr0); printf("CCR1=%x, CCR2=%x, CCR3=%x", (u_int)ccr1, (u_int)ccr2, (u_int)ccr3); if ((cpu == CPU_M1SC) || (cpu == CPU_M1) || (cpu == CPU_M2)) { printf(", CCR4=%x, ", (u_int)ccr4); if (cpu == CPU_M1SC) printf("PCR0=%x\n", pcr0); else printf("CCR5=%x\n", ccr5); } } printf("CR0=%x\n", cr0); } #endif /* DDB */ Index: head/sys/i386/i386/machdep.c =================================================================== --- head/sys/i386/i386/machdep.c (revision 313153) +++ head/sys/i386/i386/machdep.c (revision 313154) @@ -1,3137 +1,3094 @@ /*- * Copyright (c) 1992 Terrence R. Lambert. * Copyright (c) 1982, 1987, 1990 The Regents of the University of California. * All rights reserved. * * This code is derived from software contributed to Berkeley by * William Jolitz. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)machdep.c 7.4 (Berkeley) 6/3/91 */ #include __FBSDID("$FreeBSD$"); #include "opt_apic.h" #include "opt_atpic.h" #include "opt_compat.h" #include "opt_cpu.h" #include "opt_ddb.h" #include "opt_inet.h" #include "opt_isa.h" #include "opt_kstack_pages.h" #include "opt_maxmem.h" #include "opt_mp_watchdog.h" -#include "opt_npx.h" #include "opt_perfmon.h" #include "opt_platform.h" #include "opt_xbox.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef SMP #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DDB #ifndef KDB #error KDB must be enabled in order for DDB to work! #endif #include #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef PERFMON #include #endif #ifdef SMP #include #endif #ifdef FDT #include #endif #ifdef DEV_APIC #include #endif #ifdef DEV_ISA #include #endif #ifdef XBOX #include int arch_i386_is_xbox = 0; uint32_t arch_i386_xbox_memsize = 0; #endif /* Sanity check for __curthread() */ CTASSERT(offsetof(struct pcpu, pc_curthread) == 0); extern register_t init386(int first); extern void dblfault_handler(void); -#if !defined(CPU_DISABLE_SSE) && defined(I686_CPU) -#define CPU_ENABLE_SSE -#endif - static void cpu_startup(void *); static void fpstate_drop(struct thread *td); static void get_fpcontext(struct thread *td, mcontext_t *mcp, char *xfpusave, size_t xfpusave_len); static int set_fpcontext(struct thread *td, mcontext_t *mcp, char *xfpustate, size_t xfpustate_len); SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL); /* Intel ICH registers */ #define ICH_PMBASE 0x400 #define ICH_SMI_EN ICH_PMBASE + 0x30 int _udatasel, _ucodesel; u_int basemem; int cold = 1; #ifdef COMPAT_43 static void osendsig(sig_t catcher, ksiginfo_t *, sigset_t *mask); #endif #ifdef COMPAT_FREEBSD4 static void freebsd4_sendsig(sig_t catcher, ksiginfo_t *, sigset_t *mask); #endif long Maxmem = 0; long realmem = 0; #ifdef PAE FEATURE(pae, "Physical Address Extensions"); #endif /* * The number of PHYSMAP entries must be one less than the number of * PHYSSEG entries because the PHYSMAP entry that spans the largest * physical address that is accessible by ISA DMA is split into two * PHYSSEG entries. */ #define PHYSMAP_SIZE (2 * (VM_PHYSSEG_MAX - 1)) vm_paddr_t phys_avail[PHYSMAP_SIZE + 2]; vm_paddr_t dump_avail[PHYSMAP_SIZE + 2]; /* must be 2 less so 0 0 can signal end of chunks */ #define PHYS_AVAIL_ARRAY_END (nitems(phys_avail) - 2) #define DUMP_AVAIL_ARRAY_END (nitems(dump_avail) - 2) struct kva_md_info kmi; static struct trapframe proc0_tf; struct pcpu __pcpu[MAXCPU]; struct mtx icu_lock; struct mem_range_softc mem_range_softc; /* Default init_ops implementation. */ struct init_ops init_ops = { .early_clock_source_init = i8254_init, .early_delay = i8254_delay, #ifdef DEV_APIC .msi_init = msi_init, #endif }; static void cpu_startup(dummy) void *dummy; { uintmax_t memsize; char *sysenv; /* * On MacBooks, we need to disallow the legacy USB circuit to * generate an SMI# because this can cause several problems, * namely: incorrect CPU frequency detection and failure to * start the APs. * We do this by disabling a bit in the SMI_EN (SMI Control and * Enable register) of the Intel ICH LPC Interface Bridge. */ sysenv = kern_getenv("smbios.system.product"); if (sysenv != NULL) { if (strncmp(sysenv, "MacBook1,1", 10) == 0 || strncmp(sysenv, "MacBook3,1", 10) == 0 || strncmp(sysenv, "MacBook4,1", 10) == 0 || strncmp(sysenv, "MacBookPro1,1", 13) == 0 || strncmp(sysenv, "MacBookPro1,2", 13) == 0 || strncmp(sysenv, "MacBookPro3,1", 13) == 0 || strncmp(sysenv, "MacBookPro4,1", 13) == 0 || strncmp(sysenv, "Macmini1,1", 10) == 0) { if (bootverbose) printf("Disabling LEGACY_USB_EN bit on " "Intel ICH.\n"); outl(ICH_SMI_EN, inl(ICH_SMI_EN) & ~0x8); } freeenv(sysenv); } /* * Good {morning,afternoon,evening,night}. */ startrtclock(); printcpuinfo(); panicifcpuunsupported(); #ifdef PERFMON perfmon_init(); #endif /* * Display physical memory if SMBIOS reports reasonable amount. */ memsize = 0; sysenv = kern_getenv("smbios.memory.enabled"); if (sysenv != NULL) { memsize = (uintmax_t)strtoul(sysenv, (char **)NULL, 10) << 10; freeenv(sysenv); } if (memsize < ptoa((uintmax_t)vm_cnt.v_free_count)) memsize = ptoa((uintmax_t)Maxmem); printf("real memory = %ju (%ju MB)\n", memsize, memsize >> 20); realmem = atop(memsize); /* * Display any holes after the first chunk of extended memory. */ if (bootverbose) { int indx; printf("Physical memory chunk(s):\n"); for (indx = 0; phys_avail[indx + 1] != 0; indx += 2) { vm_paddr_t size; size = phys_avail[indx + 1] - phys_avail[indx]; printf( "0x%016jx - 0x%016jx, %ju bytes (%ju pages)\n", (uintmax_t)phys_avail[indx], (uintmax_t)phys_avail[indx + 1] - 1, (uintmax_t)size, (uintmax_t)size / PAGE_SIZE); } } vm_ksubmap_init(&kmi); printf("avail memory = %ju (%ju MB)\n", ptoa((uintmax_t)vm_cnt.v_free_count), ptoa((uintmax_t)vm_cnt.v_free_count) / 1048576); /* * Set up buffers, so they can be used to read disk labels. */ bufinit(); vm_pager_bufferinit(); cpu_setregs(); } /* * Send an interrupt to process. * * Stack is set up to allow sigcode stored * at top to call routine, followed by call * to sigreturn routine below. After sigreturn * resets the signal mask, the stack, and the * frame pointer, it returns to the user * specified pc, psl. */ #ifdef COMPAT_43 static void osendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask) { struct osigframe sf, *fp; struct proc *p; struct thread *td; struct sigacts *psp; struct trapframe *regs; int sig; int oonstack; td = curthread; p = td->td_proc; PROC_LOCK_ASSERT(p, MA_OWNED); sig = ksi->ksi_signo; psp = p->p_sigacts; mtx_assert(&psp->ps_mtx, MA_OWNED); regs = td->td_frame; oonstack = sigonstack(regs->tf_esp); /* Allocate space for the signal handler context. */ if ((td->td_pflags & TDP_ALTSTACK) && !oonstack && SIGISMEMBER(psp->ps_sigonstack, sig)) { fp = (struct osigframe *)((uintptr_t)td->td_sigstk.ss_sp + td->td_sigstk.ss_size - sizeof(struct osigframe)); #if defined(COMPAT_43) td->td_sigstk.ss_flags |= SS_ONSTACK; #endif } else fp = (struct osigframe *)regs->tf_esp - 1; /* Build the argument list for the signal handler. */ sf.sf_signum = sig; sf.sf_scp = (register_t)&fp->sf_siginfo.si_sc; bzero(&sf.sf_siginfo, sizeof(sf.sf_siginfo)); if (SIGISMEMBER(psp->ps_siginfo, sig)) { /* Signal handler installed with SA_SIGINFO. */ sf.sf_arg2 = (register_t)&fp->sf_siginfo; sf.sf_siginfo.si_signo = sig; sf.sf_siginfo.si_code = ksi->ksi_code; sf.sf_ahu.sf_action = (__osiginfohandler_t *)catcher; sf.sf_addr = 0; } else { /* Old FreeBSD-style arguments. */ sf.sf_arg2 = ksi->ksi_code; sf.sf_addr = (register_t)ksi->ksi_addr; sf.sf_ahu.sf_handler = catcher; } mtx_unlock(&psp->ps_mtx); PROC_UNLOCK(p); /* Save most if not all of trap frame. */ sf.sf_siginfo.si_sc.sc_eax = regs->tf_eax; sf.sf_siginfo.si_sc.sc_ebx = regs->tf_ebx; sf.sf_siginfo.si_sc.sc_ecx = regs->tf_ecx; sf.sf_siginfo.si_sc.sc_edx = regs->tf_edx; sf.sf_siginfo.si_sc.sc_esi = regs->tf_esi; sf.sf_siginfo.si_sc.sc_edi = regs->tf_edi; sf.sf_siginfo.si_sc.sc_cs = regs->tf_cs; sf.sf_siginfo.si_sc.sc_ds = regs->tf_ds; sf.sf_siginfo.si_sc.sc_ss = regs->tf_ss; sf.sf_siginfo.si_sc.sc_es = regs->tf_es; sf.sf_siginfo.si_sc.sc_fs = regs->tf_fs; sf.sf_siginfo.si_sc.sc_gs = rgs(); sf.sf_siginfo.si_sc.sc_isp = regs->tf_isp; /* Build the signal context to be used by osigreturn(). */ sf.sf_siginfo.si_sc.sc_onstack = (oonstack) ? 1 : 0; SIG2OSIG(*mask, sf.sf_siginfo.si_sc.sc_mask); sf.sf_siginfo.si_sc.sc_sp = regs->tf_esp; sf.sf_siginfo.si_sc.sc_fp = regs->tf_ebp; sf.sf_siginfo.si_sc.sc_pc = regs->tf_eip; sf.sf_siginfo.si_sc.sc_ps = regs->tf_eflags; sf.sf_siginfo.si_sc.sc_trapno = regs->tf_trapno; sf.sf_siginfo.si_sc.sc_err = regs->tf_err; /* * If we're a vm86 process, we want to save the segment registers. * We also change eflags to be our emulated eflags, not the actual * eflags. */ if (regs->tf_eflags & PSL_VM) { /* XXX confusing names: `tf' isn't a trapframe; `regs' is. */ struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs; struct vm86_kernel *vm86 = &td->td_pcb->pcb_ext->ext_vm86; sf.sf_siginfo.si_sc.sc_gs = tf->tf_vm86_gs; sf.sf_siginfo.si_sc.sc_fs = tf->tf_vm86_fs; sf.sf_siginfo.si_sc.sc_es = tf->tf_vm86_es; sf.sf_siginfo.si_sc.sc_ds = tf->tf_vm86_ds; if (vm86->vm86_has_vme == 0) sf.sf_siginfo.si_sc.sc_ps = (tf->tf_eflags & ~(PSL_VIF | PSL_VIP)) | (vm86->vm86_eflags & (PSL_VIF | PSL_VIP)); /* See sendsig() for comments. */ tf->tf_eflags &= ~(PSL_VM | PSL_NT | PSL_VIF | PSL_VIP); } /* * Copy the sigframe out to the user's stack. */ if (copyout(&sf, fp, sizeof(*fp)) != 0) { #ifdef DEBUG printf("process %ld has trashed its stack\n", (long)p->p_pid); #endif PROC_LOCK(p); sigexit(td, SIGILL); } regs->tf_esp = (int)fp; if (p->p_sysent->sv_sigcode_base != 0) { regs->tf_eip = p->p_sysent->sv_sigcode_base + szsigcode - szosigcode; } else { /* a.out sysentvec does not use shared page */ regs->tf_eip = p->p_sysent->sv_psstrings - szosigcode; } regs->tf_eflags &= ~(PSL_T | PSL_D); regs->tf_cs = _ucodesel; regs->tf_ds = _udatasel; regs->tf_es = _udatasel; regs->tf_fs = _udatasel; load_gs(_udatasel); regs->tf_ss = _udatasel; PROC_LOCK(p); mtx_lock(&psp->ps_mtx); } #endif /* COMPAT_43 */ #ifdef COMPAT_FREEBSD4 static void freebsd4_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask) { struct sigframe4 sf, *sfp; struct proc *p; struct thread *td; struct sigacts *psp; struct trapframe *regs; int sig; int oonstack; td = curthread; p = td->td_proc; PROC_LOCK_ASSERT(p, MA_OWNED); sig = ksi->ksi_signo; psp = p->p_sigacts; mtx_assert(&psp->ps_mtx, MA_OWNED); regs = td->td_frame; oonstack = sigonstack(regs->tf_esp); /* Save user context. */ bzero(&sf, sizeof(sf)); sf.sf_uc.uc_sigmask = *mask; sf.sf_uc.uc_stack = td->td_sigstk; sf.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK) ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE; sf.sf_uc.uc_mcontext.mc_onstack = (oonstack) ? 1 : 0; sf.sf_uc.uc_mcontext.mc_gs = rgs(); bcopy(regs, &sf.sf_uc.uc_mcontext.mc_fs, sizeof(*regs)); bzero(sf.sf_uc.uc_mcontext.mc_fpregs, sizeof(sf.sf_uc.uc_mcontext.mc_fpregs)); bzero(sf.sf_uc.uc_mcontext.__spare__, sizeof(sf.sf_uc.uc_mcontext.__spare__)); bzero(sf.sf_uc.__spare__, sizeof(sf.sf_uc.__spare__)); /* Allocate space for the signal handler context. */ if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack && SIGISMEMBER(psp->ps_sigonstack, sig)) { sfp = (struct sigframe4 *)((uintptr_t)td->td_sigstk.ss_sp + td->td_sigstk.ss_size - sizeof(struct sigframe4)); #if defined(COMPAT_43) td->td_sigstk.ss_flags |= SS_ONSTACK; #endif } else sfp = (struct sigframe4 *)regs->tf_esp - 1; /* Build the argument list for the signal handler. */ sf.sf_signum = sig; sf.sf_ucontext = (register_t)&sfp->sf_uc; bzero(&sf.sf_si, sizeof(sf.sf_si)); if (SIGISMEMBER(psp->ps_siginfo, sig)) { /* Signal handler installed with SA_SIGINFO. */ sf.sf_siginfo = (register_t)&sfp->sf_si; sf.sf_ahu.sf_action = (__siginfohandler_t *)catcher; /* Fill in POSIX parts */ sf.sf_si.si_signo = sig; sf.sf_si.si_code = ksi->ksi_code; sf.sf_si.si_addr = ksi->ksi_addr; } else { /* Old FreeBSD-style arguments. */ sf.sf_siginfo = ksi->ksi_code; sf.sf_addr = (register_t)ksi->ksi_addr; sf.sf_ahu.sf_handler = catcher; } mtx_unlock(&psp->ps_mtx); PROC_UNLOCK(p); /* * If we're a vm86 process, we want to save the segment registers. * We also change eflags to be our emulated eflags, not the actual * eflags. */ if (regs->tf_eflags & PSL_VM) { struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs; struct vm86_kernel *vm86 = &td->td_pcb->pcb_ext->ext_vm86; sf.sf_uc.uc_mcontext.mc_gs = tf->tf_vm86_gs; sf.sf_uc.uc_mcontext.mc_fs = tf->tf_vm86_fs; sf.sf_uc.uc_mcontext.mc_es = tf->tf_vm86_es; sf.sf_uc.uc_mcontext.mc_ds = tf->tf_vm86_ds; if (vm86->vm86_has_vme == 0) sf.sf_uc.uc_mcontext.mc_eflags = (tf->tf_eflags & ~(PSL_VIF | PSL_VIP)) | (vm86->vm86_eflags & (PSL_VIF | PSL_VIP)); /* * Clear PSL_NT to inhibit T_TSSFLT faults on return from * syscalls made by the signal handler. This just avoids * wasting time for our lazy fixup of such faults. PSL_NT * does nothing in vm86 mode, but vm86 programs can set it * almost legitimately in probes for old cpu types. */ tf->tf_eflags &= ~(PSL_VM | PSL_NT | PSL_VIF | PSL_VIP); } /* * Copy the sigframe out to the user's stack. */ if (copyout(&sf, sfp, sizeof(*sfp)) != 0) { #ifdef DEBUG printf("process %ld has trashed its stack\n", (long)p->p_pid); #endif PROC_LOCK(p); sigexit(td, SIGILL); } regs->tf_esp = (int)sfp; regs->tf_eip = p->p_sysent->sv_sigcode_base + szsigcode - szfreebsd4_sigcode; regs->tf_eflags &= ~(PSL_T | PSL_D); regs->tf_cs = _ucodesel; regs->tf_ds = _udatasel; regs->tf_es = _udatasel; regs->tf_fs = _udatasel; regs->tf_ss = _udatasel; PROC_LOCK(p); mtx_lock(&psp->ps_mtx); } #endif /* COMPAT_FREEBSD4 */ void sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask) { struct sigframe sf, *sfp; struct proc *p; struct thread *td; struct sigacts *psp; char *sp; struct trapframe *regs; struct segment_descriptor *sdp; char *xfpusave; size_t xfpusave_len; int sig; int oonstack; td = curthread; p = td->td_proc; PROC_LOCK_ASSERT(p, MA_OWNED); sig = ksi->ksi_signo; psp = p->p_sigacts; mtx_assert(&psp->ps_mtx, MA_OWNED); #ifdef COMPAT_FREEBSD4 if (SIGISMEMBER(psp->ps_freebsd4, sig)) { freebsd4_sendsig(catcher, ksi, mask); return; } #endif #ifdef COMPAT_43 if (SIGISMEMBER(psp->ps_osigset, sig)) { osendsig(catcher, ksi, mask); return; } #endif regs = td->td_frame; oonstack = sigonstack(regs->tf_esp); -#ifdef CPU_ENABLE_SSE if (cpu_max_ext_state_size > sizeof(union savefpu) && use_xsave) { xfpusave_len = cpu_max_ext_state_size - sizeof(union savefpu); xfpusave = __builtin_alloca(xfpusave_len); } else { -#else - { -#endif xfpusave_len = 0; xfpusave = NULL; } /* Save user context. */ bzero(&sf, sizeof(sf)); sf.sf_uc.uc_sigmask = *mask; sf.sf_uc.uc_stack = td->td_sigstk; sf.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK) ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE; sf.sf_uc.uc_mcontext.mc_onstack = (oonstack) ? 1 : 0; sf.sf_uc.uc_mcontext.mc_gs = rgs(); bcopy(regs, &sf.sf_uc.uc_mcontext.mc_fs, sizeof(*regs)); sf.sf_uc.uc_mcontext.mc_len = sizeof(sf.sf_uc.uc_mcontext); /* magic */ get_fpcontext(td, &sf.sf_uc.uc_mcontext, xfpusave, xfpusave_len); fpstate_drop(td); /* * Unconditionally fill the fsbase and gsbase into the mcontext. */ sdp = &td->td_pcb->pcb_fsd; sf.sf_uc.uc_mcontext.mc_fsbase = sdp->sd_hibase << 24 | sdp->sd_lobase; sdp = &td->td_pcb->pcb_gsd; sf.sf_uc.uc_mcontext.mc_gsbase = sdp->sd_hibase << 24 | sdp->sd_lobase; bzero(sf.sf_uc.uc_mcontext.mc_spare2, sizeof(sf.sf_uc.uc_mcontext.mc_spare2)); bzero(sf.sf_uc.__spare__, sizeof(sf.sf_uc.__spare__)); /* Allocate space for the signal handler context. */ if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack && SIGISMEMBER(psp->ps_sigonstack, sig)) { sp = (char *)td->td_sigstk.ss_sp + td->td_sigstk.ss_size; #if defined(COMPAT_43) td->td_sigstk.ss_flags |= SS_ONSTACK; #endif } else sp = (char *)regs->tf_esp - 128; if (xfpusave != NULL) { sp -= xfpusave_len; sp = (char *)((unsigned int)sp & ~0x3F); sf.sf_uc.uc_mcontext.mc_xfpustate = (register_t)sp; } sp -= sizeof(struct sigframe); /* Align to 16 bytes. */ sfp = (struct sigframe *)((unsigned int)sp & ~0xF); /* Build the argument list for the signal handler. */ sf.sf_signum = sig; sf.sf_ucontext = (register_t)&sfp->sf_uc; bzero(&sf.sf_si, sizeof(sf.sf_si)); if (SIGISMEMBER(psp->ps_siginfo, sig)) { /* Signal handler installed with SA_SIGINFO. */ sf.sf_siginfo = (register_t)&sfp->sf_si; sf.sf_ahu.sf_action = (__siginfohandler_t *)catcher; /* Fill in POSIX parts */ sf.sf_si = ksi->ksi_info; sf.sf_si.si_signo = sig; /* maybe a translated signal */ } else { /* Old FreeBSD-style arguments. */ sf.sf_siginfo = ksi->ksi_code; sf.sf_addr = (register_t)ksi->ksi_addr; sf.sf_ahu.sf_handler = catcher; } mtx_unlock(&psp->ps_mtx); PROC_UNLOCK(p); /* * If we're a vm86 process, we want to save the segment registers. * We also change eflags to be our emulated eflags, not the actual * eflags. */ if (regs->tf_eflags & PSL_VM) { struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs; struct vm86_kernel *vm86 = &td->td_pcb->pcb_ext->ext_vm86; sf.sf_uc.uc_mcontext.mc_gs = tf->tf_vm86_gs; sf.sf_uc.uc_mcontext.mc_fs = tf->tf_vm86_fs; sf.sf_uc.uc_mcontext.mc_es = tf->tf_vm86_es; sf.sf_uc.uc_mcontext.mc_ds = tf->tf_vm86_ds; if (vm86->vm86_has_vme == 0) sf.sf_uc.uc_mcontext.mc_eflags = (tf->tf_eflags & ~(PSL_VIF | PSL_VIP)) | (vm86->vm86_eflags & (PSL_VIF | PSL_VIP)); /* * Clear PSL_NT to inhibit T_TSSFLT faults on return from * syscalls made by the signal handler. This just avoids * wasting time for our lazy fixup of such faults. PSL_NT * does nothing in vm86 mode, but vm86 programs can set it * almost legitimately in probes for old cpu types. */ tf->tf_eflags &= ~(PSL_VM | PSL_NT | PSL_VIF | PSL_VIP); } /* * Copy the sigframe out to the user's stack. */ if (copyout(&sf, sfp, sizeof(*sfp)) != 0 || (xfpusave != NULL && copyout(xfpusave, (void *)sf.sf_uc.uc_mcontext.mc_xfpustate, xfpusave_len) != 0)) { #ifdef DEBUG printf("process %ld has trashed its stack\n", (long)p->p_pid); #endif PROC_LOCK(p); sigexit(td, SIGILL); } regs->tf_esp = (int)sfp; regs->tf_eip = p->p_sysent->sv_sigcode_base; if (regs->tf_eip == 0) regs->tf_eip = p->p_sysent->sv_psstrings - szsigcode; regs->tf_eflags &= ~(PSL_T | PSL_D); regs->tf_cs = _ucodesel; regs->tf_ds = _udatasel; regs->tf_es = _udatasel; regs->tf_fs = _udatasel; regs->tf_ss = _udatasel; PROC_LOCK(p); mtx_lock(&psp->ps_mtx); } /* * System call to cleanup state after a signal * has been taken. Reset signal mask and * stack state from context left by sendsig (above). * Return to previous pc and psl as specified by * context left by sendsig. Check carefully to * make sure that the user has not modified the * state to gain improper privileges. * * MPSAFE */ #ifdef COMPAT_43 int osigreturn(td, uap) struct thread *td; struct osigreturn_args /* { struct osigcontext *sigcntxp; } */ *uap; { struct osigcontext sc; struct trapframe *regs; struct osigcontext *scp; int eflags, error; ksiginfo_t ksi; regs = td->td_frame; error = copyin(uap->sigcntxp, &sc, sizeof(sc)); if (error != 0) return (error); scp = ≻ eflags = scp->sc_ps; if (eflags & PSL_VM) { struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs; struct vm86_kernel *vm86; /* * if pcb_ext == 0 or vm86_inited == 0, the user hasn't * set up the vm86 area, and we can't enter vm86 mode. */ if (td->td_pcb->pcb_ext == 0) return (EINVAL); vm86 = &td->td_pcb->pcb_ext->ext_vm86; if (vm86->vm86_inited == 0) return (EINVAL); /* Go back to user mode if both flags are set. */ if ((eflags & PSL_VIP) && (eflags & PSL_VIF)) { ksiginfo_init_trap(&ksi); ksi.ksi_signo = SIGBUS; ksi.ksi_code = BUS_OBJERR; ksi.ksi_addr = (void *)regs->tf_eip; trapsignal(td, &ksi); } if (vm86->vm86_has_vme) { eflags = (tf->tf_eflags & ~VME_USERCHANGE) | (eflags & VME_USERCHANGE) | PSL_VM; } else { vm86->vm86_eflags = eflags; /* save VIF, VIP */ eflags = (tf->tf_eflags & ~VM_USERCHANGE) | (eflags & VM_USERCHANGE) | PSL_VM; } tf->tf_vm86_ds = scp->sc_ds; tf->tf_vm86_es = scp->sc_es; tf->tf_vm86_fs = scp->sc_fs; tf->tf_vm86_gs = scp->sc_gs; tf->tf_ds = _udatasel; tf->tf_es = _udatasel; tf->tf_fs = _udatasel; } else { /* * Don't allow users to change privileged or reserved flags. */ if (!EFL_SECURE(eflags, regs->tf_eflags)) { return (EINVAL); } /* * Don't allow users to load a valid privileged %cs. Let the * hardware check for invalid selectors, excess privilege in * other selectors, invalid %eip's and invalid %esp's. */ if (!CS_SECURE(scp->sc_cs)) { ksiginfo_init_trap(&ksi); ksi.ksi_signo = SIGBUS; ksi.ksi_code = BUS_OBJERR; ksi.ksi_trapno = T_PROTFLT; ksi.ksi_addr = (void *)regs->tf_eip; trapsignal(td, &ksi); return (EINVAL); } regs->tf_ds = scp->sc_ds; regs->tf_es = scp->sc_es; regs->tf_fs = scp->sc_fs; } /* Restore remaining registers. */ regs->tf_eax = scp->sc_eax; regs->tf_ebx = scp->sc_ebx; regs->tf_ecx = scp->sc_ecx; regs->tf_edx = scp->sc_edx; regs->tf_esi = scp->sc_esi; regs->tf_edi = scp->sc_edi; regs->tf_cs = scp->sc_cs; regs->tf_ss = scp->sc_ss; regs->tf_isp = scp->sc_isp; regs->tf_ebp = scp->sc_fp; regs->tf_esp = scp->sc_sp; regs->tf_eip = scp->sc_pc; regs->tf_eflags = eflags; #if defined(COMPAT_43) if (scp->sc_onstack & 1) td->td_sigstk.ss_flags |= SS_ONSTACK; else td->td_sigstk.ss_flags &= ~SS_ONSTACK; #endif kern_sigprocmask(td, SIG_SETMASK, (sigset_t *)&scp->sc_mask, NULL, SIGPROCMASK_OLD); return (EJUSTRETURN); } #endif /* COMPAT_43 */ #ifdef COMPAT_FREEBSD4 /* * MPSAFE */ int freebsd4_sigreturn(td, uap) struct thread *td; struct freebsd4_sigreturn_args /* { const ucontext4 *sigcntxp; } */ *uap; { struct ucontext4 uc; struct trapframe *regs; struct ucontext4 *ucp; int cs, eflags, error; ksiginfo_t ksi; error = copyin(uap->sigcntxp, &uc, sizeof(uc)); if (error != 0) return (error); ucp = &uc; regs = td->td_frame; eflags = ucp->uc_mcontext.mc_eflags; if (eflags & PSL_VM) { struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs; struct vm86_kernel *vm86; /* * if pcb_ext == 0 or vm86_inited == 0, the user hasn't * set up the vm86 area, and we can't enter vm86 mode. */ if (td->td_pcb->pcb_ext == 0) return (EINVAL); vm86 = &td->td_pcb->pcb_ext->ext_vm86; if (vm86->vm86_inited == 0) return (EINVAL); /* Go back to user mode if both flags are set. */ if ((eflags & PSL_VIP) && (eflags & PSL_VIF)) { ksiginfo_init_trap(&ksi); ksi.ksi_signo = SIGBUS; ksi.ksi_code = BUS_OBJERR; ksi.ksi_addr = (void *)regs->tf_eip; trapsignal(td, &ksi); } if (vm86->vm86_has_vme) { eflags = (tf->tf_eflags & ~VME_USERCHANGE) | (eflags & VME_USERCHANGE) | PSL_VM; } else { vm86->vm86_eflags = eflags; /* save VIF, VIP */ eflags = (tf->tf_eflags & ~VM_USERCHANGE) | (eflags & VM_USERCHANGE) | PSL_VM; } bcopy(&ucp->uc_mcontext.mc_fs, tf, sizeof(struct trapframe)); tf->tf_eflags = eflags; tf->tf_vm86_ds = tf->tf_ds; tf->tf_vm86_es = tf->tf_es; tf->tf_vm86_fs = tf->tf_fs; tf->tf_vm86_gs = ucp->uc_mcontext.mc_gs; tf->tf_ds = _udatasel; tf->tf_es = _udatasel; tf->tf_fs = _udatasel; } else { /* * Don't allow users to change privileged or reserved flags. */ if (!EFL_SECURE(eflags, regs->tf_eflags)) { uprintf("pid %d (%s): freebsd4_sigreturn eflags = 0x%x\n", td->td_proc->p_pid, td->td_name, eflags); return (EINVAL); } /* * Don't allow users to load a valid privileged %cs. Let the * hardware check for invalid selectors, excess privilege in * other selectors, invalid %eip's and invalid %esp's. */ cs = ucp->uc_mcontext.mc_cs; if (!CS_SECURE(cs)) { uprintf("pid %d (%s): freebsd4_sigreturn cs = 0x%x\n", td->td_proc->p_pid, td->td_name, cs); ksiginfo_init_trap(&ksi); ksi.ksi_signo = SIGBUS; ksi.ksi_code = BUS_OBJERR; ksi.ksi_trapno = T_PROTFLT; ksi.ksi_addr = (void *)regs->tf_eip; trapsignal(td, &ksi); return (EINVAL); } bcopy(&ucp->uc_mcontext.mc_fs, regs, sizeof(*regs)); } #if defined(COMPAT_43) if (ucp->uc_mcontext.mc_onstack & 1) td->td_sigstk.ss_flags |= SS_ONSTACK; else td->td_sigstk.ss_flags &= ~SS_ONSTACK; #endif kern_sigprocmask(td, SIG_SETMASK, &ucp->uc_sigmask, NULL, 0); return (EJUSTRETURN); } #endif /* COMPAT_FREEBSD4 */ /* * MPSAFE */ int sys_sigreturn(td, uap) struct thread *td; struct sigreturn_args /* { const struct __ucontext *sigcntxp; } */ *uap; { ucontext_t uc; struct proc *p; struct trapframe *regs; ucontext_t *ucp; char *xfpustate; size_t xfpustate_len; int cs, eflags, error, ret; ksiginfo_t ksi; p = td->td_proc; error = copyin(uap->sigcntxp, &uc, sizeof(uc)); if (error != 0) return (error); ucp = &uc; if ((ucp->uc_mcontext.mc_flags & ~_MC_FLAG_MASK) != 0) { uprintf("pid %d (%s): sigreturn mc_flags %x\n", p->p_pid, td->td_name, ucp->uc_mcontext.mc_flags); return (EINVAL); } regs = td->td_frame; eflags = ucp->uc_mcontext.mc_eflags; if (eflags & PSL_VM) { struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs; struct vm86_kernel *vm86; /* * if pcb_ext == 0 or vm86_inited == 0, the user hasn't * set up the vm86 area, and we can't enter vm86 mode. */ if (td->td_pcb->pcb_ext == 0) return (EINVAL); vm86 = &td->td_pcb->pcb_ext->ext_vm86; if (vm86->vm86_inited == 0) return (EINVAL); /* Go back to user mode if both flags are set. */ if ((eflags & PSL_VIP) && (eflags & PSL_VIF)) { ksiginfo_init_trap(&ksi); ksi.ksi_signo = SIGBUS; ksi.ksi_code = BUS_OBJERR; ksi.ksi_addr = (void *)regs->tf_eip; trapsignal(td, &ksi); } if (vm86->vm86_has_vme) { eflags = (tf->tf_eflags & ~VME_USERCHANGE) | (eflags & VME_USERCHANGE) | PSL_VM; } else { vm86->vm86_eflags = eflags; /* save VIF, VIP */ eflags = (tf->tf_eflags & ~VM_USERCHANGE) | (eflags & VM_USERCHANGE) | PSL_VM; } bcopy(&ucp->uc_mcontext.mc_fs, tf, sizeof(struct trapframe)); tf->tf_eflags = eflags; tf->tf_vm86_ds = tf->tf_ds; tf->tf_vm86_es = tf->tf_es; tf->tf_vm86_fs = tf->tf_fs; tf->tf_vm86_gs = ucp->uc_mcontext.mc_gs; tf->tf_ds = _udatasel; tf->tf_es = _udatasel; tf->tf_fs = _udatasel; } else { /* * Don't allow users to change privileged or reserved flags. */ if (!EFL_SECURE(eflags, regs->tf_eflags)) { uprintf("pid %d (%s): sigreturn eflags = 0x%x\n", td->td_proc->p_pid, td->td_name, eflags); return (EINVAL); } /* * Don't allow users to load a valid privileged %cs. Let the * hardware check for invalid selectors, excess privilege in * other selectors, invalid %eip's and invalid %esp's. */ cs = ucp->uc_mcontext.mc_cs; if (!CS_SECURE(cs)) { uprintf("pid %d (%s): sigreturn cs = 0x%x\n", td->td_proc->p_pid, td->td_name, cs); ksiginfo_init_trap(&ksi); ksi.ksi_signo = SIGBUS; ksi.ksi_code = BUS_OBJERR; ksi.ksi_trapno = T_PROTFLT; ksi.ksi_addr = (void *)regs->tf_eip; trapsignal(td, &ksi); return (EINVAL); } if ((uc.uc_mcontext.mc_flags & _MC_HASFPXSTATE) != 0) { xfpustate_len = uc.uc_mcontext.mc_xfpustate_len; if (xfpustate_len > cpu_max_ext_state_size - sizeof(union savefpu)) { uprintf( "pid %d (%s): sigreturn xfpusave_len = 0x%zx\n", p->p_pid, td->td_name, xfpustate_len); return (EINVAL); } xfpustate = __builtin_alloca(xfpustate_len); error = copyin((const void *)uc.uc_mcontext.mc_xfpustate, xfpustate, xfpustate_len); if (error != 0) { uprintf( "pid %d (%s): sigreturn copying xfpustate failed\n", p->p_pid, td->td_name); return (error); } } else { xfpustate = NULL; xfpustate_len = 0; } ret = set_fpcontext(td, &ucp->uc_mcontext, xfpustate, xfpustate_len); if (ret != 0) return (ret); bcopy(&ucp->uc_mcontext.mc_fs, regs, sizeof(*regs)); } #if defined(COMPAT_43) if (ucp->uc_mcontext.mc_onstack & 1) td->td_sigstk.ss_flags |= SS_ONSTACK; else td->td_sigstk.ss_flags &= ~SS_ONSTACK; #endif kern_sigprocmask(td, SIG_SETMASK, &ucp->uc_sigmask, NULL, 0); return (EJUSTRETURN); } /* * Reset registers to default values on exec. */ void exec_setregs(struct thread *td, struct image_params *imgp, u_long stack) { struct trapframe *regs = td->td_frame; struct pcb *pcb = td->td_pcb; /* Reset pc->pcb_gs and %gs before possibly invalidating it. */ pcb->pcb_gs = _udatasel; load_gs(_udatasel); mtx_lock_spin(&dt_lock); if (td->td_proc->p_md.md_ldt) user_ldt_free(td); else mtx_unlock_spin(&dt_lock); bzero((char *)regs, sizeof(struct trapframe)); regs->tf_eip = imgp->entry_addr; regs->tf_esp = stack; regs->tf_eflags = PSL_USER | (regs->tf_eflags & PSL_T); regs->tf_ss = _udatasel; regs->tf_ds = _udatasel; regs->tf_es = _udatasel; regs->tf_fs = _udatasel; regs->tf_cs = _ucodesel; /* PS_STRINGS value for BSD/OS binaries. It is 0 for non-BSD/OS. */ regs->tf_ebx = imgp->ps_strings; /* * Reset the hardware debug registers if they were in use. * They won't have any meaning for the newly exec'd process. */ if (pcb->pcb_flags & PCB_DBREGS) { pcb->pcb_dr0 = 0; pcb->pcb_dr1 = 0; pcb->pcb_dr2 = 0; pcb->pcb_dr3 = 0; pcb->pcb_dr6 = 0; pcb->pcb_dr7 = 0; if (pcb == curpcb) { /* * Clear the debug registers on the running * CPU, otherwise they will end up affecting * the next process we switch to. */ reset_dbregs(); } pcb->pcb_flags &= ~PCB_DBREGS; } pcb->pcb_initial_npxcw = __INITIAL_NPXCW__; /* * Drop the FP state if we hold it, so that the process gets a * clean FP state if it uses the FPU again. */ fpstate_drop(td); /* * XXX - Linux emulator * Make sure sure edx is 0x0 on entry. Linux binaries depend * on it. */ td->td_retval[1] = 0; } void cpu_setregs(void) { unsigned int cr0; cr0 = rcr0(); /* * CR0_MP, CR0_NE and CR0_TS are set for NPX (FPU) support: * * Prepare to trap all ESC (i.e., NPX) instructions and all WAIT * instructions. We must set the CR0_MP bit and use the CR0_TS * bit to control the trap, because setting the CR0_EM bit does * not cause WAIT instructions to trap. It's important to trap * WAIT instructions - otherwise the "wait" variants of no-wait * control instructions would degenerate to the "no-wait" variants * after FP context switches but work correctly otherwise. It's * particularly important to trap WAITs when there is no NPX - * otherwise the "wait" variants would always degenerate. * * Try setting CR0_NE to get correct error reporting on 486DX's. * Setting it should fail or do nothing on lesser processors. */ cr0 |= CR0_MP | CR0_NE | CR0_TS | CR0_WP | CR0_AM; load_cr0(cr0); load_gs(_udatasel); } u_long bootdev; /* not a struct cdev *- encoding is different */ SYSCTL_ULONG(_machdep, OID_AUTO, guessed_bootdev, CTLFLAG_RD, &bootdev, 0, "Maybe the Boot device (not in struct cdev *format)"); static char bootmethod[16] = "BIOS"; SYSCTL_STRING(_machdep, OID_AUTO, bootmethod, CTLFLAG_RD, bootmethod, 0, "System firmware boot method"); /* * Initialize 386 and configure to run kernel */ /* * Initialize segments & interrupt table */ int _default_ldt; union descriptor gdt[NGDT * MAXCPU]; /* global descriptor table */ union descriptor ldt[NLDT]; /* local descriptor table */ static struct gate_descriptor idt0[NIDT]; struct gate_descriptor *idt = &idt0[0]; /* interrupt descriptor table */ struct region_descriptor r_gdt, r_idt; /* table descriptors */ struct mtx dt_lock; /* lock for GDT and LDT */ static struct i386tss dblfault_tss; static char dblfault_stack[PAGE_SIZE]; extern vm_offset_t proc0kstack; /* * software prototypes -- in more palatable form. * * GCODE_SEL through GUDATA_SEL must be in this order for syscall/sysret * GUFS_SEL and GUGS_SEL must be in this order (swtch.s knows it) */ struct soft_segment_descriptor gdt_segs[] = { /* GNULL_SEL 0 Null Descriptor */ { .ssd_base = 0x0, .ssd_limit = 0x0, .ssd_type = 0, .ssd_dpl = SEL_KPL, .ssd_p = 0, .ssd_xx = 0, .ssd_xx1 = 0, .ssd_def32 = 0, .ssd_gran = 0 }, /* GPRIV_SEL 1 SMP Per-Processor Private Data Descriptor */ { .ssd_base = 0x0, .ssd_limit = 0xfffff, .ssd_type = SDT_MEMRWA, .ssd_dpl = SEL_KPL, .ssd_p = 1, .ssd_xx = 0, .ssd_xx1 = 0, .ssd_def32 = 1, .ssd_gran = 1 }, /* GUFS_SEL 2 %fs Descriptor for user */ { .ssd_base = 0x0, .ssd_limit = 0xfffff, .ssd_type = SDT_MEMRWA, .ssd_dpl = SEL_UPL, .ssd_p = 1, .ssd_xx = 0, .ssd_xx1 = 0, .ssd_def32 = 1, .ssd_gran = 1 }, /* GUGS_SEL 3 %gs Descriptor for user */ { .ssd_base = 0x0, .ssd_limit = 0xfffff, .ssd_type = SDT_MEMRWA, .ssd_dpl = SEL_UPL, .ssd_p = 1, .ssd_xx = 0, .ssd_xx1 = 0, .ssd_def32 = 1, .ssd_gran = 1 }, /* GCODE_SEL 4 Code Descriptor for kernel */ { .ssd_base = 0x0, .ssd_limit = 0xfffff, .ssd_type = SDT_MEMERA, .ssd_dpl = SEL_KPL, .ssd_p = 1, .ssd_xx = 0, .ssd_xx1 = 0, .ssd_def32 = 1, .ssd_gran = 1 }, /* GDATA_SEL 5 Data Descriptor for kernel */ { .ssd_base = 0x0, .ssd_limit = 0xfffff, .ssd_type = SDT_MEMRWA, .ssd_dpl = SEL_KPL, .ssd_p = 1, .ssd_xx = 0, .ssd_xx1 = 0, .ssd_def32 = 1, .ssd_gran = 1 }, /* GUCODE_SEL 6 Code Descriptor for user */ { .ssd_base = 0x0, .ssd_limit = 0xfffff, .ssd_type = SDT_MEMERA, .ssd_dpl = SEL_UPL, .ssd_p = 1, .ssd_xx = 0, .ssd_xx1 = 0, .ssd_def32 = 1, .ssd_gran = 1 }, /* GUDATA_SEL 7 Data Descriptor for user */ { .ssd_base = 0x0, .ssd_limit = 0xfffff, .ssd_type = SDT_MEMRWA, .ssd_dpl = SEL_UPL, .ssd_p = 1, .ssd_xx = 0, .ssd_xx1 = 0, .ssd_def32 = 1, .ssd_gran = 1 }, /* GBIOSLOWMEM_SEL 8 BIOS access to realmode segment 0x40, must be #8 in GDT */ { .ssd_base = 0x400, .ssd_limit = 0xfffff, .ssd_type = SDT_MEMRWA, .ssd_dpl = SEL_KPL, .ssd_p = 1, .ssd_xx = 0, .ssd_xx1 = 0, .ssd_def32 = 1, .ssd_gran = 1 }, /* GPROC0_SEL 9 Proc 0 Tss Descriptor */ { .ssd_base = 0x0, .ssd_limit = sizeof(struct i386tss)-1, .ssd_type = SDT_SYS386TSS, .ssd_dpl = 0, .ssd_p = 1, .ssd_xx = 0, .ssd_xx1 = 0, .ssd_def32 = 0, .ssd_gran = 0 }, /* GLDT_SEL 10 LDT Descriptor */ { .ssd_base = (int) ldt, .ssd_limit = sizeof(ldt)-1, .ssd_type = SDT_SYSLDT, .ssd_dpl = SEL_UPL, .ssd_p = 1, .ssd_xx = 0, .ssd_xx1 = 0, .ssd_def32 = 0, .ssd_gran = 0 }, /* GUSERLDT_SEL 11 User LDT Descriptor per process */ { .ssd_base = (int) ldt, .ssd_limit = (512 * sizeof(union descriptor)-1), .ssd_type = SDT_SYSLDT, .ssd_dpl = 0, .ssd_p = 1, .ssd_xx = 0, .ssd_xx1 = 0, .ssd_def32 = 0, .ssd_gran = 0 }, /* GPANIC_SEL 12 Panic Tss Descriptor */ { .ssd_base = (int) &dblfault_tss, .ssd_limit = sizeof(struct i386tss)-1, .ssd_type = SDT_SYS386TSS, .ssd_dpl = 0, .ssd_p = 1, .ssd_xx = 0, .ssd_xx1 = 0, .ssd_def32 = 0, .ssd_gran = 0 }, /* GBIOSCODE32_SEL 13 BIOS 32-bit interface (32bit Code) */ { .ssd_base = 0, .ssd_limit = 0xfffff, .ssd_type = SDT_MEMERA, .ssd_dpl = 0, .ssd_p = 1, .ssd_xx = 0, .ssd_xx1 = 0, .ssd_def32 = 0, .ssd_gran = 1 }, /* GBIOSCODE16_SEL 14 BIOS 32-bit interface (16bit Code) */ { .ssd_base = 0, .ssd_limit = 0xfffff, .ssd_type = SDT_MEMERA, .ssd_dpl = 0, .ssd_p = 1, .ssd_xx = 0, .ssd_xx1 = 0, .ssd_def32 = 0, .ssd_gran = 1 }, /* GBIOSDATA_SEL 15 BIOS 32-bit interface (Data) */ { .ssd_base = 0, .ssd_limit = 0xfffff, .ssd_type = SDT_MEMRWA, .ssd_dpl = 0, .ssd_p = 1, .ssd_xx = 0, .ssd_xx1 = 0, .ssd_def32 = 1, .ssd_gran = 1 }, /* GBIOSUTIL_SEL 16 BIOS 16-bit interface (Utility) */ { .ssd_base = 0, .ssd_limit = 0xfffff, .ssd_type = SDT_MEMRWA, .ssd_dpl = 0, .ssd_p = 1, .ssd_xx = 0, .ssd_xx1 = 0, .ssd_def32 = 0, .ssd_gran = 1 }, /* GBIOSARGS_SEL 17 BIOS 16-bit interface (Arguments) */ { .ssd_base = 0, .ssd_limit = 0xfffff, .ssd_type = SDT_MEMRWA, .ssd_dpl = 0, .ssd_p = 1, .ssd_xx = 0, .ssd_xx1 = 0, .ssd_def32 = 0, .ssd_gran = 1 }, /* GNDIS_SEL 18 NDIS Descriptor */ { .ssd_base = 0x0, .ssd_limit = 0x0, .ssd_type = 0, .ssd_dpl = 0, .ssd_p = 0, .ssd_xx = 0, .ssd_xx1 = 0, .ssd_def32 = 0, .ssd_gran = 0 }, }; static struct soft_segment_descriptor ldt_segs[] = { /* Null Descriptor - overwritten by call gate */ { .ssd_base = 0x0, .ssd_limit = 0x0, .ssd_type = 0, .ssd_dpl = 0, .ssd_p = 0, .ssd_xx = 0, .ssd_xx1 = 0, .ssd_def32 = 0, .ssd_gran = 0 }, /* Null Descriptor - overwritten by call gate */ { .ssd_base = 0x0, .ssd_limit = 0x0, .ssd_type = 0, .ssd_dpl = 0, .ssd_p = 0, .ssd_xx = 0, .ssd_xx1 = 0, .ssd_def32 = 0, .ssd_gran = 0 }, /* Null Descriptor - overwritten by call gate */ { .ssd_base = 0x0, .ssd_limit = 0x0, .ssd_type = 0, .ssd_dpl = 0, .ssd_p = 0, .ssd_xx = 0, .ssd_xx1 = 0, .ssd_def32 = 0, .ssd_gran = 0 }, /* Code Descriptor for user */ { .ssd_base = 0x0, .ssd_limit = 0xfffff, .ssd_type = SDT_MEMERA, .ssd_dpl = SEL_UPL, .ssd_p = 1, .ssd_xx = 0, .ssd_xx1 = 0, .ssd_def32 = 1, .ssd_gran = 1 }, /* Null Descriptor - overwritten by call gate */ { .ssd_base = 0x0, .ssd_limit = 0x0, .ssd_type = 0, .ssd_dpl = 0, .ssd_p = 0, .ssd_xx = 0, .ssd_xx1 = 0, .ssd_def32 = 0, .ssd_gran = 0 }, /* Data Descriptor for user */ { .ssd_base = 0x0, .ssd_limit = 0xfffff, .ssd_type = SDT_MEMRWA, .ssd_dpl = SEL_UPL, .ssd_p = 1, .ssd_xx = 0, .ssd_xx1 = 0, .ssd_def32 = 1, .ssd_gran = 1 }, }; void setidt(idx, func, typ, dpl, selec) int idx; inthand_t *func; int typ; int dpl; int selec; { struct gate_descriptor *ip; ip = idt + idx; ip->gd_looffset = (int)func; ip->gd_selector = selec; ip->gd_stkcpy = 0; ip->gd_xx = 0; ip->gd_type = typ; ip->gd_dpl = dpl; ip->gd_p = 1; ip->gd_hioffset = ((int)func)>>16 ; } extern inthand_t IDTVEC(div), IDTVEC(dbg), IDTVEC(nmi), IDTVEC(bpt), IDTVEC(ofl), IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(fpusegm), IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot), IDTVEC(page), IDTVEC(mchk), IDTVEC(rsvd), IDTVEC(fpu), IDTVEC(align), IDTVEC(xmm), #ifdef KDTRACE_HOOKS IDTVEC(dtrace_ret), #endif #ifdef XENHVM IDTVEC(xen_intr_upcall), #endif IDTVEC(lcall_syscall), IDTVEC(int0x80_syscall); #ifdef DDB /* * Display the index and function name of any IDT entries that don't use * the default 'rsvd' entry point. */ DB_SHOW_COMMAND(idt, db_show_idt) { struct gate_descriptor *ip; int idx; uintptr_t func; ip = idt; for (idx = 0; idx < NIDT && !db_pager_quit; idx++) { func = (ip->gd_hioffset << 16 | ip->gd_looffset); if (func != (uintptr_t)&IDTVEC(rsvd)) { db_printf("%3d\t", idx); db_printsym(func, DB_STGY_PROC); db_printf("\n"); } ip++; } } /* Show privileged registers. */ DB_SHOW_COMMAND(sysregs, db_show_sysregs) { uint64_t idtr, gdtr; idtr = ridt(); db_printf("idtr\t0x%08x/%04x\n", (u_int)(idtr >> 16), (u_int)idtr & 0xffff); gdtr = rgdt(); db_printf("gdtr\t0x%08x/%04x\n", (u_int)(gdtr >> 16), (u_int)gdtr & 0xffff); db_printf("ldtr\t0x%04x\n", rldt()); db_printf("tr\t0x%04x\n", rtr()); db_printf("cr0\t0x%08x\n", rcr0()); db_printf("cr2\t0x%08x\n", rcr2()); db_printf("cr3\t0x%08x\n", rcr3()); db_printf("cr4\t0x%08x\n", rcr4()); if (rcr4() & CR4_XSAVE) db_printf("xcr0\t0x%016llx\n", rxcr(0)); if (amd_feature & (AMDID_NX | AMDID_LM)) db_printf("EFER\t0x%016llx\n", rdmsr(MSR_EFER)); if (cpu_feature2 & (CPUID2_VMX | CPUID2_SMX)) db_printf("FEATURES_CTL\t0x%016llx\n", rdmsr(MSR_IA32_FEATURE_CONTROL)); if ((cpu_vendor_id == CPU_VENDOR_INTEL || cpu_vendor_id == CPU_VENDOR_AMD) && CPUID_TO_FAMILY(cpu_id) >= 6) db_printf("DEBUG_CTL\t0x%016llx\n", rdmsr(MSR_DEBUGCTLMSR)); if (cpu_feature & CPUID_PAT) db_printf("PAT\t0x%016llx\n", rdmsr(MSR_PAT)); } DB_SHOW_COMMAND(dbregs, db_show_dbregs) { db_printf("dr0\t0x%08x\n", rdr0()); db_printf("dr1\t0x%08x\n", rdr1()); db_printf("dr2\t0x%08x\n", rdr2()); db_printf("dr3\t0x%08x\n", rdr3()); db_printf("dr6\t0x%08x\n", rdr6()); db_printf("dr7\t0x%08x\n", rdr7()); } #endif void sdtossd(sd, ssd) struct segment_descriptor *sd; struct soft_segment_descriptor *ssd; { ssd->ssd_base = (sd->sd_hibase << 24) | sd->sd_lobase; ssd->ssd_limit = (sd->sd_hilimit << 16) | sd->sd_lolimit; ssd->ssd_type = sd->sd_type; ssd->ssd_dpl = sd->sd_dpl; ssd->ssd_p = sd->sd_p; ssd->ssd_def32 = sd->sd_def32; ssd->ssd_gran = sd->sd_gran; } static int add_physmap_entry(uint64_t base, uint64_t length, vm_paddr_t *physmap, int *physmap_idxp) { int i, insert_idx, physmap_idx; physmap_idx = *physmap_idxp; if (length == 0) return (1); #ifndef PAE if (base > 0xffffffff) { printf("%uK of memory above 4GB ignored\n", (u_int)(length / 1024)); return (1); } #endif /* * Find insertion point while checking for overlap. Start off by * assuming the new entry will be added to the end. */ insert_idx = physmap_idx + 2; for (i = 0; i <= physmap_idx; i += 2) { if (base < physmap[i + 1]) { if (base + length <= physmap[i]) { insert_idx = i; break; } if (boothowto & RB_VERBOSE) printf( "Overlapping memory regions, ignoring second region\n"); return (1); } } /* See if we can prepend to the next entry. */ if (insert_idx <= physmap_idx && base + length == physmap[insert_idx]) { physmap[insert_idx] = base; return (1); } /* See if we can append to the previous entry. */ if (insert_idx > 0 && base == physmap[insert_idx - 1]) { physmap[insert_idx - 1] += length; return (1); } physmap_idx += 2; *physmap_idxp = physmap_idx; if (physmap_idx == PHYSMAP_SIZE) { printf( "Too many segments in the physical address map, giving up\n"); return (0); } /* * Move the last 'N' entries down to make room for the new * entry if needed. */ for (i = physmap_idx; i > insert_idx; i -= 2) { physmap[i] = physmap[i - 2]; physmap[i + 1] = physmap[i - 1]; } /* Insert the new entry. */ physmap[insert_idx] = base; physmap[insert_idx + 1] = base + length; return (1); } static int add_smap_entry(struct bios_smap *smap, vm_paddr_t *physmap, int *physmap_idxp) { if (boothowto & RB_VERBOSE) printf("SMAP type=%02x base=%016llx len=%016llx\n", smap->type, smap->base, smap->length); if (smap->type != SMAP_TYPE_MEMORY) return (1); return (add_physmap_entry(smap->base, smap->length, physmap, physmap_idxp)); } static void add_smap_entries(struct bios_smap *smapbase, vm_paddr_t *physmap, int *physmap_idxp) { struct bios_smap *smap, *smapend; u_int32_t smapsize; /* * Memory map from INT 15:E820. * * subr_module.c says: * "Consumer may safely assume that size value precedes data." * ie: an int32_t immediately precedes SMAP. */ smapsize = *((u_int32_t *)smapbase - 1); smapend = (struct bios_smap *)((uintptr_t)smapbase + smapsize); for (smap = smapbase; smap < smapend; smap++) if (!add_smap_entry(smap, physmap, physmap_idxp)) break; } static void basemem_setup(void) { vm_paddr_t pa; pt_entry_t *pte; int i; if (basemem > 640) { printf("Preposterous BIOS basemem of %uK, truncating to 640K\n", basemem); basemem = 640; } /* * XXX if biosbasemem is now < 640, there is a `hole' * between the end of base memory and the start of * ISA memory. The hole may be empty or it may * contain BIOS code or data. Map it read/write so * that the BIOS can write to it. (Memory from 0 to * the physical end of the kernel is mapped read-only * to begin with and then parts of it are remapped. * The parts that aren't remapped form holes that * remain read-only and are unused by the kernel. * The base memory area is below the physical end of * the kernel and right now forms a read-only hole. * The part of it from PAGE_SIZE to * (trunc_page(biosbasemem * 1024) - 1) will be * remapped and used by the kernel later.) * * This code is similar to the code used in * pmap_mapdev, but since no memory needs to be * allocated we simply change the mapping. */ for (pa = trunc_page(basemem * 1024); pa < ISA_HOLE_START; pa += PAGE_SIZE) pmap_kenter(KERNBASE + pa, pa); /* * Map pages between basemem and ISA_HOLE_START, if any, r/w into * the vm86 page table so that vm86 can scribble on them using * the vm86 map too. XXX: why 2 ways for this and only 1 way for * page 0, at least as initialized here? */ pte = (pt_entry_t *)vm86paddr; for (i = basemem / 4; i < 160; i++) pte[i] = (i << PAGE_SHIFT) | PG_V | PG_RW | PG_U; } /* * Populate the (physmap) array with base/bound pairs describing the * available physical memory in the system, then test this memory and * build the phys_avail array describing the actually-available memory. * * If we cannot accurately determine the physical memory map, then use * value from the 0xE801 call, and failing that, the RTC. * * Total memory size may be set by the kernel environment variable * hw.physmem or the compile-time define MAXMEM. * * XXX first should be vm_paddr_t. */ static void getmemsize(int first) { int has_smap, off, physmap_idx, pa_indx, da_indx; u_long memtest; vm_paddr_t physmap[PHYSMAP_SIZE]; pt_entry_t *pte; quad_t dcons_addr, dcons_size, physmem_tunable; int hasbrokenint12, i, res; u_int extmem; struct vm86frame vmf; struct vm86context vmc; vm_paddr_t pa; struct bios_smap *smap, *smapbase; caddr_t kmdp; has_smap = 0; #ifdef XBOX if (arch_i386_is_xbox) { /* * We queried the memory size before, so chop off 4MB for * the framebuffer and inform the OS of this. */ physmap[0] = 0; physmap[1] = (arch_i386_xbox_memsize * 1024 * 1024) - XBOX_FB_SIZE; physmap_idx = 0; goto physmap_done; } #endif bzero(&vmf, sizeof(vmf)); bzero(physmap, sizeof(physmap)); basemem = 0; /* * Check if the loader supplied an SMAP memory map. If so, * use that and do not make any VM86 calls. */ physmap_idx = 0; kmdp = preload_search_by_type("elf kernel"); if (kmdp == NULL) kmdp = preload_search_by_type("elf32 kernel"); smapbase = (struct bios_smap *)preload_search_info(kmdp, MODINFO_METADATA | MODINFOMD_SMAP); if (smapbase != NULL) { add_smap_entries(smapbase, physmap, &physmap_idx); has_smap = 1; goto have_smap; } /* * Some newer BIOSes have a broken INT 12H implementation * which causes a kernel panic immediately. In this case, we * need use the SMAP to determine the base memory size. */ hasbrokenint12 = 0; TUNABLE_INT_FETCH("hw.hasbrokenint12", &hasbrokenint12); if (hasbrokenint12 == 0) { /* Use INT12 to determine base memory size. */ vm86_intcall(0x12, &vmf); basemem = vmf.vmf_ax; basemem_setup(); } /* * Fetch the memory map with INT 15:E820. Map page 1 R/W into * the kernel page table so we can use it as a buffer. The * kernel will unmap this page later. */ pmap_kenter(KERNBASE + (1 << PAGE_SHIFT), 1 << PAGE_SHIFT); vmc.npages = 0; smap = (void *)vm86_addpage(&vmc, 1, KERNBASE + (1 << PAGE_SHIFT)); res = vm86_getptr(&vmc, (vm_offset_t)smap, &vmf.vmf_es, &vmf.vmf_di); KASSERT(res != 0, ("vm86_getptr() failed: address not found")); vmf.vmf_ebx = 0; do { vmf.vmf_eax = 0xE820; vmf.vmf_edx = SMAP_SIG; vmf.vmf_ecx = sizeof(struct bios_smap); i = vm86_datacall(0x15, &vmf, &vmc); if (i || vmf.vmf_eax != SMAP_SIG) break; has_smap = 1; if (!add_smap_entry(smap, physmap, &physmap_idx)) break; } while (vmf.vmf_ebx != 0); have_smap: /* * If we didn't fetch the "base memory" size from INT12, * figure it out from the SMAP (or just guess). */ if (basemem == 0) { for (i = 0; i <= physmap_idx; i += 2) { if (physmap[i] == 0x00000000) { basemem = physmap[i + 1] / 1024; break; } } /* XXX: If we couldn't find basemem from SMAP, just guess. */ if (basemem == 0) basemem = 640; basemem_setup(); } if (physmap[1] != 0) goto physmap_done; /* * If we failed to find an SMAP, figure out the extended * memory size. We will then build a simple memory map with * two segments, one for "base memory" and the second for * "extended memory". Note that "extended memory" starts at a * physical address of 1MB and that both basemem and extmem * are in units of 1KB. * * First, try to fetch the extended memory size via INT 15:E801. */ vmf.vmf_ax = 0xE801; if (vm86_intcall(0x15, &vmf) == 0) { extmem = vmf.vmf_cx + vmf.vmf_dx * 64; } else { /* * If INT15:E801 fails, this is our last ditch effort * to determine the extended memory size. Currently * we prefer the RTC value over INT15:88. */ #if 0 vmf.vmf_ah = 0x88; vm86_intcall(0x15, &vmf); extmem = vmf.vmf_ax; #else extmem = rtcin(RTC_EXTLO) + (rtcin(RTC_EXTHI) << 8); #endif } /* * Special hack for chipsets that still remap the 384k hole when * there's 16MB of memory - this really confuses people that * are trying to use bus mastering ISA controllers with the * "16MB limit"; they only have 16MB, but the remapping puts * them beyond the limit. * * If extended memory is between 15-16MB (16-17MB phys address range), * chop it to 15MB. */ if ((extmem > 15 * 1024) && (extmem < 16 * 1024)) extmem = 15 * 1024; physmap[0] = 0; physmap[1] = basemem * 1024; physmap_idx = 2; physmap[physmap_idx] = 0x100000; physmap[physmap_idx + 1] = physmap[physmap_idx] + extmem * 1024; physmap_done: /* * Now, physmap contains a map of physical memory. */ #ifdef SMP /* make hole for AP bootstrap code */ physmap[1] = mp_bootaddress(physmap[1]); #endif /* * Maxmem isn't the "maximum memory", it's one larger than the * highest page of the physical address space. It should be * called something like "Maxphyspage". We may adjust this * based on ``hw.physmem'' and the results of the memory test. * * This is especially confusing when it is much larger than the * memory size and is displayed as "realmem". */ Maxmem = atop(physmap[physmap_idx + 1]); #ifdef MAXMEM Maxmem = MAXMEM / 4; #endif if (TUNABLE_QUAD_FETCH("hw.physmem", &physmem_tunable)) Maxmem = atop(physmem_tunable); /* * If we have an SMAP, don't allow MAXMEM or hw.physmem to extend * the amount of memory in the system. */ if (has_smap && Maxmem > atop(physmap[physmap_idx + 1])) Maxmem = atop(physmap[physmap_idx + 1]); /* * By default enable the memory test on real hardware, and disable * it if we appear to be running in a VM. This avoids touching all * pages unnecessarily, which doesn't matter on real hardware but is * bad for shared VM hosts. Use a general name so that * one could eventually do more with the code than just disable it. */ memtest = (vm_guest > VM_GUEST_NO) ? 0 : 1; TUNABLE_ULONG_FETCH("hw.memtest.tests", &memtest); if (atop(physmap[physmap_idx + 1]) != Maxmem && (boothowto & RB_VERBOSE)) printf("Physical memory use set to %ldK\n", Maxmem * 4); /* * If Maxmem has been increased beyond what the system has detected, * extend the last memory segment to the new limit. */ if (atop(physmap[physmap_idx + 1]) < Maxmem) physmap[physmap_idx + 1] = ptoa((vm_paddr_t)Maxmem); /* call pmap initialization to make new kernel address space */ pmap_bootstrap(first); /* * Size up each available chunk of physical memory. */ physmap[0] = PAGE_SIZE; /* mask off page 0 */ pa_indx = 0; da_indx = 1; phys_avail[pa_indx++] = physmap[0]; phys_avail[pa_indx] = physmap[0]; dump_avail[da_indx] = physmap[0]; pte = CMAP3; /* * Get dcons buffer address */ if (getenv_quad("dcons.addr", &dcons_addr) == 0 || getenv_quad("dcons.size", &dcons_size) == 0) dcons_addr = 0; /* * physmap is in bytes, so when converting to page boundaries, * round up the start address and round down the end address. */ for (i = 0; i <= physmap_idx; i += 2) { vm_paddr_t end; end = ptoa((vm_paddr_t)Maxmem); if (physmap[i + 1] < end) end = trunc_page(physmap[i + 1]); for (pa = round_page(physmap[i]); pa < end; pa += PAGE_SIZE) { int tmp, page_bad, full; int *ptr = (int *)CADDR3; full = FALSE; /* * block out kernel memory as not available. */ if (pa >= KERNLOAD && pa < first) goto do_dump_avail; /* * block out dcons buffer */ if (dcons_addr > 0 && pa >= trunc_page(dcons_addr) && pa < dcons_addr + dcons_size) goto do_dump_avail; page_bad = FALSE; if (memtest == 0) goto skip_memtest; /* * map page into kernel: valid, read/write,non-cacheable */ *pte = pa | PG_V | PG_RW | PG_N; invltlb(); tmp = *(int *)ptr; /* * Test for alternating 1's and 0's */ *(volatile int *)ptr = 0xaaaaaaaa; if (*(volatile int *)ptr != 0xaaaaaaaa) page_bad = TRUE; /* * Test for alternating 0's and 1's */ *(volatile int *)ptr = 0x55555555; if (*(volatile int *)ptr != 0x55555555) page_bad = TRUE; /* * Test for all 1's */ *(volatile int *)ptr = 0xffffffff; if (*(volatile int *)ptr != 0xffffffff) page_bad = TRUE; /* * Test for all 0's */ *(volatile int *)ptr = 0x0; if (*(volatile int *)ptr != 0x0) page_bad = TRUE; /* * Restore original value. */ *(int *)ptr = tmp; skip_memtest: /* * Adjust array of valid/good pages. */ if (page_bad == TRUE) continue; /* * If this good page is a continuation of the * previous set of good pages, then just increase * the end pointer. Otherwise start a new chunk. * Note that "end" points one higher than end, * making the range >= start and < end. * If we're also doing a speculative memory * test and we at or past the end, bump up Maxmem * so that we keep going. The first bad page * will terminate the loop. */ if (phys_avail[pa_indx] == pa) { phys_avail[pa_indx] += PAGE_SIZE; } else { pa_indx++; if (pa_indx == PHYS_AVAIL_ARRAY_END) { printf( "Too many holes in the physical address space, giving up\n"); pa_indx--; full = TRUE; goto do_dump_avail; } phys_avail[pa_indx++] = pa; /* start */ phys_avail[pa_indx] = pa + PAGE_SIZE; /* end */ } physmem++; do_dump_avail: if (dump_avail[da_indx] == pa) { dump_avail[da_indx] += PAGE_SIZE; } else { da_indx++; if (da_indx == DUMP_AVAIL_ARRAY_END) { da_indx--; goto do_next; } dump_avail[da_indx++] = pa; /* start */ dump_avail[da_indx] = pa + PAGE_SIZE; /* end */ } do_next: if (full) break; } } *pte = 0; invltlb(); /* * XXX * The last chunk must contain at least one page plus the message * buffer to avoid complicating other code (message buffer address * calculation, etc.). */ while (phys_avail[pa_indx - 1] + PAGE_SIZE + round_page(msgbufsize) >= phys_avail[pa_indx]) { physmem -= atop(phys_avail[pa_indx] - phys_avail[pa_indx - 1]); phys_avail[pa_indx--] = 0; phys_avail[pa_indx--] = 0; } Maxmem = atop(phys_avail[pa_indx]); /* Trim off space for the message buffer. */ phys_avail[pa_indx] -= round_page(msgbufsize); /* Map the message buffer. */ for (off = 0; off < round_page(msgbufsize); off += PAGE_SIZE) pmap_kenter((vm_offset_t)msgbufp + off, phys_avail[pa_indx] + off); } static void i386_kdb_init(void) { #ifdef DDB db_fetch_ksymtab(bootinfo.bi_symtab, bootinfo.bi_esymtab); #endif kdb_init(); #ifdef KDB if (boothowto & RB_KDB) kdb_enter(KDB_WHY_BOOTFLAGS, "Boot flags requested debugger"); #endif } register_t init386(int first) { struct gate_descriptor *gdp; int gsel_tss, metadata_missing, x, pa; struct pcpu *pc; -#ifdef CPU_ENABLE_SSE struct xstate_hdr *xhdr; -#endif int late_console; thread0.td_kstack = proc0kstack; thread0.td_kstack_pages = TD0_KSTACK_PAGES; /* * This may be done better later if it gets more high level * components in it. If so just link td->td_proc here. */ proc_linkup0(&proc0, &thread0); metadata_missing = 0; if (bootinfo.bi_modulep) { preload_metadata = (caddr_t)bootinfo.bi_modulep + KERNBASE; preload_bootstrap_relocate(KERNBASE); } else { metadata_missing = 1; } if (bootinfo.bi_envp != 0) init_static_kenv((char *)bootinfo.bi_envp + KERNBASE, 0); else init_static_kenv(NULL, 0); /* Init basic tunables, hz etc */ init_param1(); /* * Make gdt memory segments. All segments cover the full 4GB * of address space and permissions are enforced at page level. */ gdt_segs[GCODE_SEL].ssd_limit = atop(0 - 1); gdt_segs[GDATA_SEL].ssd_limit = atop(0 - 1); gdt_segs[GUCODE_SEL].ssd_limit = atop(0 - 1); gdt_segs[GUDATA_SEL].ssd_limit = atop(0 - 1); gdt_segs[GUFS_SEL].ssd_limit = atop(0 - 1); gdt_segs[GUGS_SEL].ssd_limit = atop(0 - 1); pc = &__pcpu[0]; gdt_segs[GPRIV_SEL].ssd_limit = atop(0 - 1); gdt_segs[GPRIV_SEL].ssd_base = (int) pc; gdt_segs[GPROC0_SEL].ssd_base = (int) &pc->pc_common_tss; for (x = 0; x < NGDT; x++) ssdtosd(&gdt_segs[x], &gdt[x].sd); r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1; r_gdt.rd_base = (int) gdt; mtx_init(&dt_lock, "descriptor tables", NULL, MTX_SPIN); lgdt(&r_gdt); pcpu_init(pc, 0, sizeof(struct pcpu)); for (pa = first; pa < first + DPCPU_SIZE; pa += PAGE_SIZE) pmap_kenter(pa + KERNBASE, pa); dpcpu_init((void *)(first + KERNBASE), 0); first += DPCPU_SIZE; PCPU_SET(prvspace, pc); PCPU_SET(curthread, &thread0); /* Non-late cninit() and printf() can be moved up to here. */ /* * Initialize mutexes. * * icu_lock: in order to allow an interrupt to occur in a critical * section, to set pcpu->ipending (etc...) properly, we * must be able to get the icu lock, so it can't be * under witness. */ mutex_init(); mtx_init(&icu_lock, "icu", NULL, MTX_SPIN | MTX_NOWITNESS | MTX_NOPROFILE); /* make ldt memory segments */ ldt_segs[LUCODE_SEL].ssd_limit = atop(0 - 1); ldt_segs[LUDATA_SEL].ssd_limit = atop(0 - 1); for (x = 0; x < nitems(ldt_segs); x++) ssdtosd(&ldt_segs[x], &ldt[x].sd); _default_ldt = GSEL(GLDT_SEL, SEL_KPL); lldt(_default_ldt); PCPU_SET(currentldt, _default_ldt); /* exceptions */ for (x = 0; x < NIDT; x++) setidt(x, &IDTVEC(rsvd), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(IDT_DE, &IDTVEC(div), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(IDT_DB, &IDTVEC(dbg), SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(IDT_NMI, &IDTVEC(nmi), SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(IDT_BP, &IDTVEC(bpt), SDT_SYS386IGT, SEL_UPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(IDT_OF, &IDTVEC(ofl), SDT_SYS386TGT, SEL_UPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(IDT_BR, &IDTVEC(bnd), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(IDT_UD, &IDTVEC(ill), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(IDT_NM, &IDTVEC(dna), SDT_SYS386TGT, SEL_KPL , GSEL(GCODE_SEL, SEL_KPL)); setidt(IDT_DF, 0, SDT_SYSTASKGT, SEL_KPL, GSEL(GPANIC_SEL, SEL_KPL)); setidt(IDT_FPUGP, &IDTVEC(fpusegm), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(IDT_TS, &IDTVEC(tss), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(IDT_NP, &IDTVEC(missing), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(IDT_SS, &IDTVEC(stk), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(IDT_GP, &IDTVEC(prot), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(IDT_PF, &IDTVEC(page), SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(IDT_MF, &IDTVEC(fpu), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(IDT_AC, &IDTVEC(align), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(IDT_MC, &IDTVEC(mchk), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(IDT_XF, &IDTVEC(xmm), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(IDT_SYSCALL, &IDTVEC(int0x80_syscall), SDT_SYS386TGT, SEL_UPL, GSEL(GCODE_SEL, SEL_KPL)); #ifdef KDTRACE_HOOKS setidt(IDT_DTRACE_RET, &IDTVEC(dtrace_ret), SDT_SYS386TGT, SEL_UPL, GSEL(GCODE_SEL, SEL_KPL)); #endif #ifdef XENHVM setidt(IDT_EVTCHN, &IDTVEC(xen_intr_upcall), SDT_SYS386IGT, SEL_UPL, GSEL(GCODE_SEL, SEL_KPL)); #endif r_idt.rd_limit = sizeof(idt0) - 1; r_idt.rd_base = (int) idt; lidt(&r_idt); #ifdef XBOX /* * The following code queries the PCI ID of 0:0:0. For the XBOX, * This should be 0x10de / 0x02a5. * * This is exactly what Linux does. */ outl(0xcf8, 0x80000000); if (inl(0xcfc) == 0x02a510de) { arch_i386_is_xbox = 1; pic16l_setled(XBOX_LED_GREEN); /* * We are an XBOX, but we may have either 64MB or 128MB of * memory. The PCI host bridge should be programmed for this, * so we just query it. */ outl(0xcf8, 0x80000084); arch_i386_xbox_memsize = (inl(0xcfc) == 0x7FFFFFF) ? 128 : 64; } #endif /* XBOX */ /* * Initialize the clock before the console so that console * initialization can use DELAY(). */ clock_init(); finishidentcpu(); /* Final stage of CPU initialization */ setidt(IDT_UD, &IDTVEC(ill), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(IDT_GP, &IDTVEC(prot), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); initializecpu(); /* Initialize CPU registers */ initializecpucache(); /* pointer to selector slot for %fs/%gs */ PCPU_SET(fsgs_gdt, &gdt[GUFS_SEL].sd); dblfault_tss.tss_esp = dblfault_tss.tss_esp0 = dblfault_tss.tss_esp1 = dblfault_tss.tss_esp2 = (int)&dblfault_stack[sizeof(dblfault_stack)]; dblfault_tss.tss_ss = dblfault_tss.tss_ss0 = dblfault_tss.tss_ss1 = dblfault_tss.tss_ss2 = GSEL(GDATA_SEL, SEL_KPL); #if defined(PAE) || defined(PAE_TABLES) dblfault_tss.tss_cr3 = (int)IdlePDPT; #else dblfault_tss.tss_cr3 = (int)IdlePTD; #endif dblfault_tss.tss_eip = (int)dblfault_handler; dblfault_tss.tss_eflags = PSL_KERNEL; dblfault_tss.tss_ds = dblfault_tss.tss_es = dblfault_tss.tss_gs = GSEL(GDATA_SEL, SEL_KPL); dblfault_tss.tss_fs = GSEL(GPRIV_SEL, SEL_KPL); dblfault_tss.tss_cs = GSEL(GCODE_SEL, SEL_KPL); dblfault_tss.tss_ldt = GSEL(GLDT_SEL, SEL_KPL); /* Initialize the tss (except for the final esp0) early for vm86. */ PCPU_SET(common_tss.tss_esp0, thread0.td_kstack + thread0.td_kstack_pages * PAGE_SIZE - 16); PCPU_SET(common_tss.tss_ss0, GSEL(GDATA_SEL, SEL_KPL)); gsel_tss = GSEL(GPROC0_SEL, SEL_KPL); PCPU_SET(tss_gdt, &gdt[GPROC0_SEL].sd); PCPU_SET(common_tssd, *PCPU_GET(tss_gdt)); PCPU_SET(common_tss.tss_ioopt, (sizeof (struct i386tss)) << 16); ltr(gsel_tss); /* Initialize the PIC early for vm86 calls. */ #ifdef DEV_ISA #ifdef DEV_ATPIC elcr_probe(); atpic_startup(); #else /* Reset and mask the atpics and leave them shut down. */ atpic_reset(); /* * Point the ICU spurious interrupt vectors at the APIC spurious * interrupt handler. */ setidt(IDT_IO_INTS + 7, IDTVEC(spuriousint), SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(IDT_IO_INTS + 15, IDTVEC(spuriousint), SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); #endif #endif /* * The console and kdb should be initialized even earlier than here, * but some console drivers don't work until after getmemsize(). * Default to late console initialization to support these drivers. * This loses mainly printf()s in getmemsize() and early debugging. */ late_console = 1; TUNABLE_INT_FETCH("debug.late_console", &late_console); if (!late_console) { cninit(); i386_kdb_init(); } vm86_initialize(); getmemsize(first); init_param2(physmem); /* now running on new page tables, configured,and u/iom is accessible */ if (late_console) cninit(); if (metadata_missing) printf("WARNING: loader(8) metadata is missing!\n"); if (late_console) i386_kdb_init(); msgbufinit(msgbufp, msgbufsize); -#ifdef DEV_NPX npxinit(true); -#endif /* * Set up thread0 pcb after npxinit calculated pcb + fpu save * area size. Zero out the extended state header in fpu save * area. */ thread0.td_pcb = get_pcb_td(&thread0); bzero(get_pcb_user_save_td(&thread0), cpu_max_ext_state_size); -#ifdef CPU_ENABLE_SSE if (use_xsave) { xhdr = (struct xstate_hdr *)(get_pcb_user_save_td(&thread0) + 1); xhdr->xstate_bv = xsave_mask; } -#endif PCPU_SET(curpcb, thread0.td_pcb); /* Move esp0 in the tss to its final place. */ /* Note: -16 is so we can grow the trapframe if we came from vm86 */ PCPU_SET(common_tss.tss_esp0, (vm_offset_t)thread0.td_pcb - 16); gdt[GPROC0_SEL].sd.sd_type = SDT_SYS386TSS; /* clear busy bit */ ltr(gsel_tss); /* make a call gate to reenter kernel with */ gdp = &ldt[LSYS5CALLS_SEL].gd; x = (int) &IDTVEC(lcall_syscall); gdp->gd_looffset = x; gdp->gd_selector = GSEL(GCODE_SEL,SEL_KPL); gdp->gd_stkcpy = 1; gdp->gd_type = SDT_SYS386CGT; gdp->gd_dpl = SEL_UPL; gdp->gd_p = 1; gdp->gd_hioffset = x >> 16; /* XXX does this work? */ /* XXX yes! */ ldt[LBSDICALLS_SEL] = ldt[LSYS5CALLS_SEL]; ldt[LSOL26CALLS_SEL] = ldt[LSYS5CALLS_SEL]; /* transfer to user mode */ _ucodesel = GSEL(GUCODE_SEL, SEL_UPL); _udatasel = GSEL(GUDATA_SEL, SEL_UPL); /* setup proc 0's pcb */ thread0.td_pcb->pcb_flags = 0; #if defined(PAE) || defined(PAE_TABLES) thread0.td_pcb->pcb_cr3 = (int)IdlePDPT; #else thread0.td_pcb->pcb_cr3 = (int)IdlePTD; #endif thread0.td_pcb->pcb_ext = 0; thread0.td_frame = &proc0_tf; cpu_probe_amdc1e(); #ifdef FDT x86_init_fdt(); #endif /* Location of kernel stack for locore */ return ((register_t)thread0.td_pcb); } void cpu_pcpu_init(struct pcpu *pcpu, int cpuid, size_t size) { pcpu->pc_acpi_id = 0xffffffff; } static int smap_sysctl_handler(SYSCTL_HANDLER_ARGS) { struct bios_smap *smapbase; struct bios_smap_xattr smap; caddr_t kmdp; uint32_t *smapattr; int count, error, i; /* Retrieve the system memory map from the loader. */ kmdp = preload_search_by_type("elf kernel"); if (kmdp == NULL) kmdp = preload_search_by_type("elf32 kernel"); smapbase = (struct bios_smap *)preload_search_info(kmdp, MODINFO_METADATA | MODINFOMD_SMAP); if (smapbase == NULL) return (0); smapattr = (uint32_t *)preload_search_info(kmdp, MODINFO_METADATA | MODINFOMD_SMAP_XATTR); count = *((u_int32_t *)smapbase - 1) / sizeof(*smapbase); error = 0; for (i = 0; i < count; i++) { smap.base = smapbase[i].base; smap.length = smapbase[i].length; smap.type = smapbase[i].type; if (smapattr != NULL) smap.xattr = smapattr[i]; else smap.xattr = 0; error = SYSCTL_OUT(req, &smap, sizeof(smap)); } return (error); } SYSCTL_PROC(_machdep, OID_AUTO, smap, CTLTYPE_OPAQUE|CTLFLAG_RD, NULL, 0, smap_sysctl_handler, "S,bios_smap_xattr", "Raw BIOS SMAP data"); void spinlock_enter(void) { struct thread *td; register_t flags; td = curthread; if (td->td_md.md_spinlock_count == 0) { flags = intr_disable(); td->td_md.md_spinlock_count = 1; td->td_md.md_saved_flags = flags; } else td->td_md.md_spinlock_count++; critical_enter(); } void spinlock_exit(void) { struct thread *td; register_t flags; td = curthread; critical_exit(); flags = td->td_md.md_saved_flags; td->td_md.md_spinlock_count--; if (td->td_md.md_spinlock_count == 0) intr_restore(flags); } #if defined(I586_CPU) && !defined(NO_F00F_HACK) static void f00f_hack(void *unused); SYSINIT(f00f_hack, SI_SUB_INTRINSIC, SI_ORDER_FIRST, f00f_hack, NULL); static void f00f_hack(void *unused) { struct gate_descriptor *new_idt; vm_offset_t tmp; if (!has_f00f_bug) return; GIANT_REQUIRED; printf("Intel Pentium detected, installing workaround for F00F bug\n"); tmp = kmem_malloc(kernel_arena, PAGE_SIZE * 2, M_WAITOK | M_ZERO); if (tmp == 0) panic("kmem_malloc returned 0"); /* Put the problematic entry (#6) at the end of the lower page. */ new_idt = (struct gate_descriptor*) (tmp + PAGE_SIZE - 7 * sizeof(struct gate_descriptor)); bcopy(idt, new_idt, sizeof(idt0)); r_idt.rd_base = (u_int)new_idt; lidt(&r_idt); idt = new_idt; pmap_protect(kernel_pmap, tmp, tmp + PAGE_SIZE, VM_PROT_READ); } #endif /* defined(I586_CPU) && !NO_F00F_HACK */ /* * Construct a PCB from a trapframe. This is called from kdb_trap() where * we want to start a backtrace from the function that caused us to enter * the debugger. We have the context in the trapframe, but base the trace * on the PCB. The PCB doesn't have to be perfect, as long as it contains * enough for a backtrace. */ void makectx(struct trapframe *tf, struct pcb *pcb) { pcb->pcb_edi = tf->tf_edi; pcb->pcb_esi = tf->tf_esi; pcb->pcb_ebp = tf->tf_ebp; pcb->pcb_ebx = tf->tf_ebx; pcb->pcb_eip = tf->tf_eip; pcb->pcb_esp = (ISPL(tf->tf_cs)) ? tf->tf_esp : (int)(tf + 1) - 8; pcb->pcb_gs = rgs(); } int ptrace_set_pc(struct thread *td, u_long addr) { td->td_frame->tf_eip = addr; return (0); } int ptrace_single_step(struct thread *td) { td->td_frame->tf_eflags |= PSL_T; return (0); } int ptrace_clear_single_step(struct thread *td) { td->td_frame->tf_eflags &= ~PSL_T; return (0); } int fill_regs(struct thread *td, struct reg *regs) { struct pcb *pcb; struct trapframe *tp; tp = td->td_frame; pcb = td->td_pcb; regs->r_gs = pcb->pcb_gs; return (fill_frame_regs(tp, regs)); } int fill_frame_regs(struct trapframe *tp, struct reg *regs) { regs->r_fs = tp->tf_fs; regs->r_es = tp->tf_es; regs->r_ds = tp->tf_ds; regs->r_edi = tp->tf_edi; regs->r_esi = tp->tf_esi; regs->r_ebp = tp->tf_ebp; regs->r_ebx = tp->tf_ebx; regs->r_edx = tp->tf_edx; regs->r_ecx = tp->tf_ecx; regs->r_eax = tp->tf_eax; regs->r_eip = tp->tf_eip; regs->r_cs = tp->tf_cs; regs->r_eflags = tp->tf_eflags; regs->r_esp = tp->tf_esp; regs->r_ss = tp->tf_ss; return (0); } int set_regs(struct thread *td, struct reg *regs) { struct pcb *pcb; struct trapframe *tp; tp = td->td_frame; if (!EFL_SECURE(regs->r_eflags, tp->tf_eflags) || !CS_SECURE(regs->r_cs)) return (EINVAL); pcb = td->td_pcb; tp->tf_fs = regs->r_fs; tp->tf_es = regs->r_es; tp->tf_ds = regs->r_ds; tp->tf_edi = regs->r_edi; tp->tf_esi = regs->r_esi; tp->tf_ebp = regs->r_ebp; tp->tf_ebx = regs->r_ebx; tp->tf_edx = regs->r_edx; tp->tf_ecx = regs->r_ecx; tp->tf_eax = regs->r_eax; tp->tf_eip = regs->r_eip; tp->tf_cs = regs->r_cs; tp->tf_eflags = regs->r_eflags; tp->tf_esp = regs->r_esp; tp->tf_ss = regs->r_ss; pcb->pcb_gs = regs->r_gs; return (0); } int fill_fpregs(struct thread *td, struct fpreg *fpregs) { KASSERT(td == curthread || TD_IS_SUSPENDED(td) || P_SHOULDSTOP(td->td_proc), ("not suspended thread %p", td)); -#ifdef DEV_NPX npxgetregs(td); -#else - bzero(fpregs, sizeof(*fpregs)); -#endif -#ifdef CPU_ENABLE_SSE if (cpu_fxsr) npx_fill_fpregs_xmm(&get_pcb_user_save_td(td)->sv_xmm, (struct save87 *)fpregs); else -#endif /* CPU_ENABLE_SSE */ bcopy(&get_pcb_user_save_td(td)->sv_87, fpregs, sizeof(*fpregs)); return (0); } int set_fpregs(struct thread *td, struct fpreg *fpregs) { -#ifdef CPU_ENABLE_SSE if (cpu_fxsr) npx_set_fpregs_xmm((struct save87 *)fpregs, &get_pcb_user_save_td(td)->sv_xmm); else -#endif /* CPU_ENABLE_SSE */ bcopy(fpregs, &get_pcb_user_save_td(td)->sv_87, sizeof(*fpregs)); -#ifdef DEV_NPX npxuserinited(td); -#endif return (0); } /* * Get machine context. */ int get_mcontext(struct thread *td, mcontext_t *mcp, int flags) { struct trapframe *tp; struct segment_descriptor *sdp; tp = td->td_frame; PROC_LOCK(curthread->td_proc); mcp->mc_onstack = sigonstack(tp->tf_esp); PROC_UNLOCK(curthread->td_proc); mcp->mc_gs = td->td_pcb->pcb_gs; mcp->mc_fs = tp->tf_fs; mcp->mc_es = tp->tf_es; mcp->mc_ds = tp->tf_ds; mcp->mc_edi = tp->tf_edi; mcp->mc_esi = tp->tf_esi; mcp->mc_ebp = tp->tf_ebp; mcp->mc_isp = tp->tf_isp; mcp->mc_eflags = tp->tf_eflags; if (flags & GET_MC_CLEAR_RET) { mcp->mc_eax = 0; mcp->mc_edx = 0; mcp->mc_eflags &= ~PSL_C; } else { mcp->mc_eax = tp->tf_eax; mcp->mc_edx = tp->tf_edx; } mcp->mc_ebx = tp->tf_ebx; mcp->mc_ecx = tp->tf_ecx; mcp->mc_eip = tp->tf_eip; mcp->mc_cs = tp->tf_cs; mcp->mc_esp = tp->tf_esp; mcp->mc_ss = tp->tf_ss; mcp->mc_len = sizeof(*mcp); get_fpcontext(td, mcp, NULL, 0); sdp = &td->td_pcb->pcb_fsd; mcp->mc_fsbase = sdp->sd_hibase << 24 | sdp->sd_lobase; sdp = &td->td_pcb->pcb_gsd; mcp->mc_gsbase = sdp->sd_hibase << 24 | sdp->sd_lobase; mcp->mc_flags = 0; mcp->mc_xfpustate = 0; mcp->mc_xfpustate_len = 0; bzero(mcp->mc_spare2, sizeof(mcp->mc_spare2)); return (0); } /* * Set machine context. * * However, we don't set any but the user modifiable flags, and we won't * touch the cs selector. */ int set_mcontext(struct thread *td, mcontext_t *mcp) { struct trapframe *tp; char *xfpustate; int eflags, ret; tp = td->td_frame; if (mcp->mc_len != sizeof(*mcp) || (mcp->mc_flags & ~_MC_FLAG_MASK) != 0) return (EINVAL); eflags = (mcp->mc_eflags & PSL_USERCHANGE) | (tp->tf_eflags & ~PSL_USERCHANGE); if (mcp->mc_flags & _MC_HASFPXSTATE) { if (mcp->mc_xfpustate_len > cpu_max_ext_state_size - sizeof(union savefpu)) return (EINVAL); xfpustate = __builtin_alloca(mcp->mc_xfpustate_len); ret = copyin((void *)mcp->mc_xfpustate, xfpustate, mcp->mc_xfpustate_len); if (ret != 0) return (ret); } else xfpustate = NULL; ret = set_fpcontext(td, mcp, xfpustate, mcp->mc_xfpustate_len); if (ret != 0) return (ret); tp->tf_fs = mcp->mc_fs; tp->tf_es = mcp->mc_es; tp->tf_ds = mcp->mc_ds; tp->tf_edi = mcp->mc_edi; tp->tf_esi = mcp->mc_esi; tp->tf_ebp = mcp->mc_ebp; tp->tf_ebx = mcp->mc_ebx; tp->tf_edx = mcp->mc_edx; tp->tf_ecx = mcp->mc_ecx; tp->tf_eax = mcp->mc_eax; tp->tf_eip = mcp->mc_eip; tp->tf_eflags = eflags; tp->tf_esp = mcp->mc_esp; tp->tf_ss = mcp->mc_ss; td->td_pcb->pcb_gs = mcp->mc_gs; return (0); } static void get_fpcontext(struct thread *td, mcontext_t *mcp, char *xfpusave, size_t xfpusave_len) { -#ifdef CPU_ENABLE_SSE size_t max_len, len; -#endif -#ifndef DEV_NPX - mcp->mc_fpformat = _MC_FPFMT_NODEV; - mcp->mc_ownedfp = _MC_FPOWNED_NONE; - bzero(mcp->mc_fpstate, sizeof(mcp->mc_fpstate)); -#else mcp->mc_ownedfp = npxgetregs(td); bcopy(get_pcb_user_save_td(td), &mcp->mc_fpstate[0], sizeof(mcp->mc_fpstate)); mcp->mc_fpformat = npxformat(); -#ifdef CPU_ENABLE_SSE if (!use_xsave || xfpusave_len == 0) return; max_len = cpu_max_ext_state_size - sizeof(union savefpu); len = xfpusave_len; if (len > max_len) { len = max_len; bzero(xfpusave + max_len, len - max_len); } mcp->mc_flags |= _MC_HASFPXSTATE; mcp->mc_xfpustate_len = len; bcopy(get_pcb_user_save_td(td) + 1, xfpusave, len); -#endif -#endif } static int set_fpcontext(struct thread *td, mcontext_t *mcp, char *xfpustate, size_t xfpustate_len) { union savefpu *fpstate; int error; if (mcp->mc_fpformat == _MC_FPFMT_NODEV) return (0); else if (mcp->mc_fpformat != _MC_FPFMT_387 && mcp->mc_fpformat != _MC_FPFMT_XMM) return (EINVAL); else if (mcp->mc_ownedfp == _MC_FPOWNED_NONE) { /* We don't care what state is left in the FPU or PCB. */ fpstate_drop(td); error = 0; } else if (mcp->mc_ownedfp == _MC_FPOWNED_FPU || mcp->mc_ownedfp == _MC_FPOWNED_PCB) { -#ifdef DEV_NPX fpstate = (union savefpu *)&mcp->mc_fpstate; -#ifdef CPU_ENABLE_SSE if (cpu_fxsr) fpstate->sv_xmm.sv_env.en_mxcsr &= cpu_mxcsr_mask; -#endif error = npxsetregs(td, fpstate, xfpustate, xfpustate_len); -#else - error = EINVAL; -#endif } else return (EINVAL); return (error); } static void fpstate_drop(struct thread *td) { KASSERT(PCB_USER_FPU(td->td_pcb), ("fpstate_drop: kernel-owned fpu")); critical_enter(); -#ifdef DEV_NPX if (PCPU_GET(fpcurthread) == td) npxdrop(); -#endif /* * XXX force a full drop of the npx. The above only drops it if we * owned it. npxgetregs() has the same bug in the !cpu_fxsr case. * * XXX I don't much like npxgetregs()'s semantics of doing a full * drop. Dropping only to the pcb matches fnsave's behaviour. * We only need to drop to !PCB_INITDONE in sendsig(). But * sendsig() is the only caller of npxgetregs()... perhaps we just * have too many layers. */ curthread->td_pcb->pcb_flags &= ~(PCB_NPXINITDONE | PCB_NPXUSERINITDONE); critical_exit(); } int fill_dbregs(struct thread *td, struct dbreg *dbregs) { struct pcb *pcb; if (td == NULL) { dbregs->dr[0] = rdr0(); dbregs->dr[1] = rdr1(); dbregs->dr[2] = rdr2(); dbregs->dr[3] = rdr3(); dbregs->dr[4] = rdr4(); dbregs->dr[5] = rdr5(); dbregs->dr[6] = rdr6(); dbregs->dr[7] = rdr7(); } else { pcb = td->td_pcb; dbregs->dr[0] = pcb->pcb_dr0; dbregs->dr[1] = pcb->pcb_dr1; dbregs->dr[2] = pcb->pcb_dr2; dbregs->dr[3] = pcb->pcb_dr3; dbregs->dr[4] = 0; dbregs->dr[5] = 0; dbregs->dr[6] = pcb->pcb_dr6; dbregs->dr[7] = pcb->pcb_dr7; } return (0); } int set_dbregs(struct thread *td, struct dbreg *dbregs) { struct pcb *pcb; int i; if (td == NULL) { load_dr0(dbregs->dr[0]); load_dr1(dbregs->dr[1]); load_dr2(dbregs->dr[2]); load_dr3(dbregs->dr[3]); load_dr4(dbregs->dr[4]); load_dr5(dbregs->dr[5]); load_dr6(dbregs->dr[6]); load_dr7(dbregs->dr[7]); } else { /* * Don't let an illegal value for dr7 get set. Specifically, * check for undefined settings. Setting these bit patterns * result in undefined behaviour and can lead to an unexpected * TRCTRAP. */ for (i = 0; i < 4; i++) { if (DBREG_DR7_ACCESS(dbregs->dr[7], i) == 0x02) return (EINVAL); if (DBREG_DR7_LEN(dbregs->dr[7], i) == 0x02) return (EINVAL); } pcb = td->td_pcb; /* * Don't let a process set a breakpoint that is not within the * process's address space. If a process could do this, it * could halt the system by setting a breakpoint in the kernel * (if ddb was enabled). Thus, we need to check to make sure * that no breakpoints are being enabled for addresses outside * process's address space. * * XXX - what about when the watched area of the user's * address space is written into from within the kernel * ... wouldn't that still cause a breakpoint to be generated * from within kernel mode? */ if (DBREG_DR7_ENABLED(dbregs->dr[7], 0)) { /* dr0 is enabled */ if (dbregs->dr[0] >= VM_MAXUSER_ADDRESS) return (EINVAL); } if (DBREG_DR7_ENABLED(dbregs->dr[7], 1)) { /* dr1 is enabled */ if (dbregs->dr[1] >= VM_MAXUSER_ADDRESS) return (EINVAL); } if (DBREG_DR7_ENABLED(dbregs->dr[7], 2)) { /* dr2 is enabled */ if (dbregs->dr[2] >= VM_MAXUSER_ADDRESS) return (EINVAL); } if (DBREG_DR7_ENABLED(dbregs->dr[7], 3)) { /* dr3 is enabled */ if (dbregs->dr[3] >= VM_MAXUSER_ADDRESS) return (EINVAL); } pcb->pcb_dr0 = dbregs->dr[0]; pcb->pcb_dr1 = dbregs->dr[1]; pcb->pcb_dr2 = dbregs->dr[2]; pcb->pcb_dr3 = dbregs->dr[3]; pcb->pcb_dr6 = dbregs->dr[6]; pcb->pcb_dr7 = dbregs->dr[7]; pcb->pcb_flags |= PCB_DBREGS; } return (0); } /* * Return > 0 if a hardware breakpoint has been hit, and the * breakpoint was in user space. Return 0, otherwise. */ int user_dbreg_trap(void) { u_int32_t dr7, dr6; /* debug registers dr6 and dr7 */ u_int32_t bp; /* breakpoint bits extracted from dr6 */ int nbp; /* number of breakpoints that triggered */ caddr_t addr[4]; /* breakpoint addresses */ int i; dr7 = rdr7(); if ((dr7 & 0x000000ff) == 0) { /* * all GE and LE bits in the dr7 register are zero, * thus the trap couldn't have been caused by the * hardware debug registers */ return 0; } nbp = 0; dr6 = rdr6(); bp = dr6 & 0x0000000f; if (!bp) { /* * None of the breakpoint bits are set meaning this * trap was not caused by any of the debug registers */ return 0; } /* * at least one of the breakpoints were hit, check to see * which ones and if any of them are user space addresses */ if (bp & 0x01) { addr[nbp++] = (caddr_t)rdr0(); } if (bp & 0x02) { addr[nbp++] = (caddr_t)rdr1(); } if (bp & 0x04) { addr[nbp++] = (caddr_t)rdr2(); } if (bp & 0x08) { addr[nbp++] = (caddr_t)rdr3(); } for (i = 0; i < nbp; i++) { if (addr[i] < (caddr_t)VM_MAXUSER_ADDRESS) { /* * addr[i] is in user space */ return nbp; } } /* * None of the breakpoints are in user space. */ return 0; } #ifdef KDB /* * Provide inb() and outb() as functions. They are normally only available as * inline functions, thus cannot be called from the debugger. */ /* silence compiler warnings */ u_char inb_(u_short); void outb_(u_short, u_char); u_char inb_(u_short port) { return inb(port); } void outb_(u_short port, u_char data) { outb(port, data); } #endif /* KDB */ Index: head/sys/i386/i386/mp_machdep.c =================================================================== --- head/sys/i386/i386/mp_machdep.c (revision 313153) +++ head/sys/i386/i386/mp_machdep.c (revision 313154) @@ -1,472 +1,469 @@ /*- * Copyright (c) 1996, by Steve Passe * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. The name of the developer may NOT be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_apic.h" #include "opt_cpu.h" #include "opt_kstack_pages.h" #include "opt_pmap.h" #include "opt_sched.h" #include "opt_smp.h" #if !defined(lint) #if !defined(SMP) #error How did you get here? #endif #ifndef DEV_APIC #error The apic device is required for SMP, add "device apic" to your config file. #endif -#if defined(CPU_DISABLE_CMPXCHG) && !defined(COMPILING_LINT) -#error SMP not supported with CPU_DISABLE_CMPXCHG -#endif #endif /* not lint */ #include #include #include #include /* cngetc() */ #include #ifdef GPROF #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define WARMBOOT_TARGET 0 #define WARMBOOT_OFF (KERNBASE + 0x0467) #define WARMBOOT_SEG (KERNBASE + 0x0469) #define CMOS_REG (0x70) #define CMOS_DATA (0x71) #define BIOS_RESET (0x0f) #define BIOS_WARM (0x0a) /* * this code MUST be enabled here and in mpboot.s. * it follows the very early stages of AP boot by placing values in CMOS ram. * it NORMALLY will never be needed and thus the primitive method for enabling. * #define CHECK_POINTS */ #if defined(CHECK_POINTS) #define CHECK_READ(A) (outb(CMOS_REG, (A)), inb(CMOS_DATA)) #define CHECK_WRITE(A,D) (outb(CMOS_REG, (A)), outb(CMOS_DATA, (D))) #define CHECK_INIT(D); \ CHECK_WRITE(0x34, (D)); \ CHECK_WRITE(0x35, (D)); \ CHECK_WRITE(0x36, (D)); \ CHECK_WRITE(0x37, (D)); \ CHECK_WRITE(0x38, (D)); \ CHECK_WRITE(0x39, (D)); #define CHECK_PRINT(S); \ printf("%s: %d, %d, %d, %d, %d, %d\n", \ (S), \ CHECK_READ(0x34), \ CHECK_READ(0x35), \ CHECK_READ(0x36), \ CHECK_READ(0x37), \ CHECK_READ(0x38), \ CHECK_READ(0x39)); #else /* CHECK_POINTS */ #define CHECK_INIT(D) #define CHECK_PRINT(S) #define CHECK_WRITE(A, D) #endif /* CHECK_POINTS */ extern struct pcpu __pcpu[]; /* * Local data and functions. */ static void install_ap_tramp(void); static int start_all_aps(void); static int start_ap(int apic_id); static u_int boot_address; /* * Calculate usable address in base memory for AP trampoline code. */ u_int mp_bootaddress(u_int basemem) { boot_address = trunc_page(basemem); /* round down to 4k boundary */ if ((basemem - boot_address) < bootMP_size) boot_address -= PAGE_SIZE; /* not enough, lower by 4k */ return boot_address; } /* * Initialize the IPI handlers and start up the AP's. */ void cpu_mp_start(void) { int i; /* Initialize the logical ID to APIC ID table. */ for (i = 0; i < MAXCPU; i++) { cpu_apic_ids[i] = -1; cpu_ipi_pending[i] = 0; } /* Install an inter-CPU IPI for TLB invalidation */ setidt(IPI_INVLTLB, IDTVEC(invltlb), SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(IPI_INVLPG, IDTVEC(invlpg), SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(IPI_INVLRNG, IDTVEC(invlrng), SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); /* Install an inter-CPU IPI for cache invalidation. */ setidt(IPI_INVLCACHE, IDTVEC(invlcache), SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); /* Install an inter-CPU IPI for all-CPU rendezvous */ setidt(IPI_RENDEZVOUS, IDTVEC(rendezvous), SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); /* Install generic inter-CPU IPI handler */ setidt(IPI_BITMAP_VECTOR, IDTVEC(ipi_intr_bitmap_handler), SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); /* Install an inter-CPU IPI for CPU stop/restart */ setidt(IPI_STOP, IDTVEC(cpustop), SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); /* Install an inter-CPU IPI for CPU suspend/resume */ setidt(IPI_SUSPEND, IDTVEC(cpususpend), SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); /* Set boot_cpu_id if needed. */ if (boot_cpu_id == -1) { boot_cpu_id = PCPU_GET(apic_id); cpu_info[boot_cpu_id].cpu_bsp = 1; } else KASSERT(boot_cpu_id == PCPU_GET(apic_id), ("BSP's APIC ID doesn't match boot_cpu_id")); /* Probe logical/physical core configuration. */ topo_probe(); assign_cpu_ids(); /* Start each Application Processor */ start_all_aps(); set_interrupt_apic_ids(); } /* * AP CPU's call this to initialize themselves. */ void init_secondary(void) { struct pcpu *pc; vm_offset_t addr; int gsel_tss; int x, myid; u_int cr0; /* bootAP is set in start_ap() to our ID. */ myid = bootAP; /* Get per-cpu data */ pc = &__pcpu[myid]; /* prime data page for it to use */ pcpu_init(pc, myid, sizeof(struct pcpu)); dpcpu_init(dpcpu, myid); pc->pc_apic_id = cpu_apic_ids[myid]; pc->pc_prvspace = pc; pc->pc_curthread = 0; fix_cpuid(); gdt_segs[GPRIV_SEL].ssd_base = (int) pc; gdt_segs[GPROC0_SEL].ssd_base = (int) &pc->pc_common_tss; for (x = 0; x < NGDT; x++) { ssdtosd(&gdt_segs[x], &gdt[myid * NGDT + x].sd); } r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1; r_gdt.rd_base = (int) &gdt[myid * NGDT]; lgdt(&r_gdt); /* does magic intra-segment return */ lidt(&r_idt); lldt(_default_ldt); PCPU_SET(currentldt, _default_ldt); gsel_tss = GSEL(GPROC0_SEL, SEL_KPL); gdt[myid * NGDT + GPROC0_SEL].sd.sd_type = SDT_SYS386TSS; PCPU_SET(common_tss.tss_esp0, 0); /* not used until after switch */ PCPU_SET(common_tss.tss_ss0, GSEL(GDATA_SEL, SEL_KPL)); PCPU_SET(common_tss.tss_ioopt, (sizeof (struct i386tss)) << 16); PCPU_SET(tss_gdt, &gdt[myid * NGDT + GPROC0_SEL].sd); PCPU_SET(common_tssd, *PCPU_GET(tss_gdt)); ltr(gsel_tss); PCPU_SET(fsgs_gdt, &gdt[myid * NGDT + GUFS_SEL].sd); /* * Set to a known state: * Set by mpboot.s: CR0_PG, CR0_PE * Set by cpu_setregs: CR0_NE, CR0_MP, CR0_TS, CR0_WP, CR0_AM */ cr0 = rcr0(); cr0 &= ~(CR0_CD | CR0_NW | CR0_EM); load_cr0(cr0); CHECK_WRITE(0x38, 5); /* signal our startup to the BSP. */ mp_naps++; CHECK_WRITE(0x39, 6); /* Spin until the BSP releases the AP's. */ while (atomic_load_acq_int(&aps_ready) == 0) ia32_pause(); /* BSP may have changed PTD while we were waiting */ invltlb(); for (addr = 0; addr < NKPT * NBPDR - 1; addr += PAGE_SIZE) invlpg(addr); #if defined(I586_CPU) && !defined(NO_F00F_HACK) lidt(&r_idt); #endif init_secondary_tail(); } /* * start each AP in our list */ /* Lowest 1MB is already mapped: don't touch*/ #define TMPMAP_START 1 static int start_all_aps(void) { u_char mpbiosreason; u_int32_t mpbioswarmvec; int apic_id, cpu, i; mtx_init(&ap_boot_mtx, "ap boot", NULL, MTX_SPIN); /* install the AP 1st level boot code */ install_ap_tramp(); /* save the current value of the warm-start vector */ mpbioswarmvec = *((u_int32_t *) WARMBOOT_OFF); outb(CMOS_REG, BIOS_RESET); mpbiosreason = inb(CMOS_DATA); /* set up temporary P==V mapping for AP boot */ /* XXX this is a hack, we should boot the AP on its own stack/PTD */ for (i = TMPMAP_START; i < NKPT; i++) PTD[i] = PTD[KPTDI + i]; invltlb(); /* start each AP */ for (cpu = 1; cpu < mp_ncpus; cpu++) { apic_id = cpu_apic_ids[cpu]; /* allocate and set up a boot stack data page */ bootstacks[cpu] = (char *)kmem_malloc(kernel_arena, kstack_pages * PAGE_SIZE, M_WAITOK | M_ZERO); dpcpu = (void *)kmem_malloc(kernel_arena, DPCPU_SIZE, M_WAITOK | M_ZERO); /* setup a vector to our boot code */ *((volatile u_short *) WARMBOOT_OFF) = WARMBOOT_TARGET; *((volatile u_short *) WARMBOOT_SEG) = (boot_address >> 4); outb(CMOS_REG, BIOS_RESET); outb(CMOS_DATA, BIOS_WARM); /* 'warm-start' */ bootSTK = (char *)bootstacks[cpu] + kstack_pages * PAGE_SIZE - 4; bootAP = cpu; /* attempt to start the Application Processor */ CHECK_INIT(99); /* setup checkpoints */ if (!start_ap(apic_id)) { printf("AP #%d (PHY# %d) failed!\n", cpu, apic_id); CHECK_PRINT("trace"); /* show checkpoints */ /* better panic as the AP may be running loose */ printf("panic y/n? [y] "); if (cngetc() != 'n') panic("bye-bye"); } CHECK_PRINT("trace"); /* show checkpoints */ CPU_SET(cpu, &all_cpus); /* record AP in CPU map */ } /* restore the warmstart vector */ *(u_int32_t *) WARMBOOT_OFF = mpbioswarmvec; outb(CMOS_REG, BIOS_RESET); outb(CMOS_DATA, mpbiosreason); /* Undo V==P hack from above */ for (i = TMPMAP_START; i < NKPT; i++) PTD[i] = 0; pmap_invalidate_range(kernel_pmap, 0, NKPT * NBPDR - 1); /* number of APs actually started */ return mp_naps; } /* * load the 1st level AP boot code into base memory. */ /* targets for relocation */ extern void bigJump(void); extern void bootCodeSeg(void); extern void bootDataSeg(void); extern void MPentry(void); extern u_int MP_GDT; extern u_int mp_gdtbase; static void install_ap_tramp(void) { int x; int size = *(int *) ((u_long) & bootMP_size); vm_offset_t va = boot_address + KERNBASE; u_char *src = (u_char *) ((u_long) bootMP); u_char *dst = (u_char *) va; u_int boot_base = (u_int) bootMP; u_int8_t *dst8; u_int16_t *dst16; u_int32_t *dst32; KASSERT (size <= PAGE_SIZE, ("'size' do not fit into PAGE_SIZE, as expected.")); pmap_kenter(va, boot_address); pmap_invalidate_page (kernel_pmap, va); for (x = 0; x < size; ++x) *dst++ = *src++; /* * modify addresses in code we just moved to basemem. unfortunately we * need fairly detailed info about mpboot.s for this to work. changes * to mpboot.s might require changes here. */ /* boot code is located in KERNEL space */ dst = (u_char *) va; /* modify the lgdt arg */ dst32 = (u_int32_t *) (dst + ((u_int) & mp_gdtbase - boot_base)); *dst32 = boot_address + ((u_int) & MP_GDT - boot_base); /* modify the ljmp target for MPentry() */ dst32 = (u_int32_t *) (dst + ((u_int) bigJump - boot_base) + 1); *dst32 = ((u_int) MPentry - KERNBASE); /* modify the target for boot code segment */ dst16 = (u_int16_t *) (dst + ((u_int) bootCodeSeg - boot_base)); dst8 = (u_int8_t *) (dst16 + 1); *dst16 = (u_int) boot_address & 0xffff; *dst8 = ((u_int) boot_address >> 16) & 0xff; /* modify the target for boot data segment */ dst16 = (u_int16_t *) (dst + ((u_int) bootDataSeg - boot_base)); dst8 = (u_int8_t *) (dst16 + 1); *dst16 = (u_int) boot_address & 0xffff; *dst8 = ((u_int) boot_address >> 16) & 0xff; } /* * This function starts the AP (application processor) identified * by the APIC ID 'physicalCpu'. It does quite a "song and dance" * to accomplish this. This is necessary because of the nuances * of the different hardware we might encounter. It isn't pretty, * but it seems to work. */ static int start_ap(int apic_id) { int vector, ms; int cpus; /* calculate the vector */ vector = (boot_address >> 12) & 0xff; /* used as a watchpoint to signal AP startup */ cpus = mp_naps; ipi_startup(apic_id, vector); /* Wait up to 5 seconds for it to start. */ for (ms = 0; ms < 5000; ms++) { if (mp_naps > cpus) return 1; /* return SUCCESS */ DELAY(1000); } return 0; /* return FAILURE */ } Index: head/sys/i386/i386/pmap.c =================================================================== --- head/sys/i386/i386/pmap.c (revision 313153) +++ head/sys/i386/i386/pmap.c (revision 313154) @@ -1,5681 +1,5675 @@ /*- * Copyright (c) 1991 Regents of the University of California. * All rights reserved. * Copyright (c) 1994 John S. Dyson * All rights reserved. * Copyright (c) 1994 David Greenman * All rights reserved. * Copyright (c) 2005-2010 Alan L. Cox * All rights reserved. * * This code is derived from software contributed to Berkeley by * the Systems Programming Group of the University of Utah Computer * Science Department and William Jolitz of UUNET Technologies Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)pmap.c 7.7 (Berkeley) 5/12/91 */ /*- * Copyright (c) 2003 Networks Associates Technology, Inc. * All rights reserved. * * This software was developed for the FreeBSD Project by Jake Burkholder, * Safeport Network Services, and Network Associates Laboratories, the * Security Research Division of Network Associates, Inc. under * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA * CHATS research program. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); /* * Manages physical address maps. * * Since the information managed by this module is * also stored by the logical address mapping module, * this module may throw away valid virtual-to-physical * mappings at almost any time. However, invalidations * of virtual-to-physical mappings must be done as * requested. * * In order to cope with hardware architectures which * make virtual-to-physical map invalidates expensive, * this module may delay invalidate or reduced protection * operations until such time as they are actually * necessary. This module is given full information as * to which processors are currently using which maps, * and to when physical maps must be made correct. */ #include "opt_apic.h" #include "opt_cpu.h" #include "opt_pmap.h" #include "opt_smp.h" #include "opt_xbox.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DEV_APIC #include #include #include #endif #include #include #include #include #include #ifdef SMP #include #endif #ifdef XBOX #include #endif -#if !defined(CPU_DISABLE_SSE) && defined(I686_CPU) -#define CPU_ENABLE_SSE -#endif - #ifndef PMAP_SHPGPERPROC #define PMAP_SHPGPERPROC 200 #endif #if !defined(DIAGNOSTIC) #ifdef __GNUC_GNU_INLINE__ #define PMAP_INLINE __attribute__((__gnu_inline__)) inline #else #define PMAP_INLINE extern inline #endif #else #define PMAP_INLINE #endif #ifdef PV_STATS #define PV_STAT(x) do { x ; } while (0) #else #define PV_STAT(x) do { } while (0) #endif #define pa_index(pa) ((pa) >> PDRSHIFT) #define pa_to_pvh(pa) (&pv_table[pa_index(pa)]) /* * Get PDEs and PTEs for user/kernel address space */ #define pmap_pde(m, v) (&((m)->pm_pdir[(vm_offset_t)(v) >> PDRSHIFT])) #define pdir_pde(m, v) (m[(vm_offset_t)(v) >> PDRSHIFT]) #define pmap_pde_v(pte) ((*(int *)pte & PG_V) != 0) #define pmap_pte_w(pte) ((*(int *)pte & PG_W) != 0) #define pmap_pte_m(pte) ((*(int *)pte & PG_M) != 0) #define pmap_pte_u(pte) ((*(int *)pte & PG_A) != 0) #define pmap_pte_v(pte) ((*(int *)pte & PG_V) != 0) #define pmap_pte_set_w(pte, v) ((v) ? atomic_set_int((u_int *)(pte), PG_W) : \ atomic_clear_int((u_int *)(pte), PG_W)) #define pmap_pte_set_prot(pte, v) ((*(int *)pte &= ~PG_PROT), (*(int *)pte |= (v))) struct pmap kernel_pmap_store; LIST_HEAD(pmaplist, pmap); static struct pmaplist allpmaps; static struct mtx allpmaps_lock; vm_offset_t virtual_avail; /* VA of first avail page (after kernel bss) */ vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */ int pgeflag = 0; /* PG_G or-in */ int pseflag = 0; /* PG_PS or-in */ static int nkpt = NKPT; vm_offset_t kernel_vm_end = KERNBASE + NKPT * NBPDR; extern u_int32_t KERNend; extern u_int32_t KPTphys; #if defined(PAE) || defined(PAE_TABLES) pt_entry_t pg_nx; static uma_zone_t pdptzone; #endif static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD, 0, "VM/pmap parameters"); static int pat_works = 1; SYSCTL_INT(_vm_pmap, OID_AUTO, pat_works, CTLFLAG_RD, &pat_works, 1, "Is page attribute table fully functional?"); static int pg_ps_enabled = 1; SYSCTL_INT(_vm_pmap, OID_AUTO, pg_ps_enabled, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &pg_ps_enabled, 0, "Are large page mappings enabled?"); #define PAT_INDEX_SIZE 8 static int pat_index[PAT_INDEX_SIZE]; /* cache mode to PAT index conversion */ /* * pmap_mapdev support pre initialization (i.e. console) */ #define PMAP_PREINIT_MAPPING_COUNT 8 static struct pmap_preinit_mapping { vm_paddr_t pa; vm_offset_t va; vm_size_t sz; int mode; } pmap_preinit_mapping[PMAP_PREINIT_MAPPING_COUNT]; static int pmap_initialized; static struct rwlock_padalign pvh_global_lock; /* * Data for the pv entry allocation mechanism */ static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks); static int pv_entry_count = 0, pv_entry_max = 0, pv_entry_high_water = 0; static struct md_page *pv_table; static int shpgperproc = PMAP_SHPGPERPROC; struct pv_chunk *pv_chunkbase; /* KVA block for pv_chunks */ int pv_maxchunks; /* How many chunks we have KVA for */ vm_offset_t pv_vafree; /* freelist stored in the PTE */ /* * All those kernel PT submaps that BSD is so fond of */ pt_entry_t *CMAP3; static pd_entry_t *KPTD; caddr_t ptvmmap = 0; caddr_t CADDR3; struct msgbuf *msgbufp = NULL; /* * Crashdump maps. */ static caddr_t crashdumpmap; static pt_entry_t *PMAP1 = NULL, *PMAP2; static pt_entry_t *PADDR1 = NULL, *PADDR2; #ifdef SMP static int PMAP1cpu; static int PMAP1changedcpu; SYSCTL_INT(_debug, OID_AUTO, PMAP1changedcpu, CTLFLAG_RD, &PMAP1changedcpu, 0, "Number of times pmap_pte_quick changed CPU with same PMAP1"); #endif static int PMAP1changed; SYSCTL_INT(_debug, OID_AUTO, PMAP1changed, CTLFLAG_RD, &PMAP1changed, 0, "Number of times pmap_pte_quick changed PMAP1"); static int PMAP1unchanged; SYSCTL_INT(_debug, OID_AUTO, PMAP1unchanged, CTLFLAG_RD, &PMAP1unchanged, 0, "Number of times pmap_pte_quick didn't change PMAP1"); static struct mtx PMAP2mutex; static void free_pv_chunk(struct pv_chunk *pc); static void free_pv_entry(pmap_t pmap, pv_entry_t pv); static pv_entry_t get_pv_entry(pmap_t pmap, boolean_t try); static void pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa); static boolean_t pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa); static void pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa); static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va); static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va); static int pmap_pvh_wired_mappings(struct md_page *pvh, int count); static boolean_t pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va); static boolean_t pmap_enter_pde(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot); static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, vm_page_t mpte); static void pmap_flush_page(vm_page_t m); static int pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte); static void pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte); static boolean_t pmap_is_modified_pvh(struct md_page *pvh); static boolean_t pmap_is_referenced_pvh(struct md_page *pvh); static void pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode); static void pmap_kenter_pde(vm_offset_t va, pd_entry_t newpde); static void pmap_pde_attr(pd_entry_t *pde, int cache_bits); static void pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va); static boolean_t pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva, vm_prot_t prot); static void pmap_pte_attr(pt_entry_t *pte, int cache_bits); static void pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva, struct spglist *free); static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t sva, struct spglist *free); static vm_page_t pmap_remove_pt_page(pmap_t pmap, vm_offset_t va); static void pmap_remove_page(struct pmap *pmap, vm_offset_t va, struct spglist *free); static void pmap_remove_entry(struct pmap *pmap, vm_page_t m, vm_offset_t va); static void pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m); static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m); static void pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde); static void pmap_update_pde_invalidate(vm_offset_t va, pd_entry_t newpde); static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va, u_int flags); static vm_page_t _pmap_allocpte(pmap_t pmap, u_int ptepindex, u_int flags); static void _pmap_unwire_ptp(pmap_t pmap, vm_page_t m, struct spglist *free); static pt_entry_t *pmap_pte_quick(pmap_t pmap, vm_offset_t va); static void pmap_pte_release(pt_entry_t *pte); static int pmap_unuse_pt(pmap_t, vm_offset_t, struct spglist *); #if defined(PAE) || defined(PAE_TABLES) static void *pmap_pdpt_allocf(uma_zone_t zone, vm_size_t bytes, uint8_t *flags, int wait); #endif static void pmap_set_pg(void); static __inline void pagezero(void *page); CTASSERT(1 << PDESHIFT == sizeof(pd_entry_t)); CTASSERT(1 << PTESHIFT == sizeof(pt_entry_t)); /* * If you get an error here, then you set KVA_PAGES wrong! See the * description of KVA_PAGES in sys/i386/include/pmap.h. It must be * multiple of 4 for a normal kernel, or a multiple of 8 for a PAE. */ CTASSERT(KERNBASE % (1 << 24) == 0); /* * Bootstrap the system enough to run with virtual memory. * * On the i386 this is called after mapping has already been enabled * and just syncs the pmap module with what has already been done. * [We can't call it easily with mapping off since the kernel is not * mapped with PA == VA, hence we would have to relocate every address * from the linked base (virtual) address "KERNBASE" to the actual * (physical) address starting relative to 0] */ void pmap_bootstrap(vm_paddr_t firstaddr) { vm_offset_t va; pt_entry_t *pte, *unused; struct pcpu *pc; int i; /* * Add a physical memory segment (vm_phys_seg) corresponding to the * preallocated kernel page table pages so that vm_page structures * representing these pages will be created. The vm_page structures * are required for promotion of the corresponding kernel virtual * addresses to superpage mappings. */ vm_phys_add_seg(KPTphys, KPTphys + ptoa(nkpt)); /* * Initialize the first available kernel virtual address. However, * using "firstaddr" may waste a few pages of the kernel virtual * address space, because locore may not have mapped every physical * page that it allocated. Preferably, locore would provide a first * unused virtual address in addition to "firstaddr". */ virtual_avail = (vm_offset_t) KERNBASE + firstaddr; virtual_end = VM_MAX_KERNEL_ADDRESS; /* * Initialize the kernel pmap (which is statically allocated). */ PMAP_LOCK_INIT(kernel_pmap); kernel_pmap->pm_pdir = (pd_entry_t *) (KERNBASE + (u_int)IdlePTD); #if defined(PAE) || defined(PAE_TABLES) kernel_pmap->pm_pdpt = (pdpt_entry_t *) (KERNBASE + (u_int)IdlePDPT); #endif CPU_FILL(&kernel_pmap->pm_active); /* don't allow deactivation */ TAILQ_INIT(&kernel_pmap->pm_pvchunk); /* * Initialize the global pv list lock. */ rw_init(&pvh_global_lock, "pmap pv global"); LIST_INIT(&allpmaps); /* * Request a spin mutex so that changes to allpmaps cannot be * preempted by smp_rendezvous_cpus(). Otherwise, * pmap_update_pde_kernel() could access allpmaps while it is * being changed. */ mtx_init(&allpmaps_lock, "allpmaps", NULL, MTX_SPIN); mtx_lock_spin(&allpmaps_lock); LIST_INSERT_HEAD(&allpmaps, kernel_pmap, pm_list); mtx_unlock_spin(&allpmaps_lock); /* * Reserve some special page table entries/VA space for temporary * mapping of pages. */ #define SYSMAP(c, p, v, n) \ v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n); va = virtual_avail; pte = vtopte(va); /* * Initialize temporary map objects on the current CPU for use * during early boot. * CMAP1/CMAP2 are used for zeroing and copying pages. * CMAP3 is used for the boot-time memory test. */ pc = get_pcpu(); mtx_init(&pc->pc_cmap_lock, "SYSMAPS", NULL, MTX_DEF); SYSMAP(caddr_t, pc->pc_cmap_pte1, pc->pc_cmap_addr1, 1) SYSMAP(caddr_t, pc->pc_cmap_pte2, pc->pc_cmap_addr2, 1) SYSMAP(vm_offset_t, pte, pc->pc_qmap_addr, 1) SYSMAP(caddr_t, CMAP3, CADDR3, 1); /* * Crashdump maps. */ SYSMAP(caddr_t, unused, crashdumpmap, MAXDUMPPGS) /* * ptvmmap is used for reading arbitrary physical pages via /dev/mem. */ SYSMAP(caddr_t, unused, ptvmmap, 1) /* * msgbufp is used to map the system message buffer. */ SYSMAP(struct msgbuf *, unused, msgbufp, atop(round_page(msgbufsize))) /* * KPTmap is used by pmap_kextract(). * * KPTmap is first initialized by locore. However, that initial * KPTmap can only support NKPT page table pages. Here, a larger * KPTmap is created that can support KVA_PAGES page table pages. */ SYSMAP(pt_entry_t *, KPTD, KPTmap, KVA_PAGES) for (i = 0; i < NKPT; i++) KPTD[i] = (KPTphys + (i << PAGE_SHIFT)) | pgeflag | PG_RW | PG_V; /* * Adjust the start of the KPTD and KPTmap so that the implementation * of pmap_kextract() and pmap_growkernel() can be made simpler. */ KPTD -= KPTDI; KPTmap -= i386_btop(KPTDI << PDRSHIFT); /* * PADDR1 and PADDR2 are used by pmap_pte_quick() and pmap_pte(), * respectively. */ SYSMAP(pt_entry_t *, PMAP1, PADDR1, 1) SYSMAP(pt_entry_t *, PMAP2, PADDR2, 1) mtx_init(&PMAP2mutex, "PMAP2", NULL, MTX_DEF); virtual_avail = va; /* * Leave in place an identity mapping (virt == phys) for the low 1 MB * physical memory region that is used by the ACPI wakeup code. This * mapping must not have PG_G set. */ #ifdef XBOX /* FIXME: This is gross, but needed for the XBOX. Since we are in such * an early stadium, we cannot yet neatly map video memory ... :-( * Better fixes are very welcome! */ if (!arch_i386_is_xbox) #endif for (i = 1; i < NKPT; i++) PTD[i] = 0; /* * Initialize the PAT MSR if present. * pmap_init_pat() clears and sets CR4_PGE, which, as a * side-effect, invalidates stale PG_G TLB entries that might * have been created in our pre-boot environment. We assume * that PAT support implies PGE and in reverse, PGE presence * comes with PAT. Both features were added for Pentium Pro. */ pmap_init_pat(); /* Turn on PG_G on kernel page(s) */ pmap_set_pg(); } static void pmap_init_reserved_pages(void) { struct pcpu *pc; vm_offset_t pages; int i; CPU_FOREACH(i) { pc = pcpu_find(i); /* * Skip if the mapping has already been initialized, * i.e. this is the BSP. */ if (pc->pc_cmap_addr1 != 0) continue; mtx_init(&pc->pc_cmap_lock, "SYSMAPS", NULL, MTX_DEF); pages = kva_alloc(PAGE_SIZE * 3); if (pages == 0) panic("%s: unable to allocate KVA", __func__); pc->pc_cmap_pte1 = vtopte(pages); pc->pc_cmap_pte2 = vtopte(pages + PAGE_SIZE); pc->pc_cmap_addr1 = (caddr_t)pages; pc->pc_cmap_addr2 = (caddr_t)(pages + PAGE_SIZE); pc->pc_qmap_addr = pages + (PAGE_SIZE * 2); } } SYSINIT(rpages_init, SI_SUB_CPU, SI_ORDER_ANY, pmap_init_reserved_pages, NULL); /* * Setup the PAT MSR. */ void pmap_init_pat(void) { int pat_table[PAT_INDEX_SIZE]; uint64_t pat_msr; u_long cr0, cr4; int i; /* Set default PAT index table. */ for (i = 0; i < PAT_INDEX_SIZE; i++) pat_table[i] = -1; pat_table[PAT_WRITE_BACK] = 0; pat_table[PAT_WRITE_THROUGH] = 1; pat_table[PAT_UNCACHEABLE] = 3; pat_table[PAT_WRITE_COMBINING] = 3; pat_table[PAT_WRITE_PROTECTED] = 3; pat_table[PAT_UNCACHED] = 3; /* * Bail if this CPU doesn't implement PAT. * We assume that PAT support implies PGE. */ if ((cpu_feature & CPUID_PAT) == 0) { for (i = 0; i < PAT_INDEX_SIZE; i++) pat_index[i] = pat_table[i]; pat_works = 0; return; } /* * Due to some Intel errata, we can only safely use the lower 4 * PAT entries. * * Intel Pentium III Processor Specification Update * Errata E.27 (Upper Four PAT Entries Not Usable With Mode B * or Mode C Paging) * * Intel Pentium IV Processor Specification Update * Errata N46 (PAT Index MSB May Be Calculated Incorrectly) */ if (cpu_vendor_id == CPU_VENDOR_INTEL && !(CPUID_TO_FAMILY(cpu_id) == 6 && CPUID_TO_MODEL(cpu_id) >= 0xe)) pat_works = 0; /* Initialize default PAT entries. */ pat_msr = PAT_VALUE(0, PAT_WRITE_BACK) | PAT_VALUE(1, PAT_WRITE_THROUGH) | PAT_VALUE(2, PAT_UNCACHED) | PAT_VALUE(3, PAT_UNCACHEABLE) | PAT_VALUE(4, PAT_WRITE_BACK) | PAT_VALUE(5, PAT_WRITE_THROUGH) | PAT_VALUE(6, PAT_UNCACHED) | PAT_VALUE(7, PAT_UNCACHEABLE); if (pat_works) { /* * Leave the indices 0-3 at the default of WB, WT, UC-, and UC. * Program 5 and 6 as WP and WC. * Leave 4 and 7 as WB and UC. */ pat_msr &= ~(PAT_MASK(5) | PAT_MASK(6)); pat_msr |= PAT_VALUE(5, PAT_WRITE_PROTECTED) | PAT_VALUE(6, PAT_WRITE_COMBINING); pat_table[PAT_UNCACHED] = 2; pat_table[PAT_WRITE_PROTECTED] = 5; pat_table[PAT_WRITE_COMBINING] = 6; } else { /* * Just replace PAT Index 2 with WC instead of UC-. */ pat_msr &= ~PAT_MASK(2); pat_msr |= PAT_VALUE(2, PAT_WRITE_COMBINING); pat_table[PAT_WRITE_COMBINING] = 2; } /* Disable PGE. */ cr4 = rcr4(); load_cr4(cr4 & ~CR4_PGE); /* Disable caches (CD = 1, NW = 0). */ cr0 = rcr0(); load_cr0((cr0 & ~CR0_NW) | CR0_CD); /* Flushes caches and TLBs. */ wbinvd(); invltlb(); /* Update PAT and index table. */ wrmsr(MSR_PAT, pat_msr); for (i = 0; i < PAT_INDEX_SIZE; i++) pat_index[i] = pat_table[i]; /* Flush caches and TLBs again. */ wbinvd(); invltlb(); /* Restore caches and PGE. */ load_cr0(cr0); load_cr4(cr4); } /* * Set PG_G on kernel pages. Only the BSP calls this when SMP is turned on. */ static void pmap_set_pg(void) { pt_entry_t *pte; vm_offset_t va, endva; if (pgeflag == 0) return; endva = KERNBASE + KERNend; if (pseflag) { va = KERNBASE + KERNLOAD; while (va < endva) { pdir_pde(PTD, va) |= pgeflag; invltlb(); /* Flush non-PG_G entries. */ va += NBPDR; } } else { va = (vm_offset_t)btext; while (va < endva) { pte = vtopte(va); if (*pte) *pte |= pgeflag; invltlb(); /* Flush non-PG_G entries. */ va += PAGE_SIZE; } } } /* * Initialize a vm_page's machine-dependent fields. */ void pmap_page_init(vm_page_t m) { TAILQ_INIT(&m->md.pv_list); m->md.pat_mode = PAT_WRITE_BACK; } #if defined(PAE) || defined(PAE_TABLES) static void * pmap_pdpt_allocf(uma_zone_t zone, vm_size_t bytes, uint8_t *flags, int wait) { /* Inform UMA that this allocator uses kernel_map/object. */ *flags = UMA_SLAB_KERNEL; return ((void *)kmem_alloc_contig(kernel_arena, bytes, wait, 0x0ULL, 0xffffffffULL, 1, 0, VM_MEMATTR_DEFAULT)); } #endif /* * Abuse the pte nodes for unmapped kva to thread a kva freelist through. * Requirements: * - Must deal with pages in order to ensure that none of the PG_* bits * are ever set, PG_V in particular. * - Assumes we can write to ptes without pte_store() atomic ops, even * on PAE systems. This should be ok. * - Assumes nothing will ever test these addresses for 0 to indicate * no mapping instead of correctly checking PG_V. * - Assumes a vm_offset_t will fit in a pte (true for i386). * Because PG_V is never set, there can be no mappings to invalidate. */ static vm_offset_t pmap_ptelist_alloc(vm_offset_t *head) { pt_entry_t *pte; vm_offset_t va; va = *head; if (va == 0) panic("pmap_ptelist_alloc: exhausted ptelist KVA"); pte = vtopte(va); *head = *pte; if (*head & PG_V) panic("pmap_ptelist_alloc: va with PG_V set!"); *pte = 0; return (va); } static void pmap_ptelist_free(vm_offset_t *head, vm_offset_t va) { pt_entry_t *pte; if (va & PG_V) panic("pmap_ptelist_free: freeing va with PG_V set!"); pte = vtopte(va); *pte = *head; /* virtual! PG_V is 0 though */ *head = va; } static void pmap_ptelist_init(vm_offset_t *head, void *base, int npages) { int i; vm_offset_t va; *head = 0; for (i = npages - 1; i >= 0; i--) { va = (vm_offset_t)base + i * PAGE_SIZE; pmap_ptelist_free(head, va); } } /* * Initialize the pmap module. * Called by vm_init, to initialize any structures that the pmap * system needs to map virtual memory. */ void pmap_init(void) { struct pmap_preinit_mapping *ppim; vm_page_t mpte; vm_size_t s; int i, pv_npg; /* * Initialize the vm page array entries for the kernel pmap's * page table pages. */ for (i = 0; i < NKPT; i++) { mpte = PHYS_TO_VM_PAGE(KPTphys + (i << PAGE_SHIFT)); KASSERT(mpte >= vm_page_array && mpte < &vm_page_array[vm_page_array_size], ("pmap_init: page table page is out of range")); mpte->pindex = i + KPTDI; mpte->phys_addr = KPTphys + (i << PAGE_SHIFT); } /* * Initialize the address space (zone) for the pv entries. Set a * high water mark so that the system can recover from excessive * numbers of pv entries. */ TUNABLE_INT_FETCH("vm.pmap.shpgperproc", &shpgperproc); pv_entry_max = shpgperproc * maxproc + vm_cnt.v_page_count; TUNABLE_INT_FETCH("vm.pmap.pv_entries", &pv_entry_max); pv_entry_max = roundup(pv_entry_max, _NPCPV); pv_entry_high_water = 9 * (pv_entry_max / 10); /* * If the kernel is running on a virtual machine, then it must assume * that MCA is enabled by the hypervisor. Moreover, the kernel must * be prepared for the hypervisor changing the vendor and family that * are reported by CPUID. Consequently, the workaround for AMD Family * 10h Erratum 383 is enabled if the processor's feature set does not * include at least one feature that is only supported by older Intel * or newer AMD processors. */ if (vm_guest != VM_GUEST_NO && (cpu_feature & CPUID_SS) == 0 && (cpu_feature2 & (CPUID2_SSSE3 | CPUID2_SSE41 | CPUID2_AESNI | CPUID2_AVX | CPUID2_XSAVE)) == 0 && (amd_feature2 & (AMDID2_XOP | AMDID2_FMA4)) == 0) workaround_erratum383 = 1; /* * Are large page mappings supported and enabled? */ TUNABLE_INT_FETCH("vm.pmap.pg_ps_enabled", &pg_ps_enabled); if (pseflag == 0) pg_ps_enabled = 0; else if (pg_ps_enabled) { KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0, ("pmap_init: can't assign to pagesizes[1]")); pagesizes[1] = NBPDR; } /* * Calculate the size of the pv head table for superpages. * Handle the possibility that "vm_phys_segs[...].end" is zero. */ pv_npg = trunc_4mpage(vm_phys_segs[vm_phys_nsegs - 1].end - PAGE_SIZE) / NBPDR + 1; /* * Allocate memory for the pv head table for superpages. */ s = (vm_size_t)(pv_npg * sizeof(struct md_page)); s = round_page(s); pv_table = (struct md_page *)kmem_malloc(kernel_arena, s, M_WAITOK | M_ZERO); for (i = 0; i < pv_npg; i++) TAILQ_INIT(&pv_table[i].pv_list); pv_maxchunks = MAX(pv_entry_max / _NPCPV, maxproc); pv_chunkbase = (struct pv_chunk *)kva_alloc(PAGE_SIZE * pv_maxchunks); if (pv_chunkbase == NULL) panic("pmap_init: not enough kvm for pv chunks"); pmap_ptelist_init(&pv_vafree, pv_chunkbase, pv_maxchunks); #if defined(PAE) || defined(PAE_TABLES) pdptzone = uma_zcreate("PDPT", NPGPTD * sizeof(pdpt_entry_t), NULL, NULL, NULL, NULL, (NPGPTD * sizeof(pdpt_entry_t)) - 1, UMA_ZONE_VM | UMA_ZONE_NOFREE); uma_zone_set_allocf(pdptzone, pmap_pdpt_allocf); #endif pmap_initialized = 1; if (!bootverbose) return; for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { ppim = pmap_preinit_mapping + i; if (ppim->va == 0) continue; printf("PPIM %u: PA=%#jx, VA=%#x, size=%#x, mode=%#x\n", i, (uintmax_t)ppim->pa, ppim->va, ppim->sz, ppim->mode); } } SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_max, CTLFLAG_RD, &pv_entry_max, 0, "Max number of PV entries"); SYSCTL_INT(_vm_pmap, OID_AUTO, shpgperproc, CTLFLAG_RD, &shpgperproc, 0, "Page share factor per proc"); static SYSCTL_NODE(_vm_pmap, OID_AUTO, pde, CTLFLAG_RD, 0, "2/4MB page mapping counters"); static u_long pmap_pde_demotions; SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, demotions, CTLFLAG_RD, &pmap_pde_demotions, 0, "2/4MB page demotions"); static u_long pmap_pde_mappings; SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, mappings, CTLFLAG_RD, &pmap_pde_mappings, 0, "2/4MB page mappings"); static u_long pmap_pde_p_failures; SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, p_failures, CTLFLAG_RD, &pmap_pde_p_failures, 0, "2/4MB page promotion failures"); static u_long pmap_pde_promotions; SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, promotions, CTLFLAG_RD, &pmap_pde_promotions, 0, "2/4MB page promotions"); /*************************************************** * Low level helper routines..... ***************************************************/ /* * Determine the appropriate bits to set in a PTE or PDE for a specified * caching mode. */ int pmap_cache_bits(int mode, boolean_t is_pde) { int cache_bits, pat_flag, pat_idx; if (mode < 0 || mode >= PAT_INDEX_SIZE || pat_index[mode] < 0) panic("Unknown caching mode %d\n", mode); /* The PAT bit is different for PTE's and PDE's. */ pat_flag = is_pde ? PG_PDE_PAT : PG_PTE_PAT; /* Map the caching mode to a PAT index. */ pat_idx = pat_index[mode]; /* Map the 3-bit index value into the PAT, PCD, and PWT bits. */ cache_bits = 0; if (pat_idx & 0x4) cache_bits |= pat_flag; if (pat_idx & 0x2) cache_bits |= PG_NC_PCD; if (pat_idx & 0x1) cache_bits |= PG_NC_PWT; return (cache_bits); } /* * The caller is responsible for maintaining TLB consistency. */ static void pmap_kenter_pde(vm_offset_t va, pd_entry_t newpde) { pd_entry_t *pde; pmap_t pmap; boolean_t PTD_updated; PTD_updated = FALSE; mtx_lock_spin(&allpmaps_lock); LIST_FOREACH(pmap, &allpmaps, pm_list) { if ((pmap->pm_pdir[PTDPTDI] & PG_FRAME) == (PTDpde[0] & PG_FRAME)) PTD_updated = TRUE; pde = pmap_pde(pmap, va); pde_store(pde, newpde); } mtx_unlock_spin(&allpmaps_lock); KASSERT(PTD_updated, ("pmap_kenter_pde: current page table is not in allpmaps")); } /* * After changing the page size for the specified virtual address in the page * table, flush the corresponding entries from the processor's TLB. Only the * calling processor's TLB is affected. * * The calling thread must be pinned to a processor. */ static void pmap_update_pde_invalidate(vm_offset_t va, pd_entry_t newpde) { u_long cr4; if ((newpde & PG_PS) == 0) /* Demotion: flush a specific 2MB page mapping. */ invlpg(va); else if ((newpde & PG_G) == 0) /* * Promotion: flush every 4KB page mapping from the TLB * because there are too many to flush individually. */ invltlb(); else { /* * Promotion: flush every 4KB page mapping from the TLB, * including any global (PG_G) mappings. */ cr4 = rcr4(); load_cr4(cr4 & ~CR4_PGE); /* * Although preemption at this point could be detrimental to * performance, it would not lead to an error. PG_G is simply * ignored if CR4.PGE is clear. Moreover, in case this block * is re-entered, the load_cr4() either above or below will * modify CR4.PGE flushing the TLB. */ load_cr4(cr4 | CR4_PGE); } } void invltlb_glob(void) { uint64_t cr4; if (pgeflag == 0) { invltlb(); } else { cr4 = rcr4(); load_cr4(cr4 & ~CR4_PGE); load_cr4(cr4 | CR4_PGE); } } #ifdef SMP /* * For SMP, these functions have to use the IPI mechanism for coherence. * * N.B.: Before calling any of the following TLB invalidation functions, * the calling processor must ensure that all stores updating a non- * kernel page table are globally performed. Otherwise, another * processor could cache an old, pre-update entry without being * invalidated. This can happen one of two ways: (1) The pmap becomes * active on another processor after its pm_active field is checked by * one of the following functions but before a store updating the page * table is globally performed. (2) The pmap becomes active on another * processor before its pm_active field is checked but due to * speculative loads one of the following functions stills reads the * pmap as inactive on the other processor. * * The kernel page table is exempt because its pm_active field is * immutable. The kernel page table is always active on every * processor. */ void pmap_invalidate_page(pmap_t pmap, vm_offset_t va) { cpuset_t *mask, other_cpus; u_int cpuid; sched_pin(); if (pmap == kernel_pmap || !CPU_CMP(&pmap->pm_active, &all_cpus)) { invlpg(va); mask = &all_cpus; } else { cpuid = PCPU_GET(cpuid); other_cpus = all_cpus; CPU_CLR(cpuid, &other_cpus); if (CPU_ISSET(cpuid, &pmap->pm_active)) invlpg(va); CPU_AND(&other_cpus, &pmap->pm_active); mask = &other_cpus; } smp_masked_invlpg(*mask, va); sched_unpin(); } /* 4k PTEs -- Chosen to exceed the total size of Broadwell L2 TLB */ #define PMAP_INVLPG_THRESHOLD (4 * 1024 * PAGE_SIZE) void pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { cpuset_t *mask, other_cpus; vm_offset_t addr; u_int cpuid; if (eva - sva >= PMAP_INVLPG_THRESHOLD) { pmap_invalidate_all(pmap); return; } sched_pin(); if (pmap == kernel_pmap || !CPU_CMP(&pmap->pm_active, &all_cpus)) { for (addr = sva; addr < eva; addr += PAGE_SIZE) invlpg(addr); mask = &all_cpus; } else { cpuid = PCPU_GET(cpuid); other_cpus = all_cpus; CPU_CLR(cpuid, &other_cpus); if (CPU_ISSET(cpuid, &pmap->pm_active)) for (addr = sva; addr < eva; addr += PAGE_SIZE) invlpg(addr); CPU_AND(&other_cpus, &pmap->pm_active); mask = &other_cpus; } smp_masked_invlpg_range(*mask, sva, eva); sched_unpin(); } void pmap_invalidate_all(pmap_t pmap) { cpuset_t *mask, other_cpus; u_int cpuid; sched_pin(); if (pmap == kernel_pmap) { invltlb_glob(); mask = &all_cpus; } else if (!CPU_CMP(&pmap->pm_active, &all_cpus)) { invltlb(); mask = &all_cpus; } else { cpuid = PCPU_GET(cpuid); other_cpus = all_cpus; CPU_CLR(cpuid, &other_cpus); if (CPU_ISSET(cpuid, &pmap->pm_active)) invltlb(); CPU_AND(&other_cpus, &pmap->pm_active); mask = &other_cpus; } smp_masked_invltlb(*mask, pmap); sched_unpin(); } void pmap_invalidate_cache(void) { sched_pin(); wbinvd(); smp_cache_flush(); sched_unpin(); } struct pde_action { cpuset_t invalidate; /* processors that invalidate their TLB */ vm_offset_t va; pd_entry_t *pde; pd_entry_t newpde; u_int store; /* processor that updates the PDE */ }; static void pmap_update_pde_kernel(void *arg) { struct pde_action *act = arg; pd_entry_t *pde; pmap_t pmap; if (act->store == PCPU_GET(cpuid)) { /* * Elsewhere, this operation requires allpmaps_lock for * synchronization. Here, it does not because it is being * performed in the context of an all_cpus rendezvous. */ LIST_FOREACH(pmap, &allpmaps, pm_list) { pde = pmap_pde(pmap, act->va); pde_store(pde, act->newpde); } } } static void pmap_update_pde_user(void *arg) { struct pde_action *act = arg; if (act->store == PCPU_GET(cpuid)) pde_store(act->pde, act->newpde); } static void pmap_update_pde_teardown(void *arg) { struct pde_action *act = arg; if (CPU_ISSET(PCPU_GET(cpuid), &act->invalidate)) pmap_update_pde_invalidate(act->va, act->newpde); } /* * Change the page size for the specified virtual address in a way that * prevents any possibility of the TLB ever having two entries that map the * same virtual address using different page sizes. This is the recommended * workaround for Erratum 383 on AMD Family 10h processors. It prevents a * machine check exception for a TLB state that is improperly diagnosed as a * hardware error. */ static void pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde) { struct pde_action act; cpuset_t active, other_cpus; u_int cpuid; sched_pin(); cpuid = PCPU_GET(cpuid); other_cpus = all_cpus; CPU_CLR(cpuid, &other_cpus); if (pmap == kernel_pmap) active = all_cpus; else active = pmap->pm_active; if (CPU_OVERLAP(&active, &other_cpus)) { act.store = cpuid; act.invalidate = active; act.va = va; act.pde = pde; act.newpde = newpde; CPU_SET(cpuid, &active); smp_rendezvous_cpus(active, smp_no_rendevous_barrier, pmap == kernel_pmap ? pmap_update_pde_kernel : pmap_update_pde_user, pmap_update_pde_teardown, &act); } else { if (pmap == kernel_pmap) pmap_kenter_pde(va, newpde); else pde_store(pde, newpde); if (CPU_ISSET(cpuid, &active)) pmap_update_pde_invalidate(va, newpde); } sched_unpin(); } #else /* !SMP */ /* * Normal, non-SMP, 486+ invalidation functions. * We inline these within pmap.c for speed. */ PMAP_INLINE void pmap_invalidate_page(pmap_t pmap, vm_offset_t va) { if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active)) invlpg(va); } PMAP_INLINE void pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { vm_offset_t addr; if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active)) for (addr = sva; addr < eva; addr += PAGE_SIZE) invlpg(addr); } PMAP_INLINE void pmap_invalidate_all(pmap_t pmap) { if (pmap == kernel_pmap) invltlb_glob(); else if (!CPU_EMPTY(&pmap->pm_active)) invltlb(); } PMAP_INLINE void pmap_invalidate_cache(void) { wbinvd(); } static void pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde) { if (pmap == kernel_pmap) pmap_kenter_pde(va, newpde); else pde_store(pde, newpde); if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active)) pmap_update_pde_invalidate(va, newpde); } #endif /* !SMP */ #define PMAP_CLFLUSH_THRESHOLD (2 * 1024 * 1024) void pmap_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva, boolean_t force) { if (force) { sva &= ~(vm_offset_t)cpu_clflush_line_size; } else { KASSERT((sva & PAGE_MASK) == 0, ("pmap_invalidate_cache_range: sva not page-aligned")); KASSERT((eva & PAGE_MASK) == 0, ("pmap_invalidate_cache_range: eva not page-aligned")); } if ((cpu_feature & CPUID_SS) != 0 && !force) ; /* If "Self Snoop" is supported and allowed, do nothing. */ else if ((cpu_stdext_feature & CPUID_STDEXT_CLFLUSHOPT) != 0 && eva - sva < PMAP_CLFLUSH_THRESHOLD) { #ifdef DEV_APIC /* * XXX: Some CPUs fault, hang, or trash the local APIC * registers if we use CLFLUSH on the local APIC * range. The local APIC is always uncached, so we * don't need to flush for that range anyway. */ if (pmap_kextract(sva) == lapic_paddr) return; #endif /* * Otherwise, do per-cache line flush. Use the sfence * instruction to insure that previous stores are * included in the write-back. The processor * propagates flush to other processors in the cache * coherence domain. */ sfence(); for (; sva < eva; sva += cpu_clflush_line_size) clflushopt(sva); sfence(); } else if ((cpu_feature & CPUID_CLFSH) != 0 && eva - sva < PMAP_CLFLUSH_THRESHOLD) { #ifdef DEV_APIC if (pmap_kextract(sva) == lapic_paddr) return; #endif /* * Writes are ordered by CLFLUSH on Intel CPUs. */ if (cpu_vendor_id != CPU_VENDOR_INTEL) mfence(); for (; sva < eva; sva += cpu_clflush_line_size) clflush(sva); if (cpu_vendor_id != CPU_VENDOR_INTEL) mfence(); } else { /* * No targeted cache flush methods are supported by CPU, * or the supplied range is bigger than 2MB. * Globally invalidate cache. */ pmap_invalidate_cache(); } } void pmap_invalidate_cache_pages(vm_page_t *pages, int count) { int i; if (count >= PMAP_CLFLUSH_THRESHOLD / PAGE_SIZE || (cpu_feature & CPUID_CLFSH) == 0) { pmap_invalidate_cache(); } else { for (i = 0; i < count; i++) pmap_flush_page(pages[i]); } } /* * Are we current address space or kernel? */ static __inline int pmap_is_current(pmap_t pmap) { return (pmap == kernel_pmap || pmap == vmspace_pmap(curthread->td_proc->p_vmspace)); } /* * If the given pmap is not the current or kernel pmap, the returned pte must * be released by passing it to pmap_pte_release(). */ pt_entry_t * pmap_pte(pmap_t pmap, vm_offset_t va) { pd_entry_t newpf; pd_entry_t *pde; pde = pmap_pde(pmap, va); if (*pde & PG_PS) return (pde); if (*pde != 0) { /* are we current address space or kernel? */ if (pmap_is_current(pmap)) return (vtopte(va)); mtx_lock(&PMAP2mutex); newpf = *pde & PG_FRAME; if ((*PMAP2 & PG_FRAME) != newpf) { *PMAP2 = newpf | PG_RW | PG_V | PG_A | PG_M; pmap_invalidate_page(kernel_pmap, (vm_offset_t)PADDR2); } return (PADDR2 + (i386_btop(va) & (NPTEPG - 1))); } return (NULL); } /* * Releases a pte that was obtained from pmap_pte(). Be prepared for the pte * being NULL. */ static __inline void pmap_pte_release(pt_entry_t *pte) { if ((pt_entry_t *)((vm_offset_t)pte & ~PAGE_MASK) == PADDR2) mtx_unlock(&PMAP2mutex); } /* * NB: The sequence of updating a page table followed by accesses to the * corresponding pages is subject to the situation described in the "AMD64 * Architecture Programmer's Manual Volume 2: System Programming" rev. 3.23, * "7.3.1 Special Coherency Considerations". Therefore, issuing the INVLPG * right after modifying the PTE bits is crucial. */ static __inline void invlcaddr(void *caddr) { invlpg((u_int)caddr); } /* * Super fast pmap_pte routine best used when scanning * the pv lists. This eliminates many coarse-grained * invltlb calls. Note that many of the pv list * scans are across different pmaps. It is very wasteful * to do an entire invltlb for checking a single mapping. * * If the given pmap is not the current pmap, pvh_global_lock * must be held and curthread pinned to a CPU. */ static pt_entry_t * pmap_pte_quick(pmap_t pmap, vm_offset_t va) { pd_entry_t newpf; pd_entry_t *pde; pde = pmap_pde(pmap, va); if (*pde & PG_PS) return (pde); if (*pde != 0) { /* are we current address space or kernel? */ if (pmap_is_current(pmap)) return (vtopte(va)); rw_assert(&pvh_global_lock, RA_WLOCKED); KASSERT(curthread->td_pinned > 0, ("curthread not pinned")); newpf = *pde & PG_FRAME; if ((*PMAP1 & PG_FRAME) != newpf) { *PMAP1 = newpf | PG_RW | PG_V | PG_A | PG_M; #ifdef SMP PMAP1cpu = PCPU_GET(cpuid); #endif invlcaddr(PADDR1); PMAP1changed++; } else #ifdef SMP if (PMAP1cpu != PCPU_GET(cpuid)) { PMAP1cpu = PCPU_GET(cpuid); invlcaddr(PADDR1); PMAP1changedcpu++; } else #endif PMAP1unchanged++; return (PADDR1 + (i386_btop(va) & (NPTEPG - 1))); } return (0); } /* * Routine: pmap_extract * Function: * Extract the physical page address associated * with the given map/virtual_address pair. */ vm_paddr_t pmap_extract(pmap_t pmap, vm_offset_t va) { vm_paddr_t rtval; pt_entry_t *pte; pd_entry_t pde; rtval = 0; PMAP_LOCK(pmap); pde = pmap->pm_pdir[va >> PDRSHIFT]; if (pde != 0) { if ((pde & PG_PS) != 0) rtval = (pde & PG_PS_FRAME) | (va & PDRMASK); else { pte = pmap_pte(pmap, va); rtval = (*pte & PG_FRAME) | (va & PAGE_MASK); pmap_pte_release(pte); } } PMAP_UNLOCK(pmap); return (rtval); } /* * Routine: pmap_extract_and_hold * Function: * Atomically extract and hold the physical page * with the given pmap and virtual address pair * if that mapping permits the given protection. */ vm_page_t pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot) { pd_entry_t pde; pt_entry_t pte, *ptep; vm_page_t m; vm_paddr_t pa; pa = 0; m = NULL; PMAP_LOCK(pmap); retry: pde = *pmap_pde(pmap, va); if (pde != 0) { if (pde & PG_PS) { if ((pde & PG_RW) || (prot & VM_PROT_WRITE) == 0) { if (vm_page_pa_tryrelock(pmap, (pde & PG_PS_FRAME) | (va & PDRMASK), &pa)) goto retry; m = PHYS_TO_VM_PAGE((pde & PG_PS_FRAME) | (va & PDRMASK)); vm_page_hold(m); } } else { ptep = pmap_pte(pmap, va); pte = *ptep; pmap_pte_release(ptep); if (pte != 0 && ((pte & PG_RW) || (prot & VM_PROT_WRITE) == 0)) { if (vm_page_pa_tryrelock(pmap, pte & PG_FRAME, &pa)) goto retry; m = PHYS_TO_VM_PAGE(pte & PG_FRAME); vm_page_hold(m); } } } PA_UNLOCK_COND(pa); PMAP_UNLOCK(pmap); return (m); } /*************************************************** * Low level mapping routines..... ***************************************************/ /* * Add a wired page to the kva. * Note: not SMP coherent. * * This function may be used before pmap_bootstrap() is called. */ PMAP_INLINE void pmap_kenter(vm_offset_t va, vm_paddr_t pa) { pt_entry_t *pte; pte = vtopte(va); pte_store(pte, pa | PG_RW | PG_V | pgeflag); } static __inline void pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode) { pt_entry_t *pte; pte = vtopte(va); pte_store(pte, pa | PG_RW | PG_V | pgeflag | pmap_cache_bits(mode, 0)); } /* * Remove a page from the kernel pagetables. * Note: not SMP coherent. * * This function may be used before pmap_bootstrap() is called. */ PMAP_INLINE void pmap_kremove(vm_offset_t va) { pt_entry_t *pte; pte = vtopte(va); pte_clear(pte); } /* * Used to map a range of physical addresses into kernel * virtual address space. * * The value passed in '*virt' is a suggested virtual address for * the mapping. Architectures which can support a direct-mapped * physical to virtual region can return the appropriate address * within that region, leaving '*virt' unchanged. Other * architectures should map the pages starting at '*virt' and * update '*virt' with the first usable address after the mapped * region. */ vm_offset_t pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot) { vm_offset_t va, sva; vm_paddr_t superpage_offset; pd_entry_t newpde; va = *virt; /* * Does the physical address range's size and alignment permit at * least one superpage mapping to be created? */ superpage_offset = start & PDRMASK; if ((end - start) - ((NBPDR - superpage_offset) & PDRMASK) >= NBPDR) { /* * Increase the starting virtual address so that its alignment * does not preclude the use of superpage mappings. */ if ((va & PDRMASK) < superpage_offset) va = (va & ~PDRMASK) + superpage_offset; else if ((va & PDRMASK) > superpage_offset) va = ((va + PDRMASK) & ~PDRMASK) + superpage_offset; } sva = va; while (start < end) { if ((start & PDRMASK) == 0 && end - start >= NBPDR && pseflag) { KASSERT((va & PDRMASK) == 0, ("pmap_map: misaligned va %#x", va)); newpde = start | PG_PS | pgeflag | PG_RW | PG_V; pmap_kenter_pde(va, newpde); va += NBPDR; start += NBPDR; } else { pmap_kenter(va, start); va += PAGE_SIZE; start += PAGE_SIZE; } } pmap_invalidate_range(kernel_pmap, sva, va); *virt = va; return (sva); } /* * Add a list of wired pages to the kva * this routine is only used for temporary * kernel mappings that do not need to have * page modification or references recorded. * Note that old mappings are simply written * over. The page *must* be wired. * Note: SMP coherent. Uses a ranged shootdown IPI. */ void pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count) { pt_entry_t *endpte, oldpte, pa, *pte; vm_page_t m; oldpte = 0; pte = vtopte(sva); endpte = pte + count; while (pte < endpte) { m = *ma++; pa = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(m->md.pat_mode, 0); if ((*pte & (PG_FRAME | PG_PTE_CACHE)) != pa) { oldpte |= *pte; pte_store(pte, pa | pgeflag | PG_RW | PG_V); } pte++; } if (__predict_false((oldpte & PG_V) != 0)) pmap_invalidate_range(kernel_pmap, sva, sva + count * PAGE_SIZE); } /* * This routine tears out page mappings from the * kernel -- it is meant only for temporary mappings. * Note: SMP coherent. Uses a ranged shootdown IPI. */ void pmap_qremove(vm_offset_t sva, int count) { vm_offset_t va; va = sva; while (count-- > 0) { pmap_kremove(va); va += PAGE_SIZE; } pmap_invalidate_range(kernel_pmap, sva, va); } /*************************************************** * Page table page management routines..... ***************************************************/ static __inline void pmap_free_zero_pages(struct spglist *free) { vm_page_t m; while ((m = SLIST_FIRST(free)) != NULL) { SLIST_REMOVE_HEAD(free, plinks.s.ss); /* Preserve the page's PG_ZERO setting. */ vm_page_free_toq(m); } } /* * Schedule the specified unused page table page to be freed. Specifically, * add the page to the specified list of pages that will be released to the * physical memory manager after the TLB has been updated. */ static __inline void pmap_add_delayed_free_list(vm_page_t m, struct spglist *free, boolean_t set_PG_ZERO) { if (set_PG_ZERO) m->flags |= PG_ZERO; else m->flags &= ~PG_ZERO; SLIST_INSERT_HEAD(free, m, plinks.s.ss); } /* * Inserts the specified page table page into the specified pmap's collection * of idle page table pages. Each of a pmap's page table pages is responsible * for mapping a distinct range of virtual addresses. The pmap's collection is * ordered by this virtual address range. */ static __inline int pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte) { PMAP_LOCK_ASSERT(pmap, MA_OWNED); return (vm_radix_insert(&pmap->pm_root, mpte)); } /* * Removes the page table page mapping the specified virtual address from the * specified pmap's collection of idle page table pages, and returns it. * Otherwise, returns NULL if there is no page table page corresponding to the * specified virtual address. */ static __inline vm_page_t pmap_remove_pt_page(pmap_t pmap, vm_offset_t va) { PMAP_LOCK_ASSERT(pmap, MA_OWNED); return (vm_radix_remove(&pmap->pm_root, va >> PDRSHIFT)); } /* * Decrements a page table page's wire count, which is used to record the * number of valid page table entries within the page. If the wire count * drops to zero, then the page table page is unmapped. Returns TRUE if the * page table page was unmapped and FALSE otherwise. */ static inline boolean_t pmap_unwire_ptp(pmap_t pmap, vm_page_t m, struct spglist *free) { --m->wire_count; if (m->wire_count == 0) { _pmap_unwire_ptp(pmap, m, free); return (TRUE); } else return (FALSE); } static void _pmap_unwire_ptp(pmap_t pmap, vm_page_t m, struct spglist *free) { vm_offset_t pteva; /* * unmap the page table page */ pmap->pm_pdir[m->pindex] = 0; --pmap->pm_stats.resident_count; /* * This is a release store so that the ordinary store unmapping * the page table page is globally performed before TLB shoot- * down is begun. */ atomic_subtract_rel_int(&vm_cnt.v_wire_count, 1); /* * Do an invltlb to make the invalidated mapping * take effect immediately. */ pteva = VM_MAXUSER_ADDRESS + i386_ptob(m->pindex); pmap_invalidate_page(pmap, pteva); /* * Put page on a list so that it is released after * *ALL* TLB shootdown is done */ pmap_add_delayed_free_list(m, free, TRUE); } /* * After removing a page table entry, this routine is used to * conditionally free the page, and manage the hold/wire counts. */ static int pmap_unuse_pt(pmap_t pmap, vm_offset_t va, struct spglist *free) { pd_entry_t ptepde; vm_page_t mpte; if (va >= VM_MAXUSER_ADDRESS) return (0); ptepde = *pmap_pde(pmap, va); mpte = PHYS_TO_VM_PAGE(ptepde & PG_FRAME); return (pmap_unwire_ptp(pmap, mpte, free)); } /* * Initialize the pmap for the swapper process. */ void pmap_pinit0(pmap_t pmap) { PMAP_LOCK_INIT(pmap); /* * Since the page table directory is shared with the kernel pmap, * which is already included in the list "allpmaps", this pmap does * not need to be inserted into that list. */ pmap->pm_pdir = (pd_entry_t *)(KERNBASE + (vm_offset_t)IdlePTD); #if defined(PAE) || defined(PAE_TABLES) pmap->pm_pdpt = (pdpt_entry_t *)(KERNBASE + (vm_offset_t)IdlePDPT); #endif pmap->pm_root.rt_root = 0; CPU_ZERO(&pmap->pm_active); PCPU_SET(curpmap, pmap); TAILQ_INIT(&pmap->pm_pvchunk); bzero(&pmap->pm_stats, sizeof pmap->pm_stats); } /* * Initialize a preallocated and zeroed pmap structure, * such as one in a vmspace structure. */ int pmap_pinit(pmap_t pmap) { vm_page_t m, ptdpg[NPGPTD]; vm_paddr_t pa; int i; /* * No need to allocate page table space yet but we do need a valid * page directory table. */ if (pmap->pm_pdir == NULL) { pmap->pm_pdir = (pd_entry_t *)kva_alloc(NBPTD); if (pmap->pm_pdir == NULL) return (0); #if defined(PAE) || defined(PAE_TABLES) pmap->pm_pdpt = uma_zalloc(pdptzone, M_WAITOK | M_ZERO); KASSERT(((vm_offset_t)pmap->pm_pdpt & ((NPGPTD * sizeof(pdpt_entry_t)) - 1)) == 0, ("pmap_pinit: pdpt misaligned")); KASSERT(pmap_kextract((vm_offset_t)pmap->pm_pdpt) < (4ULL<<30), ("pmap_pinit: pdpt above 4g")); #endif pmap->pm_root.rt_root = 0; } KASSERT(vm_radix_is_empty(&pmap->pm_root), ("pmap_pinit: pmap has reserved page table page(s)")); /* * allocate the page directory page(s) */ for (i = 0; i < NPGPTD;) { m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO); if (m == NULL) VM_WAIT; else { ptdpg[i++] = m; } } pmap_qenter((vm_offset_t)pmap->pm_pdir, ptdpg, NPGPTD); for (i = 0; i < NPGPTD; i++) if ((ptdpg[i]->flags & PG_ZERO) == 0) pagezero(pmap->pm_pdir + (i * NPDEPG)); mtx_lock_spin(&allpmaps_lock); LIST_INSERT_HEAD(&allpmaps, pmap, pm_list); /* Copy the kernel page table directory entries. */ bcopy(PTD + KPTDI, pmap->pm_pdir + KPTDI, nkpt * sizeof(pd_entry_t)); mtx_unlock_spin(&allpmaps_lock); /* install self-referential address mapping entry(s) */ for (i = 0; i < NPGPTD; i++) { pa = VM_PAGE_TO_PHYS(ptdpg[i]); pmap->pm_pdir[PTDPTDI + i] = pa | PG_V | PG_RW | PG_A | PG_M; #if defined(PAE) || defined(PAE_TABLES) pmap->pm_pdpt[i] = pa | PG_V; #endif } CPU_ZERO(&pmap->pm_active); TAILQ_INIT(&pmap->pm_pvchunk); bzero(&pmap->pm_stats, sizeof pmap->pm_stats); return (1); } /* * this routine is called if the page table page is not * mapped correctly. */ static vm_page_t _pmap_allocpte(pmap_t pmap, u_int ptepindex, u_int flags) { vm_paddr_t ptepa; vm_page_t m; /* * Allocate a page table page. */ if ((m = vm_page_alloc(NULL, ptepindex, VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) { if ((flags & PMAP_ENTER_NOSLEEP) == 0) { PMAP_UNLOCK(pmap); rw_wunlock(&pvh_global_lock); VM_WAIT; rw_wlock(&pvh_global_lock); PMAP_LOCK(pmap); } /* * Indicate the need to retry. While waiting, the page table * page may have been allocated. */ return (NULL); } if ((m->flags & PG_ZERO) == 0) pmap_zero_page(m); /* * Map the pagetable page into the process address space, if * it isn't already there. */ pmap->pm_stats.resident_count++; ptepa = VM_PAGE_TO_PHYS(m); pmap->pm_pdir[ptepindex] = (pd_entry_t) (ptepa | PG_U | PG_RW | PG_V | PG_A | PG_M); return (m); } static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va, u_int flags) { u_int ptepindex; pd_entry_t ptepa; vm_page_t m; /* * Calculate pagetable page index */ ptepindex = va >> PDRSHIFT; retry: /* * Get the page directory entry */ ptepa = pmap->pm_pdir[ptepindex]; /* * This supports switching from a 4MB page to a * normal 4K page. */ if (ptepa & PG_PS) { (void)pmap_demote_pde(pmap, &pmap->pm_pdir[ptepindex], va); ptepa = pmap->pm_pdir[ptepindex]; } /* * If the page table page is mapped, we just increment the * hold count, and activate it. */ if (ptepa) { m = PHYS_TO_VM_PAGE(ptepa & PG_FRAME); m->wire_count++; } else { /* * Here if the pte page isn't mapped, or if it has * been deallocated. */ m = _pmap_allocpte(pmap, ptepindex, flags); if (m == NULL && (flags & PMAP_ENTER_NOSLEEP) == 0) goto retry; } return (m); } /*************************************************** * Pmap allocation/deallocation routines. ***************************************************/ /* * Release any resources held by the given physical map. * Called when a pmap initialized by pmap_pinit is being released. * Should only be called if the map contains no valid mappings. */ void pmap_release(pmap_t pmap) { vm_page_t m, ptdpg[NPGPTD]; int i; KASSERT(pmap->pm_stats.resident_count == 0, ("pmap_release: pmap resident count %ld != 0", pmap->pm_stats.resident_count)); KASSERT(vm_radix_is_empty(&pmap->pm_root), ("pmap_release: pmap has reserved page table page(s)")); KASSERT(CPU_EMPTY(&pmap->pm_active), ("releasing active pmap %p", pmap)); mtx_lock_spin(&allpmaps_lock); LIST_REMOVE(pmap, pm_list); mtx_unlock_spin(&allpmaps_lock); for (i = 0; i < NPGPTD; i++) ptdpg[i] = PHYS_TO_VM_PAGE(pmap->pm_pdir[PTDPTDI + i] & PG_FRAME); bzero(pmap->pm_pdir + PTDPTDI, (nkpt + NPGPTD) * sizeof(*pmap->pm_pdir)); pmap_qremove((vm_offset_t)pmap->pm_pdir, NPGPTD); for (i = 0; i < NPGPTD; i++) { m = ptdpg[i]; #if defined(PAE) || defined(PAE_TABLES) KASSERT(VM_PAGE_TO_PHYS(m) == (pmap->pm_pdpt[i] & PG_FRAME), ("pmap_release: got wrong ptd page")); #endif m->wire_count--; atomic_subtract_int(&vm_cnt.v_wire_count, 1); vm_page_free_zero(m); } } static int kvm_size(SYSCTL_HANDLER_ARGS) { unsigned long ksize = VM_MAX_KERNEL_ADDRESS - KERNBASE; return (sysctl_handle_long(oidp, &ksize, 0, req)); } SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG|CTLFLAG_RD, 0, 0, kvm_size, "IU", "Size of KVM"); static int kvm_free(SYSCTL_HANDLER_ARGS) { unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end; return (sysctl_handle_long(oidp, &kfree, 0, req)); } SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG|CTLFLAG_RD, 0, 0, kvm_free, "IU", "Amount of KVM free"); /* * grow the number of kernel page table entries, if needed */ void pmap_growkernel(vm_offset_t addr) { vm_paddr_t ptppaddr; vm_page_t nkpg; pd_entry_t newpdir; mtx_assert(&kernel_map->system_mtx, MA_OWNED); addr = roundup2(addr, NBPDR); if (addr - 1 >= kernel_map->max_offset) addr = kernel_map->max_offset; while (kernel_vm_end < addr) { if (pdir_pde(PTD, kernel_vm_end)) { kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK; if (kernel_vm_end - 1 >= kernel_map->max_offset) { kernel_vm_end = kernel_map->max_offset; break; } continue; } nkpg = vm_page_alloc(NULL, kernel_vm_end >> PDRSHIFT, VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO); if (nkpg == NULL) panic("pmap_growkernel: no memory to grow kernel"); nkpt++; if ((nkpg->flags & PG_ZERO) == 0) pmap_zero_page(nkpg); ptppaddr = VM_PAGE_TO_PHYS(nkpg); newpdir = (pd_entry_t) (ptppaddr | PG_V | PG_RW | PG_A | PG_M); pdir_pde(KPTD, kernel_vm_end) = pgeflag | newpdir; pmap_kenter_pde(kernel_vm_end, newpdir); kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK; if (kernel_vm_end - 1 >= kernel_map->max_offset) { kernel_vm_end = kernel_map->max_offset; break; } } } /*************************************************** * page management routines. ***************************************************/ CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE); CTASSERT(_NPCM == 11); CTASSERT(_NPCPV == 336); static __inline struct pv_chunk * pv_to_chunk(pv_entry_t pv) { return ((struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK)); } #define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap) #define PC_FREE0_9 0xfffffffful /* Free values for index 0 through 9 */ #define PC_FREE10 0x0000fffful /* Free values for index 10 */ static const uint32_t pc_freemask[_NPCM] = { PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, PC_FREE0_9, PC_FREE10 }; SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0, "Current number of pv entries"); #ifdef PV_STATS static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail; SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0, "Current number of pv entry chunks"); SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0, "Current number of pv entry chunks allocated"); SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0, "Current number of pv entry chunks frees"); SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0, "Number of times tried to get a chunk page but failed."); static long pv_entry_frees, pv_entry_allocs; static int pv_entry_spare; SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0, "Current number of pv entry frees"); SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0, "Current number of pv entry allocs"); SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0, "Current number of spare pv entries"); #endif /* * We are in a serious low memory condition. Resort to * drastic measures to free some pages so we can allocate * another pv entry chunk. */ static vm_page_t pmap_pv_reclaim(pmap_t locked_pmap) { struct pch newtail; struct pv_chunk *pc; struct md_page *pvh; pd_entry_t *pde; pmap_t pmap; pt_entry_t *pte, tpte; pv_entry_t pv; vm_offset_t va; vm_page_t m, m_pc; struct spglist free; uint32_t inuse; int bit, field, freed; PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED); pmap = NULL; m_pc = NULL; SLIST_INIT(&free); TAILQ_INIT(&newtail); while ((pc = TAILQ_FIRST(&pv_chunks)) != NULL && (pv_vafree == 0 || SLIST_EMPTY(&free))) { TAILQ_REMOVE(&pv_chunks, pc, pc_lru); if (pmap != pc->pc_pmap) { if (pmap != NULL) { pmap_invalidate_all(pmap); if (pmap != locked_pmap) PMAP_UNLOCK(pmap); } pmap = pc->pc_pmap; /* Avoid deadlock and lock recursion. */ if (pmap > locked_pmap) PMAP_LOCK(pmap); else if (pmap != locked_pmap && !PMAP_TRYLOCK(pmap)) { pmap = NULL; TAILQ_INSERT_TAIL(&newtail, pc, pc_lru); continue; } } /* * Destroy every non-wired, 4 KB page mapping in the chunk. */ freed = 0; for (field = 0; field < _NPCM; field++) { for (inuse = ~pc->pc_map[field] & pc_freemask[field]; inuse != 0; inuse &= ~(1UL << bit)) { bit = bsfl(inuse); pv = &pc->pc_pventry[field * 32 + bit]; va = pv->pv_va; pde = pmap_pde(pmap, va); if ((*pde & PG_PS) != 0) continue; pte = pmap_pte(pmap, va); tpte = *pte; if ((tpte & PG_W) == 0) tpte = pte_load_clear(pte); pmap_pte_release(pte); if ((tpte & PG_W) != 0) continue; KASSERT(tpte != 0, ("pmap_pv_reclaim: pmap %p va %x zero pte", pmap, va)); if ((tpte & PG_G) != 0) pmap_invalidate_page(pmap, va); m = PHYS_TO_VM_PAGE(tpte & PG_FRAME); if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) vm_page_dirty(m); if ((tpte & PG_A) != 0) vm_page_aflag_set(m, PGA_REFERENCED); TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); if (TAILQ_EMPTY(&m->md.pv_list) && (m->flags & PG_FICTITIOUS) == 0) { pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); if (TAILQ_EMPTY(&pvh->pv_list)) { vm_page_aflag_clear(m, PGA_WRITEABLE); } } pc->pc_map[field] |= 1UL << bit; pmap_unuse_pt(pmap, va, &free); freed++; } } if (freed == 0) { TAILQ_INSERT_TAIL(&newtail, pc, pc_lru); continue; } /* Every freed mapping is for a 4 KB page. */ pmap->pm_stats.resident_count -= freed; PV_STAT(pv_entry_frees += freed); PV_STAT(pv_entry_spare += freed); pv_entry_count -= freed; TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); for (field = 0; field < _NPCM; field++) if (pc->pc_map[field] != pc_freemask[field]) { TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); TAILQ_INSERT_TAIL(&newtail, pc, pc_lru); /* * One freed pv entry in locked_pmap is * sufficient. */ if (pmap == locked_pmap) goto out; break; } if (field == _NPCM) { PV_STAT(pv_entry_spare -= _NPCPV); PV_STAT(pc_chunk_count--); PV_STAT(pc_chunk_frees++); /* Entire chunk is free; return it. */ m_pc = PHYS_TO_VM_PAGE(pmap_kextract((vm_offset_t)pc)); pmap_qremove((vm_offset_t)pc, 1); pmap_ptelist_free(&pv_vafree, (vm_offset_t)pc); break; } } out: TAILQ_CONCAT(&pv_chunks, &newtail, pc_lru); if (pmap != NULL) { pmap_invalidate_all(pmap); if (pmap != locked_pmap) PMAP_UNLOCK(pmap); } if (m_pc == NULL && pv_vafree != 0 && SLIST_EMPTY(&free)) { m_pc = SLIST_FIRST(&free); SLIST_REMOVE_HEAD(&free, plinks.s.ss); /* Recycle a freed page table page. */ m_pc->wire_count = 1; atomic_add_int(&vm_cnt.v_wire_count, 1); } pmap_free_zero_pages(&free); return (m_pc); } /* * free the pv_entry back to the free list */ static void free_pv_entry(pmap_t pmap, pv_entry_t pv) { struct pv_chunk *pc; int idx, field, bit; rw_assert(&pvh_global_lock, RA_WLOCKED); PMAP_LOCK_ASSERT(pmap, MA_OWNED); PV_STAT(pv_entry_frees++); PV_STAT(pv_entry_spare++); pv_entry_count--; pc = pv_to_chunk(pv); idx = pv - &pc->pc_pventry[0]; field = idx / 32; bit = idx % 32; pc->pc_map[field] |= 1ul << bit; for (idx = 0; idx < _NPCM; idx++) if (pc->pc_map[idx] != pc_freemask[idx]) { /* * 98% of the time, pc is already at the head of the * list. If it isn't already, move it to the head. */ if (__predict_false(TAILQ_FIRST(&pmap->pm_pvchunk) != pc)) { TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); } return; } TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); free_pv_chunk(pc); } static void free_pv_chunk(struct pv_chunk *pc) { vm_page_t m; TAILQ_REMOVE(&pv_chunks, pc, pc_lru); PV_STAT(pv_entry_spare -= _NPCPV); PV_STAT(pc_chunk_count--); PV_STAT(pc_chunk_frees++); /* entire chunk is free, return it */ m = PHYS_TO_VM_PAGE(pmap_kextract((vm_offset_t)pc)); pmap_qremove((vm_offset_t)pc, 1); vm_page_unwire(m, PQ_NONE); vm_page_free(m); pmap_ptelist_free(&pv_vafree, (vm_offset_t)pc); } /* * get a new pv_entry, allocating a block from the system * when needed. */ static pv_entry_t get_pv_entry(pmap_t pmap, boolean_t try) { static const struct timeval printinterval = { 60, 0 }; static struct timeval lastprint; int bit, field; pv_entry_t pv; struct pv_chunk *pc; vm_page_t m; rw_assert(&pvh_global_lock, RA_WLOCKED); PMAP_LOCK_ASSERT(pmap, MA_OWNED); PV_STAT(pv_entry_allocs++); pv_entry_count++; if (pv_entry_count > pv_entry_high_water) if (ratecheck(&lastprint, &printinterval)) printf("Approaching the limit on PV entries, consider " "increasing either the vm.pmap.shpgperproc or the " "vm.pmap.pv_entry_max tunable.\n"); retry: pc = TAILQ_FIRST(&pmap->pm_pvchunk); if (pc != NULL) { for (field = 0; field < _NPCM; field++) { if (pc->pc_map[field]) { bit = bsfl(pc->pc_map[field]); break; } } if (field < _NPCM) { pv = &pc->pc_pventry[field * 32 + bit]; pc->pc_map[field] &= ~(1ul << bit); /* If this was the last item, move it to tail */ for (field = 0; field < _NPCM; field++) if (pc->pc_map[field] != 0) { PV_STAT(pv_entry_spare--); return (pv); /* not full, return */ } TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); PV_STAT(pv_entry_spare--); return (pv); } } /* * Access to the ptelist "pv_vafree" is synchronized by the pvh * global lock. If "pv_vafree" is currently non-empty, it will * remain non-empty until pmap_ptelist_alloc() completes. */ if (pv_vafree == 0 || (m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) { if (try) { pv_entry_count--; PV_STAT(pc_chunk_tryfail++); return (NULL); } m = pmap_pv_reclaim(pmap); if (m == NULL) goto retry; } PV_STAT(pc_chunk_count++); PV_STAT(pc_chunk_allocs++); pc = (struct pv_chunk *)pmap_ptelist_alloc(&pv_vafree); pmap_qenter((vm_offset_t)pc, &m, 1); pc->pc_pmap = pmap; pc->pc_map[0] = pc_freemask[0] & ~1ul; /* preallocated bit 0 */ for (field = 1; field < _NPCM; field++) pc->pc_map[field] = pc_freemask[field]; TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru); pv = &pc->pc_pventry[0]; TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); PV_STAT(pv_entry_spare += _NPCPV - 1); return (pv); } static __inline pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va) { pv_entry_t pv; rw_assert(&pvh_global_lock, RA_WLOCKED); TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { if (pmap == PV_PMAP(pv) && va == pv->pv_va) { TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); break; } } return (pv); } static void pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa) { struct md_page *pvh; pv_entry_t pv; vm_offset_t va_last; vm_page_t m; rw_assert(&pvh_global_lock, RA_WLOCKED); KASSERT((pa & PDRMASK) == 0, ("pmap_pv_demote_pde: pa is not 4mpage aligned")); /* * Transfer the 4mpage's pv entry for this mapping to the first * page's pv list. */ pvh = pa_to_pvh(pa); va = trunc_4mpage(va); pv = pmap_pvh_remove(pvh, pmap, va); KASSERT(pv != NULL, ("pmap_pv_demote_pde: pv not found")); m = PHYS_TO_VM_PAGE(pa); TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); /* Instantiate the remaining NPTEPG - 1 pv entries. */ va_last = va + NBPDR - PAGE_SIZE; do { m++; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_pv_demote_pde: page %p is not managed", m)); va += PAGE_SIZE; pmap_insert_entry(pmap, va, m); } while (va < va_last); } static void pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa) { struct md_page *pvh; pv_entry_t pv; vm_offset_t va_last; vm_page_t m; rw_assert(&pvh_global_lock, RA_WLOCKED); KASSERT((pa & PDRMASK) == 0, ("pmap_pv_promote_pde: pa is not 4mpage aligned")); /* * Transfer the first page's pv entry for this mapping to the * 4mpage's pv list. Aside from avoiding the cost of a call * to get_pv_entry(), a transfer avoids the possibility that * get_pv_entry() calls pmap_collect() and that pmap_collect() * removes one of the mappings that is being promoted. */ m = PHYS_TO_VM_PAGE(pa); va = trunc_4mpage(va); pv = pmap_pvh_remove(&m->md, pmap, va); KASSERT(pv != NULL, ("pmap_pv_promote_pde: pv not found")); pvh = pa_to_pvh(pa); TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); /* Free the remaining NPTEPG - 1 pv entries. */ va_last = va + NBPDR - PAGE_SIZE; do { m++; va += PAGE_SIZE; pmap_pvh_free(&m->md, pmap, va); } while (va < va_last); } static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va) { pv_entry_t pv; pv = pmap_pvh_remove(pvh, pmap, va); KASSERT(pv != NULL, ("pmap_pvh_free: pv not found")); free_pv_entry(pmap, pv); } static void pmap_remove_entry(pmap_t pmap, vm_page_t m, vm_offset_t va) { struct md_page *pvh; rw_assert(&pvh_global_lock, RA_WLOCKED); pmap_pvh_free(&m->md, pmap, va); if (TAILQ_EMPTY(&m->md.pv_list) && (m->flags & PG_FICTITIOUS) == 0) { pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); if (TAILQ_EMPTY(&pvh->pv_list)) vm_page_aflag_clear(m, PGA_WRITEABLE); } } /* * Create a pv entry for page at pa for * (pmap, va). */ static void pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m) { pv_entry_t pv; rw_assert(&pvh_global_lock, RA_WLOCKED); PMAP_LOCK_ASSERT(pmap, MA_OWNED); pv = get_pv_entry(pmap, FALSE); pv->pv_va = va; TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); } /* * Conditionally create a pv entry. */ static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m) { pv_entry_t pv; rw_assert(&pvh_global_lock, RA_WLOCKED); PMAP_LOCK_ASSERT(pmap, MA_OWNED); if (pv_entry_count < pv_entry_high_water && (pv = get_pv_entry(pmap, TRUE)) != NULL) { pv->pv_va = va; TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); return (TRUE); } else return (FALSE); } /* * Create the pv entries for each of the pages within a superpage. */ static boolean_t pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa) { struct md_page *pvh; pv_entry_t pv; rw_assert(&pvh_global_lock, RA_WLOCKED); if (pv_entry_count < pv_entry_high_water && (pv = get_pv_entry(pmap, TRUE)) != NULL) { pv->pv_va = va; pvh = pa_to_pvh(pa); TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); return (TRUE); } else return (FALSE); } /* * Fills a page table page with mappings to consecutive physical pages. */ static void pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte) { pt_entry_t *pte; for (pte = firstpte; pte < firstpte + NPTEPG; pte++) { *pte = newpte; newpte += PAGE_SIZE; } } /* * Tries to demote a 2- or 4MB page mapping. If demotion fails, the * 2- or 4MB page mapping is invalidated. */ static boolean_t pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va) { pd_entry_t newpde, oldpde; pt_entry_t *firstpte, newpte; vm_paddr_t mptepa; vm_page_t mpte; struct spglist free; vm_offset_t sva; PMAP_LOCK_ASSERT(pmap, MA_OWNED); oldpde = *pde; KASSERT((oldpde & (PG_PS | PG_V)) == (PG_PS | PG_V), ("pmap_demote_pde: oldpde is missing PG_PS and/or PG_V")); if ((oldpde & PG_A) == 0 || (mpte = pmap_remove_pt_page(pmap, va)) == NULL) { KASSERT((oldpde & PG_W) == 0, ("pmap_demote_pde: page table page for a wired mapping" " is missing")); /* * Invalidate the 2- or 4MB page mapping and return * "failure" if the mapping was never accessed or the * allocation of the new page table page fails. */ if ((oldpde & PG_A) == 0 || (mpte = vm_page_alloc(NULL, va >> PDRSHIFT, VM_ALLOC_NOOBJ | VM_ALLOC_NORMAL | VM_ALLOC_WIRED)) == NULL) { SLIST_INIT(&free); sva = trunc_4mpage(va); pmap_remove_pde(pmap, pde, sva, &free); pmap_invalidate_range(pmap, sva, sva + NBPDR - 1); pmap_free_zero_pages(&free); CTR2(KTR_PMAP, "pmap_demote_pde: failure for va %#x" " in pmap %p", va, pmap); return (FALSE); } if (va < VM_MAXUSER_ADDRESS) pmap->pm_stats.resident_count++; } mptepa = VM_PAGE_TO_PHYS(mpte); /* * If the page mapping is in the kernel's address space, then the * KPTmap can provide access to the page table page. Otherwise, * temporarily map the page table page (mpte) into the kernel's * address space at either PADDR1 or PADDR2. */ if (va >= KERNBASE) firstpte = &KPTmap[i386_btop(trunc_4mpage(va))]; else if (curthread->td_pinned > 0 && rw_wowned(&pvh_global_lock)) { if ((*PMAP1 & PG_FRAME) != mptepa) { *PMAP1 = mptepa | PG_RW | PG_V | PG_A | PG_M; #ifdef SMP PMAP1cpu = PCPU_GET(cpuid); #endif invlcaddr(PADDR1); PMAP1changed++; } else #ifdef SMP if (PMAP1cpu != PCPU_GET(cpuid)) { PMAP1cpu = PCPU_GET(cpuid); invlcaddr(PADDR1); PMAP1changedcpu++; } else #endif PMAP1unchanged++; firstpte = PADDR1; } else { mtx_lock(&PMAP2mutex); if ((*PMAP2 & PG_FRAME) != mptepa) { *PMAP2 = mptepa | PG_RW | PG_V | PG_A | PG_M; pmap_invalidate_page(kernel_pmap, (vm_offset_t)PADDR2); } firstpte = PADDR2; } newpde = mptepa | PG_M | PG_A | (oldpde & PG_U) | PG_RW | PG_V; KASSERT((oldpde & PG_A) != 0, ("pmap_demote_pde: oldpde is missing PG_A")); KASSERT((oldpde & (PG_M | PG_RW)) != PG_RW, ("pmap_demote_pde: oldpde is missing PG_M")); newpte = oldpde & ~PG_PS; if ((newpte & PG_PDE_PAT) != 0) newpte ^= PG_PDE_PAT | PG_PTE_PAT; /* * If the page table page is new, initialize it. */ if (mpte->wire_count == 1) { mpte->wire_count = NPTEPG; pmap_fill_ptp(firstpte, newpte); } KASSERT((*firstpte & PG_FRAME) == (newpte & PG_FRAME), ("pmap_demote_pde: firstpte and newpte map different physical" " addresses")); /* * If the mapping has changed attributes, update the page table * entries. */ if ((*firstpte & PG_PTE_PROMOTE) != (newpte & PG_PTE_PROMOTE)) pmap_fill_ptp(firstpte, newpte); /* * Demote the mapping. This pmap is locked. The old PDE has * PG_A set. If the old PDE has PG_RW set, it also has PG_M * set. Thus, there is no danger of a race with another * processor changing the setting of PG_A and/or PG_M between * the read above and the store below. */ if (workaround_erratum383) pmap_update_pde(pmap, va, pde, newpde); else if (pmap == kernel_pmap) pmap_kenter_pde(va, newpde); else pde_store(pde, newpde); if (firstpte == PADDR2) mtx_unlock(&PMAP2mutex); /* * Invalidate the recursive mapping of the page table page. */ pmap_invalidate_page(pmap, (vm_offset_t)vtopte(va)); /* * Demote the pv entry. This depends on the earlier demotion * of the mapping. Specifically, the (re)creation of a per- * page pv entry might trigger the execution of pmap_collect(), * which might reclaim a newly (re)created per-page pv entry * and destroy the associated mapping. In order to destroy * the mapping, the PDE must have already changed from mapping * the 2mpage to referencing the page table page. */ if ((oldpde & PG_MANAGED) != 0) pmap_pv_demote_pde(pmap, va, oldpde & PG_PS_FRAME); pmap_pde_demotions++; CTR2(KTR_PMAP, "pmap_demote_pde: success for va %#x" " in pmap %p", va, pmap); return (TRUE); } /* * Removes a 2- or 4MB page mapping from the kernel pmap. */ static void pmap_remove_kernel_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va) { pd_entry_t newpde; vm_paddr_t mptepa; vm_page_t mpte; PMAP_LOCK_ASSERT(pmap, MA_OWNED); mpte = pmap_remove_pt_page(pmap, va); if (mpte == NULL) panic("pmap_remove_kernel_pde: Missing pt page."); mptepa = VM_PAGE_TO_PHYS(mpte); newpde = mptepa | PG_M | PG_A | PG_RW | PG_V; /* * Initialize the page table page. */ pagezero((void *)&KPTmap[i386_btop(trunc_4mpage(va))]); /* * Remove the mapping. */ if (workaround_erratum383) pmap_update_pde(pmap, va, pde, newpde); else pmap_kenter_pde(va, newpde); /* * Invalidate the recursive mapping of the page table page. */ pmap_invalidate_page(pmap, (vm_offset_t)vtopte(va)); } /* * pmap_remove_pde: do the things to unmap a superpage in a process */ static void pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva, struct spglist *free) { struct md_page *pvh; pd_entry_t oldpde; vm_offset_t eva, va; vm_page_t m, mpte; PMAP_LOCK_ASSERT(pmap, MA_OWNED); KASSERT((sva & PDRMASK) == 0, ("pmap_remove_pde: sva is not 4mpage aligned")); oldpde = pte_load_clear(pdq); if (oldpde & PG_W) pmap->pm_stats.wired_count -= NBPDR / PAGE_SIZE; /* * Machines that don't support invlpg, also don't support * PG_G. * * When workaround_erratum383 is false, a promotion to a 2M/4M * page mapping does not invalidate the 512/1024 4K page mappings * from the TLB. Consequently, at this point, the TLB may * hold both 4K and 2M/4M page mappings. Therefore, the entire * range of addresses must be invalidated here. In contrast, * when workaround_erratum383 is true, a promotion does * invalidate the 512/1024 4K page mappings, and so a single INVLPG * suffices to invalidate the 2M/4M page mapping. */ if ((oldpde & PG_G) != 0) { if (workaround_erratum383) pmap_invalidate_page(kernel_pmap, sva); else pmap_invalidate_range(kernel_pmap, sva, sva + NBPDR - 1); } pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE; if (oldpde & PG_MANAGED) { pvh = pa_to_pvh(oldpde & PG_PS_FRAME); pmap_pvh_free(pvh, pmap, sva); eva = sva + NBPDR; for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME); va < eva; va += PAGE_SIZE, m++) { if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW)) vm_page_dirty(m); if (oldpde & PG_A) vm_page_aflag_set(m, PGA_REFERENCED); if (TAILQ_EMPTY(&m->md.pv_list) && TAILQ_EMPTY(&pvh->pv_list)) vm_page_aflag_clear(m, PGA_WRITEABLE); } } if (pmap == kernel_pmap) { pmap_remove_kernel_pde(pmap, pdq, sva); } else { mpte = pmap_remove_pt_page(pmap, sva); if (mpte != NULL) { pmap->pm_stats.resident_count--; KASSERT(mpte->wire_count == NPTEPG, ("pmap_remove_pde: pte page wire count error")); mpte->wire_count = 0; pmap_add_delayed_free_list(mpte, free, FALSE); atomic_subtract_int(&vm_cnt.v_wire_count, 1); } } } /* * pmap_remove_pte: do the things to unmap a page in a process */ static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va, struct spglist *free) { pt_entry_t oldpte; vm_page_t m; rw_assert(&pvh_global_lock, RA_WLOCKED); PMAP_LOCK_ASSERT(pmap, MA_OWNED); oldpte = pte_load_clear(ptq); KASSERT(oldpte != 0, ("pmap_remove_pte: pmap %p va %x zero pte", pmap, va)); if (oldpte & PG_W) pmap->pm_stats.wired_count -= 1; /* * Machines that don't support invlpg, also don't support * PG_G. */ if (oldpte & PG_G) pmap_invalidate_page(kernel_pmap, va); pmap->pm_stats.resident_count -= 1; if (oldpte & PG_MANAGED) { m = PHYS_TO_VM_PAGE(oldpte & PG_FRAME); if ((oldpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) vm_page_dirty(m); if (oldpte & PG_A) vm_page_aflag_set(m, PGA_REFERENCED); pmap_remove_entry(pmap, m, va); } return (pmap_unuse_pt(pmap, va, free)); } /* * Remove a single page from a process address space */ static void pmap_remove_page(pmap_t pmap, vm_offset_t va, struct spglist *free) { pt_entry_t *pte; rw_assert(&pvh_global_lock, RA_WLOCKED); KASSERT(curthread->td_pinned > 0, ("curthread not pinned")); PMAP_LOCK_ASSERT(pmap, MA_OWNED); if ((pte = pmap_pte_quick(pmap, va)) == NULL || *pte == 0) return; pmap_remove_pte(pmap, pte, va, free); pmap_invalidate_page(pmap, va); } /* * Remove the given range of addresses from the specified map. * * It is assumed that the start and end are properly * rounded to the page size. */ void pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { vm_offset_t pdnxt; pd_entry_t ptpaddr; pt_entry_t *pte; struct spglist free; int anyvalid; /* * Perform an unsynchronized read. This is, however, safe. */ if (pmap->pm_stats.resident_count == 0) return; anyvalid = 0; SLIST_INIT(&free); rw_wlock(&pvh_global_lock); sched_pin(); PMAP_LOCK(pmap); /* * special handling of removing one page. a very * common operation and easy to short circuit some * code. */ if ((sva + PAGE_SIZE == eva) && ((pmap->pm_pdir[(sva >> PDRSHIFT)] & PG_PS) == 0)) { pmap_remove_page(pmap, sva, &free); goto out; } for (; sva < eva; sva = pdnxt) { u_int pdirindex; /* * Calculate index for next page table. */ pdnxt = (sva + NBPDR) & ~PDRMASK; if (pdnxt < sva) pdnxt = eva; if (pmap->pm_stats.resident_count == 0) break; pdirindex = sva >> PDRSHIFT; ptpaddr = pmap->pm_pdir[pdirindex]; /* * Weed out invalid mappings. Note: we assume that the page * directory table is always allocated, and in kernel virtual. */ if (ptpaddr == 0) continue; /* * Check for large page. */ if ((ptpaddr & PG_PS) != 0) { /* * Are we removing the entire large page? If not, * demote the mapping and fall through. */ if (sva + NBPDR == pdnxt && eva >= pdnxt) { /* * The TLB entry for a PG_G mapping is * invalidated by pmap_remove_pde(). */ if ((ptpaddr & PG_G) == 0) anyvalid = 1; pmap_remove_pde(pmap, &pmap->pm_pdir[pdirindex], sva, &free); continue; } else if (!pmap_demote_pde(pmap, &pmap->pm_pdir[pdirindex], sva)) { /* The large page mapping was destroyed. */ continue; } } /* * Limit our scan to either the end of the va represented * by the current page table page, or to the end of the * range being removed. */ if (pdnxt > eva) pdnxt = eva; for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++, sva += PAGE_SIZE) { if (*pte == 0) continue; /* * The TLB entry for a PG_G mapping is invalidated * by pmap_remove_pte(). */ if ((*pte & PG_G) == 0) anyvalid = 1; if (pmap_remove_pte(pmap, pte, sva, &free)) break; } } out: sched_unpin(); if (anyvalid) pmap_invalidate_all(pmap); rw_wunlock(&pvh_global_lock); PMAP_UNLOCK(pmap); pmap_free_zero_pages(&free); } /* * Routine: pmap_remove_all * Function: * Removes this physical page from * all physical maps in which it resides. * Reflects back modify bits to the pager. * * Notes: * Original versions of this routine were very * inefficient because they iteratively called * pmap_remove (slow...) */ void pmap_remove_all(vm_page_t m) { struct md_page *pvh; pv_entry_t pv; pmap_t pmap; pt_entry_t *pte, tpte; pd_entry_t *pde; vm_offset_t va; struct spglist free; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_remove_all: page %p is not managed", m)); SLIST_INIT(&free); rw_wlock(&pvh_global_lock); sched_pin(); if ((m->flags & PG_FICTITIOUS) != 0) goto small_mappings; pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) { va = pv->pv_va; pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pde = pmap_pde(pmap, va); (void)pmap_demote_pde(pmap, pde, va); PMAP_UNLOCK(pmap); } small_mappings: while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pmap->pm_stats.resident_count--; pde = pmap_pde(pmap, pv->pv_va); KASSERT((*pde & PG_PS) == 0, ("pmap_remove_all: found" " a 4mpage in page %p's pv list", m)); pte = pmap_pte_quick(pmap, pv->pv_va); tpte = pte_load_clear(pte); KASSERT(tpte != 0, ("pmap_remove_all: pmap %p va %x zero pte", pmap, pv->pv_va)); if (tpte & PG_W) pmap->pm_stats.wired_count--; if (tpte & PG_A) vm_page_aflag_set(m, PGA_REFERENCED); /* * Update the vm_page_t clean and reference bits. */ if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) vm_page_dirty(m); pmap_unuse_pt(pmap, pv->pv_va, &free); pmap_invalidate_page(pmap, pv->pv_va); TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); free_pv_entry(pmap, pv); PMAP_UNLOCK(pmap); } vm_page_aflag_clear(m, PGA_WRITEABLE); sched_unpin(); rw_wunlock(&pvh_global_lock); pmap_free_zero_pages(&free); } /* * pmap_protect_pde: do the things to protect a 4mpage in a process */ static boolean_t pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva, vm_prot_t prot) { pd_entry_t newpde, oldpde; vm_offset_t eva, va; vm_page_t m; boolean_t anychanged; PMAP_LOCK_ASSERT(pmap, MA_OWNED); KASSERT((sva & PDRMASK) == 0, ("pmap_protect_pde: sva is not 4mpage aligned")); anychanged = FALSE; retry: oldpde = newpde = *pde; if (oldpde & PG_MANAGED) { eva = sva + NBPDR; for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME); va < eva; va += PAGE_SIZE, m++) if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW)) vm_page_dirty(m); } if ((prot & VM_PROT_WRITE) == 0) newpde &= ~(PG_RW | PG_M); #if defined(PAE) || defined(PAE_TABLES) if ((prot & VM_PROT_EXECUTE) == 0) newpde |= pg_nx; #endif if (newpde != oldpde) { if (!pde_cmpset(pde, oldpde, newpde)) goto retry; if (oldpde & PG_G) { /* See pmap_remove_pde() for explanation. */ if (workaround_erratum383) pmap_invalidate_page(kernel_pmap, sva); else pmap_invalidate_range(kernel_pmap, sva, sva + NBPDR - 1); } else anychanged = TRUE; } return (anychanged); } /* * Set the physical protection on the * specified range of this map as requested. */ void pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) { vm_offset_t pdnxt; pd_entry_t ptpaddr; pt_entry_t *pte; boolean_t anychanged, pv_lists_locked; KASSERT((prot & ~VM_PROT_ALL) == 0, ("invalid prot %x", prot)); if (prot == VM_PROT_NONE) { pmap_remove(pmap, sva, eva); return; } #if defined(PAE) || defined(PAE_TABLES) if ((prot & (VM_PROT_WRITE|VM_PROT_EXECUTE)) == (VM_PROT_WRITE|VM_PROT_EXECUTE)) return; #else if (prot & VM_PROT_WRITE) return; #endif if (pmap_is_current(pmap)) pv_lists_locked = FALSE; else { pv_lists_locked = TRUE; resume: rw_wlock(&pvh_global_lock); sched_pin(); } anychanged = FALSE; PMAP_LOCK(pmap); for (; sva < eva; sva = pdnxt) { pt_entry_t obits, pbits; u_int pdirindex; pdnxt = (sva + NBPDR) & ~PDRMASK; if (pdnxt < sva) pdnxt = eva; pdirindex = sva >> PDRSHIFT; ptpaddr = pmap->pm_pdir[pdirindex]; /* * Weed out invalid mappings. Note: we assume that the page * directory table is always allocated, and in kernel virtual. */ if (ptpaddr == 0) continue; /* * Check for large page. */ if ((ptpaddr & PG_PS) != 0) { /* * Are we protecting the entire large page? If not, * demote the mapping and fall through. */ if (sva + NBPDR == pdnxt && eva >= pdnxt) { /* * The TLB entry for a PG_G mapping is * invalidated by pmap_protect_pde(). */ if (pmap_protect_pde(pmap, &pmap->pm_pdir[pdirindex], sva, prot)) anychanged = TRUE; continue; } else { if (!pv_lists_locked) { pv_lists_locked = TRUE; if (!rw_try_wlock(&pvh_global_lock)) { if (anychanged) pmap_invalidate_all( pmap); PMAP_UNLOCK(pmap); goto resume; } sched_pin(); } if (!pmap_demote_pde(pmap, &pmap->pm_pdir[pdirindex], sva)) { /* * The large page mapping was * destroyed. */ continue; } } } if (pdnxt > eva) pdnxt = eva; for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++, sva += PAGE_SIZE) { vm_page_t m; retry: /* * Regardless of whether a pte is 32 or 64 bits in * size, PG_RW, PG_A, and PG_M are among the least * significant 32 bits. */ obits = pbits = *pte; if ((pbits & PG_V) == 0) continue; if ((prot & VM_PROT_WRITE) == 0) { if ((pbits & (PG_MANAGED | PG_M | PG_RW)) == (PG_MANAGED | PG_M | PG_RW)) { m = PHYS_TO_VM_PAGE(pbits & PG_FRAME); vm_page_dirty(m); } pbits &= ~(PG_RW | PG_M); } #if defined(PAE) || defined(PAE_TABLES) if ((prot & VM_PROT_EXECUTE) == 0) pbits |= pg_nx; #endif if (pbits != obits) { #if defined(PAE) || defined(PAE_TABLES) if (!atomic_cmpset_64(pte, obits, pbits)) goto retry; #else if (!atomic_cmpset_int((u_int *)pte, obits, pbits)) goto retry; #endif if (obits & PG_G) pmap_invalidate_page(pmap, sva); else anychanged = TRUE; } } } if (anychanged) pmap_invalidate_all(pmap); if (pv_lists_locked) { sched_unpin(); rw_wunlock(&pvh_global_lock); } PMAP_UNLOCK(pmap); } /* * Tries to promote the 512 or 1024, contiguous 4KB page mappings that are * within a single page table page (PTP) to a single 2- or 4MB page mapping. * For promotion to occur, two conditions must be met: (1) the 4KB page * mappings must map aligned, contiguous physical memory and (2) the 4KB page * mappings must have identical characteristics. * * Managed (PG_MANAGED) mappings within the kernel address space are not * promoted. The reason is that kernel PDEs are replicated in each pmap but * pmap_clear_ptes() and pmap_ts_referenced() only read the PDE from the kernel * pmap. */ static void pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va) { pd_entry_t newpde; pt_entry_t *firstpte, oldpte, pa, *pte; vm_offset_t oldpteva; vm_page_t mpte; PMAP_LOCK_ASSERT(pmap, MA_OWNED); /* * Examine the first PTE in the specified PTP. Abort if this PTE is * either invalid, unused, or does not map the first 4KB physical page * within a 2- or 4MB page. */ firstpte = pmap_pte_quick(pmap, trunc_4mpage(va)); setpde: newpde = *firstpte; if ((newpde & ((PG_FRAME & PDRMASK) | PG_A | PG_V)) != (PG_A | PG_V)) { pmap_pde_p_failures++; CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x" " in pmap %p", va, pmap); return; } if ((*firstpte & PG_MANAGED) != 0 && pmap == kernel_pmap) { pmap_pde_p_failures++; CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x" " in pmap %p", va, pmap); return; } if ((newpde & (PG_M | PG_RW)) == PG_RW) { /* * When PG_M is already clear, PG_RW can be cleared without * a TLB invalidation. */ if (!atomic_cmpset_int((u_int *)firstpte, newpde, newpde & ~PG_RW)) goto setpde; newpde &= ~PG_RW; } /* * Examine each of the other PTEs in the specified PTP. Abort if this * PTE maps an unexpected 4KB physical page or does not have identical * characteristics to the first PTE. */ pa = (newpde & (PG_PS_FRAME | PG_A | PG_V)) + NBPDR - PAGE_SIZE; for (pte = firstpte + NPTEPG - 1; pte > firstpte; pte--) { setpte: oldpte = *pte; if ((oldpte & (PG_FRAME | PG_A | PG_V)) != pa) { pmap_pde_p_failures++; CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x" " in pmap %p", va, pmap); return; } if ((oldpte & (PG_M | PG_RW)) == PG_RW) { /* * When PG_M is already clear, PG_RW can be cleared * without a TLB invalidation. */ if (!atomic_cmpset_int((u_int *)pte, oldpte, oldpte & ~PG_RW)) goto setpte; oldpte &= ~PG_RW; oldpteva = (oldpte & PG_FRAME & PDRMASK) | (va & ~PDRMASK); CTR2(KTR_PMAP, "pmap_promote_pde: protect for va %#x" " in pmap %p", oldpteva, pmap); } if ((oldpte & PG_PTE_PROMOTE) != (newpde & PG_PTE_PROMOTE)) { pmap_pde_p_failures++; CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x" " in pmap %p", va, pmap); return; } pa -= PAGE_SIZE; } /* * Save the page table page in its current state until the PDE * mapping the superpage is demoted by pmap_demote_pde() or * destroyed by pmap_remove_pde(). */ mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME); KASSERT(mpte >= vm_page_array && mpte < &vm_page_array[vm_page_array_size], ("pmap_promote_pde: page table page is out of range")); KASSERT(mpte->pindex == va >> PDRSHIFT, ("pmap_promote_pde: page table page's pindex is wrong")); if (pmap_insert_pt_page(pmap, mpte)) { pmap_pde_p_failures++; CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x in pmap %p", va, pmap); return; } /* * Promote the pv entries. */ if ((newpde & PG_MANAGED) != 0) pmap_pv_promote_pde(pmap, va, newpde & PG_PS_FRAME); /* * Propagate the PAT index to its proper position. */ if ((newpde & PG_PTE_PAT) != 0) newpde ^= PG_PDE_PAT | PG_PTE_PAT; /* * Map the superpage. */ if (workaround_erratum383) pmap_update_pde(pmap, va, pde, PG_PS | newpde); else if (pmap == kernel_pmap) pmap_kenter_pde(va, PG_PS | newpde); else pde_store(pde, PG_PS | newpde); pmap_pde_promotions++; CTR2(KTR_PMAP, "pmap_promote_pde: success for va %#x" " in pmap %p", va, pmap); } /* * Insert the given physical page (p) at * the specified virtual address (v) in the * target physical map with the protection requested. * * If specified, the page will be wired down, meaning * that the related pte can not be reclaimed. * * NB: This is the only routine which MAY NOT lazy-evaluate * or lose information. That is, this routine must actually * insert this page into the given map NOW. */ int pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, u_int flags, int8_t psind) { pd_entry_t *pde; pt_entry_t *pte; pt_entry_t newpte, origpte; pv_entry_t pv; vm_paddr_t opa, pa; vm_page_t mpte, om; boolean_t invlva, wired; va = trunc_page(va); mpte = NULL; wired = (flags & PMAP_ENTER_WIRED) != 0; KASSERT(va <= VM_MAX_KERNEL_ADDRESS, ("pmap_enter: toobig")); KASSERT(va < UPT_MIN_ADDRESS || va >= UPT_MAX_ADDRESS, ("pmap_enter: invalid to pmap_enter page table pages (va: 0x%x)", va)); if ((m->oflags & VPO_UNMANAGED) == 0 && !vm_page_xbusied(m)) VM_OBJECT_ASSERT_LOCKED(m->object); rw_wlock(&pvh_global_lock); PMAP_LOCK(pmap); sched_pin(); pde = pmap_pde(pmap, va); if (va < VM_MAXUSER_ADDRESS) { /* * va is for UVA. * In the case that a page table page is not resident, * we are creating it here. pmap_allocpte() handles * demotion. */ mpte = pmap_allocpte(pmap, va, flags); if (mpte == NULL) { KASSERT((flags & PMAP_ENTER_NOSLEEP) != 0, ("pmap_allocpte failed with sleep allowed")); sched_unpin(); rw_wunlock(&pvh_global_lock); PMAP_UNLOCK(pmap); return (KERN_RESOURCE_SHORTAGE); } } else { /* * va is for KVA, so pmap_demote_pde() will never fail * to install a page table page. PG_V is also * asserted by pmap_demote_pde(). */ KASSERT(pde != NULL && (*pde & PG_V) != 0, ("KVA %#x invalid pde pdir %#jx", va, (uintmax_t)pmap->pm_pdir[PTDPTDI])); if ((*pde & PG_PS) != 0) pmap_demote_pde(pmap, pde, va); } pte = pmap_pte_quick(pmap, va); /* * Page Directory table entry is not valid, which should not * happen. We should have either allocated the page table * page or demoted the existing mapping above. */ if (pte == NULL) { panic("pmap_enter: invalid page directory pdir=%#jx, va=%#x", (uintmax_t)pmap->pm_pdir[PTDPTDI], va); } pa = VM_PAGE_TO_PHYS(m); om = NULL; origpte = *pte; opa = origpte & PG_FRAME; /* * Mapping has not changed, must be protection or wiring change. */ if (origpte && (opa == pa)) { /* * Wiring change, just update stats. We don't worry about * wiring PT pages as they remain resident as long as there * are valid mappings in them. Hence, if a user page is wired, * the PT page will be also. */ if (wired && ((origpte & PG_W) == 0)) pmap->pm_stats.wired_count++; else if (!wired && (origpte & PG_W)) pmap->pm_stats.wired_count--; /* * Remove extra pte reference */ if (mpte) mpte->wire_count--; if (origpte & PG_MANAGED) { om = m; pa |= PG_MANAGED; } goto validate; } pv = NULL; /* * Mapping has changed, invalidate old range and fall through to * handle validating new mapping. */ if (opa) { if (origpte & PG_W) pmap->pm_stats.wired_count--; if (origpte & PG_MANAGED) { om = PHYS_TO_VM_PAGE(opa); pv = pmap_pvh_remove(&om->md, pmap, va); } if (mpte != NULL) { mpte->wire_count--; KASSERT(mpte->wire_count > 0, ("pmap_enter: missing reference to page table page," " va: 0x%x", va)); } } else pmap->pm_stats.resident_count++; /* * Enter on the PV list if part of our managed memory. */ if ((m->oflags & VPO_UNMANAGED) == 0) { KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva, ("pmap_enter: managed mapping within the clean submap")); if (pv == NULL) pv = get_pv_entry(pmap, FALSE); pv->pv_va = va; TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); pa |= PG_MANAGED; } else if (pv != NULL) free_pv_entry(pmap, pv); /* * Increment counters */ if (wired) pmap->pm_stats.wired_count++; validate: /* * Now validate mapping with desired protection/wiring. */ newpte = (pt_entry_t)(pa | pmap_cache_bits(m->md.pat_mode, 0) | PG_V); if ((prot & VM_PROT_WRITE) != 0) { newpte |= PG_RW; if ((newpte & PG_MANAGED) != 0) vm_page_aflag_set(m, PGA_WRITEABLE); } #if defined(PAE) || defined(PAE_TABLES) if ((prot & VM_PROT_EXECUTE) == 0) newpte |= pg_nx; #endif if (wired) newpte |= PG_W; if (va < VM_MAXUSER_ADDRESS) newpte |= PG_U; if (pmap == kernel_pmap) newpte |= pgeflag; /* * if the mapping or permission bits are different, we need * to update the pte. */ if ((origpte & ~(PG_M|PG_A)) != newpte) { newpte |= PG_A; if ((flags & VM_PROT_WRITE) != 0) newpte |= PG_M; if (origpte & PG_V) { invlva = FALSE; origpte = pte_load_store(pte, newpte); if (origpte & PG_A) { if (origpte & PG_MANAGED) vm_page_aflag_set(om, PGA_REFERENCED); if (opa != VM_PAGE_TO_PHYS(m)) invlva = TRUE; #if defined(PAE) || defined(PAE_TABLES) if ((origpte & PG_NX) == 0 && (newpte & PG_NX) != 0) invlva = TRUE; #endif } if ((origpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { if ((origpte & PG_MANAGED) != 0) vm_page_dirty(om); if ((prot & VM_PROT_WRITE) == 0) invlva = TRUE; } if ((origpte & PG_MANAGED) != 0 && TAILQ_EMPTY(&om->md.pv_list) && ((om->flags & PG_FICTITIOUS) != 0 || TAILQ_EMPTY(&pa_to_pvh(opa)->pv_list))) vm_page_aflag_clear(om, PGA_WRITEABLE); if (invlva) pmap_invalidate_page(pmap, va); } else pte_store(pte, newpte); } /* * If both the page table page and the reservation are fully * populated, then attempt promotion. */ if ((mpte == NULL || mpte->wire_count == NPTEPG) && pg_ps_enabled && (m->flags & PG_FICTITIOUS) == 0 && vm_reserv_level_iffullpop(m) == 0) pmap_promote_pde(pmap, pde, va); sched_unpin(); rw_wunlock(&pvh_global_lock); PMAP_UNLOCK(pmap); return (KERN_SUCCESS); } /* * Tries to create a 2- or 4MB page mapping. Returns TRUE if successful and * FALSE otherwise. Fails if (1) a page table page cannot be allocated without * blocking, (2) a mapping already exists at the specified virtual address, or * (3) a pv entry cannot be allocated without reclaiming another pv entry. */ static boolean_t pmap_enter_pde(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot) { pd_entry_t *pde, newpde; rw_assert(&pvh_global_lock, RA_WLOCKED); PMAP_LOCK_ASSERT(pmap, MA_OWNED); pde = pmap_pde(pmap, va); if (*pde != 0) { CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx" " in pmap %p", va, pmap); return (FALSE); } newpde = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(m->md.pat_mode, 1) | PG_PS | PG_V; if ((m->oflags & VPO_UNMANAGED) == 0) { newpde |= PG_MANAGED; /* * Abort this mapping if its PV entry could not be created. */ if (!pmap_pv_insert_pde(pmap, va, VM_PAGE_TO_PHYS(m))) { CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx" " in pmap %p", va, pmap); return (FALSE); } } #if defined(PAE) || defined(PAE_TABLES) if ((prot & VM_PROT_EXECUTE) == 0) newpde |= pg_nx; #endif if (va < VM_MAXUSER_ADDRESS) newpde |= PG_U; /* * Increment counters. */ pmap->pm_stats.resident_count += NBPDR / PAGE_SIZE; /* * Map the superpage. */ pde_store(pde, newpde); pmap_pde_mappings++; CTR2(KTR_PMAP, "pmap_enter_pde: success for va %#lx" " in pmap %p", va, pmap); return (TRUE); } /* * Maps a sequence of resident pages belonging to the same object. * The sequence begins with the given page m_start. This page is * mapped at the given virtual address start. Each subsequent page is * mapped at a virtual address that is offset from start by the same * amount as the page is offset from m_start within the object. The * last page in the sequence is the page with the largest offset from * m_start that can be mapped at a virtual address less than the given * virtual address end. Not every virtual page between start and end * is mapped; only those for which a resident page exists with the * corresponding offset from m_start are mapped. */ void pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end, vm_page_t m_start, vm_prot_t prot) { vm_offset_t va; vm_page_t m, mpte; vm_pindex_t diff, psize; VM_OBJECT_ASSERT_LOCKED(m_start->object); psize = atop(end - start); mpte = NULL; m = m_start; rw_wlock(&pvh_global_lock); PMAP_LOCK(pmap); while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) { va = start + ptoa(diff); if ((va & PDRMASK) == 0 && va + NBPDR <= end && m->psind == 1 && pg_ps_enabled && pmap_enter_pde(pmap, va, m, prot)) m = &m[NBPDR / PAGE_SIZE - 1]; else mpte = pmap_enter_quick_locked(pmap, va, m, prot, mpte); m = TAILQ_NEXT(m, listq); } rw_wunlock(&pvh_global_lock); PMAP_UNLOCK(pmap); } /* * this code makes some *MAJOR* assumptions: * 1. Current pmap & pmap exists. * 2. Not wired. * 3. Read access. * 4. No page table pages. * but is *MUCH* faster than pmap_enter... */ void pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot) { rw_wlock(&pvh_global_lock); PMAP_LOCK(pmap); (void)pmap_enter_quick_locked(pmap, va, m, prot, NULL); rw_wunlock(&pvh_global_lock); PMAP_UNLOCK(pmap); } static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, vm_page_t mpte) { pt_entry_t *pte; vm_paddr_t pa; struct spglist free; KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva || (m->oflags & VPO_UNMANAGED) != 0, ("pmap_enter_quick_locked: managed mapping within the clean submap")); rw_assert(&pvh_global_lock, RA_WLOCKED); PMAP_LOCK_ASSERT(pmap, MA_OWNED); /* * In the case that a page table page is not * resident, we are creating it here. */ if (va < VM_MAXUSER_ADDRESS) { u_int ptepindex; pd_entry_t ptepa; /* * Calculate pagetable page index */ ptepindex = va >> PDRSHIFT; if (mpte && (mpte->pindex == ptepindex)) { mpte->wire_count++; } else { /* * Get the page directory entry */ ptepa = pmap->pm_pdir[ptepindex]; /* * If the page table page is mapped, we just increment * the hold count, and activate it. */ if (ptepa) { if (ptepa & PG_PS) return (NULL); mpte = PHYS_TO_VM_PAGE(ptepa & PG_FRAME); mpte->wire_count++; } else { mpte = _pmap_allocpte(pmap, ptepindex, PMAP_ENTER_NOSLEEP); if (mpte == NULL) return (mpte); } } } else { mpte = NULL; } /* * This call to vtopte makes the assumption that we are * entering the page into the current pmap. In order to support * quick entry into any pmap, one would likely use pmap_pte_quick. * But that isn't as quick as vtopte. */ pte = vtopte(va); if (*pte) { if (mpte != NULL) { mpte->wire_count--; mpte = NULL; } return (mpte); } /* * Enter on the PV list if part of our managed memory. */ if ((m->oflags & VPO_UNMANAGED) == 0 && !pmap_try_insert_pv_entry(pmap, va, m)) { if (mpte != NULL) { SLIST_INIT(&free); if (pmap_unwire_ptp(pmap, mpte, &free)) { pmap_invalidate_page(pmap, va); pmap_free_zero_pages(&free); } mpte = NULL; } return (mpte); } /* * Increment counters */ pmap->pm_stats.resident_count++; pa = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(m->md.pat_mode, 0); #if defined(PAE) || defined(PAE_TABLES) if ((prot & VM_PROT_EXECUTE) == 0) pa |= pg_nx; #endif /* * Now validate mapping with RO protection */ if ((m->oflags & VPO_UNMANAGED) != 0) pte_store(pte, pa | PG_V | PG_U); else pte_store(pte, pa | PG_V | PG_U | PG_MANAGED); return (mpte); } /* * Make a temporary mapping for a physical address. This is only intended * to be used for panic dumps. */ void * pmap_kenter_temporary(vm_paddr_t pa, int i) { vm_offset_t va; va = (vm_offset_t)crashdumpmap + (i * PAGE_SIZE); pmap_kenter(va, pa); invlpg(va); return ((void *)crashdumpmap); } /* * This code maps large physical mmap regions into the * processor address space. Note that some shortcuts * are taken, but the code works. */ void pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object, vm_pindex_t pindex, vm_size_t size) { pd_entry_t *pde; vm_paddr_t pa, ptepa; vm_page_t p; int pat_mode; VM_OBJECT_ASSERT_WLOCKED(object); KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG, ("pmap_object_init_pt: non-device object")); if (pseflag && (addr & (NBPDR - 1)) == 0 && (size & (NBPDR - 1)) == 0) { if (!vm_object_populate(object, pindex, pindex + atop(size))) return; p = vm_page_lookup(object, pindex); KASSERT(p->valid == VM_PAGE_BITS_ALL, ("pmap_object_init_pt: invalid page %p", p)); pat_mode = p->md.pat_mode; /* * Abort the mapping if the first page is not physically * aligned to a 2/4MB page boundary. */ ptepa = VM_PAGE_TO_PHYS(p); if (ptepa & (NBPDR - 1)) return; /* * Skip the first page. Abort the mapping if the rest of * the pages are not physically contiguous or have differing * memory attributes. */ p = TAILQ_NEXT(p, listq); for (pa = ptepa + PAGE_SIZE; pa < ptepa + size; pa += PAGE_SIZE) { KASSERT(p->valid == VM_PAGE_BITS_ALL, ("pmap_object_init_pt: invalid page %p", p)); if (pa != VM_PAGE_TO_PHYS(p) || pat_mode != p->md.pat_mode) return; p = TAILQ_NEXT(p, listq); } /* * Map using 2/4MB pages. Since "ptepa" is 2/4M aligned and * "size" is a multiple of 2/4M, adding the PAT setting to * "pa" will not affect the termination of this loop. */ PMAP_LOCK(pmap); for (pa = ptepa | pmap_cache_bits(pat_mode, 1); pa < ptepa + size; pa += NBPDR) { pde = pmap_pde(pmap, addr); if (*pde == 0) { pde_store(pde, pa | PG_PS | PG_M | PG_A | PG_U | PG_RW | PG_V); pmap->pm_stats.resident_count += NBPDR / PAGE_SIZE; pmap_pde_mappings++; } /* Else continue on if the PDE is already valid. */ addr += NBPDR; } PMAP_UNLOCK(pmap); } } /* * Clear the wired attribute from the mappings for the specified range of * addresses in the given pmap. Every valid mapping within that range * must have the wired attribute set. In contrast, invalid mappings * cannot have the wired attribute set, so they are ignored. * * The wired attribute of the page table entry is not a hardware feature, * so there is no need to invalidate any TLB entries. */ void pmap_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { vm_offset_t pdnxt; pd_entry_t *pde; pt_entry_t *pte; boolean_t pv_lists_locked; if (pmap_is_current(pmap)) pv_lists_locked = FALSE; else { pv_lists_locked = TRUE; resume: rw_wlock(&pvh_global_lock); sched_pin(); } PMAP_LOCK(pmap); for (; sva < eva; sva = pdnxt) { pdnxt = (sva + NBPDR) & ~PDRMASK; if (pdnxt < sva) pdnxt = eva; pde = pmap_pde(pmap, sva); if ((*pde & PG_V) == 0) continue; if ((*pde & PG_PS) != 0) { if ((*pde & PG_W) == 0) panic("pmap_unwire: pde %#jx is missing PG_W", (uintmax_t)*pde); /* * Are we unwiring the entire large page? If not, * demote the mapping and fall through. */ if (sva + NBPDR == pdnxt && eva >= pdnxt) { /* * Regardless of whether a pde (or pte) is 32 * or 64 bits in size, PG_W is among the least * significant 32 bits. */ atomic_clear_int((u_int *)pde, PG_W); pmap->pm_stats.wired_count -= NBPDR / PAGE_SIZE; continue; } else { if (!pv_lists_locked) { pv_lists_locked = TRUE; if (!rw_try_wlock(&pvh_global_lock)) { PMAP_UNLOCK(pmap); /* Repeat sva. */ goto resume; } sched_pin(); } if (!pmap_demote_pde(pmap, pde, sva)) panic("pmap_unwire: demotion failed"); } } if (pdnxt > eva) pdnxt = eva; for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++, sva += PAGE_SIZE) { if ((*pte & PG_V) == 0) continue; if ((*pte & PG_W) == 0) panic("pmap_unwire: pte %#jx is missing PG_W", (uintmax_t)*pte); /* * PG_W must be cleared atomically. Although the pmap * lock synchronizes access to PG_W, another processor * could be setting PG_M and/or PG_A concurrently. * * PG_W is among the least significant 32 bits. */ atomic_clear_int((u_int *)pte, PG_W); pmap->pm_stats.wired_count--; } } if (pv_lists_locked) { sched_unpin(); rw_wunlock(&pvh_global_lock); } PMAP_UNLOCK(pmap); } /* * Copy the range specified by src_addr/len * from the source map to the range dst_addr/len * in the destination map. * * This routine is only advisory and need not do anything. */ void pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len, vm_offset_t src_addr) { struct spglist free; vm_offset_t addr; vm_offset_t end_addr = src_addr + len; vm_offset_t pdnxt; if (dst_addr != src_addr) return; if (!pmap_is_current(src_pmap)) return; rw_wlock(&pvh_global_lock); if (dst_pmap < src_pmap) { PMAP_LOCK(dst_pmap); PMAP_LOCK(src_pmap); } else { PMAP_LOCK(src_pmap); PMAP_LOCK(dst_pmap); } sched_pin(); for (addr = src_addr; addr < end_addr; addr = pdnxt) { pt_entry_t *src_pte, *dst_pte; vm_page_t dstmpte, srcmpte; pd_entry_t srcptepaddr; u_int ptepindex; KASSERT(addr < UPT_MIN_ADDRESS, ("pmap_copy: invalid to pmap_copy page tables")); pdnxt = (addr + NBPDR) & ~PDRMASK; if (pdnxt < addr) pdnxt = end_addr; ptepindex = addr >> PDRSHIFT; srcptepaddr = src_pmap->pm_pdir[ptepindex]; if (srcptepaddr == 0) continue; if (srcptepaddr & PG_PS) { if ((addr & PDRMASK) != 0 || addr + NBPDR > end_addr) continue; if (dst_pmap->pm_pdir[ptepindex] == 0 && ((srcptepaddr & PG_MANAGED) == 0 || pmap_pv_insert_pde(dst_pmap, addr, srcptepaddr & PG_PS_FRAME))) { dst_pmap->pm_pdir[ptepindex] = srcptepaddr & ~PG_W; dst_pmap->pm_stats.resident_count += NBPDR / PAGE_SIZE; pmap_pde_mappings++; } continue; } srcmpte = PHYS_TO_VM_PAGE(srcptepaddr & PG_FRAME); KASSERT(srcmpte->wire_count > 0, ("pmap_copy: source page table page is unused")); if (pdnxt > end_addr) pdnxt = end_addr; src_pte = vtopte(addr); while (addr < pdnxt) { pt_entry_t ptetemp; ptetemp = *src_pte; /* * we only virtual copy managed pages */ if ((ptetemp & PG_MANAGED) != 0) { dstmpte = pmap_allocpte(dst_pmap, addr, PMAP_ENTER_NOSLEEP); if (dstmpte == NULL) goto out; dst_pte = pmap_pte_quick(dst_pmap, addr); if (*dst_pte == 0 && pmap_try_insert_pv_entry(dst_pmap, addr, PHYS_TO_VM_PAGE(ptetemp & PG_FRAME))) { /* * Clear the wired, modified, and * accessed (referenced) bits * during the copy. */ *dst_pte = ptetemp & ~(PG_W | PG_M | PG_A); dst_pmap->pm_stats.resident_count++; } else { SLIST_INIT(&free); if (pmap_unwire_ptp(dst_pmap, dstmpte, &free)) { pmap_invalidate_page(dst_pmap, addr); pmap_free_zero_pages(&free); } goto out; } if (dstmpte->wire_count >= srcmpte->wire_count) break; } addr += PAGE_SIZE; src_pte++; } } out: sched_unpin(); rw_wunlock(&pvh_global_lock); PMAP_UNLOCK(src_pmap); PMAP_UNLOCK(dst_pmap); } /* * Zero 1 page of virtual memory mapped from a hardware page by the caller. */ static __inline void pagezero(void *page) { #if defined(I686_CPU) if (cpu_class == CPUCLASS_686) { -#if defined(CPU_ENABLE_SSE) if (cpu_feature & CPUID_SSE2) sse2_pagezero(page); else -#endif i686_pagezero(page); } else #endif bzero(page, PAGE_SIZE); } /* * Zero the specified hardware page. */ void pmap_zero_page(vm_page_t m) { pt_entry_t *cmap_pte2; struct pcpu *pc; sched_pin(); pc = get_pcpu(); cmap_pte2 = pc->pc_cmap_pte2; mtx_lock(&pc->pc_cmap_lock); if (*cmap_pte2) panic("pmap_zero_page: CMAP2 busy"); *cmap_pte2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M | pmap_cache_bits(m->md.pat_mode, 0); invlcaddr(pc->pc_cmap_addr2); pagezero(pc->pc_cmap_addr2); *cmap_pte2 = 0; /* * Unpin the thread before releasing the lock. Otherwise the thread * could be rescheduled while still bound to the current CPU, only * to unpin itself immediately upon resuming execution. */ sched_unpin(); mtx_unlock(&pc->pc_cmap_lock); } /* * Zero an an area within a single hardware page. off and size must not * cover an area beyond a single hardware page. */ void pmap_zero_page_area(vm_page_t m, int off, int size) { pt_entry_t *cmap_pte2; struct pcpu *pc; sched_pin(); pc = get_pcpu(); cmap_pte2 = pc->pc_cmap_pte2; mtx_lock(&pc->pc_cmap_lock); if (*cmap_pte2) panic("pmap_zero_page_area: CMAP2 busy"); *cmap_pte2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M | pmap_cache_bits(m->md.pat_mode, 0); invlcaddr(pc->pc_cmap_addr2); if (off == 0 && size == PAGE_SIZE) pagezero(pc->pc_cmap_addr2); else bzero(pc->pc_cmap_addr2 + off, size); *cmap_pte2 = 0; sched_unpin(); mtx_unlock(&pc->pc_cmap_lock); } /* * Copy 1 specified hardware page to another. */ void pmap_copy_page(vm_page_t src, vm_page_t dst) { pt_entry_t *cmap_pte1, *cmap_pte2; struct pcpu *pc; sched_pin(); pc = get_pcpu(); cmap_pte1 = pc->pc_cmap_pte1; cmap_pte2 = pc->pc_cmap_pte2; mtx_lock(&pc->pc_cmap_lock); if (*cmap_pte1) panic("pmap_copy_page: CMAP1 busy"); if (*cmap_pte2) panic("pmap_copy_page: CMAP2 busy"); *cmap_pte1 = PG_V | VM_PAGE_TO_PHYS(src) | PG_A | pmap_cache_bits(src->md.pat_mode, 0); invlcaddr(pc->pc_cmap_addr1); *cmap_pte2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(dst) | PG_A | PG_M | pmap_cache_bits(dst->md.pat_mode, 0); invlcaddr(pc->pc_cmap_addr2); bcopy(pc->pc_cmap_addr1, pc->pc_cmap_addr2, PAGE_SIZE); *cmap_pte1 = 0; *cmap_pte2 = 0; sched_unpin(); mtx_unlock(&pc->pc_cmap_lock); } int unmapped_buf_allowed = 1; void pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[], vm_offset_t b_offset, int xfersize) { vm_page_t a_pg, b_pg; char *a_cp, *b_cp; vm_offset_t a_pg_offset, b_pg_offset; pt_entry_t *cmap_pte1, *cmap_pte2; struct pcpu *pc; int cnt; sched_pin(); pc = get_pcpu(); cmap_pte1 = pc->pc_cmap_pte1; cmap_pte2 = pc->pc_cmap_pte2; mtx_lock(&pc->pc_cmap_lock); if (*cmap_pte1 != 0) panic("pmap_copy_pages: CMAP1 busy"); if (*cmap_pte2 != 0) panic("pmap_copy_pages: CMAP2 busy"); while (xfersize > 0) { a_pg = ma[a_offset >> PAGE_SHIFT]; a_pg_offset = a_offset & PAGE_MASK; cnt = min(xfersize, PAGE_SIZE - a_pg_offset); b_pg = mb[b_offset >> PAGE_SHIFT]; b_pg_offset = b_offset & PAGE_MASK; cnt = min(cnt, PAGE_SIZE - b_pg_offset); *cmap_pte1 = PG_V | VM_PAGE_TO_PHYS(a_pg) | PG_A | pmap_cache_bits(a_pg->md.pat_mode, 0); invlcaddr(pc->pc_cmap_addr1); *cmap_pte2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(b_pg) | PG_A | PG_M | pmap_cache_bits(b_pg->md.pat_mode, 0); invlcaddr(pc->pc_cmap_addr2); a_cp = pc->pc_cmap_addr1 + a_pg_offset; b_cp = pc->pc_cmap_addr2 + b_pg_offset; bcopy(a_cp, b_cp, cnt); a_offset += cnt; b_offset += cnt; xfersize -= cnt; } *cmap_pte1 = 0; *cmap_pte2 = 0; sched_unpin(); mtx_unlock(&pc->pc_cmap_lock); } /* * Returns true if the pmap's pv is one of the first * 16 pvs linked to from this page. This count may * be changed upwards or downwards in the future; it * is only necessary that true be returned for a small * subset of pmaps for proper page aging. */ boolean_t pmap_page_exists_quick(pmap_t pmap, vm_page_t m) { struct md_page *pvh; pv_entry_t pv; int loops = 0; boolean_t rv; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_page_exists_quick: page %p is not managed", m)); rv = FALSE; rw_wlock(&pvh_global_lock); TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { if (PV_PMAP(pv) == pmap) { rv = TRUE; break; } loops++; if (loops >= 16) break; } if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) { pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { if (PV_PMAP(pv) == pmap) { rv = TRUE; break; } loops++; if (loops >= 16) break; } } rw_wunlock(&pvh_global_lock); return (rv); } /* * pmap_page_wired_mappings: * * Return the number of managed mappings to the given physical page * that are wired. */ int pmap_page_wired_mappings(vm_page_t m) { int count; count = 0; if ((m->oflags & VPO_UNMANAGED) != 0) return (count); rw_wlock(&pvh_global_lock); count = pmap_pvh_wired_mappings(&m->md, count); if ((m->flags & PG_FICTITIOUS) == 0) { count = pmap_pvh_wired_mappings(pa_to_pvh(VM_PAGE_TO_PHYS(m)), count); } rw_wunlock(&pvh_global_lock); return (count); } /* * pmap_pvh_wired_mappings: * * Return the updated number "count" of managed mappings that are wired. */ static int pmap_pvh_wired_mappings(struct md_page *pvh, int count) { pmap_t pmap; pt_entry_t *pte; pv_entry_t pv; rw_assert(&pvh_global_lock, RA_WLOCKED); sched_pin(); TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pte = pmap_pte_quick(pmap, pv->pv_va); if ((*pte & PG_W) != 0) count++; PMAP_UNLOCK(pmap); } sched_unpin(); return (count); } /* * Returns TRUE if the given page is mapped individually or as part of * a 4mpage. Otherwise, returns FALSE. */ boolean_t pmap_page_is_mapped(vm_page_t m) { boolean_t rv; if ((m->oflags & VPO_UNMANAGED) != 0) return (FALSE); rw_wlock(&pvh_global_lock); rv = !TAILQ_EMPTY(&m->md.pv_list) || ((m->flags & PG_FICTITIOUS) == 0 && !TAILQ_EMPTY(&pa_to_pvh(VM_PAGE_TO_PHYS(m))->pv_list)); rw_wunlock(&pvh_global_lock); return (rv); } /* * Remove all pages from specified address space * this aids process exit speeds. Also, this code * is special cased for current process only, but * can have the more generic (and slightly slower) * mode enabled. This is much faster than pmap_remove * in the case of running down an entire address space. */ void pmap_remove_pages(pmap_t pmap) { pt_entry_t *pte, tpte; vm_page_t m, mpte, mt; pv_entry_t pv; struct md_page *pvh; struct pv_chunk *pc, *npc; struct spglist free; int field, idx; int32_t bit; uint32_t inuse, bitmask; int allfree; if (pmap != PCPU_GET(curpmap)) { printf("warning: pmap_remove_pages called with non-current pmap\n"); return; } SLIST_INIT(&free); rw_wlock(&pvh_global_lock); PMAP_LOCK(pmap); sched_pin(); TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) { KASSERT(pc->pc_pmap == pmap, ("Wrong pmap %p %p", pmap, pc->pc_pmap)); allfree = 1; for (field = 0; field < _NPCM; field++) { inuse = ~pc->pc_map[field] & pc_freemask[field]; while (inuse != 0) { bit = bsfl(inuse); bitmask = 1UL << bit; idx = field * 32 + bit; pv = &pc->pc_pventry[idx]; inuse &= ~bitmask; pte = pmap_pde(pmap, pv->pv_va); tpte = *pte; if ((tpte & PG_PS) == 0) { pte = vtopte(pv->pv_va); tpte = *pte & ~PG_PTE_PAT; } if (tpte == 0) { printf( "TPTE at %p IS ZERO @ VA %08x\n", pte, pv->pv_va); panic("bad pte"); } /* * We cannot remove wired pages from a process' mapping at this time */ if (tpte & PG_W) { allfree = 0; continue; } m = PHYS_TO_VM_PAGE(tpte & PG_FRAME); KASSERT(m->phys_addr == (tpte & PG_FRAME), ("vm_page_t %p phys_addr mismatch %016jx %016jx", m, (uintmax_t)m->phys_addr, (uintmax_t)tpte)); KASSERT((m->flags & PG_FICTITIOUS) != 0 || m < &vm_page_array[vm_page_array_size], ("pmap_remove_pages: bad tpte %#jx", (uintmax_t)tpte)); pte_clear(pte); /* * Update the vm_page_t clean/reference bits. */ if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { if ((tpte & PG_PS) != 0) { for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++) vm_page_dirty(mt); } else vm_page_dirty(m); } /* Mark free */ PV_STAT(pv_entry_frees++); PV_STAT(pv_entry_spare++); pv_entry_count--; pc->pc_map[field] |= bitmask; if ((tpte & PG_PS) != 0) { pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE; pvh = pa_to_pvh(tpte & PG_PS_FRAME); TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); if (TAILQ_EMPTY(&pvh->pv_list)) { for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++) if (TAILQ_EMPTY(&mt->md.pv_list)) vm_page_aflag_clear(mt, PGA_WRITEABLE); } mpte = pmap_remove_pt_page(pmap, pv->pv_va); if (mpte != NULL) { pmap->pm_stats.resident_count--; KASSERT(mpte->wire_count == NPTEPG, ("pmap_remove_pages: pte page wire count error")); mpte->wire_count = 0; pmap_add_delayed_free_list(mpte, &free, FALSE); atomic_subtract_int(&vm_cnt.v_wire_count, 1); } } else { pmap->pm_stats.resident_count--; TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); if (TAILQ_EMPTY(&m->md.pv_list) && (m->flags & PG_FICTITIOUS) == 0) { pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); if (TAILQ_EMPTY(&pvh->pv_list)) vm_page_aflag_clear(m, PGA_WRITEABLE); } pmap_unuse_pt(pmap, pv->pv_va, &free); } } } if (allfree) { TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); free_pv_chunk(pc); } } sched_unpin(); pmap_invalidate_all(pmap); rw_wunlock(&pvh_global_lock); PMAP_UNLOCK(pmap); pmap_free_zero_pages(&free); } /* * pmap_is_modified: * * Return whether or not the specified physical page was modified * in any physical maps. */ boolean_t pmap_is_modified(vm_page_t m) { boolean_t rv; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_is_modified: page %p is not managed", m)); /* * If the page is not exclusive busied, then PGA_WRITEABLE cannot be * concurrently set while the object is locked. Thus, if PGA_WRITEABLE * is clear, no PTEs can have PG_M set. */ VM_OBJECT_ASSERT_WLOCKED(m->object); if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0) return (FALSE); rw_wlock(&pvh_global_lock); rv = pmap_is_modified_pvh(&m->md) || ((m->flags & PG_FICTITIOUS) == 0 && pmap_is_modified_pvh(pa_to_pvh(VM_PAGE_TO_PHYS(m)))); rw_wunlock(&pvh_global_lock); return (rv); } /* * Returns TRUE if any of the given mappings were used to modify * physical memory. Otherwise, returns FALSE. Both page and 2mpage * mappings are supported. */ static boolean_t pmap_is_modified_pvh(struct md_page *pvh) { pv_entry_t pv; pt_entry_t *pte; pmap_t pmap; boolean_t rv; rw_assert(&pvh_global_lock, RA_WLOCKED); rv = FALSE; sched_pin(); TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pte = pmap_pte_quick(pmap, pv->pv_va); rv = (*pte & (PG_M | PG_RW)) == (PG_M | PG_RW); PMAP_UNLOCK(pmap); if (rv) break; } sched_unpin(); return (rv); } /* * pmap_is_prefaultable: * * Return whether or not the specified virtual address is elgible * for prefault. */ boolean_t pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr) { pd_entry_t *pde; pt_entry_t *pte; boolean_t rv; rv = FALSE; PMAP_LOCK(pmap); pde = pmap_pde(pmap, addr); if (*pde != 0 && (*pde & PG_PS) == 0) { pte = vtopte(addr); rv = *pte == 0; } PMAP_UNLOCK(pmap); return (rv); } /* * pmap_is_referenced: * * Return whether or not the specified physical page was referenced * in any physical maps. */ boolean_t pmap_is_referenced(vm_page_t m) { boolean_t rv; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_is_referenced: page %p is not managed", m)); rw_wlock(&pvh_global_lock); rv = pmap_is_referenced_pvh(&m->md) || ((m->flags & PG_FICTITIOUS) == 0 && pmap_is_referenced_pvh(pa_to_pvh(VM_PAGE_TO_PHYS(m)))); rw_wunlock(&pvh_global_lock); return (rv); } /* * Returns TRUE if any of the given mappings were referenced and FALSE * otherwise. Both page and 4mpage mappings are supported. */ static boolean_t pmap_is_referenced_pvh(struct md_page *pvh) { pv_entry_t pv; pt_entry_t *pte; pmap_t pmap; boolean_t rv; rw_assert(&pvh_global_lock, RA_WLOCKED); rv = FALSE; sched_pin(); TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pte = pmap_pte_quick(pmap, pv->pv_va); rv = (*pte & (PG_A | PG_V)) == (PG_A | PG_V); PMAP_UNLOCK(pmap); if (rv) break; } sched_unpin(); return (rv); } /* * Clear the write and modified bits in each of the given page's mappings. */ void pmap_remove_write(vm_page_t m) { struct md_page *pvh; pv_entry_t next_pv, pv; pmap_t pmap; pd_entry_t *pde; pt_entry_t oldpte, *pte; vm_offset_t va; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_remove_write: page %p is not managed", m)); /* * If the page is not exclusive busied, then PGA_WRITEABLE cannot be * set by another thread while the object is locked. Thus, * if PGA_WRITEABLE is clear, no page table entries need updating. */ VM_OBJECT_ASSERT_WLOCKED(m->object); if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0) return; rw_wlock(&pvh_global_lock); sched_pin(); if ((m->flags & PG_FICTITIOUS) != 0) goto small_mappings; pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { va = pv->pv_va; pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pde = pmap_pde(pmap, va); if ((*pde & PG_RW) != 0) (void)pmap_demote_pde(pmap, pde, va); PMAP_UNLOCK(pmap); } small_mappings: TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pde = pmap_pde(pmap, pv->pv_va); KASSERT((*pde & PG_PS) == 0, ("pmap_clear_write: found" " a 4mpage in page %p's pv list", m)); pte = pmap_pte_quick(pmap, pv->pv_va); retry: oldpte = *pte; if ((oldpte & PG_RW) != 0) { /* * Regardless of whether a pte is 32 or 64 bits * in size, PG_RW and PG_M are among the least * significant 32 bits. */ if (!atomic_cmpset_int((u_int *)pte, oldpte, oldpte & ~(PG_RW | PG_M))) goto retry; if ((oldpte & PG_M) != 0) vm_page_dirty(m); pmap_invalidate_page(pmap, pv->pv_va); } PMAP_UNLOCK(pmap); } vm_page_aflag_clear(m, PGA_WRITEABLE); sched_unpin(); rw_wunlock(&pvh_global_lock); } /* * pmap_ts_referenced: * * Return a count of reference bits for a page, clearing those bits. * It is not necessary for every reference bit to be cleared, but it * is necessary that 0 only be returned when there are truly no * reference bits set. * * As an optimization, update the page's dirty field if a modified bit is * found while counting reference bits. This opportunistic update can be * performed at low cost and can eliminate the need for some future calls * to pmap_is_modified(). However, since this function stops after * finding PMAP_TS_REFERENCED_MAX reference bits, it may not detect some * dirty pages. Those dirty pages will only be detected by a future call * to pmap_is_modified(). */ int pmap_ts_referenced(vm_page_t m) { struct md_page *pvh; pv_entry_t pv, pvf; pmap_t pmap; pd_entry_t *pde; pt_entry_t *pte; vm_paddr_t pa; int rtval = 0; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_ts_referenced: page %p is not managed", m)); pa = VM_PAGE_TO_PHYS(m); pvh = pa_to_pvh(pa); rw_wlock(&pvh_global_lock); sched_pin(); if ((m->flags & PG_FICTITIOUS) != 0 || (pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL) goto small_mappings; pv = pvf; do { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pde = pmap_pde(pmap, pv->pv_va); if ((*pde & (PG_M | PG_RW)) == (PG_M | PG_RW)) { /* * Although "*pde" is mapping a 2/4MB page, because * this function is called at a 4KB page granularity, * we only update the 4KB page under test. */ vm_page_dirty(m); } if ((*pde & PG_A) != 0) { /* * Since this reference bit is shared by either 1024 * or 512 4KB pages, it should not be cleared every * time it is tested. Apply a simple "hash" function * on the physical page number, the virtual superpage * number, and the pmap address to select one 4KB page * out of the 1024 or 512 on which testing the * reference bit will result in clearing that bit. * This function is designed to avoid the selection of * the same 4KB page for every 2- or 4MB page mapping. * * On demotion, a mapping that hasn't been referenced * is simply destroyed. To avoid the possibility of a * subsequent page fault on a demoted wired mapping, * always leave its reference bit set. Moreover, * since the superpage is wired, the current state of * its reference bit won't affect page replacement. */ if ((((pa >> PAGE_SHIFT) ^ (pv->pv_va >> PDRSHIFT) ^ (uintptr_t)pmap) & (NPTEPG - 1)) == 0 && (*pde & PG_W) == 0) { atomic_clear_int((u_int *)pde, PG_A); pmap_invalidate_page(pmap, pv->pv_va); } rtval++; } PMAP_UNLOCK(pmap); /* Rotate the PV list if it has more than one entry. */ if (TAILQ_NEXT(pv, pv_next) != NULL) { TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); } if (rtval >= PMAP_TS_REFERENCED_MAX) goto out; } while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf); small_mappings: if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL) goto out; pv = pvf; do { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pde = pmap_pde(pmap, pv->pv_va); KASSERT((*pde & PG_PS) == 0, ("pmap_ts_referenced: found a 4mpage in page %p's pv list", m)); pte = pmap_pte_quick(pmap, pv->pv_va); if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) vm_page_dirty(m); if ((*pte & PG_A) != 0) { atomic_clear_int((u_int *)pte, PG_A); pmap_invalidate_page(pmap, pv->pv_va); rtval++; } PMAP_UNLOCK(pmap); /* Rotate the PV list if it has more than one entry. */ if (TAILQ_NEXT(pv, pv_next) != NULL) { TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); } } while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && rtval < PMAP_TS_REFERENCED_MAX); out: sched_unpin(); rw_wunlock(&pvh_global_lock); return (rtval); } /* * Apply the given advice to the specified range of addresses within the * given pmap. Depending on the advice, clear the referenced and/or * modified flags in each mapping and set the mapped page's dirty field. */ void pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice) { pd_entry_t oldpde, *pde; pt_entry_t *pte; vm_offset_t va, pdnxt; vm_page_t m; boolean_t anychanged, pv_lists_locked; if (advice != MADV_DONTNEED && advice != MADV_FREE) return; if (pmap_is_current(pmap)) pv_lists_locked = FALSE; else { pv_lists_locked = TRUE; resume: rw_wlock(&pvh_global_lock); sched_pin(); } anychanged = FALSE; PMAP_LOCK(pmap); for (; sva < eva; sva = pdnxt) { pdnxt = (sva + NBPDR) & ~PDRMASK; if (pdnxt < sva) pdnxt = eva; pde = pmap_pde(pmap, sva); oldpde = *pde; if ((oldpde & PG_V) == 0) continue; else if ((oldpde & PG_PS) != 0) { if ((oldpde & PG_MANAGED) == 0) continue; if (!pv_lists_locked) { pv_lists_locked = TRUE; if (!rw_try_wlock(&pvh_global_lock)) { if (anychanged) pmap_invalidate_all(pmap); PMAP_UNLOCK(pmap); goto resume; } sched_pin(); } if (!pmap_demote_pde(pmap, pde, sva)) { /* * The large page mapping was destroyed. */ continue; } /* * Unless the page mappings are wired, remove the * mapping to a single page so that a subsequent * access may repromote. Since the underlying page * table page is fully populated, this removal never * frees a page table page. */ if ((oldpde & PG_W) == 0) { pte = pmap_pte_quick(pmap, sva); KASSERT((*pte & PG_V) != 0, ("pmap_advise: invalid PTE")); pmap_remove_pte(pmap, pte, sva, NULL); anychanged = TRUE; } } if (pdnxt > eva) pdnxt = eva; va = pdnxt; for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++, sva += PAGE_SIZE) { if ((*pte & (PG_MANAGED | PG_V)) != (PG_MANAGED | PG_V)) goto maybe_invlrng; else if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { if (advice == MADV_DONTNEED) { /* * Future calls to pmap_is_modified() * can be avoided by making the page * dirty now. */ m = PHYS_TO_VM_PAGE(*pte & PG_FRAME); vm_page_dirty(m); } atomic_clear_int((u_int *)pte, PG_M | PG_A); } else if ((*pte & PG_A) != 0) atomic_clear_int((u_int *)pte, PG_A); else goto maybe_invlrng; if ((*pte & PG_G) != 0) { if (va == pdnxt) va = sva; } else anychanged = TRUE; continue; maybe_invlrng: if (va != pdnxt) { pmap_invalidate_range(pmap, va, sva); va = pdnxt; } } if (va != pdnxt) pmap_invalidate_range(pmap, va, sva); } if (anychanged) pmap_invalidate_all(pmap); if (pv_lists_locked) { sched_unpin(); rw_wunlock(&pvh_global_lock); } PMAP_UNLOCK(pmap); } /* * Clear the modify bits on the specified physical page. */ void pmap_clear_modify(vm_page_t m) { struct md_page *pvh; pv_entry_t next_pv, pv; pmap_t pmap; pd_entry_t oldpde, *pde; pt_entry_t oldpte, *pte; vm_offset_t va; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_clear_modify: page %p is not managed", m)); VM_OBJECT_ASSERT_WLOCKED(m->object); KASSERT(!vm_page_xbusied(m), ("pmap_clear_modify: page %p is exclusive busied", m)); /* * If the page is not PGA_WRITEABLE, then no PTEs can have PG_M set. * If the object containing the page is locked and the page is not * exclusive busied, then PGA_WRITEABLE cannot be concurrently set. */ if ((m->aflags & PGA_WRITEABLE) == 0) return; rw_wlock(&pvh_global_lock); sched_pin(); if ((m->flags & PG_FICTITIOUS) != 0) goto small_mappings; pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { va = pv->pv_va; pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pde = pmap_pde(pmap, va); oldpde = *pde; if ((oldpde & PG_RW) != 0) { if (pmap_demote_pde(pmap, pde, va)) { if ((oldpde & PG_W) == 0) { /* * Write protect the mapping to a * single page so that a subsequent * write access may repromote. */ va += VM_PAGE_TO_PHYS(m) - (oldpde & PG_PS_FRAME); pte = pmap_pte_quick(pmap, va); oldpte = *pte; if ((oldpte & PG_V) != 0) { /* * Regardless of whether a pte is 32 or 64 bits * in size, PG_RW and PG_M are among the least * significant 32 bits. */ while (!atomic_cmpset_int((u_int *)pte, oldpte, oldpte & ~(PG_M | PG_RW))) oldpte = *pte; vm_page_dirty(m); pmap_invalidate_page(pmap, va); } } } } PMAP_UNLOCK(pmap); } small_mappings: TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { pmap = PV_PMAP(pv); PMAP_LOCK(pmap); pde = pmap_pde(pmap, pv->pv_va); KASSERT((*pde & PG_PS) == 0, ("pmap_clear_modify: found" " a 4mpage in page %p's pv list", m)); pte = pmap_pte_quick(pmap, pv->pv_va); if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { /* * Regardless of whether a pte is 32 or 64 bits * in size, PG_M is among the least significant * 32 bits. */ atomic_clear_int((u_int *)pte, PG_M); pmap_invalidate_page(pmap, pv->pv_va); } PMAP_UNLOCK(pmap); } sched_unpin(); rw_wunlock(&pvh_global_lock); } /* * Miscellaneous support routines follow */ /* Adjust the cache mode for a 4KB page mapped via a PTE. */ static __inline void pmap_pte_attr(pt_entry_t *pte, int cache_bits) { u_int opte, npte; /* * The cache mode bits are all in the low 32-bits of the * PTE, so we can just spin on updating the low 32-bits. */ do { opte = *(u_int *)pte; npte = opte & ~PG_PTE_CACHE; npte |= cache_bits; } while (npte != opte && !atomic_cmpset_int((u_int *)pte, opte, npte)); } /* Adjust the cache mode for a 2/4MB page mapped via a PDE. */ static __inline void pmap_pde_attr(pd_entry_t *pde, int cache_bits) { u_int opde, npde; /* * The cache mode bits are all in the low 32-bits of the * PDE, so we can just spin on updating the low 32-bits. */ do { opde = *(u_int *)pde; npde = opde & ~PG_PDE_CACHE; npde |= cache_bits; } while (npde != opde && !atomic_cmpset_int((u_int *)pde, opde, npde)); } /* * Map a set of physical memory pages into the kernel virtual * address space. Return a pointer to where it is mapped. This * routine is intended to be used for mapping device memory, * NOT real memory. */ void * pmap_mapdev_attr(vm_paddr_t pa, vm_size_t size, int mode) { struct pmap_preinit_mapping *ppim; vm_offset_t va, offset; vm_size_t tmpsize; int i; offset = pa & PAGE_MASK; size = round_page(offset + size); pa = pa & PG_FRAME; if (pa < KERNLOAD && pa + size <= KERNLOAD) va = KERNBASE + pa; else if (!pmap_initialized) { va = 0; for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { ppim = pmap_preinit_mapping + i; if (ppim->va == 0) { ppim->pa = pa; ppim->sz = size; ppim->mode = mode; ppim->va = virtual_avail; virtual_avail += size; va = ppim->va; break; } } if (va == 0) panic("%s: too many preinit mappings", __func__); } else { /* * If we have a preinit mapping, re-use it. */ for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { ppim = pmap_preinit_mapping + i; if (ppim->pa == pa && ppim->sz == size && ppim->mode == mode) return ((void *)(ppim->va + offset)); } va = kva_alloc(size); if (va == 0) panic("%s: Couldn't allocate KVA", __func__); } for (tmpsize = 0; tmpsize < size; tmpsize += PAGE_SIZE) pmap_kenter_attr(va + tmpsize, pa + tmpsize, mode); pmap_invalidate_range(kernel_pmap, va, va + tmpsize); pmap_invalidate_cache_range(va, va + size, FALSE); return ((void *)(va + offset)); } void * pmap_mapdev(vm_paddr_t pa, vm_size_t size) { return (pmap_mapdev_attr(pa, size, PAT_UNCACHEABLE)); } void * pmap_mapbios(vm_paddr_t pa, vm_size_t size) { return (pmap_mapdev_attr(pa, size, PAT_WRITE_BACK)); } void pmap_unmapdev(vm_offset_t va, vm_size_t size) { struct pmap_preinit_mapping *ppim; vm_offset_t offset; int i; if (va >= KERNBASE && va + size <= KERNBASE + KERNLOAD) return; offset = va & PAGE_MASK; size = round_page(offset + size); va = trunc_page(va); for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { ppim = pmap_preinit_mapping + i; if (ppim->va == va && ppim->sz == size) { if (pmap_initialized) return; ppim->pa = 0; ppim->va = 0; ppim->sz = 0; ppim->mode = 0; if (va + size == virtual_avail) virtual_avail = va; return; } } if (pmap_initialized) kva_free(va, size); } /* * Sets the memory attribute for the specified page. */ void pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma) { m->md.pat_mode = ma; if ((m->flags & PG_FICTITIOUS) != 0) return; /* * If "m" is a normal page, flush it from the cache. * See pmap_invalidate_cache_range(). * * First, try to find an existing mapping of the page by sf * buffer. sf_buf_invalidate_cache() modifies mapping and * flushes the cache. */ if (sf_buf_invalidate_cache(m)) return; /* * If page is not mapped by sf buffer, but CPU does not * support self snoop, map the page transient and do * invalidation. In the worst case, whole cache is flushed by * pmap_invalidate_cache_range(). */ if ((cpu_feature & CPUID_SS) == 0) pmap_flush_page(m); } static void pmap_flush_page(vm_page_t m) { pt_entry_t *cmap_pte2; struct pcpu *pc; vm_offset_t sva, eva; bool useclflushopt; useclflushopt = (cpu_stdext_feature & CPUID_STDEXT_CLFLUSHOPT) != 0; if (useclflushopt || (cpu_feature & CPUID_CLFSH) != 0) { sched_pin(); pc = get_pcpu(); cmap_pte2 = pc->pc_cmap_pte2; mtx_lock(&pc->pc_cmap_lock); if (*cmap_pte2) panic("pmap_flush_page: CMAP2 busy"); *cmap_pte2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M | pmap_cache_bits(m->md.pat_mode, 0); invlcaddr(pc->pc_cmap_addr2); sva = (vm_offset_t)pc->pc_cmap_addr2; eva = sva + PAGE_SIZE; /* * Use mfence or sfence despite the ordering implied by * mtx_{un,}lock() because clflush on non-Intel CPUs * and clflushopt are not guaranteed to be ordered by * any other instruction. */ if (useclflushopt) sfence(); else if (cpu_vendor_id != CPU_VENDOR_INTEL) mfence(); for (; sva < eva; sva += cpu_clflush_line_size) { if (useclflushopt) clflushopt(sva); else clflush(sva); } if (useclflushopt) sfence(); else if (cpu_vendor_id != CPU_VENDOR_INTEL) mfence(); *cmap_pte2 = 0; sched_unpin(); mtx_unlock(&pc->pc_cmap_lock); } else pmap_invalidate_cache(); } /* * Changes the specified virtual address range's memory type to that given by * the parameter "mode". The specified virtual address range must be * completely contained within either the kernel map. * * Returns zero if the change completed successfully, and either EINVAL or * ENOMEM if the change failed. Specifically, EINVAL is returned if some part * of the virtual address range was not mapped, and ENOMEM is returned if * there was insufficient memory available to complete the change. */ int pmap_change_attr(vm_offset_t va, vm_size_t size, int mode) { vm_offset_t base, offset, tmpva; pd_entry_t *pde; pt_entry_t *pte; int cache_bits_pte, cache_bits_pde; boolean_t changed; base = trunc_page(va); offset = va & PAGE_MASK; size = round_page(offset + size); /* * Only supported on kernel virtual addresses above the recursive map. */ if (base < VM_MIN_KERNEL_ADDRESS) return (EINVAL); cache_bits_pde = pmap_cache_bits(mode, 1); cache_bits_pte = pmap_cache_bits(mode, 0); changed = FALSE; /* * Pages that aren't mapped aren't supported. Also break down * 2/4MB pages into 4KB pages if required. */ PMAP_LOCK(kernel_pmap); for (tmpva = base; tmpva < base + size; ) { pde = pmap_pde(kernel_pmap, tmpva); if (*pde == 0) { PMAP_UNLOCK(kernel_pmap); return (EINVAL); } if (*pde & PG_PS) { /* * If the current 2/4MB page already has * the required memory type, then we need not * demote this page. Just increment tmpva to * the next 2/4MB page frame. */ if ((*pde & PG_PDE_CACHE) == cache_bits_pde) { tmpva = trunc_4mpage(tmpva) + NBPDR; continue; } /* * If the current offset aligns with a 2/4MB * page frame and there is at least 2/4MB left * within the range, then we need not break * down this page into 4KB pages. */ if ((tmpva & PDRMASK) == 0 && tmpva + PDRMASK < base + size) { tmpva += NBPDR; continue; } if (!pmap_demote_pde(kernel_pmap, pde, tmpva)) { PMAP_UNLOCK(kernel_pmap); return (ENOMEM); } } pte = vtopte(tmpva); if (*pte == 0) { PMAP_UNLOCK(kernel_pmap); return (EINVAL); } tmpva += PAGE_SIZE; } PMAP_UNLOCK(kernel_pmap); /* * Ok, all the pages exist, so run through them updating their * cache mode if required. */ for (tmpva = base; tmpva < base + size; ) { pde = pmap_pde(kernel_pmap, tmpva); if (*pde & PG_PS) { if ((*pde & PG_PDE_CACHE) != cache_bits_pde) { pmap_pde_attr(pde, cache_bits_pde); changed = TRUE; } tmpva = trunc_4mpage(tmpva) + NBPDR; } else { pte = vtopte(tmpva); if ((*pte & PG_PTE_CACHE) != cache_bits_pte) { pmap_pte_attr(pte, cache_bits_pte); changed = TRUE; } tmpva += PAGE_SIZE; } } /* * Flush CPU caches to make sure any data isn't cached that * shouldn't be, etc. */ if (changed) { pmap_invalidate_range(kernel_pmap, base, tmpva); pmap_invalidate_cache_range(base, tmpva, FALSE); } return (0); } /* * perform the pmap work for mincore */ int pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *locked_pa) { pd_entry_t *pdep; pt_entry_t *ptep, pte; vm_paddr_t pa; int val; PMAP_LOCK(pmap); retry: pdep = pmap_pde(pmap, addr); if (*pdep != 0) { if (*pdep & PG_PS) { pte = *pdep; /* Compute the physical address of the 4KB page. */ pa = ((*pdep & PG_PS_FRAME) | (addr & PDRMASK)) & PG_FRAME; val = MINCORE_SUPER; } else { ptep = pmap_pte(pmap, addr); pte = *ptep; pmap_pte_release(ptep); pa = pte & PG_FRAME; val = 0; } } else { pte = 0; pa = 0; val = 0; } if ((pte & PG_V) != 0) { val |= MINCORE_INCORE; if ((pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER; if ((pte & PG_A) != 0) val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER; } if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) != (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) && (pte & (PG_MANAGED | PG_V)) == (PG_MANAGED | PG_V)) { /* Ensure that "PHYS_TO_VM_PAGE(pa)->object" doesn't change. */ if (vm_page_pa_tryrelock(pmap, pa, locked_pa)) goto retry; } else PA_UNLOCK_COND(*locked_pa); PMAP_UNLOCK(pmap); return (val); } void pmap_activate(struct thread *td) { pmap_t pmap, oldpmap; u_int cpuid; u_int32_t cr3; critical_enter(); pmap = vmspace_pmap(td->td_proc->p_vmspace); oldpmap = PCPU_GET(curpmap); cpuid = PCPU_GET(cpuid); #if defined(SMP) CPU_CLR_ATOMIC(cpuid, &oldpmap->pm_active); CPU_SET_ATOMIC(cpuid, &pmap->pm_active); #else CPU_CLR(cpuid, &oldpmap->pm_active); CPU_SET(cpuid, &pmap->pm_active); #endif #if defined(PAE) || defined(PAE_TABLES) cr3 = vtophys(pmap->pm_pdpt); #else cr3 = vtophys(pmap->pm_pdir); #endif /* * pmap_activate is for the current thread on the current cpu */ td->td_pcb->pcb_cr3 = cr3; load_cr3(cr3); PCPU_SET(curpmap, pmap); critical_exit(); } void pmap_sync_icache(pmap_t pm, vm_offset_t va, vm_size_t sz) { } /* * Increase the starting virtual address of the given mapping if a * different alignment might result in more superpage mappings. */ void pmap_align_superpage(vm_object_t object, vm_ooffset_t offset, vm_offset_t *addr, vm_size_t size) { vm_offset_t superpage_offset; if (size < NBPDR) return; if (object != NULL && (object->flags & OBJ_COLORED) != 0) offset += ptoa(object->pg_color); superpage_offset = offset & PDRMASK; if (size - ((NBPDR - superpage_offset) & PDRMASK) < NBPDR || (*addr & PDRMASK) == superpage_offset) return; if ((*addr & PDRMASK) < superpage_offset) *addr = (*addr & ~PDRMASK) + superpage_offset; else *addr = ((*addr + PDRMASK) & ~PDRMASK) + superpage_offset; } vm_offset_t pmap_quick_enter_page(vm_page_t m) { vm_offset_t qaddr; pt_entry_t *pte; critical_enter(); qaddr = PCPU_GET(qmap_addr); pte = vtopte(qaddr); KASSERT(*pte == 0, ("pmap_quick_enter_page: PTE busy")); *pte = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M | pmap_cache_bits(pmap_page_get_memattr(m), 0); invlpg(qaddr); return (qaddr); } void pmap_quick_remove_page(vm_offset_t addr) { vm_offset_t qaddr; pt_entry_t *pte; qaddr = PCPU_GET(qmap_addr); pte = vtopte(qaddr); KASSERT(*pte != 0, ("pmap_quick_remove_page: PTE not in use")); KASSERT(addr == qaddr, ("pmap_quick_remove_page: invalid address")); *pte = 0; critical_exit(); } #if defined(PMAP_DEBUG) pmap_pid_dump(int pid) { pmap_t pmap; struct proc *p; int npte = 0; int index; sx_slock(&allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { if (p->p_pid != pid) continue; if (p->p_vmspace) { int i,j; index = 0; pmap = vmspace_pmap(p->p_vmspace); for (i = 0; i < NPDEPTD; i++) { pd_entry_t *pde; pt_entry_t *pte; vm_offset_t base = i << PDRSHIFT; pde = &pmap->pm_pdir[i]; if (pde && pmap_pde_v(pde)) { for (j = 0; j < NPTEPG; j++) { vm_offset_t va = base + (j << PAGE_SHIFT); if (va >= (vm_offset_t) VM_MIN_KERNEL_ADDRESS) { if (index) { index = 0; printf("\n"); } sx_sunlock(&allproc_lock); return (npte); } pte = pmap_pte(pmap, va); if (pte && pmap_pte_v(pte)) { pt_entry_t pa; vm_page_t m; pa = *pte; m = PHYS_TO_VM_PAGE(pa & PG_FRAME); printf("va: 0x%x, pt: 0x%x, h: %d, w: %d, f: 0x%x", va, pa, m->hold_count, m->wire_count, m->flags); npte++; index++; if (index >= 2) { index = 0; printf("\n"); } else { printf(" "); } } } } } } } sx_sunlock(&allproc_lock); return (npte); } #endif Index: head/sys/i386/i386/ptrace_machdep.c =================================================================== --- head/sys/i386/i386/ptrace_machdep.c (revision 313153) +++ head/sys/i386/i386/ptrace_machdep.c (revision 313154) @@ -1,206 +1,196 @@ /*- * Copyright (c) 2005 Doug Rabson * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * */ #include __FBSDID("$FreeBSD$"); #include "opt_cpu.h" #include #include #include #include #include #include #include #include -#if !defined(CPU_DISABLE_SSE) && defined(I686_CPU) -#define CPU_ENABLE_SSE -#endif - -#ifdef CPU_ENABLE_SSE static int cpu_ptrace_xstate(struct thread *td, int req, void *addr, int data) { struct ptrace_xstate_info info; char *savefpu; int error; if (!use_xsave) return (EOPNOTSUPP); switch (req) { case PT_GETXSTATE_OLD: npxgetregs(td); savefpu = (char *)(get_pcb_user_save_td(td) + 1); error = copyout(savefpu, addr, cpu_max_ext_state_size - sizeof(union savefpu)); break; case PT_SETXSTATE_OLD: if (data > cpu_max_ext_state_size - sizeof(union savefpu)) { error = EINVAL; break; } savefpu = malloc(data, M_TEMP, M_WAITOK); error = copyin(addr, savefpu, data); if (error == 0) { npxgetregs(td); error = npxsetxstate(td, savefpu, data); } free(savefpu, M_TEMP); break; case PT_GETXSTATE_INFO: if (data != sizeof(info)) { error = EINVAL; break; } info.xsave_len = cpu_max_ext_state_size; info.xsave_mask = xsave_mask; error = copyout(&info, addr, data); break; case PT_GETXSTATE: npxgetregs(td); savefpu = (char *)(get_pcb_user_save_td(td)); error = copyout(savefpu, addr, cpu_max_ext_state_size); break; case PT_SETXSTATE: if (data < sizeof(union savefpu) || data > cpu_max_ext_state_size) { error = EINVAL; break; } savefpu = malloc(data, M_TEMP, M_WAITOK); error = copyin(addr, savefpu, data); if (error == 0) error = npxsetregs(td, (union savefpu *)savefpu, savefpu + sizeof(union savefpu), data - sizeof(union savefpu)); free(savefpu, M_TEMP); break; default: error = EINVAL; break; } return (error); } -#endif static int cpu_ptrace_xmm(struct thread *td, int req, void *addr, int data) { -#ifdef CPU_ENABLE_SSE struct savexmm *fpstate; int error; if (!cpu_fxsr) return (EINVAL); fpstate = &get_pcb_user_save_td(td)->sv_xmm; switch (req) { case PT_GETXMMREGS: npxgetregs(td); error = copyout(fpstate, addr, sizeof(*fpstate)); break; case PT_SETXMMREGS: npxgetregs(td); error = copyin(addr, fpstate, sizeof(*fpstate)); fpstate->sv_env.en_mxcsr &= cpu_mxcsr_mask; break; case PT_GETXSTATE_OLD: case PT_SETXSTATE_OLD: case PT_GETXSTATE_INFO: case PT_GETXSTATE: case PT_SETXSTATE: error = cpu_ptrace_xstate(td, req, addr, data); break; default: return (EINVAL); } return (error); -#else - return (EINVAL); -#endif } int cpu_ptrace(struct thread *td, int req, void *addr, int data) { struct segment_descriptor *sdp, sd; register_t r; int error; switch (req) { case PT_GETXMMREGS: case PT_SETXMMREGS: case PT_GETXSTATE_OLD: case PT_SETXSTATE_OLD: case PT_GETXSTATE_INFO: case PT_GETXSTATE: case PT_SETXSTATE: error = cpu_ptrace_xmm(td, req, addr, data); break; case PT_GETFSBASE: case PT_GETGSBASE: sdp = req == PT_GETFSBASE ? &td->td_pcb->pcb_fsd : &td->td_pcb->pcb_gsd; r = sdp->sd_hibase << 24 | sdp->sd_lobase; error = copyout(&r, addr, sizeof(r)); break; case PT_SETFSBASE: case PT_SETGSBASE: error = copyin(addr, &r, sizeof(r)); if (error != 0) break; fill_based_sd(&sd, r); if (req == PT_SETFSBASE) { td->td_pcb->pcb_fsd = sd; td->td_frame->tf_fs = GSEL(GUFS_SEL, SEL_UPL); } else { td->td_pcb->pcb_gsd = sd; td->td_pcb->pcb_gs = GSEL(GUGS_SEL, SEL_UPL); } break; default: return (EINVAL); } return (error); } Index: head/sys/i386/i386/support.s =================================================================== --- head/sys/i386/i386/support.s (revision 313153) +++ head/sys/i386/i386/support.s (revision 313154) @@ -1,841 +1,839 @@ /*- * Copyright (c) 1993 The Regents of the University of California. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ -#include "opt_npx.h" - #include #include #include #include #include "assym.s" #define IDXSHIFT 10 .text /* * bcopy family * void bzero(void *buf, u_int len) */ ENTRY(bzero) pushl %edi movl 8(%esp),%edi movl 12(%esp),%ecx xorl %eax,%eax shrl $2,%ecx cld rep stosl movl 12(%esp),%ecx andl $3,%ecx rep stosb popl %edi ret END(bzero) ENTRY(sse2_pagezero) pushl %ebx movl 8(%esp),%ecx movl %ecx,%eax addl $4096,%eax xor %ebx,%ebx jmp 1f /* * The loop takes 14 bytes. Ensure that it doesn't cross a 16-byte * cache line. */ .p2align 4,0x90 1: movnti %ebx,(%ecx) movnti %ebx,4(%ecx) addl $8,%ecx cmpl %ecx,%eax jne 1b sfence popl %ebx ret END(sse2_pagezero) ENTRY(i686_pagezero) pushl %edi pushl %ebx movl 12(%esp),%edi movl $1024,%ecx cld ALIGN_TEXT 1: xorl %eax,%eax repe scasl jnz 2f popl %ebx popl %edi ret ALIGN_TEXT 2: incl %ecx subl $4,%edi movl %ecx,%edx cmpl $16,%ecx jge 3f movl %edi,%ebx andl $0x3f,%ebx shrl %ebx shrl %ebx movl $16,%ecx subl %ebx,%ecx 3: subl %ecx,%edx rep stosl movl %edx,%ecx testl %edx,%edx jnz 1b popl %ebx popl %edi ret END(i686_pagezero) /* fillw(pat, base, cnt) */ ENTRY(fillw) pushl %edi movl 8(%esp),%eax movl 12(%esp),%edi movl 16(%esp),%ecx cld rep stosw popl %edi ret END(fillw) ENTRY(bcopyb) pushl %esi pushl %edi movl 12(%esp),%esi movl 16(%esp),%edi movl 20(%esp),%ecx movl %edi,%eax subl %esi,%eax cmpl %ecx,%eax /* overlapping && src < dst? */ jb 1f cld /* nope, copy forwards */ rep movsb popl %edi popl %esi ret ALIGN_TEXT 1: addl %ecx,%edi /* copy backwards. */ addl %ecx,%esi decl %edi decl %esi std rep movsb popl %edi popl %esi cld ret END(bcopyb) /* * bcopy(src, dst, cnt) * ws@tools.de (Wolfgang Solfrank, TooLs GmbH) +49-228-985800 */ ENTRY(bcopy) pushl %ebp movl %esp,%ebp pushl %esi pushl %edi movl 8(%ebp),%esi movl 12(%ebp),%edi movl 16(%ebp),%ecx movl %edi,%eax subl %esi,%eax cmpl %ecx,%eax /* overlapping && src < dst? */ jb 1f shrl $2,%ecx /* copy by 32-bit words */ cld /* nope, copy forwards */ rep movsl movl 16(%ebp),%ecx andl $3,%ecx /* any bytes left? */ rep movsb popl %edi popl %esi popl %ebp ret ALIGN_TEXT 1: addl %ecx,%edi /* copy backwards */ addl %ecx,%esi decl %edi decl %esi andl $3,%ecx /* any fractional bytes? */ std rep movsb movl 16(%ebp),%ecx /* copy remainder by 32-bit words */ shrl $2,%ecx subl $3,%esi subl $3,%edi rep movsl popl %edi popl %esi cld popl %ebp ret END(bcopy) /* * Note: memcpy does not support overlapping copies */ ENTRY(memcpy) pushl %edi pushl %esi movl 12(%esp),%edi movl 16(%esp),%esi movl 20(%esp),%ecx movl %edi,%eax shrl $2,%ecx /* copy by 32-bit words */ cld /* nope, copy forwards */ rep movsl movl 20(%esp),%ecx andl $3,%ecx /* any bytes left? */ rep movsb popl %esi popl %edi ret END(memcpy) /*****************************************************************************/ /* copyout and fubyte family */ /*****************************************************************************/ /* * Access user memory from inside the kernel. These routines and possibly * the math- and DOS emulators should be the only places that do this. * * We have to access the memory with user's permissions, so use a segment * selector with RPL 3. For writes to user space we have to additionally * check the PTE for write permission, because the 386 does not check * write permissions when we are executing with EPL 0. The 486 does check * this if the WP bit is set in CR0, so we can use a simpler version here. * * These routines set curpcb->pcb_onfault for the time they execute. When a * protection violation occurs inside the functions, the trap handler * returns to *curpcb->pcb_onfault instead of the function. */ /* * copyout(from_kernel, to_user, len) - MP SAFE */ ENTRY(copyout) movl PCPU(CURPCB),%eax movl $copyout_fault,PCB_ONFAULT(%eax) pushl %esi pushl %edi pushl %ebx movl 16(%esp),%esi movl 20(%esp),%edi movl 24(%esp),%ebx testl %ebx,%ebx /* anything to do? */ jz done_copyout /* * Check explicitly for non-user addresses. If 486 write protection * is being used, this check is essential because we are in kernel * mode so the h/w does not provide any protection against writing * kernel addresses. */ /* * First, prevent address wrapping. */ movl %edi,%eax addl %ebx,%eax jc copyout_fault /* * XXX STOP USING VM_MAXUSER_ADDRESS. * It is an end address, not a max, so every time it is used correctly it * looks like there is an off by one error, and of course it caused an off * by one error in several places. */ cmpl $VM_MAXUSER_ADDRESS,%eax ja copyout_fault /* bcopy(%esi, %edi, %ebx) */ movl %ebx,%ecx shrl $2,%ecx cld rep movsl movb %bl,%cl andb $3,%cl rep movsb done_copyout: popl %ebx popl %edi popl %esi xorl %eax,%eax movl PCPU(CURPCB),%edx movl %eax,PCB_ONFAULT(%edx) ret END(copyout) ALIGN_TEXT copyout_fault: popl %ebx popl %edi popl %esi movl PCPU(CURPCB),%edx movl $0,PCB_ONFAULT(%edx) movl $EFAULT,%eax ret /* * copyin(from_user, to_kernel, len) - MP SAFE */ ENTRY(copyin) movl PCPU(CURPCB),%eax movl $copyin_fault,PCB_ONFAULT(%eax) pushl %esi pushl %edi movl 12(%esp),%esi /* caddr_t from */ movl 16(%esp),%edi /* caddr_t to */ movl 20(%esp),%ecx /* size_t len */ /* * make sure address is valid */ movl %esi,%edx addl %ecx,%edx jc copyin_fault cmpl $VM_MAXUSER_ADDRESS,%edx ja copyin_fault movb %cl,%al shrl $2,%ecx /* copy longword-wise */ cld rep movsl movb %al,%cl andb $3,%cl /* copy remaining bytes */ rep movsb popl %edi popl %esi xorl %eax,%eax movl PCPU(CURPCB),%edx movl %eax,PCB_ONFAULT(%edx) ret END(copyin) ALIGN_TEXT copyin_fault: popl %edi popl %esi movl PCPU(CURPCB),%edx movl $0,PCB_ONFAULT(%edx) movl $EFAULT,%eax ret /* * casueword. Compare and set user word. Returns -1 on fault, * 0 on non-faulting access. The current value is in *oldp. */ ALTENTRY(casueword32) ENTRY(casueword) movl PCPU(CURPCB),%ecx movl $fusufault,PCB_ONFAULT(%ecx) movl 4(%esp),%edx /* dst */ movl 8(%esp),%eax /* old */ movl 16(%esp),%ecx /* new */ cmpl $VM_MAXUSER_ADDRESS-4,%edx /* verify address is valid */ ja fusufault #ifdef SMP lock #endif cmpxchgl %ecx,(%edx) /* Compare and set. */ /* * The old value is in %eax. If the store succeeded it will be the * value we expected (old) from before the store, otherwise it will * be the current value. */ movl PCPU(CURPCB),%ecx movl $0,PCB_ONFAULT(%ecx) movl 12(%esp),%edx /* oldp */ movl %eax,(%edx) xorl %eax,%eax ret END(casueword32) END(casueword) /* * Fetch (load) a 32-bit word, a 16-bit word, or an 8-bit byte from user * memory. */ ALTENTRY(fueword32) ENTRY(fueword) movl PCPU(CURPCB),%ecx movl $fusufault,PCB_ONFAULT(%ecx) movl 4(%esp),%edx /* from */ cmpl $VM_MAXUSER_ADDRESS-4,%edx /* verify address is valid */ ja fusufault movl (%edx),%eax movl $0,PCB_ONFAULT(%ecx) movl 8(%esp),%edx movl %eax,(%edx) xorl %eax,%eax ret END(fueword32) END(fueword) /* * fuswintr() and suswintr() are specialized variants of fuword16() and * suword16(), respectively. They are called from the profiling code, * potentially at interrupt time. If they fail, that's okay; good things * will happen later. They always fail for now, until the trap code is * able to deal with this. */ ALTENTRY(suswintr) ENTRY(fuswintr) movl $-1,%eax ret END(suswintr) END(fuswintr) ENTRY(fuword16) movl PCPU(CURPCB),%ecx movl $fusufault,PCB_ONFAULT(%ecx) movl 4(%esp),%edx cmpl $VM_MAXUSER_ADDRESS-2,%edx ja fusufault movzwl (%edx),%eax movl $0,PCB_ONFAULT(%ecx) ret END(fuword16) ENTRY(fubyte) movl PCPU(CURPCB),%ecx movl $fusufault,PCB_ONFAULT(%ecx) movl 4(%esp),%edx cmpl $VM_MAXUSER_ADDRESS-1,%edx ja fusufault movzbl (%edx),%eax movl $0,PCB_ONFAULT(%ecx) ret END(fubyte) ALIGN_TEXT fusufault: movl PCPU(CURPCB),%ecx xorl %eax,%eax movl %eax,PCB_ONFAULT(%ecx) decl %eax ret /* * Store a 32-bit word, a 16-bit word, or an 8-bit byte to user memory. * All these functions are MPSAFE. */ ALTENTRY(suword32) ENTRY(suword) movl PCPU(CURPCB),%ecx movl $fusufault,PCB_ONFAULT(%ecx) movl 4(%esp),%edx cmpl $VM_MAXUSER_ADDRESS-4,%edx /* verify address validity */ ja fusufault movl 8(%esp),%eax movl %eax,(%edx) xorl %eax,%eax movl PCPU(CURPCB),%ecx movl %eax,PCB_ONFAULT(%ecx) ret END(suword32) END(suword) ENTRY(suword16) movl PCPU(CURPCB),%ecx movl $fusufault,PCB_ONFAULT(%ecx) movl 4(%esp),%edx cmpl $VM_MAXUSER_ADDRESS-2,%edx /* verify address validity */ ja fusufault movw 8(%esp),%ax movw %ax,(%edx) xorl %eax,%eax movl PCPU(CURPCB),%ecx /* restore trashed register */ movl %eax,PCB_ONFAULT(%ecx) ret END(suword16) ENTRY(subyte) movl PCPU(CURPCB),%ecx movl $fusufault,PCB_ONFAULT(%ecx) movl 4(%esp),%edx cmpl $VM_MAXUSER_ADDRESS-1,%edx /* verify address validity */ ja fusufault movb 8(%esp),%al movb %al,(%edx) xorl %eax,%eax movl PCPU(CURPCB),%ecx /* restore trashed register */ movl %eax,PCB_ONFAULT(%ecx) ret END(subyte) /* * copyinstr(from, to, maxlen, int *lencopied) - MP SAFE * * copy a string from from to to, stop when a 0 character is reached. * return ENAMETOOLONG if string is longer than maxlen, and * EFAULT on protection violations. If lencopied is non-zero, * return the actual length in *lencopied. */ ENTRY(copyinstr) pushl %esi pushl %edi movl PCPU(CURPCB),%ecx movl $cpystrflt,PCB_ONFAULT(%ecx) movl 12(%esp),%esi /* %esi = from */ movl 16(%esp),%edi /* %edi = to */ movl 20(%esp),%edx /* %edx = maxlen */ movl $VM_MAXUSER_ADDRESS,%eax /* make sure 'from' is within bounds */ subl %esi,%eax jbe cpystrflt /* restrict maxlen to <= VM_MAXUSER_ADDRESS-from */ cmpl %edx,%eax jae 1f movl %eax,%edx movl %eax,20(%esp) 1: incl %edx cld 2: decl %edx jz 3f lodsb stosb orb %al,%al jnz 2b /* Success -- 0 byte reached */ decl %edx xorl %eax,%eax jmp cpystrflt_x 3: /* edx is zero - return ENAMETOOLONG or EFAULT */ cmpl $VM_MAXUSER_ADDRESS,%esi jae cpystrflt 4: movl $ENAMETOOLONG,%eax jmp cpystrflt_x cpystrflt: movl $EFAULT,%eax cpystrflt_x: /* set *lencopied and return %eax */ movl PCPU(CURPCB),%ecx movl $0,PCB_ONFAULT(%ecx) movl 20(%esp),%ecx subl %edx,%ecx movl 24(%esp),%edx testl %edx,%edx jz 1f movl %ecx,(%edx) 1: popl %edi popl %esi ret END(copyinstr) /* * copystr(from, to, maxlen, int *lencopied) - MP SAFE */ ENTRY(copystr) pushl %esi pushl %edi movl 12(%esp),%esi /* %esi = from */ movl 16(%esp),%edi /* %edi = to */ movl 20(%esp),%edx /* %edx = maxlen */ incl %edx cld 1: decl %edx jz 4f lodsb stosb orb %al,%al jnz 1b /* Success -- 0 byte reached */ decl %edx xorl %eax,%eax jmp 6f 4: /* edx is zero -- return ENAMETOOLONG */ movl $ENAMETOOLONG,%eax 6: /* set *lencopied and return %eax */ movl 20(%esp),%ecx subl %edx,%ecx movl 24(%esp),%edx testl %edx,%edx jz 7f movl %ecx,(%edx) 7: popl %edi popl %esi ret END(copystr) ENTRY(bcmp) pushl %edi pushl %esi movl 12(%esp),%edi movl 16(%esp),%esi movl 20(%esp),%edx movl %edx,%ecx shrl $2,%ecx cld /* compare forwards */ repe cmpsl jne 1f movl %edx,%ecx andl $3,%ecx repe cmpsb 1: setne %al movsbl %al,%eax popl %esi popl %edi ret END(bcmp) /* * Handling of special 386 registers and descriptor tables etc */ /* void lgdt(struct region_descriptor *rdp); */ ENTRY(lgdt) /* reload the descriptor table */ movl 4(%esp),%eax lgdt (%eax) /* flush the prefetch q */ jmp 1f nop 1: /* reload "stale" selectors */ movl $KDSEL,%eax movl %eax,%ds movl %eax,%es movl %eax,%gs movl %eax,%ss movl $KPSEL,%eax movl %eax,%fs /* reload code selector by turning return into intersegmental return */ movl (%esp),%eax pushl %eax movl $KCSEL,4(%esp) MEXITCOUNT lret END(lgdt) /* ssdtosd(*ssdp,*sdp) */ ENTRY(ssdtosd) pushl %ebx movl 8(%esp),%ecx movl 8(%ecx),%ebx shll $16,%ebx movl (%ecx),%edx roll $16,%edx movb %dh,%bl movb %dl,%bh rorl $8,%ebx movl 4(%ecx),%eax movw %ax,%dx andl $0xf0000,%eax orl %eax,%ebx movl 12(%esp),%ecx movl %edx,(%ecx) movl %ebx,4(%ecx) popl %ebx ret END(ssdtosd) /* void reset_dbregs() */ ENTRY(reset_dbregs) movl $0,%eax movl %eax,%dr7 /* disable all breakpoints first */ movl %eax,%dr0 movl %eax,%dr1 movl %eax,%dr2 movl %eax,%dr3 movl %eax,%dr6 ret END(reset_dbregs) /*****************************************************************************/ /* setjump, longjump */ /*****************************************************************************/ ENTRY(setjmp) movl 4(%esp),%eax movl %ebx,(%eax) /* save ebx */ movl %esp,4(%eax) /* save esp */ movl %ebp,8(%eax) /* save ebp */ movl %esi,12(%eax) /* save esi */ movl %edi,16(%eax) /* save edi */ movl (%esp),%edx /* get rta */ movl %edx,20(%eax) /* save eip */ xorl %eax,%eax /* return(0); */ ret END(setjmp) ENTRY(longjmp) movl 4(%esp),%eax movl (%eax),%ebx /* restore ebx */ movl 4(%eax),%esp /* restore esp */ movl 8(%eax),%ebp /* restore ebp */ movl 12(%eax),%esi /* restore esi */ movl 16(%eax),%edi /* restore edi */ movl 20(%eax),%edx /* get rta */ movl %edx,(%esp) /* put in return frame */ xorl %eax,%eax /* return(1); */ incl %eax ret END(longjmp) /* * Support for reading MSRs in the safe manner. */ ENTRY(rdmsr_safe) /* int rdmsr_safe(u_int msr, uint64_t *data) */ movl PCPU(CURPCB),%ecx movl $msr_onfault,PCB_ONFAULT(%ecx) movl 4(%esp),%ecx rdmsr movl 8(%esp),%ecx movl %eax,(%ecx) movl %edx,4(%ecx) xorl %eax,%eax movl PCPU(CURPCB),%ecx movl %eax,PCB_ONFAULT(%ecx) ret /* * Support for writing MSRs in the safe manner. */ ENTRY(wrmsr_safe) /* int wrmsr_safe(u_int msr, uint64_t data) */ movl PCPU(CURPCB),%ecx movl $msr_onfault,PCB_ONFAULT(%ecx) movl 4(%esp),%ecx movl 8(%esp),%eax movl 12(%esp),%edx wrmsr xorl %eax,%eax movl PCPU(CURPCB),%ecx movl %eax,PCB_ONFAULT(%ecx) ret /* * MSR operations fault handler */ ALIGN_TEXT msr_onfault: movl PCPU(CURPCB),%ecx movl $0,PCB_ONFAULT(%ecx) movl $EFAULT,%eax ret Index: head/sys/i386/i386/swtch.s =================================================================== --- head/sys/i386/i386/swtch.s (revision 313153) +++ head/sys/i386/i386/swtch.s (revision 313154) @@ -1,471 +1,468 @@ /*- * Copyright (c) 1990 The Regents of the University of California. * All rights reserved. * * This code is derived from software contributed to Berkeley by * William Jolitz. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ -#include "opt_npx.h" #include "opt_sched.h" #include #include "assym.s" #if defined(SMP) && defined(SCHED_ULE) #define SETOP xchgl #define BLOCK_SPIN(reg) \ movl $blocked_lock,%eax ; \ 100: ; \ lock ; \ cmpxchgl %eax,TD_LOCK(reg) ; \ jne 101f ; \ pause ; \ jmp 100b ; \ 101: #else #define SETOP movl #define BLOCK_SPIN(reg) #endif /*****************************************************************************/ /* Scheduling */ /*****************************************************************************/ .text /* * cpu_throw() * * This is the second half of cpu_switch(). It is used when the current * thread is either a dummy or slated to die, and we no longer care * about its state. This is only a slight optimization and is probably * not worth it anymore. Note that we need to clear the pm_active bits so * we do need the old proc if it still exists. * 0(%esp) = ret * 4(%esp) = oldtd * 8(%esp) = newtd */ ENTRY(cpu_throw) movl PCPU(CPUID), %esi movl 4(%esp),%ecx /* Old thread */ testl %ecx,%ecx /* no thread? */ jz 1f /* release bit from old pm_active */ movl PCPU(CURPMAP), %ebx #ifdef SMP lock #endif btrl %esi, PM_ACTIVE(%ebx) /* clear old */ 1: movl 8(%esp),%ecx /* New thread */ movl TD_PCB(%ecx),%edx movl PCB_CR3(%edx),%eax movl %eax,%cr3 /* set bit in new pm_active */ movl TD_PROC(%ecx),%eax movl P_VMSPACE(%eax), %ebx addl $VM_PMAP, %ebx movl %ebx, PCPU(CURPMAP) #ifdef SMP lock #endif btsl %esi, PM_ACTIVE(%ebx) /* set new */ jmp sw1 END(cpu_throw) /* * cpu_switch(old, new) * * Save the current thread state, then select the next thread to run * and load its state. * 0(%esp) = ret * 4(%esp) = oldtd * 8(%esp) = newtd * 12(%esp) = newlock */ ENTRY(cpu_switch) /* Switch to new thread. First, save context. */ movl 4(%esp),%ecx #ifdef INVARIANTS testl %ecx,%ecx /* no thread? */ jz badsw2 /* no, panic */ #endif movl TD_PCB(%ecx),%edx movl (%esp),%eax /* Hardware registers */ movl %eax,PCB_EIP(%edx) movl %ebx,PCB_EBX(%edx) movl %esp,PCB_ESP(%edx) movl %ebp,PCB_EBP(%edx) movl %esi,PCB_ESI(%edx) movl %edi,PCB_EDI(%edx) mov %gs,PCB_GS(%edx) /* Test if debug registers should be saved. */ testl $PCB_DBREGS,PCB_FLAGS(%edx) jz 1f /* no, skip over */ movl %dr7,%eax /* yes, do the save */ movl %eax,PCB_DR7(%edx) andl $0x0000fc00, %eax /* disable all watchpoints */ movl %eax,%dr7 movl %dr6,%eax movl %eax,PCB_DR6(%edx) movl %dr3,%eax movl %eax,PCB_DR3(%edx) movl %dr2,%eax movl %eax,PCB_DR2(%edx) movl %dr1,%eax movl %eax,PCB_DR1(%edx) movl %dr0,%eax movl %eax,PCB_DR0(%edx) 1: -#ifdef DEV_NPX /* have we used fp, and need a save? */ cmpl %ecx,PCPU(FPCURTHREAD) jne 1f pushl PCB_SAVEFPU(%edx) /* h/w bugs make saving complicated */ call npxsave /* do it in a big C function */ popl %eax 1: -#endif /* Save is done. Now fire up new thread. Leave old vmspace. */ movl 4(%esp),%edi movl 8(%esp),%ecx /* New thread */ movl 12(%esp),%esi /* New lock */ #ifdef INVARIANTS testl %ecx,%ecx /* no thread? */ jz badsw3 /* no, panic */ #endif movl TD_PCB(%ecx),%edx /* switch address space */ movl PCB_CR3(%edx),%eax movl %cr3,%ebx /* The same address space? */ cmpl %ebx,%eax je sw0 movl %eax,%cr3 /* new address space */ movl %esi,%eax movl PCPU(CPUID),%esi SETOP %eax,TD_LOCK(%edi) /* Switchout td_lock */ /* Release bit from old pmap->pm_active */ movl PCPU(CURPMAP), %ebx #ifdef SMP lock #endif btrl %esi, PM_ACTIVE(%ebx) /* clear old */ /* Set bit in new pmap->pm_active */ movl TD_PROC(%ecx),%eax /* newproc */ movl P_VMSPACE(%eax), %ebx addl $VM_PMAP, %ebx movl %ebx, PCPU(CURPMAP) #ifdef SMP lock #endif btsl %esi, PM_ACTIVE(%ebx) /* set new */ jmp sw1 sw0: SETOP %esi,TD_LOCK(%edi) /* Switchout td_lock */ sw1: BLOCK_SPIN(%ecx) /* * At this point, we've switched address spaces and are ready * to load up the rest of the next context. */ cmpl $0, PCB_EXT(%edx) /* has pcb extension? */ je 1f /* If not, use the default */ movl $1, PCPU(PRIVATE_TSS) /* mark use of private tss */ movl PCB_EXT(%edx), %edi /* new tss descriptor */ jmp 2f /* Load it up */ 1: /* * Use the common default TSS instead of our own. * Set our stack pointer into the TSS, it's set to just * below the PCB. In C, common_tss.tss_esp0 = &pcb - 16; */ leal -16(%edx), %ebx /* leave space for vm86 */ movl %ebx, PCPU(COMMON_TSS) + TSS_ESP0 /* * Test this CPU's bit in the bitmap to see if this * CPU was using a private TSS. */ cmpl $0, PCPU(PRIVATE_TSS) /* Already using the common? */ je 3f /* if so, skip reloading */ movl $0, PCPU(PRIVATE_TSS) PCPU_ADDR(COMMON_TSSD, %edi) 2: /* Move correct tss descriptor into GDT slot, then reload tr. */ movl PCPU(TSS_GDT), %ebx /* entry in GDT */ movl 0(%edi), %eax movl 4(%edi), %esi movl %eax, 0(%ebx) movl %esi, 4(%ebx) movl $GPROC0_SEL*8, %esi /* GSEL(GPROC0_SEL, SEL_KPL) */ ltr %si 3: /* Copy the %fs and %gs selectors into this pcpu gdt */ leal PCB_FSD(%edx), %esi movl PCPU(FSGS_GDT), %edi movl 0(%esi), %eax /* %fs selector */ movl 4(%esi), %ebx movl %eax, 0(%edi) movl %ebx, 4(%edi) movl 8(%esi), %eax /* %gs selector, comes straight after */ movl 12(%esi), %ebx movl %eax, 8(%edi) movl %ebx, 12(%edi) /* Restore context. */ movl PCB_EBX(%edx),%ebx movl PCB_ESP(%edx),%esp movl PCB_EBP(%edx),%ebp movl PCB_ESI(%edx),%esi movl PCB_EDI(%edx),%edi movl PCB_EIP(%edx),%eax movl %eax,(%esp) movl %edx, PCPU(CURPCB) movl TD_TID(%ecx),%eax movl %ecx, PCPU(CURTHREAD) /* into next thread */ /* * Determine the LDT to use and load it if is the default one and * that is not the current one. */ movl TD_PROC(%ecx),%eax cmpl $0,P_MD+MD_LDT(%eax) jnz 1f movl _default_ldt,%eax cmpl PCPU(CURRENTLDT),%eax je 2f lldt _default_ldt movl %eax,PCPU(CURRENTLDT) jmp 2f 1: /* Load the LDT when it is not the default one. */ pushl %edx /* Preserve pointer to pcb. */ addl $P_MD,%eax /* Pointer to mdproc is arg. */ pushl %eax call set_user_ldt addl $4,%esp popl %edx 2: /* This must be done after loading the user LDT. */ .globl cpu_switch_load_gs cpu_switch_load_gs: mov PCB_GS(%edx),%gs /* Test if debug registers should be restored. */ testl $PCB_DBREGS,PCB_FLAGS(%edx) jz 1f /* * Restore debug registers. The special code for dr7 is to * preserve the current values of its reserved bits. */ movl PCB_DR6(%edx),%eax movl %eax,%dr6 movl PCB_DR3(%edx),%eax movl %eax,%dr3 movl PCB_DR2(%edx),%eax movl %eax,%dr2 movl PCB_DR1(%edx),%eax movl %eax,%dr1 movl PCB_DR0(%edx),%eax movl %eax,%dr0 movl %dr7,%eax andl $0x0000fc00,%eax movl PCB_DR7(%edx),%ecx andl $~0x0000fc00,%ecx orl %ecx,%eax movl %eax,%dr7 1: ret #ifdef INVARIANTS badsw1: pushal pushl $sw0_1 call panic sw0_1: .asciz "cpu_throw: no newthread supplied" badsw2: pushal pushl $sw0_2 call panic sw0_2: .asciz "cpu_switch: no curthread supplied" badsw3: pushal pushl $sw0_3 call panic sw0_3: .asciz "cpu_switch: no newthread supplied" #endif END(cpu_switch) /* * savectx(pcb) * Update pcb, saving current processor state. */ ENTRY(savectx) /* Fetch PCB. */ movl 4(%esp),%ecx /* Save caller's return address. Child won't execute this routine. */ movl (%esp),%eax movl %eax,PCB_EIP(%ecx) movl %cr3,%eax movl %eax,PCB_CR3(%ecx) movl %ebx,PCB_EBX(%ecx) movl %esp,PCB_ESP(%ecx) movl %ebp,PCB_EBP(%ecx) movl %esi,PCB_ESI(%ecx) movl %edi,PCB_EDI(%ecx) mov %gs,PCB_GS(%ecx) movl %cr0,%eax movl %eax,PCB_CR0(%ecx) movl %cr2,%eax movl %eax,PCB_CR2(%ecx) movl %cr4,%eax movl %eax,PCB_CR4(%ecx) movl %dr0,%eax movl %eax,PCB_DR0(%ecx) movl %dr1,%eax movl %eax,PCB_DR1(%ecx) movl %dr2,%eax movl %eax,PCB_DR2(%ecx) movl %dr3,%eax movl %eax,PCB_DR3(%ecx) movl %dr6,%eax movl %eax,PCB_DR6(%ecx) movl %dr7,%eax movl %eax,PCB_DR7(%ecx) mov %ds,PCB_DS(%ecx) mov %es,PCB_ES(%ecx) mov %fs,PCB_FS(%ecx) mov %ss,PCB_SS(%ecx) sgdt PCB_GDT(%ecx) sidt PCB_IDT(%ecx) sldt PCB_LDT(%ecx) str PCB_TR(%ecx) movl $1,%eax ret END(savectx) /* * resumectx(pcb) __fastcall * Resuming processor state from pcb. */ ENTRY(resumectx) /* Restore GDT. */ lgdt PCB_GDT(%ecx) /* Restore segment registers */ movzwl PCB_DS(%ecx),%eax mov %ax,%ds movzwl PCB_ES(%ecx),%eax mov %ax,%es movzwl PCB_FS(%ecx),%eax mov %ax,%fs movzwl PCB_GS(%ecx),%eax movw %ax,%gs movzwl PCB_SS(%ecx),%eax mov %ax,%ss /* Restore CR2, CR4, CR3 and CR0 */ movl PCB_CR2(%ecx),%eax movl %eax,%cr2 movl PCB_CR4(%ecx),%eax movl %eax,%cr4 movl PCB_CR3(%ecx),%eax movl %eax,%cr3 movl PCB_CR0(%ecx),%eax movl %eax,%cr0 jmp 1f 1: /* Restore descriptor tables */ lidt PCB_IDT(%ecx) lldt PCB_LDT(%ecx) #define SDT_SYS386TSS 9 #define SDT_SYS386BSY 11 /* Clear "task busy" bit and reload TR */ movl PCPU(TSS_GDT),%eax andb $(~SDT_SYS386BSY | SDT_SYS386TSS),5(%eax) movzwl PCB_TR(%ecx),%eax ltr %ax #undef SDT_SYS386TSS #undef SDT_SYS386BSY /* Restore debug registers */ movl PCB_DR0(%ecx),%eax movl %eax,%dr0 movl PCB_DR1(%ecx),%eax movl %eax,%dr1 movl PCB_DR2(%ecx),%eax movl %eax,%dr2 movl PCB_DR3(%ecx),%eax movl %eax,%dr3 movl PCB_DR6(%ecx),%eax movl %eax,%dr6 movl PCB_DR7(%ecx),%eax movl %eax,%dr7 /* Restore other registers */ movl PCB_EDI(%ecx),%edi movl PCB_ESI(%ecx),%esi movl PCB_EBP(%ecx),%ebp movl PCB_ESP(%ecx),%esp movl PCB_EBX(%ecx),%ebx /* reload code selector by turning return into intersegmental return */ pushl PCB_EIP(%ecx) movl $KCSEL,4(%esp) xorl %eax,%eax lret END(resumectx) Index: head/sys/i386/i386/trap.c =================================================================== --- head/sys/i386/i386/trap.c (revision 313153) +++ head/sys/i386/i386/trap.c (revision 313154) @@ -1,1137 +1,1124 @@ /*- * Copyright (C) 1994, David Greenman * Copyright (c) 1990, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * the University of Utah, and William Jolitz. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)trap.c 7.4 (Berkeley) 5/13/91 */ #include __FBSDID("$FreeBSD$"); /* * 386 Trap and System call handling */ #include "opt_clock.h" #include "opt_cpu.h" #include "opt_hwpmc_hooks.h" #include "opt_isa.h" #include "opt_kdb.h" -#include "opt_npx.h" #include "opt_stack.h" #include "opt_trap.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef HWPMC_HOOKS #include PMC_SOFT_DEFINE( , , page_fault, all); PMC_SOFT_DEFINE( , , page_fault, read); PMC_SOFT_DEFINE( , , page_fault, write); #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef SMP #include #endif #include #include #include #ifdef POWERFAIL_NMI #include #include #endif #ifdef KDTRACE_HOOKS #include #endif extern void trap(struct trapframe *frame); extern void syscall(struct trapframe *frame); static int trap_pfault(struct trapframe *, int, vm_offset_t); static void trap_fatal(struct trapframe *, vm_offset_t); void dblfault_handler(void); extern inthand_t IDTVEC(lcall_syscall); #define MAX_TRAP_MSG 32 static char *trap_msg[] = { "", /* 0 unused */ "privileged instruction fault", /* 1 T_PRIVINFLT */ "", /* 2 unused */ "breakpoint instruction fault", /* 3 T_BPTFLT */ "", /* 4 unused */ "", /* 5 unused */ "arithmetic trap", /* 6 T_ARITHTRAP */ "", /* 7 unused */ "", /* 8 unused */ "general protection fault", /* 9 T_PROTFLT */ "trace trap", /* 10 T_TRCTRAP */ "", /* 11 unused */ "page fault", /* 12 T_PAGEFLT */ "", /* 13 unused */ "alignment fault", /* 14 T_ALIGNFLT */ "", /* 15 unused */ "", /* 16 unused */ "", /* 17 unused */ "integer divide fault", /* 18 T_DIVIDE */ "non-maskable interrupt trap", /* 19 T_NMI */ "overflow trap", /* 20 T_OFLOW */ "FPU bounds check fault", /* 21 T_BOUND */ "FPU device not available", /* 22 T_DNA */ "double fault", /* 23 T_DOUBLEFLT */ "FPU operand fetch fault", /* 24 T_FPOPFLT */ "invalid TSS fault", /* 25 T_TSSFLT */ "segment not present fault", /* 26 T_SEGNPFLT */ "stack fault", /* 27 T_STKFLT */ "machine check trap", /* 28 T_MCHK */ "SIMD floating-point exception", /* 29 T_XMMFLT */ "reserved (unknown) fault", /* 30 T_RESERVED */ "", /* 31 unused (reserved) */ "DTrace pid return trap", /* 32 T_DTRACE_RET */ }; #if defined(I586_CPU) && !defined(NO_F00F_HACK) int has_f00f_bug = 0; /* Initialized so that it can be patched. */ #endif static int prot_fault_translation = 0; SYSCTL_INT(_machdep, OID_AUTO, prot_fault_translation, CTLFLAG_RW, &prot_fault_translation, 0, "Select signal to deliver on protection fault"); static int uprintf_signal; SYSCTL_INT(_machdep, OID_AUTO, uprintf_signal, CTLFLAG_RW, &uprintf_signal, 0, "Print debugging information on trap signal to ctty"); /* * Exception, fault, and trap interface to the FreeBSD kernel. * This common code is called from assembly language IDT gate entry * routines that prepare a suitable stack frame, and restore this * frame after the exception has been processed. */ void trap(struct trapframe *frame) { #ifdef KDTRACE_HOOKS struct reg regs; #endif struct thread *td = curthread; struct proc *p = td->td_proc; #ifdef KDB register_t dr6; #endif int i = 0, ucode = 0; u_int type; register_t addr = 0; vm_offset_t eva; ksiginfo_t ksi; #ifdef POWERFAIL_NMI static int lastalert = 0; #endif PCPU_INC(cnt.v_trap); type = frame->tf_trapno; #ifdef SMP /* Handler for NMI IPIs used for stopping CPUs. */ if (type == T_NMI) { if (ipi_nmi_handler() == 0) goto out; } #endif /* SMP */ #ifdef KDB if (kdb_active) { kdb_reenter(); goto out; } #endif if (type == T_RESERVED) { trap_fatal(frame, 0); goto out; } if (type == T_NMI) { #ifdef HWPMC_HOOKS /* * CPU PMCs interrupt using an NMI so we check for that first. * If the HWPMC module is active, 'pmc_hook' will point to * the function to be called. A non-zero return value from the * hook means that the NMI was consumed by it and that we can * return immediately. */ if (pmc_intr != NULL && (*pmc_intr)(PCPU_GET(cpuid), frame) != 0) goto out; #endif #ifdef STACK if (stack_nmi_handler(frame) != 0) goto out; #endif } if (type == T_MCHK) { mca_intr(); goto out; } #ifdef KDTRACE_HOOKS /* * A trap can occur while DTrace executes a probe. Before * executing the probe, DTrace blocks re-scheduling and sets * a flag in its per-cpu flags to indicate that it doesn't * want to fault. On returning from the probe, the no-fault * flag is cleared and finally re-scheduling is enabled. */ if ((type == T_PROTFLT || type == T_PAGEFLT) && dtrace_trap_func != NULL && (*dtrace_trap_func)(frame, type)) goto out; #endif if ((frame->tf_eflags & PSL_I) == 0) { /* * Buggy application or kernel code has disabled * interrupts and then trapped. Enabling interrupts * now is wrong, but it is better than running with * interrupts disabled until they are accidentally * enabled later. */ if (TRAPF_USERMODE(frame) && (curpcb->pcb_flags & PCB_VM86CALL) == 0) uprintf( "pid %ld (%s): trap %d with interrupts disabled\n", (long)curproc->p_pid, curthread->td_name, type); else if (type != T_NMI && type != T_BPTFLT && type != T_TRCTRAP && frame->tf_eip != (int)cpu_switch_load_gs) { /* * XXX not quite right, since this may be for a * multiple fault in user mode. */ printf("kernel trap %d with interrupts disabled\n", type); /* * Page faults need interrupts disabled until later, * and we shouldn't enable interrupts while holding * a spin lock. */ if (type != T_PAGEFLT && td->td_md.md_spinlock_count == 0) enable_intr(); } } eva = 0; if (type == T_PAGEFLT) { /* * For some Cyrix CPUs, %cr2 is clobbered by * interrupts. This problem is worked around by using * an interrupt gate for the pagefault handler. We * are finally ready to read %cr2 and conditionally * reenable interrupts. If we hold a spin lock, then * we must not reenable interrupts. This might be a * spurious page fault. */ eva = rcr2(); if (td->td_md.md_spinlock_count == 0) enable_intr(); } if (TRAPF_USERMODE(frame) && (curpcb->pcb_flags & PCB_VM86CALL) == 0) { /* user trap */ td->td_pticks = 0; td->td_frame = frame; addr = frame->tf_eip; if (td->td_cowgen != p->p_cowgen) thread_cow_update(td); switch (type) { case T_PRIVINFLT: /* privileged instruction fault */ i = SIGILL; ucode = ILL_PRVOPC; break; case T_BPTFLT: /* bpt instruction fault */ case T_TRCTRAP: /* trace trap */ enable_intr(); #ifdef KDTRACE_HOOKS if (type == T_BPTFLT) { fill_frame_regs(frame, ®s); if (dtrace_pid_probe_ptr != NULL && dtrace_pid_probe_ptr(®s) == 0) goto out; } #endif user_trctrap_out: frame->tf_eflags &= ~PSL_T; i = SIGTRAP; ucode = (type == T_TRCTRAP ? TRAP_TRACE : TRAP_BRKPT); break; case T_ARITHTRAP: /* arithmetic trap */ -#ifdef DEV_NPX ucode = npxtrap_x87(); if (ucode == -1) goto userout; -#else - ucode = 0; -#endif i = SIGFPE; break; /* * The following two traps can happen in * vm86 mode, and, if so, we want to handle * them specially. */ case T_PROTFLT: /* general protection fault */ case T_STKFLT: /* stack fault */ if (frame->tf_eflags & PSL_VM) { i = vm86_emulate((struct vm86frame *)frame); if (i == SIGTRAP) { type = T_TRCTRAP; load_dr6(rdr6() | 0x4000); goto user_trctrap_out; } if (i == 0) goto user; break; } i = SIGBUS; ucode = (type == T_PROTFLT) ? BUS_OBJERR : BUS_ADRERR; break; case T_SEGNPFLT: /* segment not present fault */ i = SIGBUS; ucode = BUS_ADRERR; break; case T_TSSFLT: /* invalid TSS fault */ i = SIGBUS; ucode = BUS_OBJERR; break; case T_ALIGNFLT: i = SIGBUS; ucode = BUS_ADRALN; break; case T_DOUBLEFLT: /* double fault */ default: i = SIGBUS; ucode = BUS_OBJERR; break; case T_PAGEFLT: /* page fault */ i = trap_pfault(frame, TRUE, eva); #if defined(I586_CPU) && !defined(NO_F00F_HACK) if (i == -2) { /* * The f00f hack workaround has triggered, so * treat the fault as an illegal instruction * (T_PRIVINFLT) instead of a page fault. */ type = frame->tf_trapno = T_PRIVINFLT; /* Proceed as in that case. */ ucode = ILL_PRVOPC; i = SIGILL; break; } #endif if (i == -1) goto userout; if (i == 0) goto user; if (i == SIGSEGV) ucode = SEGV_MAPERR; else { if (prot_fault_translation == 0) { /* * Autodetect. * This check also covers the images * without the ABI-tag ELF note. */ if (SV_CURPROC_ABI() == SV_ABI_FREEBSD && p->p_osrel >= P_OSREL_SIGSEGV) { i = SIGSEGV; ucode = SEGV_ACCERR; } else { i = SIGBUS; ucode = BUS_PAGE_FAULT; } } else if (prot_fault_translation == 1) { /* * Always compat mode. */ i = SIGBUS; ucode = BUS_PAGE_FAULT; } else { /* * Always SIGSEGV mode. */ i = SIGSEGV; ucode = SEGV_ACCERR; } } addr = eva; break; case T_DIVIDE: /* integer divide fault */ ucode = FPE_INTDIV; i = SIGFPE; break; #ifdef DEV_ISA case T_NMI: #ifdef POWERFAIL_NMI #ifndef TIMER_FREQ # define TIMER_FREQ 1193182 #endif if (time_second - lastalert > 10) { log(LOG_WARNING, "NMI: power fail\n"); sysbeep(880, hz); lastalert = time_second; } goto userout; #else /* !POWERFAIL_NMI */ nmi_handle_intr(type, frame); break; #endif /* POWERFAIL_NMI */ #endif /* DEV_ISA */ case T_OFLOW: /* integer overflow fault */ ucode = FPE_INTOVF; i = SIGFPE; break; case T_BOUND: /* bounds check fault */ ucode = FPE_FLTSUB; i = SIGFPE; break; case T_DNA: -#ifdef DEV_NPX KASSERT(PCB_USER_FPU(td->td_pcb), ("kernel FPU ctx has leaked")); /* transparent fault (due to context switch "late") */ if (npxdna()) goto userout; -#endif uprintf("pid %d killed due to lack of floating point\n", p->p_pid); i = SIGKILL; ucode = 0; break; case T_FPOPFLT: /* FPU operand fetch fault */ ucode = ILL_COPROC; i = SIGILL; break; case T_XMMFLT: /* SIMD floating-point exception */ -#if defined(DEV_NPX) && !defined(CPU_DISABLE_SSE) && defined(I686_CPU) ucode = npxtrap_sse(); if (ucode == -1) goto userout; -#else - ucode = 0; -#endif i = SIGFPE; break; #ifdef KDTRACE_HOOKS case T_DTRACE_RET: enable_intr(); fill_frame_regs(frame, ®s); if (dtrace_return_probe_ptr != NULL && dtrace_return_probe_ptr(®s) == 0) goto out; break; #endif } } else { /* kernel trap */ KASSERT(cold || td->td_ucred != NULL, ("kernel trap doesn't have ucred")); switch (type) { case T_PAGEFLT: /* page fault */ (void) trap_pfault(frame, FALSE, eva); goto out; case T_DNA: -#ifdef DEV_NPX if (PCB_USER_FPU(td->td_pcb)) panic("Unregistered use of FPU in kernel"); if (npxdna()) goto out; -#endif break; case T_ARITHTRAP: /* arithmetic trap */ case T_XMMFLT: /* SIMD floating-point exception */ case T_FPOPFLT: /* FPU operand fetch fault */ /* * XXXKIB for now disable any FPU traps in kernel * handler registration seems to be overkill */ trap_fatal(frame, 0); goto out; /* * The following two traps can happen in * vm86 mode, and, if so, we want to handle * them specially. */ case T_PROTFLT: /* general protection fault */ case T_STKFLT: /* stack fault */ if (frame->tf_eflags & PSL_VM) { i = vm86_emulate((struct vm86frame *)frame); if (i == SIGTRAP) { type = T_TRCTRAP; load_dr6(rdr6() | 0x4000); goto kernel_trctrap; } if (i != 0) /* * returns to original process */ vm86_trap((struct vm86frame *)frame); goto out; } if (type == T_STKFLT) break; /* FALL THROUGH */ case T_SEGNPFLT: /* segment not present fault */ if (curpcb->pcb_flags & PCB_VM86CALL) break; /* * Invalid %fs's and %gs's can be created using * procfs or PT_SETREGS or by invalidating the * underlying LDT entry. This causes a fault * in kernel mode when the kernel attempts to * switch contexts. Lose the bad context * (XXX) so that we can continue, and generate * a signal. */ if (frame->tf_eip == (int)cpu_switch_load_gs) { curpcb->pcb_gs = 0; #if 0 PROC_LOCK(p); kern_psignal(p, SIGBUS); PROC_UNLOCK(p); #endif goto out; } if (td->td_intr_nesting_level != 0) break; /* * Invalid segment selectors and out of bounds * %eip's and %esp's can be set up in user mode. * This causes a fault in kernel mode when the * kernel tries to return to user mode. We want * to get this fault so that we can fix the * problem here and not have to check all the * selectors and pointers when the user changes * them. */ if (frame->tf_eip == (int)doreti_iret) { frame->tf_eip = (int)doreti_iret_fault; goto out; } if (frame->tf_eip == (int)doreti_popl_ds) { frame->tf_eip = (int)doreti_popl_ds_fault; goto out; } if (frame->tf_eip == (int)doreti_popl_es) { frame->tf_eip = (int)doreti_popl_es_fault; goto out; } if (frame->tf_eip == (int)doreti_popl_fs) { frame->tf_eip = (int)doreti_popl_fs_fault; goto out; } if (curpcb->pcb_onfault != NULL) { frame->tf_eip = (int)curpcb->pcb_onfault; goto out; } break; case T_TSSFLT: /* * PSL_NT can be set in user mode and isn't cleared * automatically when the kernel is entered. This * causes a TSS fault when the kernel attempts to * `iret' because the TSS link is uninitialized. We * want to get this fault so that we can fix the * problem here and not every time the kernel is * entered. */ if (frame->tf_eflags & PSL_NT) { frame->tf_eflags &= ~PSL_NT; goto out; } break; case T_TRCTRAP: /* trace trap */ kernel_trctrap: if (frame->tf_eip == (int)IDTVEC(lcall_syscall)) { /* * We've just entered system mode via the * syscall lcall. Continue single stepping * silently until the syscall handler has * saved the flags. */ goto out; } if (frame->tf_eip == (int)IDTVEC(lcall_syscall) + 1) { /* * The syscall handler has now saved the * flags. Stop single stepping it. */ frame->tf_eflags &= ~PSL_T; goto out; } /* * Ignore debug register trace traps due to * accesses in the user's address space, which * can happen under several conditions such as * if a user sets a watchpoint on a buffer and * then passes that buffer to a system call. * We still want to get TRCTRAPS for addresses * in kernel space because that is useful when * debugging the kernel. */ if (user_dbreg_trap() && !(curpcb->pcb_flags & PCB_VM86CALL)) { /* * Reset breakpoint bits because the * processor doesn't */ load_dr6(rdr6() & ~0xf); goto out; } /* * FALLTHROUGH (TRCTRAP kernel mode, kernel address) */ case T_BPTFLT: /* * If KDB is enabled, let it handle the debugger trap. * Otherwise, debugger traps "can't happen". */ #ifdef KDB /* XXX %dr6 is not quite reentrant. */ dr6 = rdr6(); load_dr6(dr6 & ~0x4000); if (kdb_trap(type, dr6, frame)) goto out; #endif break; #ifdef DEV_ISA case T_NMI: #ifdef POWERFAIL_NMI if (time_second - lastalert > 10) { log(LOG_WARNING, "NMI: power fail\n"); sysbeep(880, hz); lastalert = time_second; } goto out; #else /* !POWERFAIL_NMI */ nmi_handle_intr(type, frame); goto out; #endif /* POWERFAIL_NMI */ #endif /* DEV_ISA */ } trap_fatal(frame, eva); goto out; } /* Translate fault for emulators (e.g. Linux) */ if (*p->p_sysent->sv_transtrap) i = (*p->p_sysent->sv_transtrap)(i, type); ksiginfo_init_trap(&ksi); ksi.ksi_signo = i; ksi.ksi_code = ucode; ksi.ksi_addr = (void *)addr; ksi.ksi_trapno = type; if (uprintf_signal) { uprintf("pid %d comm %s: signal %d err %x code %d type %d " "addr 0x%x esp 0x%08x eip 0x%08x " "<%02x %02x %02x %02x %02x %02x %02x %02x>\n", p->p_pid, p->p_comm, i, frame->tf_err, ucode, type, addr, frame->tf_esp, frame->tf_eip, fubyte((void *)(frame->tf_eip + 0)), fubyte((void *)(frame->tf_eip + 1)), fubyte((void *)(frame->tf_eip + 2)), fubyte((void *)(frame->tf_eip + 3)), fubyte((void *)(frame->tf_eip + 4)), fubyte((void *)(frame->tf_eip + 5)), fubyte((void *)(frame->tf_eip + 6)), fubyte((void *)(frame->tf_eip + 7))); } KASSERT((read_eflags() & PSL_I) != 0, ("interrupts disabled")); trapsignal(td, &ksi); #ifdef DEBUG if (type <= MAX_TRAP_MSG) { uprintf("fatal process exception: %s", trap_msg[type]); if ((type == T_PAGEFLT) || (type == T_PROTFLT)) uprintf(", fault VA = 0x%lx", (u_long)eva); uprintf("\n"); } #endif user: userret(td, frame); KASSERT(PCB_USER_FPU(td->td_pcb), ("Return from trap with kernel FPU ctx leaked")); userout: out: return; } static int trap_pfault(frame, usermode, eva) struct trapframe *frame; int usermode; vm_offset_t eva; { vm_offset_t va; vm_map_t map; int rv = 0; vm_prot_t ftype; struct thread *td = curthread; struct proc *p = td->td_proc; if (__predict_false((td->td_pflags & TDP_NOFAULTING) != 0)) { /* * Due to both processor errata and lazy TLB invalidation when * access restrictions are removed from virtual pages, memory * accesses that are allowed by the physical mapping layer may * nonetheless cause one spurious page fault per virtual page. * When the thread is executing a "no faulting" section that * is bracketed by vm_fault_{disable,enable}_pagefaults(), * every page fault is treated as a spurious page fault, * unless it accesses the same virtual address as the most * recent page fault within the same "no faulting" section. */ if (td->td_md.md_spurflt_addr != eva || (td->td_pflags & TDP_RESETSPUR) != 0) { /* * Do nothing to the TLB. A stale TLB entry is * flushed automatically by a page fault. */ td->td_md.md_spurflt_addr = eva; td->td_pflags &= ~TDP_RESETSPUR; return (0); } } else { /* * If we get a page fault while in a critical section, then * it is most likely a fatal kernel page fault. The kernel * is already going to panic trying to get a sleep lock to * do the VM lookup, so just consider it a fatal trap so the * kernel can print out a useful trap message and even get * to the debugger. * * If we get a page fault while holding a non-sleepable * lock, then it is most likely a fatal kernel page fault. * If WITNESS is enabled, then it's going to whine about * bogus LORs with various VM locks, so just skip to the * fatal trap handling directly. */ if (td->td_critnest != 0 || WITNESS_CHECK(WARN_SLEEPOK | WARN_GIANTOK, NULL, "Kernel page fault") != 0) { trap_fatal(frame, eva); return (-1); } } va = trunc_page(eva); if (va >= KERNBASE) { /* * Don't allow user-mode faults in kernel address space. * An exception: if the faulting address is the invalid * instruction entry in the IDT, then the Intel Pentium * F00F bug workaround was triggered, and we need to * treat it is as an illegal instruction, and not a page * fault. */ #if defined(I586_CPU) && !defined(NO_F00F_HACK) if ((eva == (unsigned int)&idt[6]) && has_f00f_bug) return (-2); #endif if (usermode) goto nogo; map = kernel_map; } else { map = &p->p_vmspace->vm_map; /* * When accessing a user-space address, kernel must be * ready to accept the page fault, and provide a * handling routine. Since accessing the address * without the handler is a bug, do not try to handle * it normally, and panic immediately. */ if (!usermode && (td->td_intr_nesting_level != 0 || curpcb->pcb_onfault == NULL)) { trap_fatal(frame, eva); return (-1); } } /* * If the trap was caused by errant bits in the PTE then panic. */ if (frame->tf_err & PGEX_RSV) { trap_fatal(frame, eva); return (-1); } /* * PGEX_I is defined only if the execute disable bit capability is * supported and enabled. */ if (frame->tf_err & PGEX_W) ftype = VM_PROT_WRITE; #if defined(PAE) || defined(PAE_TABLES) else if ((frame->tf_err & PGEX_I) && pg_nx != 0) ftype = VM_PROT_EXECUTE; #endif else ftype = VM_PROT_READ; /* Fault in the page. */ rv = vm_fault(map, va, ftype, VM_FAULT_NORMAL); if (rv == KERN_SUCCESS) { #ifdef HWPMC_HOOKS if (ftype == VM_PROT_READ || ftype == VM_PROT_WRITE) { PMC_SOFT_CALL_TF( , , page_fault, all, frame); if (ftype == VM_PROT_READ) PMC_SOFT_CALL_TF( , , page_fault, read, frame); else PMC_SOFT_CALL_TF( , , page_fault, write, frame); } #endif return (0); } nogo: if (!usermode) { if (td->td_intr_nesting_level == 0 && curpcb->pcb_onfault != NULL) { frame->tf_eip = (int)curpcb->pcb_onfault; return (0); } trap_fatal(frame, eva); return (-1); } return ((rv == KERN_PROTECTION_FAILURE) ? SIGBUS : SIGSEGV); } static void trap_fatal(frame, eva) struct trapframe *frame; vm_offset_t eva; { int code, ss, esp; u_int type; struct soft_segment_descriptor softseg; char *msg; code = frame->tf_err; type = frame->tf_trapno; sdtossd(&gdt[IDXSEL(frame->tf_cs & 0xffff)].sd, &softseg); if (type <= MAX_TRAP_MSG) msg = trap_msg[type]; else msg = "UNKNOWN"; printf("\n\nFatal trap %d: %s while in %s mode\n", type, msg, frame->tf_eflags & PSL_VM ? "vm86" : ISPL(frame->tf_cs) == SEL_UPL ? "user" : "kernel"); #ifdef SMP /* two separate prints in case of a trap on an unmapped page */ printf("cpuid = %d; ", PCPU_GET(cpuid)); printf("apic id = %02x\n", PCPU_GET(apic_id)); #endif if (type == T_PAGEFLT) { printf("fault virtual address = 0x%x\n", eva); printf("fault code = %s %s%s, %s\n", code & PGEX_U ? "user" : "supervisor", code & PGEX_W ? "write" : "read", #if defined(PAE) || defined(PAE_TABLES) pg_nx != 0 ? (code & PGEX_I ? " instruction" : " data") : #endif "", code & PGEX_RSV ? "reserved bits in PTE" : code & PGEX_P ? "protection violation" : "page not present"); } printf("instruction pointer = 0x%x:0x%x\n", frame->tf_cs & 0xffff, frame->tf_eip); if (TF_HAS_STACKREGS(frame)) { ss = frame->tf_ss & 0xffff; esp = frame->tf_esp; } else { ss = GSEL(GDATA_SEL, SEL_KPL); esp = (int)&frame->tf_esp; } printf("stack pointer = 0x%x:0x%x\n", ss, esp); printf("frame pointer = 0x%x:0x%x\n", ss, frame->tf_ebp); printf("code segment = base 0x%x, limit 0x%x, type 0x%x\n", softseg.ssd_base, softseg.ssd_limit, softseg.ssd_type); printf(" = DPL %d, pres %d, def32 %d, gran %d\n", softseg.ssd_dpl, softseg.ssd_p, softseg.ssd_def32, softseg.ssd_gran); printf("processor eflags = "); if (frame->tf_eflags & PSL_T) printf("trace trap, "); if (frame->tf_eflags & PSL_I) printf("interrupt enabled, "); if (frame->tf_eflags & PSL_NT) printf("nested task, "); if (frame->tf_eflags & PSL_RF) printf("resume, "); if (frame->tf_eflags & PSL_VM) printf("vm86, "); printf("IOPL = %d\n", (frame->tf_eflags & PSL_IOPL) >> 12); printf("current process = %d (%s)\n", curproc->p_pid, curthread->td_name); #ifdef KDB if (debugger_on_panic || kdb_active) { frame->tf_err = eva; /* smuggle fault address to ddb */ if (kdb_trap(type, 0, frame)) { frame->tf_err = code; /* restore error code */ return; } frame->tf_err = code; /* restore error code */ } #endif printf("trap number = %d\n", type); if (type <= MAX_TRAP_MSG) panic("%s", trap_msg[type]); else panic("unknown/reserved trap"); } /* * Double fault handler. Called when a fault occurs while writing * a frame for a trap/exception onto the stack. This usually occurs * when the stack overflows (such is the case with infinite recursion, * for example). * * XXX Note that the current PTD gets replaced by IdlePTD when the * task switch occurs. This means that the stack that was active at * the time of the double fault is not available at unless * the machine was idle when the double fault occurred. The downside * of this is that "trace " in ddb won't work. */ void dblfault_handler() { #ifdef KDTRACE_HOOKS if (dtrace_doubletrap_func != NULL) (*dtrace_doubletrap_func)(); #endif printf("\nFatal double fault:\n"); printf("eip = 0x%x\n", PCPU_GET(common_tss.tss_eip)); printf("esp = 0x%x\n", PCPU_GET(common_tss.tss_esp)); printf("ebp = 0x%x\n", PCPU_GET(common_tss.tss_ebp)); #ifdef SMP /* two separate prints in case of a trap on an unmapped page */ printf("cpuid = %d; ", PCPU_GET(cpuid)); printf("apic id = %02x\n", PCPU_GET(apic_id)); #endif panic("double fault"); } int cpu_fetch_syscall_args(struct thread *td, struct syscall_args *sa) { struct proc *p; struct trapframe *frame; caddr_t params; long tmp; int error; p = td->td_proc; frame = td->td_frame; params = (caddr_t)frame->tf_esp + sizeof(int); sa->code = frame->tf_eax; /* * Need to check if this is a 32 bit or 64 bit syscall. */ if (sa->code == SYS_syscall) { /* * Code is first argument, followed by actual args. */ error = fueword(params, &tmp); if (error == -1) return (EFAULT); sa->code = tmp; params += sizeof(int); } else if (sa->code == SYS___syscall) { /* * Like syscall, but code is a quad, so as to maintain * quad alignment for the rest of the arguments. */ error = fueword(params, &tmp); if (error == -1) return (EFAULT); sa->code = tmp; params += sizeof(quad_t); } if (p->p_sysent->sv_mask) sa->code &= p->p_sysent->sv_mask; if (sa->code >= p->p_sysent->sv_size) sa->callp = &p->p_sysent->sv_table[0]; else sa->callp = &p->p_sysent->sv_table[sa->code]; sa->narg = sa->callp->sy_narg; if (params != NULL && sa->narg != 0) error = copyin(params, (caddr_t)sa->args, (u_int)(sa->narg * sizeof(int))); else error = 0; if (error == 0) { td->td_retval[0] = 0; td->td_retval[1] = frame->tf_edx; } return (error); } #include "../../kern/subr_syscall.c" /* * syscall - system call request C handler. A system call is * essentially treated as a trap by reusing the frame layout. */ void syscall(struct trapframe *frame) { struct thread *td; struct syscall_args sa; register_t orig_tf_eflags; int error; ksiginfo_t ksi; #ifdef DIAGNOSTIC if (!(TRAPF_USERMODE(frame) && (curpcb->pcb_flags & PCB_VM86CALL) == 0)) { panic("syscall"); /* NOT REACHED */ } #endif orig_tf_eflags = frame->tf_eflags; td = curthread; td->td_frame = frame; error = syscallenter(td, &sa); /* * Traced syscall. */ if ((orig_tf_eflags & PSL_T) && !(orig_tf_eflags & PSL_VM)) { frame->tf_eflags &= ~PSL_T; ksiginfo_init_trap(&ksi); ksi.ksi_signo = SIGTRAP; ksi.ksi_code = TRAP_TRACE; ksi.ksi_addr = (void *)frame->tf_eip; trapsignal(td, &ksi); } KASSERT(PCB_USER_FPU(td->td_pcb), ("System call %s returning with kernel FPU ctx leaked", syscallname(td->td_proc, sa.code))); KASSERT(td->td_pcb->pcb_save == get_pcb_user_save_td(td), ("System call %s returning with mangled pcb_save", syscallname(td->td_proc, sa.code))); syscallret(td, error, &sa); } Index: head/sys/i386/i386/vm86bios.s =================================================================== --- head/sys/i386/i386/vm86bios.s (revision 313153) +++ head/sys/i386/i386/vm86bios.s (revision 313154) @@ -1,177 +1,173 @@ /*- * Copyright (c) 1998 Jonathan Lemon * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ -#include "opt_npx.h" - #include /* miscellaneous asm macros */ #include #include "assym.s" #define SCR_NEWPTD PCB_ESI /* readability macros */ #define SCR_VMFRAME PCB_EBP /* see vm86.c for explanation */ #define SCR_STACK PCB_ESP #define SCR_PGTABLE PCB_EBX #define SCR_ARGFRAME PCB_EIP #define SCR_TSS0 PCB_VM86 #define SCR_TSS1 (PCB_VM86+4) .data ALIGN_DATA .globl vm86pcb vm86pcb: .long 0 .text /* * vm86_bioscall(struct trapframe_vm86 *vm86) */ ENTRY(vm86_bioscall) movl vm86pcb,%edx /* scratch data area */ movl 4(%esp),%eax movl %eax,SCR_ARGFRAME(%edx) /* save argument pointer */ pushl %ebx pushl %ebp pushl %esi pushl %edi pushl %gs -#ifdef DEV_NPX pushfl cli movl PCPU(CURTHREAD),%ecx cmpl %ecx,PCPU(FPCURTHREAD) /* do we need to save fp? */ jne 1f pushl %edx movl TD_PCB(%ecx),%ecx pushl PCB_SAVEFPU(%ecx) call npxsave addl $4,%esp popl %edx /* recover our pcb */ 1: popfl -#endif movl SCR_VMFRAME(%edx),%ebx /* target frame location */ movl %ebx,%edi /* destination */ movl SCR_ARGFRAME(%edx),%esi /* source (set on entry) */ movl $VM86_FRAMESIZE/4,%ecx /* sizeof(struct vm86frame)/4 */ cld rep movsl /* copy frame to new stack */ movl PCPU(CURPCB),%eax pushl %eax /* save curpcb */ movl %edx,PCPU(CURPCB) /* set curpcb to vm86pcb */ movl PCPU(TSS_GDT),%ebx /* entry in GDT */ movl 0(%ebx),%eax movl %eax,SCR_TSS0(%edx) /* save first word */ movl 4(%ebx),%eax andl $~0x200, %eax /* flip 386BSY -> 386TSS */ movl %eax,SCR_TSS1(%edx) /* save second word */ movl PCB_EXT(%edx),%edi /* vm86 tssd entry */ movl 0(%edi),%eax movl %eax,0(%ebx) movl 4(%edi),%eax movl %eax,4(%ebx) movl $GPROC0_SEL*8,%esi /* GSEL(entry, SEL_KPL) */ ltr %si movl %cr3,%eax pushl %eax /* save address space */ movl IdlePTD,%ecx movl %ecx,%ebx addl $KERNBASE,%ebx /* va of Idle PTD */ movl 0(%ebx),%eax pushl %eax /* old ptde != 0 when booting */ pushl %ebx /* keep for reuse */ movl %esp,SCR_STACK(%edx) /* save current stack location */ movl SCR_NEWPTD(%edx),%eax /* mapping for vm86 page table */ movl %eax,0(%ebx) /* ... install as PTD entry 0 */ #if defined(PAE) || defined(PAE_TABLES) movl IdlePDPT,%ecx #endif movl %ecx,%cr3 /* new page tables */ movl SCR_VMFRAME(%edx),%esp /* switch to new stack */ pushl %esp call vm86_prepcall /* finish setup */ add $4, %esp /* * Return via doreti */ MEXITCOUNT jmp doreti /* * vm86_biosret(struct trapframe_vm86 *vm86) */ ENTRY(vm86_biosret) movl vm86pcb,%edx /* data area */ movl 4(%esp),%esi /* source */ movl SCR_ARGFRAME(%edx),%edi /* destination */ movl $VM86_FRAMESIZE/4,%ecx /* size */ cld rep movsl /* copy frame to original frame */ movl SCR_STACK(%edx),%esp /* back to old stack */ popl %ebx /* saved va of Idle PTD */ popl %eax movl %eax,0(%ebx) /* restore old pte */ popl %eax movl %eax,%cr3 /* install old page table */ movl PCPU(TSS_GDT),%ebx /* entry in GDT */ movl SCR_TSS0(%edx),%eax movl %eax,0(%ebx) /* restore first word */ movl SCR_TSS1(%edx),%eax movl %eax,4(%ebx) /* restore second word */ movl $GPROC0_SEL*8,%esi /* GSEL(entry, SEL_KPL) */ ltr %si popl PCPU(CURPCB) /* restore curpcb/curproc */ movl SCR_ARGFRAME(%edx),%edx /* original stack frame */ movl TF_TRAPNO(%edx),%eax /* return (trapno) */ popl %gs popl %edi popl %esi popl %ebp popl %ebx ret /* back to our normal program */ Index: head/sys/i386/i386/vm_machdep.c =================================================================== --- head/sys/i386/i386/vm_machdep.c (revision 313153) +++ head/sys/i386/i386/vm_machdep.c (revision 313154) @@ -1,871 +1,855 @@ /*- * Copyright (c) 1982, 1986 The Regents of the University of California. * Copyright (c) 1989, 1990 William Jolitz * Copyright (c) 1994 John Dyson * All rights reserved. * * This code is derived from software contributed to Berkeley by * the Systems Programming Group of the University of Utah Computer * Science Department, and William Jolitz. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)vm_machdep.c 7.3 (Berkeley) 5/13/91 * Utah $Hdr: vm_machdep.c 1.16.1.1 89/06/23$ */ #include __FBSDID("$FreeBSD$"); #include "opt_isa.h" #include "opt_npx.h" #include "opt_reset.h" #include "opt_cpu.h" #include "opt_xbox.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef CPU_ELAN #include #endif #include #include #include #include #include #include #include #ifdef XBOX #include #endif #ifndef NSFBUFS #define NSFBUFS (512 + maxusers * 16) #endif -#if !defined(CPU_DISABLE_SSE) && defined(I686_CPU) -#define CPU_ENABLE_SSE -#endif - _Static_assert(OFFSETOF_CURTHREAD == offsetof(struct pcpu, pc_curthread), "OFFSETOF_CURTHREAD does not correspond with offset of pc_curthread."); _Static_assert(OFFSETOF_CURPCB == offsetof(struct pcpu, pc_curpcb), "OFFSETOF_CURPCB does not correspond with offset of pc_curpcb."); _Static_assert(__OFFSETOF_MONITORBUF == offsetof(struct pcpu, pc_monitorbuf), "__OFFSETOF_MONINORBUF does not correspond with offset of pc_monitorbuf."); static void cpu_reset_real(void); #ifdef SMP static void cpu_reset_proxy(void); static u_int cpu_reset_proxyid; static volatile u_int cpu_reset_proxy_active; #endif union savefpu * get_pcb_user_save_td(struct thread *td) { vm_offset_t p; p = td->td_kstack + td->td_kstack_pages * PAGE_SIZE - roundup2(cpu_max_ext_state_size, XSAVE_AREA_ALIGN); KASSERT((p % XSAVE_AREA_ALIGN) == 0, ("Unaligned pcb_user_save area")); return ((union savefpu *)p); } union savefpu * get_pcb_user_save_pcb(struct pcb *pcb) { vm_offset_t p; p = (vm_offset_t)(pcb + 1); return ((union savefpu *)p); } struct pcb * get_pcb_td(struct thread *td) { vm_offset_t p; p = td->td_kstack + td->td_kstack_pages * PAGE_SIZE - roundup2(cpu_max_ext_state_size, XSAVE_AREA_ALIGN) - sizeof(struct pcb); return ((struct pcb *)p); } void * alloc_fpusave(int flags) { void *res; -#ifdef CPU_ENABLE_SSE struct savefpu_ymm *sf; -#endif res = malloc(cpu_max_ext_state_size, M_DEVBUF, flags); -#ifdef CPU_ENABLE_SSE if (use_xsave) { sf = (struct savefpu_ymm *)res; bzero(&sf->sv_xstate.sx_hd, sizeof(sf->sv_xstate.sx_hd)); sf->sv_xstate.sx_hd.xstate_bv = xsave_mask; } -#endif return (res); } /* * Finish a fork operation, with process p2 nearly set up. * Copy and update the pcb, set up the stack so that the child * ready to run and return to user mode. */ void cpu_fork(struct thread *td1, struct proc *p2, struct thread *td2, int flags) { register struct proc *p1; struct pcb *pcb2; struct mdproc *mdp2; p1 = td1->td_proc; if ((flags & RFPROC) == 0) { if ((flags & RFMEM) == 0) { /* unshare user LDT */ struct mdproc *mdp1 = &p1->p_md; struct proc_ldt *pldt, *pldt1; mtx_lock_spin(&dt_lock); if ((pldt1 = mdp1->md_ldt) != NULL && pldt1->ldt_refcnt > 1) { pldt = user_ldt_alloc(mdp1, pldt1->ldt_len); if (pldt == NULL) panic("could not copy LDT"); mdp1->md_ldt = pldt; set_user_ldt(mdp1); user_ldt_deref(pldt1); } else mtx_unlock_spin(&dt_lock); } return; } /* Ensure that td1's pcb is up to date. */ if (td1 == curthread) td1->td_pcb->pcb_gs = rgs(); -#ifdef DEV_NPX critical_enter(); if (PCPU_GET(fpcurthread) == td1) npxsave(td1->td_pcb->pcb_save); critical_exit(); -#endif /* Point the pcb to the top of the stack */ pcb2 = get_pcb_td(td2); td2->td_pcb = pcb2; /* Copy td1's pcb */ bcopy(td1->td_pcb, pcb2, sizeof(*pcb2)); /* Properly initialize pcb_save */ pcb2->pcb_save = get_pcb_user_save_pcb(pcb2); bcopy(get_pcb_user_save_td(td1), get_pcb_user_save_pcb(pcb2), cpu_max_ext_state_size); /* Point mdproc and then copy over td1's contents */ mdp2 = &p2->p_md; bcopy(&p1->p_md, mdp2, sizeof(*mdp2)); /* * Create a new fresh stack for the new process. * Copy the trap frame for the return to user mode as if from a * syscall. This copies most of the user mode register values. * The -16 is so we can expand the trapframe if we go to vm86. */ td2->td_frame = (struct trapframe *)((caddr_t)td2->td_pcb - 16) - 1; bcopy(td1->td_frame, td2->td_frame, sizeof(struct trapframe)); td2->td_frame->tf_eax = 0; /* Child returns zero */ td2->td_frame->tf_eflags &= ~PSL_C; /* success */ td2->td_frame->tf_edx = 1; /* * If the parent process has the trap bit set (i.e. a debugger had * single stepped the process to the system call), we need to clear * the trap flag from the new frame unless the debugger had set PF_FORK * on the parent. Otherwise, the child will receive a (likely * unexpected) SIGTRAP when it executes the first instruction after * returning to userland. */ if ((p1->p_pfsflags & PF_FORK) == 0) td2->td_frame->tf_eflags &= ~PSL_T; /* * Set registers for trampoline to user mode. Leave space for the * return address on stack. These are the kernel mode register values. */ #if defined(PAE) || defined(PAE_TABLES) pcb2->pcb_cr3 = vtophys(vmspace_pmap(p2->p_vmspace)->pm_pdpt); #else pcb2->pcb_cr3 = vtophys(vmspace_pmap(p2->p_vmspace)->pm_pdir); #endif pcb2->pcb_edi = 0; pcb2->pcb_esi = (int)fork_return; /* fork_trampoline argument */ pcb2->pcb_ebp = 0; pcb2->pcb_esp = (int)td2->td_frame - sizeof(void *); pcb2->pcb_ebx = (int)td2; /* fork_trampoline argument */ pcb2->pcb_eip = (int)fork_trampoline; /*- * pcb2->pcb_dr*: cloned above. * pcb2->pcb_savefpu: cloned above. * pcb2->pcb_flags: cloned above. * pcb2->pcb_onfault: cloned above (always NULL here?). * pcb2->pcb_gs: cloned above. * pcb2->pcb_ext: cleared below. */ /* * XXX don't copy the i/o pages. this should probably be fixed. */ pcb2->pcb_ext = 0; /* Copy the LDT, if necessary. */ mtx_lock_spin(&dt_lock); if (mdp2->md_ldt != NULL) { if (flags & RFMEM) { mdp2->md_ldt->ldt_refcnt++; } else { mdp2->md_ldt = user_ldt_alloc(mdp2, mdp2->md_ldt->ldt_len); if (mdp2->md_ldt == NULL) panic("could not copy LDT"); } } mtx_unlock_spin(&dt_lock); /* Setup to release spin count in fork_exit(). */ td2->td_md.md_spinlock_count = 1; td2->td_md.md_saved_flags = PSL_KERNEL | PSL_I; /* * Now, cpu_switch() can schedule the new process. * pcb_esp is loaded pointing to the cpu_switch() stack frame * containing the return address when exiting cpu_switch. * This will normally be to fork_trampoline(), which will have * %ebx loaded with the new proc's pointer. fork_trampoline() * will set up a stack to call fork_return(p, frame); to complete * the return to user-mode. */ } /* * Intercept the return address from a freshly forked process that has NOT * been scheduled yet. * * This is needed to make kernel threads stay in kernel mode. */ void cpu_fork_kthread_handler(struct thread *td, void (*func)(void *), void *arg) { /* * Note that the trap frame follows the args, so the function * is really called like this: func(arg, frame); */ td->td_pcb->pcb_esi = (int) func; /* function */ td->td_pcb->pcb_ebx = (int) arg; /* first arg */ } void cpu_exit(struct thread *td) { /* * If this process has a custom LDT, release it. Reset pc->pcb_gs * and %gs before we free it in case they refer to an LDT entry. */ mtx_lock_spin(&dt_lock); if (td->td_proc->p_md.md_ldt) { td->td_pcb->pcb_gs = _udatasel; load_gs(_udatasel); user_ldt_free(td); } else mtx_unlock_spin(&dt_lock); } void cpu_thread_exit(struct thread *td) { -#ifdef DEV_NPX critical_enter(); if (td == PCPU_GET(fpcurthread)) npxdrop(); critical_exit(); -#endif /* Disable any hardware breakpoints. */ if (td->td_pcb->pcb_flags & PCB_DBREGS) { reset_dbregs(); td->td_pcb->pcb_flags &= ~PCB_DBREGS; } } void cpu_thread_clean(struct thread *td) { struct pcb *pcb; pcb = td->td_pcb; if (pcb->pcb_ext != NULL) { /* if (pcb->pcb_ext->ext_refcount-- == 1) ?? */ /* * XXX do we need to move the TSS off the allocated pages * before freeing them? (not done here) */ kmem_free(kernel_arena, (vm_offset_t)pcb->pcb_ext, ctob(IOPAGES + 1)); pcb->pcb_ext = NULL; } } void cpu_thread_swapin(struct thread *td) { } void cpu_thread_swapout(struct thread *td) { } void cpu_thread_alloc(struct thread *td) { struct pcb *pcb; -#ifdef CPU_ENABLE_SSE struct xstate_hdr *xhdr; -#endif td->td_pcb = pcb = get_pcb_td(td); td->td_frame = (struct trapframe *)((caddr_t)pcb - 16) - 1; pcb->pcb_ext = NULL; pcb->pcb_save = get_pcb_user_save_pcb(pcb); -#ifdef CPU_ENABLE_SSE if (use_xsave) { xhdr = (struct xstate_hdr *)(pcb->pcb_save + 1); bzero(xhdr, sizeof(*xhdr)); xhdr->xstate_bv = xsave_mask; } -#endif } void cpu_thread_free(struct thread *td) { cpu_thread_clean(td); } void cpu_set_syscall_retval(struct thread *td, int error) { switch (error) { case 0: td->td_frame->tf_eax = td->td_retval[0]; td->td_frame->tf_edx = td->td_retval[1]; td->td_frame->tf_eflags &= ~PSL_C; break; case ERESTART: /* * Reconstruct pc, assuming lcall $X,y is 7 bytes, int * 0x80 is 2 bytes. We saved this in tf_err. */ td->td_frame->tf_eip -= td->td_frame->tf_err; break; case EJUSTRETURN: break; default: td->td_frame->tf_eax = SV_ABI_ERRNO(td->td_proc, error); td->td_frame->tf_eflags |= PSL_C; break; } } /* * Initialize machine state, mostly pcb and trap frame for a new * thread, about to return to userspace. Put enough state in the new * thread's PCB to get it to go back to the fork_return(), which * finalizes the thread state and handles peculiarities of the first * return to userspace for the new thread. */ void cpu_copy_thread(struct thread *td, struct thread *td0) { struct pcb *pcb2; /* Point the pcb to the top of the stack. */ pcb2 = td->td_pcb; /* * Copy the upcall pcb. This loads kernel regs. * Those not loaded individually below get their default * values here. */ bcopy(td0->td_pcb, pcb2, sizeof(*pcb2)); pcb2->pcb_flags &= ~(PCB_NPXINITDONE | PCB_NPXUSERINITDONE | PCB_KERNNPX); pcb2->pcb_save = get_pcb_user_save_pcb(pcb2); bcopy(get_pcb_user_save_td(td0), pcb2->pcb_save, cpu_max_ext_state_size); /* * Create a new fresh stack for the new thread. */ bcopy(td0->td_frame, td->td_frame, sizeof(struct trapframe)); /* If the current thread has the trap bit set (i.e. a debugger had * single stepped the process to the system call), we need to clear * the trap flag from the new frame. Otherwise, the new thread will * receive a (likely unexpected) SIGTRAP when it executes the first * instruction after returning to userland. */ td->td_frame->tf_eflags &= ~PSL_T; /* * Set registers for trampoline to user mode. Leave space for the * return address on stack. These are the kernel mode register values. */ pcb2->pcb_edi = 0; pcb2->pcb_esi = (int)fork_return; /* trampoline arg */ pcb2->pcb_ebp = 0; pcb2->pcb_esp = (int)td->td_frame - sizeof(void *); /* trampoline arg */ pcb2->pcb_ebx = (int)td; /* trampoline arg */ pcb2->pcb_eip = (int)fork_trampoline; pcb2->pcb_gs = rgs(); /* * If we didn't copy the pcb, we'd need to do the following registers: * pcb2->pcb_cr3: cloned above. * pcb2->pcb_dr*: cloned above. * pcb2->pcb_savefpu: cloned above. * pcb2->pcb_flags: cloned above. * pcb2->pcb_onfault: cloned above (always NULL here?). * pcb2->pcb_gs: cloned above. * pcb2->pcb_ext: cleared below. */ pcb2->pcb_ext = NULL; /* Setup to release spin count in fork_exit(). */ td->td_md.md_spinlock_count = 1; td->td_md.md_saved_flags = PSL_KERNEL | PSL_I; } /* * Set that machine state for performing an upcall that starts * the entry function with the given argument. */ void cpu_set_upcall(struct thread *td, void (*entry)(void *), void *arg, stack_t *stack) { /* * Do any extra cleaning that needs to be done. * The thread may have optional components * that are not present in a fresh thread. * This may be a recycled thread so make it look * as though it's newly allocated. */ cpu_thread_clean(td); /* * Set the trap frame to point at the beginning of the entry * function. */ td->td_frame->tf_ebp = 0; td->td_frame->tf_esp = (((int)stack->ss_sp + stack->ss_size - 4) & ~0x0f) - 4; td->td_frame->tf_eip = (int)entry; /* Pass the argument to the entry point. */ suword((void *)(td->td_frame->tf_esp + sizeof(void *)), (int)arg); } int cpu_set_user_tls(struct thread *td, void *tls_base) { struct segment_descriptor sd; uint32_t base; /* * Construct a descriptor and store it in the pcb for * the next context switch. Also store it in the gdt * so that the load of tf_fs into %fs will activate it * at return to userland. */ base = (uint32_t)tls_base; sd.sd_lobase = base & 0xffffff; sd.sd_hibase = (base >> 24) & 0xff; sd.sd_lolimit = 0xffff; /* 4GB limit, wraps around */ sd.sd_hilimit = 0xf; sd.sd_type = SDT_MEMRWA; sd.sd_dpl = SEL_UPL; sd.sd_p = 1; sd.sd_xx = 0; sd.sd_def32 = 1; sd.sd_gran = 1; critical_enter(); /* set %gs */ td->td_pcb->pcb_gsd = sd; if (td == curthread) { PCPU_GET(fsgs_gdt)[1] = sd; load_gs(GSEL(GUGS_SEL, SEL_UPL)); } critical_exit(); return (0); } /* * Convert kernel VA to physical address */ vm_paddr_t kvtop(void *addr) { vm_paddr_t pa; pa = pmap_kextract((vm_offset_t)addr); if (pa == 0) panic("kvtop: zero page frame"); return (pa); } #ifdef SMP static void cpu_reset_proxy() { cpuset_t tcrp; cpu_reset_proxy_active = 1; while (cpu_reset_proxy_active == 1) ; /* Wait for other cpu to see that we've started */ CPU_SETOF(cpu_reset_proxyid, &tcrp); stop_cpus(tcrp); printf("cpu_reset_proxy: Stopped CPU %d\n", cpu_reset_proxyid); DELAY(1000000); cpu_reset_real(); } #endif void cpu_reset() { #ifdef XBOX if (arch_i386_is_xbox) { /* Kick the PIC16L, it can reboot the box */ pic16l_reboot(); for (;;); } #endif #ifdef SMP cpuset_t map; u_int cnt; if (smp_started) { map = all_cpus; CPU_CLR(PCPU_GET(cpuid), &map); CPU_NAND(&map, &stopped_cpus); if (!CPU_EMPTY(&map)) { printf("cpu_reset: Stopping other CPUs\n"); stop_cpus(map); } if (PCPU_GET(cpuid) != 0) { cpu_reset_proxyid = PCPU_GET(cpuid); cpustop_restartfunc = cpu_reset_proxy; cpu_reset_proxy_active = 0; printf("cpu_reset: Restarting BSP\n"); /* Restart CPU #0. */ /* XXX: restart_cpus(1 << 0); */ CPU_SETOF(0, &started_cpus); wmb(); cnt = 0; while (cpu_reset_proxy_active == 0 && cnt < 10000000) cnt++; /* Wait for BSP to announce restart */ if (cpu_reset_proxy_active == 0) printf("cpu_reset: Failed to restart BSP\n"); enable_intr(); cpu_reset_proxy_active = 2; while (1); /* NOTREACHED */ } DELAY(1000000); } #endif cpu_reset_real(); /* NOTREACHED */ } static void cpu_reset_real() { struct region_descriptor null_idt; int b; disable_intr(); #ifdef CPU_ELAN if (elan_mmcr != NULL) elan_mmcr->RESCFG = 1; #endif if (cpu == CPU_GEODE1100) { /* Attempt Geode's own reset */ outl(0xcf8, 0x80009044ul); outl(0xcfc, 0xf); } #if !defined(BROKEN_KEYBOARD_RESET) /* * Attempt to do a CPU reset via the keyboard controller, * do not turn off GateA20, as any machine that fails * to do the reset here would then end up in no man's land. */ outb(IO_KBD + 4, 0xFE); DELAY(500000); /* wait 0.5 sec to see if that did it */ #endif /* * Attempt to force a reset via the Reset Control register at * I/O port 0xcf9. Bit 2 forces a system reset when it * transitions from 0 to 1. Bit 1 selects the type of reset * to attempt: 0 selects a "soft" reset, and 1 selects a * "hard" reset. We try a "hard" reset. The first write sets * bit 1 to select a "hard" reset and clears bit 2. The * second write forces a 0 -> 1 transition in bit 2 to trigger * a reset. */ outb(0xcf9, 0x2); outb(0xcf9, 0x6); DELAY(500000); /* wait 0.5 sec to see if that did it */ /* * Attempt to force a reset via the Fast A20 and Init register * at I/O port 0x92. Bit 1 serves as an alternate A20 gate. * Bit 0 asserts INIT# when set to 1. We are careful to only * preserve bit 1 while setting bit 0. We also must clear bit * 0 before setting it if it isn't already clear. */ b = inb(0x92); if (b != 0xff) { if ((b & 0x1) != 0) outb(0x92, b & 0xfe); outb(0x92, b | 0x1); DELAY(500000); /* wait 0.5 sec to see if that did it */ } printf("No known reset method worked, attempting CPU shutdown\n"); DELAY(1000000); /* wait 1 sec for printf to complete */ /* Wipe the IDT. */ null_idt.rd_limit = 0; null_idt.rd_base = 0; lidt(&null_idt); /* "good night, sweet prince .... " */ breakpoint(); /* NOTREACHED */ while(1); } /* * Get an sf_buf from the freelist. May block if none are available. */ void sf_buf_map(struct sf_buf *sf, int flags) { pt_entry_t opte, *ptep; /* * Update the sf_buf's virtual-to-physical mapping, flushing the * virtual address from the TLB. Since the reference count for * the sf_buf's old mapping was zero, that mapping is not * currently in use. Consequently, there is no need to exchange * the old and new PTEs atomically, even under PAE. */ ptep = vtopte(sf->kva); opte = *ptep; *ptep = VM_PAGE_TO_PHYS(sf->m) | pgeflag | PG_RW | PG_V | pmap_cache_bits(sf->m->md.pat_mode, 0); /* * Avoid unnecessary TLB invalidations: If the sf_buf's old * virtual-to-physical mapping was not used, then any processor * that has invalidated the sf_buf's virtual address from its TLB * since the last used mapping need not invalidate again. */ #ifdef SMP if ((opte & (PG_V | PG_A)) == (PG_V | PG_A)) CPU_ZERO(&sf->cpumask); sf_buf_shootdown(sf, flags); #else if ((opte & (PG_V | PG_A)) == (PG_V | PG_A)) pmap_invalidate_page(kernel_pmap, sf->kva); #endif } #ifdef SMP void sf_buf_shootdown(struct sf_buf *sf, int flags) { cpuset_t other_cpus; u_int cpuid; sched_pin(); cpuid = PCPU_GET(cpuid); if (!CPU_ISSET(cpuid, &sf->cpumask)) { CPU_SET(cpuid, &sf->cpumask); invlpg(sf->kva); } if ((flags & SFB_CPUPRIVATE) == 0) { other_cpus = all_cpus; CPU_CLR(cpuid, &other_cpus); CPU_NAND(&other_cpus, &sf->cpumask); if (!CPU_EMPTY(&other_cpus)) { CPU_OR(&sf->cpumask, &other_cpus); smp_masked_invlpg(other_cpus, sf->kva); } } sched_unpin(); } #endif /* * MD part of sf_buf_free(). */ int sf_buf_unmap(struct sf_buf *sf) { return (0); } static void sf_buf_invalidate(struct sf_buf *sf) { vm_page_t m = sf->m; /* * Use pmap_qenter to update the pte for * existing mapping, in particular, the PAT * settings are recalculated. */ pmap_qenter(sf->kva, &m, 1); pmap_invalidate_cache_range(sf->kva, sf->kva + PAGE_SIZE, FALSE); } /* * Invalidate the cache lines that may belong to the page, if * (possibly old) mapping of the page by sf buffer exists. Returns * TRUE when mapping was found and cache invalidated. */ boolean_t sf_buf_invalidate_cache(vm_page_t m) { return (sf_buf_process_page(m, sf_buf_invalidate)); } /* * Software interrupt handler for queued VM system processing. */ void swi_vm(void *dummy) { if (busdma_swi_pending != 0) busdma_swi(); } /* * Tell whether this address is in some physical memory region. * Currently used by the kernel coredump code in order to avoid * dumping the ``ISA memory hole'' which could cause indefinite hangs, * or other unpredictable behaviour. */ int is_physical_memory(vm_paddr_t addr) { #ifdef DEV_ISA /* The ISA ``memory hole''. */ if (addr >= 0xa0000 && addr < 0x100000) return 0; #endif /* * stuff other tests for known memory-mapped devices (PCI?) * here */ return 1; } Index: head/sys/i386/include/atomic.h =================================================================== --- head/sys/i386/include/atomic.h (revision 313153) +++ head/sys/i386/include/atomic.h (revision 313154) @@ -1,846 +1,817 @@ /*- * Copyright (c) 1998 Doug Rabson * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _MACHINE_ATOMIC_H_ #define _MACHINE_ATOMIC_H_ #ifndef _SYS_CDEFS_H_ #error this file needs sys/cdefs.h as a prerequisite #endif #ifdef _KERNEL #include #include #endif #ifndef __OFFSETOF_MONITORBUF /* * __OFFSETOF_MONITORBUF == __pcpu_offset(pc_monitorbuf). * * The open-coded number is used instead of the symbolic expression to * avoid a dependency on sys/pcpu.h in machine/atomic.h consumers. * An assertion in i386/vm_machdep.c ensures that the value is correct. */ #define __OFFSETOF_MONITORBUF 0x180 static __inline void __mbk(void) { __asm __volatile("lock; addl $0,%%fs:%0" : "+m" (*(u_int *)__OFFSETOF_MONITORBUF) : : "memory", "cc"); } static __inline void __mbu(void) { __asm __volatile("lock; addl $0,(%%esp)" : : : "memory", "cc"); } #endif /* * Various simple operations on memory, each of which is atomic in the * presence of interrupts and multiple processors. * * atomic_set_char(P, V) (*(u_char *)(P) |= (V)) * atomic_clear_char(P, V) (*(u_char *)(P) &= ~(V)) * atomic_add_char(P, V) (*(u_char *)(P) += (V)) * atomic_subtract_char(P, V) (*(u_char *)(P) -= (V)) * * atomic_set_short(P, V) (*(u_short *)(P) |= (V)) * atomic_clear_short(P, V) (*(u_short *)(P) &= ~(V)) * atomic_add_short(P, V) (*(u_short *)(P) += (V)) * atomic_subtract_short(P, V) (*(u_short *)(P) -= (V)) * * atomic_set_int(P, V) (*(u_int *)(P) |= (V)) * atomic_clear_int(P, V) (*(u_int *)(P) &= ~(V)) * atomic_add_int(P, V) (*(u_int *)(P) += (V)) * atomic_subtract_int(P, V) (*(u_int *)(P) -= (V)) * atomic_swap_int(P, V) (return (*(u_int *)(P)); *(u_int *)(P) = (V);) * atomic_readandclear_int(P) (return (*(u_int *)(P)); *(u_int *)(P) = 0;) * * atomic_set_long(P, V) (*(u_long *)(P) |= (V)) * atomic_clear_long(P, V) (*(u_long *)(P) &= ~(V)) * atomic_add_long(P, V) (*(u_long *)(P) += (V)) * atomic_subtract_long(P, V) (*(u_long *)(P) -= (V)) * atomic_swap_long(P, V) (return (*(u_long *)(P)); *(u_long *)(P) = (V);) * atomic_readandclear_long(P) (return (*(u_long *)(P)); *(u_long *)(P) = 0;) */ /* * The above functions are expanded inline in the statically-linked * kernel. Lock prefixes are generated if an SMP kernel is being * built. * * Kernel modules call real functions which are built into the kernel. * This allows kernel modules to be portable between UP and SMP systems. */ #if defined(KLD_MODULE) || !defined(__GNUCLIKE_ASM) #define ATOMIC_ASM(NAME, TYPE, OP, CONS, V) \ void atomic_##NAME##_##TYPE(volatile u_##TYPE *p, u_##TYPE v); \ void atomic_##NAME##_barr_##TYPE(volatile u_##TYPE *p, u_##TYPE v) int atomic_cmpset_int(volatile u_int *dst, u_int expect, u_int src); int atomic_fcmpset_int(volatile u_int *dst, u_int *expect, u_int src); u_int atomic_fetchadd_int(volatile u_int *p, u_int v); int atomic_testandset_int(volatile u_int *p, u_int v); int atomic_testandclear_int(volatile u_int *p, u_int v); void atomic_thread_fence_acq(void); void atomic_thread_fence_acq_rel(void); void atomic_thread_fence_rel(void); void atomic_thread_fence_seq_cst(void); #define ATOMIC_LOAD(TYPE) \ u_##TYPE atomic_load_acq_##TYPE(volatile u_##TYPE *p) #define ATOMIC_STORE(TYPE) \ void atomic_store_rel_##TYPE(volatile u_##TYPE *p, u_##TYPE v) int atomic_cmpset_64(volatile uint64_t *, uint64_t, uint64_t); uint64_t atomic_load_acq_64(volatile uint64_t *); void atomic_store_rel_64(volatile uint64_t *, uint64_t); uint64_t atomic_swap_64(volatile uint64_t *, uint64_t); #else /* !KLD_MODULE && __GNUCLIKE_ASM */ /* * For userland, always use lock prefixes so that the binaries will run * on both SMP and !SMP systems. */ #if defined(SMP) || !defined(_KERNEL) #define MPLOCKED "lock ; " #else #define MPLOCKED #endif /* * The assembly is volatilized to avoid code chunk removal by the compiler. * GCC aggressively reorders operations and memory clobbering is necessary * in order to avoid that for memory barriers. */ #define ATOMIC_ASM(NAME, TYPE, OP, CONS, V) \ static __inline void \ atomic_##NAME##_##TYPE(volatile u_##TYPE *p, u_##TYPE v)\ { \ __asm __volatile(MPLOCKED OP \ : "+m" (*p) \ : CONS (V) \ : "cc"); \ } \ \ static __inline void \ atomic_##NAME##_barr_##TYPE(volatile u_##TYPE *p, u_##TYPE v)\ { \ __asm __volatile(MPLOCKED OP \ : "+m" (*p) \ : CONS (V) \ : "memory", "cc"); \ } \ struct __hack /* * Atomic compare and set, used by the mutex functions * * if (*dst == expect) *dst = src (all 32 bit words) * * Returns 0 on failure, non-zero on success */ -#ifdef CPU_DISABLE_CMPXCHG - static __inline int atomic_cmpset_int(volatile u_int *dst, u_int expect, u_int src) { u_char res; __asm __volatile( - " pushfl ; " - " cli ; " - " cmpl %3,%1 ; " - " jne 1f ; " - " movl %2,%1 ; " - "1: " - " sete %0 ; " - " popfl ; " - "# atomic_cmpset_int" - : "=q" (res), /* 0 */ - "+m" (*dst) /* 1 */ - : "r" (src), /* 2 */ - "r" (expect) /* 3 */ - : "memory"); - return (res); -} - -#else /* !CPU_DISABLE_CMPXCHG */ - -static __inline int -atomic_cmpset_int(volatile u_int *dst, u_int expect, u_int src) -{ - u_char res; - - __asm __volatile( " " MPLOCKED " " " cmpxchgl %3,%1 ; " " sete %0 ; " "# atomic_cmpset_int" : "=q" (res), /* 0 */ "+m" (*dst), /* 1 */ "+a" (expect) /* 2 */ : "r" (src) /* 3 */ : "memory", "cc"); return (res); } static __inline int atomic_fcmpset_int(volatile u_int *dst, u_int *expect, u_int src) { u_char res; __asm __volatile( " " MPLOCKED " " " cmpxchgl %3,%1 ; " " sete %0 ; " "# atomic_cmpset_int" : "=q" (res), /* 0 */ "+m" (*dst), /* 1 */ "+a" (*expect) /* 2 */ : "r" (src) /* 3 */ : "memory", "cc"); return (res); } - -#endif /* CPU_DISABLE_CMPXCHG */ /* * Atomically add the value of v to the integer pointed to by p and return * the previous value of *p. */ static __inline u_int atomic_fetchadd_int(volatile u_int *p, u_int v) { __asm __volatile( " " MPLOCKED " " " xaddl %0,%1 ; " "# atomic_fetchadd_int" : "+r" (v), /* 0 */ "+m" (*p) /* 1 */ : : "cc"); return (v); } static __inline int atomic_testandset_int(volatile u_int *p, u_int v) { u_char res; __asm __volatile( " " MPLOCKED " " " btsl %2,%1 ; " " setc %0 ; " "# atomic_testandset_int" : "=q" (res), /* 0 */ "+m" (*p) /* 1 */ : "Ir" (v & 0x1f) /* 2 */ : "cc"); return (res); } static __inline int atomic_testandclear_int(volatile u_int *p, u_int v) { u_char res; __asm __volatile( " " MPLOCKED " " " btrl %2,%1 ; " " setc %0 ; " "# atomic_testandclear_int" : "=q" (res), /* 0 */ "+m" (*p) /* 1 */ : "Ir" (v & 0x1f) /* 2 */ : "cc"); return (res); } /* * We assume that a = b will do atomic loads and stores. Due to the * IA32 memory model, a simple store guarantees release semantics. * * However, a load may pass a store if they are performed on distinct * addresses, so we need Store/Load barrier for sequentially * consistent fences in SMP kernels. We use "lock addl $0,mem" for a * Store/Load barrier, as recommended by the AMD Software Optimization * Guide, and not mfence. In the kernel, we use a private per-cpu * cache line for "mem", to avoid introducing false data * dependencies. In user space, we use the word at the top of the * stack. * * For UP kernels, however, the memory of the single processor is * always consistent, so we only need to stop the compiler from * reordering accesses in a way that violates the semantics of acquire * and release. */ #if defined(_KERNEL) #if defined(SMP) #define __storeload_barrier() __mbk() #else /* _KERNEL && UP */ #define __storeload_barrier() __compiler_membar() #endif /* SMP */ #else /* !_KERNEL */ #define __storeload_barrier() __mbu() #endif /* _KERNEL*/ #define ATOMIC_LOAD(TYPE) \ static __inline u_##TYPE \ atomic_load_acq_##TYPE(volatile u_##TYPE *p) \ { \ u_##TYPE res; \ \ res = *p; \ __compiler_membar(); \ return (res); \ } \ struct __hack #define ATOMIC_STORE(TYPE) \ static __inline void \ atomic_store_rel_##TYPE(volatile u_##TYPE *p, u_##TYPE v) \ { \ \ __compiler_membar(); \ *p = v; \ } \ struct __hack static __inline void atomic_thread_fence_acq(void) { __compiler_membar(); } static __inline void atomic_thread_fence_rel(void) { __compiler_membar(); } static __inline void atomic_thread_fence_acq_rel(void) { __compiler_membar(); } static __inline void atomic_thread_fence_seq_cst(void) { __storeload_barrier(); } #ifdef _KERNEL #ifdef WANT_FUNCTIONS int atomic_cmpset_64_i386(volatile uint64_t *, uint64_t, uint64_t); int atomic_cmpset_64_i586(volatile uint64_t *, uint64_t, uint64_t); uint64_t atomic_load_acq_64_i386(volatile uint64_t *); uint64_t atomic_load_acq_64_i586(volatile uint64_t *); void atomic_store_rel_64_i386(volatile uint64_t *, uint64_t); void atomic_store_rel_64_i586(volatile uint64_t *, uint64_t); uint64_t atomic_swap_64_i386(volatile uint64_t *, uint64_t); uint64_t atomic_swap_64_i586(volatile uint64_t *, uint64_t); #endif /* I486 does not support SMP or CMPXCHG8B. */ static __inline int atomic_cmpset_64_i386(volatile uint64_t *dst, uint64_t expect, uint64_t src) { volatile uint32_t *p; u_char res; p = (volatile uint32_t *)dst; __asm __volatile( " pushfl ; " " cli ; " " xorl %1,%%eax ; " " xorl %2,%%edx ; " " orl %%edx,%%eax ; " " jne 1f ; " " movl %4,%1 ; " " movl %5,%2 ; " "1: " " sete %3 ; " " popfl" : "+A" (expect), /* 0 */ "+m" (*p), /* 1 */ "+m" (*(p + 1)), /* 2 */ "=q" (res) /* 3 */ : "r" ((uint32_t)src), /* 4 */ "r" ((uint32_t)(src >> 32)) /* 5 */ : "memory", "cc"); return (res); } static __inline uint64_t atomic_load_acq_64_i386(volatile uint64_t *p) { volatile uint32_t *q; uint64_t res; q = (volatile uint32_t *)p; __asm __volatile( " pushfl ; " " cli ; " " movl %1,%%eax ; " " movl %2,%%edx ; " " popfl" : "=&A" (res) /* 0 */ : "m" (*q), /* 1 */ "m" (*(q + 1)) /* 2 */ : "memory"); return (res); } static __inline void atomic_store_rel_64_i386(volatile uint64_t *p, uint64_t v) { volatile uint32_t *q; q = (volatile uint32_t *)p; __asm __volatile( " pushfl ; " " cli ; " " movl %%eax,%0 ; " " movl %%edx,%1 ; " " popfl" : "=m" (*q), /* 0 */ "=m" (*(q + 1)) /* 1 */ : "A" (v) /* 2 */ : "memory"); } static __inline uint64_t atomic_swap_64_i386(volatile uint64_t *p, uint64_t v) { volatile uint32_t *q; uint64_t res; q = (volatile uint32_t *)p; __asm __volatile( " pushfl ; " " cli ; " " movl %1,%%eax ; " " movl %2,%%edx ; " " movl %4,%2 ; " " movl %3,%1 ; " " popfl" : "=&A" (res), /* 0 */ "+m" (*q), /* 1 */ "+m" (*(q + 1)) /* 2 */ : "r" ((uint32_t)v), /* 3 */ "r" ((uint32_t)(v >> 32))); /* 4 */ return (res); } static __inline int atomic_cmpset_64_i586(volatile uint64_t *dst, uint64_t expect, uint64_t src) { u_char res; __asm __volatile( " " MPLOCKED " " " cmpxchg8b %1 ; " " sete %0" : "=q" (res), /* 0 */ "+m" (*dst), /* 1 */ "+A" (expect) /* 2 */ : "b" ((uint32_t)src), /* 3 */ "c" ((uint32_t)(src >> 32)) /* 4 */ : "memory", "cc"); return (res); } static __inline uint64_t atomic_load_acq_64_i586(volatile uint64_t *p) { uint64_t res; __asm __volatile( " movl %%ebx,%%eax ; " " movl %%ecx,%%edx ; " " " MPLOCKED " " " cmpxchg8b %1" : "=&A" (res), /* 0 */ "+m" (*p) /* 1 */ : : "memory", "cc"); return (res); } static __inline void atomic_store_rel_64_i586(volatile uint64_t *p, uint64_t v) { __asm __volatile( " movl %%eax,%%ebx ; " " movl %%edx,%%ecx ; " "1: " " " MPLOCKED " " " cmpxchg8b %0 ; " " jne 1b" : "+m" (*p), /* 0 */ "+A" (v) /* 1 */ : : "ebx", "ecx", "memory", "cc"); } static __inline uint64_t atomic_swap_64_i586(volatile uint64_t *p, uint64_t v) { __asm __volatile( " movl %%eax,%%ebx ; " " movl %%edx,%%ecx ; " "1: " " " MPLOCKED " " " cmpxchg8b %0 ; " " jne 1b" : "+m" (*p), /* 0 */ "+A" (v) /* 1 */ : : "ebx", "ecx", "memory", "cc"); return (v); } static __inline int atomic_cmpset_64(volatile uint64_t *dst, uint64_t expect, uint64_t src) { if ((cpu_feature & CPUID_CX8) == 0) return (atomic_cmpset_64_i386(dst, expect, src)); else return (atomic_cmpset_64_i586(dst, expect, src)); } static __inline uint64_t atomic_load_acq_64(volatile uint64_t *p) { if ((cpu_feature & CPUID_CX8) == 0) return (atomic_load_acq_64_i386(p)); else return (atomic_load_acq_64_i586(p)); } static __inline void atomic_store_rel_64(volatile uint64_t *p, uint64_t v) { if ((cpu_feature & CPUID_CX8) == 0) atomic_store_rel_64_i386(p, v); else atomic_store_rel_64_i586(p, v); } static __inline uint64_t atomic_swap_64(volatile uint64_t *p, uint64_t v) { if ((cpu_feature & CPUID_CX8) == 0) return (atomic_swap_64_i386(p, v)); else return (atomic_swap_64_i586(p, v)); } #endif /* _KERNEL */ #endif /* KLD_MODULE || !__GNUCLIKE_ASM */ ATOMIC_ASM(set, char, "orb %b1,%0", "iq", v); ATOMIC_ASM(clear, char, "andb %b1,%0", "iq", ~v); ATOMIC_ASM(add, char, "addb %b1,%0", "iq", v); ATOMIC_ASM(subtract, char, "subb %b1,%0", "iq", v); ATOMIC_ASM(set, short, "orw %w1,%0", "ir", v); ATOMIC_ASM(clear, short, "andw %w1,%0", "ir", ~v); ATOMIC_ASM(add, short, "addw %w1,%0", "ir", v); ATOMIC_ASM(subtract, short, "subw %w1,%0", "ir", v); ATOMIC_ASM(set, int, "orl %1,%0", "ir", v); ATOMIC_ASM(clear, int, "andl %1,%0", "ir", ~v); ATOMIC_ASM(add, int, "addl %1,%0", "ir", v); ATOMIC_ASM(subtract, int, "subl %1,%0", "ir", v); ATOMIC_ASM(set, long, "orl %1,%0", "ir", v); ATOMIC_ASM(clear, long, "andl %1,%0", "ir", ~v); ATOMIC_ASM(add, long, "addl %1,%0", "ir", v); ATOMIC_ASM(subtract, long, "subl %1,%0", "ir", v); #define ATOMIC_LOADSTORE(TYPE) \ ATOMIC_LOAD(TYPE); \ ATOMIC_STORE(TYPE) ATOMIC_LOADSTORE(char); ATOMIC_LOADSTORE(short); ATOMIC_LOADSTORE(int); ATOMIC_LOADSTORE(long); #undef ATOMIC_ASM #undef ATOMIC_LOAD #undef ATOMIC_STORE #undef ATOMIC_LOADSTORE #ifndef WANT_FUNCTIONS static __inline int atomic_cmpset_long(volatile u_long *dst, u_long expect, u_long src) { return (atomic_cmpset_int((volatile u_int *)dst, (u_int)expect, (u_int)src)); } static __inline u_long atomic_fetchadd_long(volatile u_long *p, u_long v) { return (atomic_fetchadd_int((volatile u_int *)p, (u_int)v)); } static __inline int atomic_testandset_long(volatile u_long *p, u_int v) { return (atomic_testandset_int((volatile u_int *)p, v)); } static __inline int atomic_testandclear_long(volatile u_long *p, u_int v) { return (atomic_testandclear_int((volatile u_int *)p, v)); } /* Read the current value and store a new value in the destination. */ #ifdef __GNUCLIKE_ASM static __inline u_int atomic_swap_int(volatile u_int *p, u_int v) { __asm __volatile( " xchgl %1,%0 ; " "# atomic_swap_int" : "+r" (v), /* 0 */ "+m" (*p)); /* 1 */ return (v); } static __inline u_long atomic_swap_long(volatile u_long *p, u_long v) { return (atomic_swap_int((volatile u_int *)p, (u_int)v)); } #else /* !__GNUCLIKE_ASM */ u_int atomic_swap_int(volatile u_int *p, u_int v); u_long atomic_swap_long(volatile u_long *p, u_long v); #endif /* __GNUCLIKE_ASM */ #define atomic_set_acq_char atomic_set_barr_char #define atomic_set_rel_char atomic_set_barr_char #define atomic_clear_acq_char atomic_clear_barr_char #define atomic_clear_rel_char atomic_clear_barr_char #define atomic_add_acq_char atomic_add_barr_char #define atomic_add_rel_char atomic_add_barr_char #define atomic_subtract_acq_char atomic_subtract_barr_char #define atomic_subtract_rel_char atomic_subtract_barr_char #define atomic_set_acq_short atomic_set_barr_short #define atomic_set_rel_short atomic_set_barr_short #define atomic_clear_acq_short atomic_clear_barr_short #define atomic_clear_rel_short atomic_clear_barr_short #define atomic_add_acq_short atomic_add_barr_short #define atomic_add_rel_short atomic_add_barr_short #define atomic_subtract_acq_short atomic_subtract_barr_short #define atomic_subtract_rel_short atomic_subtract_barr_short #define atomic_set_acq_int atomic_set_barr_int #define atomic_set_rel_int atomic_set_barr_int #define atomic_clear_acq_int atomic_clear_barr_int #define atomic_clear_rel_int atomic_clear_barr_int #define atomic_add_acq_int atomic_add_barr_int #define atomic_add_rel_int atomic_add_barr_int #define atomic_subtract_acq_int atomic_subtract_barr_int #define atomic_subtract_rel_int atomic_subtract_barr_int #define atomic_cmpset_acq_int atomic_cmpset_int #define atomic_cmpset_rel_int atomic_cmpset_int #define atomic_fcmpset_acq_int atomic_fcmpset_int #define atomic_fcmpset_rel_int atomic_fcmpset_int #define atomic_set_acq_long atomic_set_barr_long #define atomic_set_rel_long atomic_set_barr_long #define atomic_clear_acq_long atomic_clear_barr_long #define atomic_clear_rel_long atomic_clear_barr_long #define atomic_add_acq_long atomic_add_barr_long #define atomic_add_rel_long atomic_add_barr_long #define atomic_subtract_acq_long atomic_subtract_barr_long #define atomic_subtract_rel_long atomic_subtract_barr_long #define atomic_cmpset_acq_long atomic_cmpset_long #define atomic_cmpset_rel_long atomic_cmpset_long #define atomic_fcmpset_acq_long atomic_fcmpset_long #define atomic_fcmpset_rel_long atomic_fcmpset_long #define atomic_readandclear_int(p) atomic_swap_int(p, 0) #define atomic_readandclear_long(p) atomic_swap_long(p, 0) /* Operations on 8-bit bytes. */ #define atomic_set_8 atomic_set_char #define atomic_set_acq_8 atomic_set_acq_char #define atomic_set_rel_8 atomic_set_rel_char #define atomic_clear_8 atomic_clear_char #define atomic_clear_acq_8 atomic_clear_acq_char #define atomic_clear_rel_8 atomic_clear_rel_char #define atomic_add_8 atomic_add_char #define atomic_add_acq_8 atomic_add_acq_char #define atomic_add_rel_8 atomic_add_rel_char #define atomic_subtract_8 atomic_subtract_char #define atomic_subtract_acq_8 atomic_subtract_acq_char #define atomic_subtract_rel_8 atomic_subtract_rel_char #define atomic_load_acq_8 atomic_load_acq_char #define atomic_store_rel_8 atomic_store_rel_char /* Operations on 16-bit words. */ #define atomic_set_16 atomic_set_short #define atomic_set_acq_16 atomic_set_acq_short #define atomic_set_rel_16 atomic_set_rel_short #define atomic_clear_16 atomic_clear_short #define atomic_clear_acq_16 atomic_clear_acq_short #define atomic_clear_rel_16 atomic_clear_rel_short #define atomic_add_16 atomic_add_short #define atomic_add_acq_16 atomic_add_acq_short #define atomic_add_rel_16 atomic_add_rel_short #define atomic_subtract_16 atomic_subtract_short #define atomic_subtract_acq_16 atomic_subtract_acq_short #define atomic_subtract_rel_16 atomic_subtract_rel_short #define atomic_load_acq_16 atomic_load_acq_short #define atomic_store_rel_16 atomic_store_rel_short /* Operations on 32-bit double words. */ #define atomic_set_32 atomic_set_int #define atomic_set_acq_32 atomic_set_acq_int #define atomic_set_rel_32 atomic_set_rel_int #define atomic_clear_32 atomic_clear_int #define atomic_clear_acq_32 atomic_clear_acq_int #define atomic_clear_rel_32 atomic_clear_rel_int #define atomic_add_32 atomic_add_int #define atomic_add_acq_32 atomic_add_acq_int #define atomic_add_rel_32 atomic_add_rel_int #define atomic_subtract_32 atomic_subtract_int #define atomic_subtract_acq_32 atomic_subtract_acq_int #define atomic_subtract_rel_32 atomic_subtract_rel_int #define atomic_load_acq_32 atomic_load_acq_int #define atomic_store_rel_32 atomic_store_rel_int #define atomic_cmpset_32 atomic_cmpset_int #define atomic_cmpset_acq_32 atomic_cmpset_acq_int #define atomic_cmpset_rel_32 atomic_cmpset_rel_int #define atomic_fcmpset_32 atomic_fcmpset_int #define atomic_fcmpset_acq_32 atomic_fcmpset_acq_int #define atomic_fcmpset_rel_32 atomic_fcmpset_rel_int #define atomic_swap_32 atomic_swap_int #define atomic_readandclear_32 atomic_readandclear_int #define atomic_fetchadd_32 atomic_fetchadd_int #define atomic_testandset_32 atomic_testandset_int #define atomic_testandclear_32 atomic_testandclear_int /* Operations on pointers. */ #define atomic_set_ptr(p, v) \ atomic_set_int((volatile u_int *)(p), (u_int)(v)) #define atomic_set_acq_ptr(p, v) \ atomic_set_acq_int((volatile u_int *)(p), (u_int)(v)) #define atomic_set_rel_ptr(p, v) \ atomic_set_rel_int((volatile u_int *)(p), (u_int)(v)) #define atomic_clear_ptr(p, v) \ atomic_clear_int((volatile u_int *)(p), (u_int)(v)) #define atomic_clear_acq_ptr(p, v) \ atomic_clear_acq_int((volatile u_int *)(p), (u_int)(v)) #define atomic_clear_rel_ptr(p, v) \ atomic_clear_rel_int((volatile u_int *)(p), (u_int)(v)) #define atomic_add_ptr(p, v) \ atomic_add_int((volatile u_int *)(p), (u_int)(v)) #define atomic_add_acq_ptr(p, v) \ atomic_add_acq_int((volatile u_int *)(p), (u_int)(v)) #define atomic_add_rel_ptr(p, v) \ atomic_add_rel_int((volatile u_int *)(p), (u_int)(v)) #define atomic_subtract_ptr(p, v) \ atomic_subtract_int((volatile u_int *)(p), (u_int)(v)) #define atomic_subtract_acq_ptr(p, v) \ atomic_subtract_acq_int((volatile u_int *)(p), (u_int)(v)) #define atomic_subtract_rel_ptr(p, v) \ atomic_subtract_rel_int((volatile u_int *)(p), (u_int)(v)) #define atomic_load_acq_ptr(p) \ atomic_load_acq_int((volatile u_int *)(p)) #define atomic_store_rel_ptr(p, v) \ atomic_store_rel_int((volatile u_int *)(p), (v)) #define atomic_cmpset_ptr(dst, old, new) \ atomic_cmpset_int((volatile u_int *)(dst), (u_int)(old), (u_int)(new)) #define atomic_cmpset_acq_ptr(dst, old, new) \ atomic_cmpset_acq_int((volatile u_int *)(dst), (u_int)(old), \ (u_int)(new)) #define atomic_cmpset_rel_ptr(dst, old, new) \ atomic_cmpset_rel_int((volatile u_int *)(dst), (u_int)(old), \ (u_int)(new)) #define atomic_fcmpset_ptr(dst, old, new) \ atomic_fcmpset_int((volatile u_int *)(dst), (u_int *)(old), (u_int)(new)) #define atomic_fcmpset_acq_ptr(dst, old, new) \ atomic_fcmpset_acq_int((volatile u_int *)(dst), (u_int *)(old), \ (u_int)(new)) #define atomic_fcmpset_rel_ptr(dst, old, new) \ atomic_fcmpset_rel_int((volatile u_int *)(dst), (u_int *)(old), \ (u_int)(new)) #define atomic_swap_ptr(p, v) \ atomic_swap_int((volatile u_int *)(p), (u_int)(v)) #define atomic_readandclear_ptr(p) \ atomic_readandclear_int((volatile u_int *)(p)) #endif /* !WANT_FUNCTIONS */ #if defined(_KERNEL) #define mb() __mbk() #define wmb() __mbk() #define rmb() __mbk() #else #define mb() __mbu() #define wmb() __mbu() #define rmb() __mbu() #endif #endif /* !_MACHINE_ATOMIC_H_ */ Index: head/sys/i386/isa/npx.c =================================================================== --- head/sys/i386/isa/npx.c (revision 313153) +++ head/sys/i386/isa/npx.c (revision 313154) @@ -1,1501 +1,1423 @@ /*- * Copyright (c) 1990 William Jolitz. * Copyright (c) 1991 The Regents of the University of California. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)npx.c 7.2 (Berkeley) 5/12/91 */ #include __FBSDID("$FreeBSD$"); #include "opt_cpu.h" #include "opt_isa.h" #include "opt_npx.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef NPX_DEBUG #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DEV_ISA #include #endif -#if !defined(CPU_DISABLE_SSE) && defined(I686_CPU) -#define CPU_ENABLE_SSE -#endif - /* * 387 and 287 Numeric Coprocessor Extension (NPX) Driver. */ #if defined(__GNUCLIKE_ASM) && !defined(lint) #define fldcw(cw) __asm __volatile("fldcw %0" : : "m" (cw)) #define fnclex() __asm __volatile("fnclex") #define fninit() __asm __volatile("fninit") #define fnsave(addr) __asm __volatile("fnsave %0" : "=m" (*(addr))) #define fnstcw(addr) __asm __volatile("fnstcw %0" : "=m" (*(addr))) #define fnstsw(addr) __asm __volatile("fnstsw %0" : "=am" (*(addr))) #define fp_divide_by_0() __asm __volatile( \ "fldz; fld1; fdiv %st,%st(1); fnop") #define frstor(addr) __asm __volatile("frstor %0" : : "m" (*(addr))) -#ifdef CPU_ENABLE_SSE #define fxrstor(addr) __asm __volatile("fxrstor %0" : : "m" (*(addr))) #define fxsave(addr) __asm __volatile("fxsave %0" : "=m" (*(addr))) #define ldmxcsr(csr) __asm __volatile("ldmxcsr %0" : : "m" (csr)) #define stmxcsr(addr) __asm __volatile("stmxcsr %0" : : "m" (*(addr))) static __inline void xrstor(char *addr, uint64_t mask) { uint32_t low, hi; low = mask; hi = mask >> 32; __asm __volatile("xrstor %0" : : "m" (*addr), "a" (low), "d" (hi)); } static __inline void xsave(char *addr, uint64_t mask) { uint32_t low, hi; low = mask; hi = mask >> 32; __asm __volatile("xsave %0" : "=m" (*addr) : "a" (low), "d" (hi) : "memory"); } static __inline void xsaveopt(char *addr, uint64_t mask) { uint32_t low, hi; low = mask; hi = mask >> 32; __asm __volatile("xsaveopt %0" : "=m" (*addr) : "a" (low), "d" (hi) : "memory"); } -#endif #else /* !(__GNUCLIKE_ASM && !lint) */ void fldcw(u_short cw); void fnclex(void); void fninit(void); void fnsave(caddr_t addr); void fnstcw(caddr_t addr); void fnstsw(caddr_t addr); void fp_divide_by_0(void); void frstor(caddr_t addr); -#ifdef CPU_ENABLE_SSE void fxsave(caddr_t addr); void fxrstor(caddr_t addr); void ldmxcsr(u_int csr); void stmxcsr(u_int *csr); void xrstor(char *addr, uint64_t mask); void xsave(char *addr, uint64_t mask); void xsaveopt(char *addr, uint64_t mask); -#endif #endif /* __GNUCLIKE_ASM && !lint */ #define start_emulating() load_cr0(rcr0() | CR0_TS) #define stop_emulating() clts() -#ifdef CPU_ENABLE_SSE #define GET_FPU_CW(thread) \ (cpu_fxsr ? \ (thread)->td_pcb->pcb_save->sv_xmm.sv_env.en_cw : \ (thread)->td_pcb->pcb_save->sv_87.sv_env.en_cw) #define GET_FPU_SW(thread) \ (cpu_fxsr ? \ (thread)->td_pcb->pcb_save->sv_xmm.sv_env.en_sw : \ (thread)->td_pcb->pcb_save->sv_87.sv_env.en_sw) #define SET_FPU_CW(savefpu, value) do { \ if (cpu_fxsr) \ (savefpu)->sv_xmm.sv_env.en_cw = (value); \ else \ (savefpu)->sv_87.sv_env.en_cw = (value); \ } while (0) -#else /* CPU_ENABLE_SSE */ -#define GET_FPU_CW(thread) \ - (thread->td_pcb->pcb_save->sv_87.sv_env.en_cw) -#define GET_FPU_SW(thread) \ - (thread->td_pcb->pcb_save->sv_87.sv_env.en_sw) -#define SET_FPU_CW(savefpu, value) \ - (savefpu)->sv_87.sv_env.en_cw = (value) -#endif /* CPU_ENABLE_SSE */ -#ifdef CPU_ENABLE_SSE CTASSERT(sizeof(union savefpu) == 512); CTASSERT(sizeof(struct xstate_hdr) == 64); CTASSERT(sizeof(struct savefpu_ymm) == 832); /* * This requirement is to make it easier for asm code to calculate * offset of the fpu save area from the pcb address. FPU save area * must be 64-byte aligned. */ CTASSERT(sizeof(struct pcb) % XSAVE_AREA_ALIGN == 0); /* * Ensure the copy of XCR0 saved in a core is contained in the padding * area. */ CTASSERT(X86_XSTATE_XCR0_OFFSET >= offsetof(struct savexmm, sv_pad) && X86_XSTATE_XCR0_OFFSET + sizeof(uint64_t) <= sizeof(struct savexmm)); static void fpu_clean_state(void); -#endif static void fpusave(union savefpu *); static void fpurstor(union savefpu *); int hw_float; SYSCTL_INT(_hw, HW_FLOATINGPT, floatingpoint, CTLFLAG_RD, &hw_float, 0, "Floating point instructions executed in hardware"); -#ifdef CPU_ENABLE_SSE int use_xsave; uint64_t xsave_mask; -#endif static uma_zone_t fpu_save_area_zone; static union savefpu *npx_initialstate; -#ifdef CPU_ENABLE_SSE struct xsave_area_elm_descr { u_int offset; u_int size; } *xsave_area_desc; static int use_xsaveopt; -#endif static volatile u_int npx_traps_while_probing; alias_for_inthand_t probetrap; __asm(" \n\ .text \n\ .p2align 2,0x90 \n\ .type " __XSTRING(CNAME(probetrap)) ",@function \n\ " __XSTRING(CNAME(probetrap)) ": \n\ ss \n\ incl " __XSTRING(CNAME(npx_traps_while_probing)) " \n\ fnclex \n\ iret \n\ "); /* * Determine if an FPU is present and how to use it. */ static int npx_probe(void) { struct gate_descriptor save_idt_npxtrap; u_short control, status; /* * Modern CPUs all have an FPU that uses the INT16 interface * and provide a simple way to verify that, so handle the * common case right away. */ if (cpu_feature & CPUID_FPU) { hw_float = 1; return (1); } save_idt_npxtrap = idt[IDT_MF]; setidt(IDT_MF, probetrap, SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); /* * Don't trap while we're probing. */ stop_emulating(); /* * Finish resetting the coprocessor, if any. If there is an error * pending, then we may get a bogus IRQ13, but npx_intr() will handle * it OK. Bogus halts have never been observed, but we enabled * IRQ13 and cleared the BUSY# latch early to handle them anyway. */ fninit(); /* * Don't use fwait here because it might hang. * Don't use fnop here because it usually hangs if there is no FPU. */ DELAY(1000); /* wait for any IRQ13 */ #ifdef DIAGNOSTIC if (npx_traps_while_probing != 0) printf("fninit caused %u bogus npx trap(s)\n", npx_traps_while_probing); #endif /* * Check for a status of mostly zero. */ status = 0x5a5a; fnstsw(&status); if ((status & 0xb8ff) == 0) { /* * Good, now check for a proper control word. */ control = 0x5a5a; fnstcw(&control); if ((control & 0x1f3f) == 0x033f) { /* * We have an npx, now divide by 0 to see if exception * 16 works. */ control &= ~(1 << 2); /* enable divide by 0 trap */ fldcw(control); npx_traps_while_probing = 0; fp_divide_by_0(); if (npx_traps_while_probing != 0) { /* * Good, exception 16 works. */ hw_float = 1; goto cleanup; } printf( "FPU does not use exception 16 for error reporting\n"); goto cleanup; } } /* * Probe failed. Floating point simply won't work. * Notify user and disable FPU/MMX/SSE instruction execution. */ printf("WARNING: no FPU!\n"); __asm __volatile("smsw %%ax; orb %0,%%al; lmsw %%ax" : : "n" (CR0_EM | CR0_MP) : "ax"); cleanup: idt[IDT_MF] = save_idt_npxtrap; return (hw_float); } -#ifdef CPU_ENABLE_SSE /* * Enable XSAVE if supported and allowed by user. * Calculate the xsave_mask. */ static void npxinit_bsp1(void) { u_int cp[4]; uint64_t xsave_mask_user; if (cpu_fxsr && (cpu_feature2 & CPUID2_XSAVE) != 0) { use_xsave = 1; TUNABLE_INT_FETCH("hw.use_xsave", &use_xsave); } if (!use_xsave) return; cpuid_count(0xd, 0x0, cp); xsave_mask = XFEATURE_ENABLED_X87 | XFEATURE_ENABLED_SSE; if ((cp[0] & xsave_mask) != xsave_mask) panic("CPU0 does not support X87 or SSE: %x", cp[0]); xsave_mask = ((uint64_t)cp[3] << 32) | cp[0]; xsave_mask_user = xsave_mask; TUNABLE_QUAD_FETCH("hw.xsave_mask", &xsave_mask_user); xsave_mask_user |= XFEATURE_ENABLED_X87 | XFEATURE_ENABLED_SSE; xsave_mask &= xsave_mask_user; if ((xsave_mask & XFEATURE_AVX512) != XFEATURE_AVX512) xsave_mask &= ~XFEATURE_AVX512; if ((xsave_mask & XFEATURE_MPX) != XFEATURE_MPX) xsave_mask &= ~XFEATURE_MPX; cpuid_count(0xd, 0x1, cp); if ((cp[0] & CPUID_EXTSTATE_XSAVEOPT) != 0) use_xsaveopt = 1; } -#endif + /* * Calculate the fpu save area size. */ static void npxinit_bsp2(void) { -#ifdef CPU_ENABLE_SSE u_int cp[4]; if (use_xsave) { cpuid_count(0xd, 0x0, cp); cpu_max_ext_state_size = cp[1]; /* * Reload the cpu_feature2, since we enabled OSXSAVE. */ do_cpuid(1, cp); cpu_feature2 = cp[2]; } else -#endif cpu_max_ext_state_size = sizeof(union savefpu); } /* * Initialize floating point unit. */ void npxinit(bool bsp) { static union savefpu dummy; register_t saveintr; -#ifdef CPU_ENABLE_SSE u_int mxcsr; -#endif u_short control; if (bsp) { if (!npx_probe()) return; -#ifdef CPU_ENABLE_SSE npxinit_bsp1(); -#endif } -#ifdef CPU_ENABLE_SSE if (use_xsave) { load_cr4(rcr4() | CR4_XSAVE); load_xcr(XCR0, xsave_mask); } -#endif /* * XCR0 shall be set up before CPU can report the save area size. */ if (bsp) npxinit_bsp2(); /* * fninit has the same h/w bugs as fnsave. Use the detoxified * fnsave to throw away any junk in the fpu. fpusave() initializes * the fpu. * * It is too early for critical_enter() to work on AP. */ saveintr = intr_disable(); stop_emulating(); -#ifdef CPU_ENABLE_SSE if (cpu_fxsr) fninit(); else -#endif fnsave(&dummy); control = __INITIAL_NPXCW__; fldcw(control); -#ifdef CPU_ENABLE_SSE if (cpu_fxsr) { mxcsr = __INITIAL_MXCSR__; ldmxcsr(mxcsr); } -#endif start_emulating(); intr_restore(saveintr); } /* * On the boot CPU we generate a clean state that is used to * initialize the floating point unit when it is first used by a * process. */ static void npxinitstate(void *arg __unused) { register_t saveintr; -#ifdef CPU_ENABLE_SSE int cp[4], i, max_ext_n; -#endif if (!hw_float) return; npx_initialstate = malloc(cpu_max_ext_state_size, M_DEVBUF, M_WAITOK | M_ZERO); saveintr = intr_disable(); stop_emulating(); fpusave(npx_initialstate); -#ifdef CPU_ENABLE_SSE if (cpu_fxsr) { if (npx_initialstate->sv_xmm.sv_env.en_mxcsr_mask) cpu_mxcsr_mask = npx_initialstate->sv_xmm.sv_env.en_mxcsr_mask; else cpu_mxcsr_mask = 0xFFBF; /* * The fninit instruction does not modify XMM * registers or x87 registers (MM/ST). The fpusave * call dumped the garbage contained in the registers * after reset to the initial state saved. Clear XMM * and x87 registers file image to make the startup * program state and signal handler XMM/x87 register * content predictable. */ bzero(npx_initialstate->sv_xmm.sv_fp, sizeof(npx_initialstate->sv_xmm.sv_fp)); bzero(npx_initialstate->sv_xmm.sv_xmm, sizeof(npx_initialstate->sv_xmm.sv_xmm)); } else -#endif bzero(npx_initialstate->sv_87.sv_ac, sizeof(npx_initialstate->sv_87.sv_ac)); -#ifdef CPU_ENABLE_SSE /* * Create a table describing the layout of the CPU Extended * Save Area. */ if (use_xsave) { if (xsave_mask >> 32 != 0) max_ext_n = fls(xsave_mask >> 32) + 32; else max_ext_n = fls(xsave_mask); xsave_area_desc = malloc(max_ext_n * sizeof(struct xsave_area_elm_descr), M_DEVBUF, M_WAITOK | M_ZERO); /* x87 state */ xsave_area_desc[0].offset = 0; xsave_area_desc[0].size = 160; /* XMM */ xsave_area_desc[1].offset = 160; xsave_area_desc[1].size = 288 - 160; for (i = 2; i < max_ext_n; i++) { cpuid_count(0xd, i, cp); xsave_area_desc[i].offset = cp[1]; xsave_area_desc[i].size = cp[0]; } } -#endif fpu_save_area_zone = uma_zcreate("FPU_save_area", cpu_max_ext_state_size, NULL, NULL, NULL, NULL, XSAVE_AREA_ALIGN - 1, 0); start_emulating(); intr_restore(saveintr); } SYSINIT(npxinitstate, SI_SUB_DRIVERS, SI_ORDER_ANY, npxinitstate, NULL); /* * Free coprocessor (if we have it). */ void npxexit(struct thread *td) { critical_enter(); if (curthread == PCPU_GET(fpcurthread)) { stop_emulating(); fpusave(curpcb->pcb_save); start_emulating(); PCPU_SET(fpcurthread, NULL); } critical_exit(); #ifdef NPX_DEBUG if (hw_float) { u_int masked_exceptions; masked_exceptions = GET_FPU_CW(td) & GET_FPU_SW(td) & 0x7f; /* * Log exceptions that would have trapped with the old * control word (overflow, divide by 0, and invalid operand). */ if (masked_exceptions & 0x0d) log(LOG_ERR, "pid %d (%s) exited with masked floating point exceptions 0x%02x\n", td->td_proc->p_pid, td->td_proc->p_comm, masked_exceptions); } #endif } int npxformat(void) { if (!hw_float) return (_MC_FPFMT_NODEV); -#ifdef CPU_ENABLE_SSE if (cpu_fxsr) return (_MC_FPFMT_XMM); -#endif return (_MC_FPFMT_387); } /* * The following mechanism is used to ensure that the FPE_... value * that is passed as a trapcode to the signal handler of the user * process does not have more than one bit set. * * Multiple bits may be set if the user process modifies the control * word while a status word bit is already set. While this is a sign * of bad coding, we have no choise than to narrow them down to one * bit, since we must not send a trapcode that is not exactly one of * the FPE_ macros. * * The mechanism has a static table with 127 entries. Each combination * of the 7 FPU status word exception bits directly translates to a * position in this table, where a single FPE_... value is stored. * This FPE_... value stored there is considered the "most important" * of the exception bits and will be sent as the signal code. The * precedence of the bits is based upon Intel Document "Numerical * Applications", Chapter "Special Computational Situations". * * The macro to choose one of these values does these steps: 1) Throw * away status word bits that cannot be masked. 2) Throw away the bits * currently masked in the control word, assuming the user isn't * interested in them anymore. 3) Reinsert status word bit 7 (stack * fault) if it is set, which cannot be masked but must be presered. * 4) Use the remaining bits to point into the trapcode table. * * The 6 maskable bits in order of their preference, as stated in the * above referenced Intel manual: * 1 Invalid operation (FP_X_INV) * 1a Stack underflow * 1b Stack overflow * 1c Operand of unsupported format * 1d SNaN operand. * 2 QNaN operand (not an exception, irrelavant here) * 3 Any other invalid-operation not mentioned above or zero divide * (FP_X_INV, FP_X_DZ) * 4 Denormal operand (FP_X_DNML) * 5 Numeric over/underflow (FP_X_OFL, FP_X_UFL) * 6 Inexact result (FP_X_IMP) */ static char fpetable[128] = { 0, FPE_FLTINV, /* 1 - INV */ FPE_FLTUND, /* 2 - DNML */ FPE_FLTINV, /* 3 - INV | DNML */ FPE_FLTDIV, /* 4 - DZ */ FPE_FLTINV, /* 5 - INV | DZ */ FPE_FLTDIV, /* 6 - DNML | DZ */ FPE_FLTINV, /* 7 - INV | DNML | DZ */ FPE_FLTOVF, /* 8 - OFL */ FPE_FLTINV, /* 9 - INV | OFL */ FPE_FLTUND, /* A - DNML | OFL */ FPE_FLTINV, /* B - INV | DNML | OFL */ FPE_FLTDIV, /* C - DZ | OFL */ FPE_FLTINV, /* D - INV | DZ | OFL */ FPE_FLTDIV, /* E - DNML | DZ | OFL */ FPE_FLTINV, /* F - INV | DNML | DZ | OFL */ FPE_FLTUND, /* 10 - UFL */ FPE_FLTINV, /* 11 - INV | UFL */ FPE_FLTUND, /* 12 - DNML | UFL */ FPE_FLTINV, /* 13 - INV | DNML | UFL */ FPE_FLTDIV, /* 14 - DZ | UFL */ FPE_FLTINV, /* 15 - INV | DZ | UFL */ FPE_FLTDIV, /* 16 - DNML | DZ | UFL */ FPE_FLTINV, /* 17 - INV | DNML | DZ | UFL */ FPE_FLTOVF, /* 18 - OFL | UFL */ FPE_FLTINV, /* 19 - INV | OFL | UFL */ FPE_FLTUND, /* 1A - DNML | OFL | UFL */ FPE_FLTINV, /* 1B - INV | DNML | OFL | UFL */ FPE_FLTDIV, /* 1C - DZ | OFL | UFL */ FPE_FLTINV, /* 1D - INV | DZ | OFL | UFL */ FPE_FLTDIV, /* 1E - DNML | DZ | OFL | UFL */ FPE_FLTINV, /* 1F - INV | DNML | DZ | OFL | UFL */ FPE_FLTRES, /* 20 - IMP */ FPE_FLTINV, /* 21 - INV | IMP */ FPE_FLTUND, /* 22 - DNML | IMP */ FPE_FLTINV, /* 23 - INV | DNML | IMP */ FPE_FLTDIV, /* 24 - DZ | IMP */ FPE_FLTINV, /* 25 - INV | DZ | IMP */ FPE_FLTDIV, /* 26 - DNML | DZ | IMP */ FPE_FLTINV, /* 27 - INV | DNML | DZ | IMP */ FPE_FLTOVF, /* 28 - OFL | IMP */ FPE_FLTINV, /* 29 - INV | OFL | IMP */ FPE_FLTUND, /* 2A - DNML | OFL | IMP */ FPE_FLTINV, /* 2B - INV | DNML | OFL | IMP */ FPE_FLTDIV, /* 2C - DZ | OFL | IMP */ FPE_FLTINV, /* 2D - INV | DZ | OFL | IMP */ FPE_FLTDIV, /* 2E - DNML | DZ | OFL | IMP */ FPE_FLTINV, /* 2F - INV | DNML | DZ | OFL | IMP */ FPE_FLTUND, /* 30 - UFL | IMP */ FPE_FLTINV, /* 31 - INV | UFL | IMP */ FPE_FLTUND, /* 32 - DNML | UFL | IMP */ FPE_FLTINV, /* 33 - INV | DNML | UFL | IMP */ FPE_FLTDIV, /* 34 - DZ | UFL | IMP */ FPE_FLTINV, /* 35 - INV | DZ | UFL | IMP */ FPE_FLTDIV, /* 36 - DNML | DZ | UFL | IMP */ FPE_FLTINV, /* 37 - INV | DNML | DZ | UFL | IMP */ FPE_FLTOVF, /* 38 - OFL | UFL | IMP */ FPE_FLTINV, /* 39 - INV | OFL | UFL | IMP */ FPE_FLTUND, /* 3A - DNML | OFL | UFL | IMP */ FPE_FLTINV, /* 3B - INV | DNML | OFL | UFL | IMP */ FPE_FLTDIV, /* 3C - DZ | OFL | UFL | IMP */ FPE_FLTINV, /* 3D - INV | DZ | OFL | UFL | IMP */ FPE_FLTDIV, /* 3E - DNML | DZ | OFL | UFL | IMP */ FPE_FLTINV, /* 3F - INV | DNML | DZ | OFL | UFL | IMP */ FPE_FLTSUB, /* 40 - STK */ FPE_FLTSUB, /* 41 - INV | STK */ FPE_FLTUND, /* 42 - DNML | STK */ FPE_FLTSUB, /* 43 - INV | DNML | STK */ FPE_FLTDIV, /* 44 - DZ | STK */ FPE_FLTSUB, /* 45 - INV | DZ | STK */ FPE_FLTDIV, /* 46 - DNML | DZ | STK */ FPE_FLTSUB, /* 47 - INV | DNML | DZ | STK */ FPE_FLTOVF, /* 48 - OFL | STK */ FPE_FLTSUB, /* 49 - INV | OFL | STK */ FPE_FLTUND, /* 4A - DNML | OFL | STK */ FPE_FLTSUB, /* 4B - INV | DNML | OFL | STK */ FPE_FLTDIV, /* 4C - DZ | OFL | STK */ FPE_FLTSUB, /* 4D - INV | DZ | OFL | STK */ FPE_FLTDIV, /* 4E - DNML | DZ | OFL | STK */ FPE_FLTSUB, /* 4F - INV | DNML | DZ | OFL | STK */ FPE_FLTUND, /* 50 - UFL | STK */ FPE_FLTSUB, /* 51 - INV | UFL | STK */ FPE_FLTUND, /* 52 - DNML | UFL | STK */ FPE_FLTSUB, /* 53 - INV | DNML | UFL | STK */ FPE_FLTDIV, /* 54 - DZ | UFL | STK */ FPE_FLTSUB, /* 55 - INV | DZ | UFL | STK */ FPE_FLTDIV, /* 56 - DNML | DZ | UFL | STK */ FPE_FLTSUB, /* 57 - INV | DNML | DZ | UFL | STK */ FPE_FLTOVF, /* 58 - OFL | UFL | STK */ FPE_FLTSUB, /* 59 - INV | OFL | UFL | STK */ FPE_FLTUND, /* 5A - DNML | OFL | UFL | STK */ FPE_FLTSUB, /* 5B - INV | DNML | OFL | UFL | STK */ FPE_FLTDIV, /* 5C - DZ | OFL | UFL | STK */ FPE_FLTSUB, /* 5D - INV | DZ | OFL | UFL | STK */ FPE_FLTDIV, /* 5E - DNML | DZ | OFL | UFL | STK */ FPE_FLTSUB, /* 5F - INV | DNML | DZ | OFL | UFL | STK */ FPE_FLTRES, /* 60 - IMP | STK */ FPE_FLTSUB, /* 61 - INV | IMP | STK */ FPE_FLTUND, /* 62 - DNML | IMP | STK */ FPE_FLTSUB, /* 63 - INV | DNML | IMP | STK */ FPE_FLTDIV, /* 64 - DZ | IMP | STK */ FPE_FLTSUB, /* 65 - INV | DZ | IMP | STK */ FPE_FLTDIV, /* 66 - DNML | DZ | IMP | STK */ FPE_FLTSUB, /* 67 - INV | DNML | DZ | IMP | STK */ FPE_FLTOVF, /* 68 - OFL | IMP | STK */ FPE_FLTSUB, /* 69 - INV | OFL | IMP | STK */ FPE_FLTUND, /* 6A - DNML | OFL | IMP | STK */ FPE_FLTSUB, /* 6B - INV | DNML | OFL | IMP | STK */ FPE_FLTDIV, /* 6C - DZ | OFL | IMP | STK */ FPE_FLTSUB, /* 6D - INV | DZ | OFL | IMP | STK */ FPE_FLTDIV, /* 6E - DNML | DZ | OFL | IMP | STK */ FPE_FLTSUB, /* 6F - INV | DNML | DZ | OFL | IMP | STK */ FPE_FLTUND, /* 70 - UFL | IMP | STK */ FPE_FLTSUB, /* 71 - INV | UFL | IMP | STK */ FPE_FLTUND, /* 72 - DNML | UFL | IMP | STK */ FPE_FLTSUB, /* 73 - INV | DNML | UFL | IMP | STK */ FPE_FLTDIV, /* 74 - DZ | UFL | IMP | STK */ FPE_FLTSUB, /* 75 - INV | DZ | UFL | IMP | STK */ FPE_FLTDIV, /* 76 - DNML | DZ | UFL | IMP | STK */ FPE_FLTSUB, /* 77 - INV | DNML | DZ | UFL | IMP | STK */ FPE_FLTOVF, /* 78 - OFL | UFL | IMP | STK */ FPE_FLTSUB, /* 79 - INV | OFL | UFL | IMP | STK */ FPE_FLTUND, /* 7A - DNML | OFL | UFL | IMP | STK */ FPE_FLTSUB, /* 7B - INV | DNML | OFL | UFL | IMP | STK */ FPE_FLTDIV, /* 7C - DZ | OFL | UFL | IMP | STK */ FPE_FLTSUB, /* 7D - INV | DZ | OFL | UFL | IMP | STK */ FPE_FLTDIV, /* 7E - DNML | DZ | OFL | UFL | IMP | STK */ FPE_FLTSUB, /* 7F - INV | DNML | DZ | OFL | UFL | IMP | STK */ }; /* * Read the FP status and control words, then generate si_code value * for SIGFPE. The error code chosen will be one of the * FPE_... macros. It will be sent as the second argument to old * BSD-style signal handlers and as "siginfo_t->si_code" (second * argument) to SA_SIGINFO signal handlers. * * Some time ago, we cleared the x87 exceptions with FNCLEX there. * Clearing exceptions was necessary mainly to avoid IRQ13 bugs. The * usermode code which understands the FPU hardware enough to enable * the exceptions, can also handle clearing the exception state in the * handler. The only consequence of not clearing the exception is the * rethrow of the SIGFPE on return from the signal handler and * reexecution of the corresponding instruction. * * For XMM traps, the exceptions were never cleared. */ int npxtrap_x87(void) { u_short control, status; if (!hw_float) { printf( "npxtrap_x87: fpcurthread = %p, curthread = %p, hw_float = %d\n", PCPU_GET(fpcurthread), curthread, hw_float); panic("npxtrap from nowhere"); } critical_enter(); /* * Interrupt handling (for another interrupt) may have pushed the * state to memory. Fetch the relevant parts of the state from * wherever they are. */ if (PCPU_GET(fpcurthread) != curthread) { control = GET_FPU_CW(curthread); status = GET_FPU_SW(curthread); } else { fnstcw(&control); fnstsw(&status); } critical_exit(); return (fpetable[status & ((~control & 0x3f) | 0x40)]); } -#ifdef CPU_ENABLE_SSE int npxtrap_sse(void) { u_int mxcsr; if (!hw_float) { printf( "npxtrap_sse: fpcurthread = %p, curthread = %p, hw_float = %d\n", PCPU_GET(fpcurthread), curthread, hw_float); panic("npxtrap from nowhere"); } critical_enter(); if (PCPU_GET(fpcurthread) != curthread) mxcsr = curthread->td_pcb->pcb_save->sv_xmm.sv_env.en_mxcsr; else stmxcsr(&mxcsr); critical_exit(); return (fpetable[(mxcsr & (~mxcsr >> 7)) & 0x3f]); } -#endif /* * Implement device not available (DNA) exception * * It would be better to switch FP context here (if curthread != fpcurthread) * and not necessarily for every context switch, but it is too hard to * access foreign pcb's. */ static int err_count = 0; int npxdna(void) { if (!hw_float) return (0); critical_enter(); if (PCPU_GET(fpcurthread) == curthread) { printf("npxdna: fpcurthread == curthread %d times\n", ++err_count); stop_emulating(); critical_exit(); return (1); } if (PCPU_GET(fpcurthread) != NULL) { printf("npxdna: fpcurthread = %p (%d), curthread = %p (%d)\n", PCPU_GET(fpcurthread), PCPU_GET(fpcurthread)->td_proc->p_pid, curthread, curthread->td_proc->p_pid); panic("npxdna"); } stop_emulating(); /* * Record new context early in case frstor causes a trap. */ PCPU_SET(fpcurthread, curthread); -#ifdef CPU_ENABLE_SSE if (cpu_fxsr) fpu_clean_state(); -#endif if ((curpcb->pcb_flags & PCB_NPXINITDONE) == 0) { /* * This is the first time this thread has used the FPU or * the PCB doesn't contain a clean FPU state. Explicitly * load an initial state. * * We prefer to restore the state from the actual save * area in PCB instead of directly loading from * npx_initialstate, to ignite the XSAVEOPT * tracking engine. */ bcopy(npx_initialstate, curpcb->pcb_save, cpu_max_ext_state_size); fpurstor(curpcb->pcb_save); if (curpcb->pcb_initial_npxcw != __INITIAL_NPXCW__) fldcw(curpcb->pcb_initial_npxcw); curpcb->pcb_flags |= PCB_NPXINITDONE; if (PCB_USER_FPU(curpcb)) curpcb->pcb_flags |= PCB_NPXUSERINITDONE; } else { fpurstor(curpcb->pcb_save); } critical_exit(); return (1); } /* * Wrapper for fpusave() called from context switch routines. * * npxsave() must be called with interrupts disabled, so that it clears * fpcurthread atomically with saving the state. We require callers to do the * disabling, since most callers need to disable interrupts anyway to call * npxsave() atomically with checking fpcurthread. */ void npxsave(addr) union savefpu *addr; { stop_emulating(); -#ifdef CPU_ENABLE_SSE if (use_xsaveopt) xsaveopt((char *)addr, xsave_mask); else -#endif fpusave(addr); start_emulating(); PCPU_SET(fpcurthread, NULL); } /* * Unconditionally save the current co-processor state across suspend and * resume. */ void npxsuspend(union savefpu *addr) { register_t cr0; if (!hw_float) return; if (PCPU_GET(fpcurthread) == NULL) { bcopy(npx_initialstate, addr, cpu_max_ext_state_size); return; } cr0 = rcr0(); stop_emulating(); fpusave(addr); load_cr0(cr0); } void npxresume(union savefpu *addr) { register_t cr0; if (!hw_float) return; cr0 = rcr0(); npxinit(false); stop_emulating(); fpurstor(addr); load_cr0(cr0); } void npxdrop(void) { struct thread *td; /* * Discard pending exceptions in the !cpu_fxsr case so that unmasked * ones don't cause a panic on the next frstor. */ -#ifdef CPU_ENABLE_SSE if (!cpu_fxsr) -#endif fnclex(); td = PCPU_GET(fpcurthread); KASSERT(td == curthread, ("fpudrop: fpcurthread != curthread")); CRITICAL_ASSERT(td); PCPU_SET(fpcurthread, NULL); td->td_pcb->pcb_flags &= ~PCB_NPXINITDONE; start_emulating(); } /* * Get the user state of the FPU into pcb->pcb_user_save without * dropping ownership (if possible). It returns the FPU ownership * status. */ int npxgetregs(struct thread *td) { struct pcb *pcb; -#ifdef CPU_ENABLE_SSE uint64_t *xstate_bv, bit; char *sa; int max_ext_n, i; -#endif int owned; if (!hw_float) return (_MC_FPOWNED_NONE); pcb = td->td_pcb; if ((pcb->pcb_flags & PCB_NPXINITDONE) == 0) { bcopy(npx_initialstate, get_pcb_user_save_pcb(pcb), cpu_max_ext_state_size); SET_FPU_CW(get_pcb_user_save_pcb(pcb), pcb->pcb_initial_npxcw); npxuserinited(td); return (_MC_FPOWNED_PCB); } critical_enter(); if (td == PCPU_GET(fpcurthread)) { fpusave(get_pcb_user_save_pcb(pcb)); -#ifdef CPU_ENABLE_SSE if (!cpu_fxsr) -#endif /* * fnsave initializes the FPU and destroys whatever * context it contains. Make sure the FPU owner * starts with a clean state next time. */ npxdrop(); owned = _MC_FPOWNED_FPU; } else { owned = _MC_FPOWNED_PCB; } critical_exit(); -#ifdef CPU_ENABLE_SSE if (use_xsave) { /* * Handle partially saved state. */ sa = (char *)get_pcb_user_save_pcb(pcb); xstate_bv = (uint64_t *)(sa + sizeof(union savefpu) + offsetof(struct xstate_hdr, xstate_bv)); if (xsave_mask >> 32 != 0) max_ext_n = fls(xsave_mask >> 32) + 32; else max_ext_n = fls(xsave_mask); for (i = 0; i < max_ext_n; i++) { bit = 1ULL << i; if ((xsave_mask & bit) == 0 || (*xstate_bv & bit) != 0) continue; bcopy((char *)npx_initialstate + xsave_area_desc[i].offset, sa + xsave_area_desc[i].offset, xsave_area_desc[i].size); *xstate_bv |= bit; } } -#endif return (owned); } void npxuserinited(struct thread *td) { struct pcb *pcb; pcb = td->td_pcb; if (PCB_USER_FPU(pcb)) pcb->pcb_flags |= PCB_NPXINITDONE; pcb->pcb_flags |= PCB_NPXUSERINITDONE; } -#ifdef CPU_ENABLE_SSE int npxsetxstate(struct thread *td, char *xfpustate, size_t xfpustate_size) { struct xstate_hdr *hdr, *ehdr; size_t len, max_len; uint64_t bv; /* XXXKIB should we clear all extended state in xstate_bv instead ? */ if (xfpustate == NULL) return (0); if (!use_xsave) return (EOPNOTSUPP); len = xfpustate_size; if (len < sizeof(struct xstate_hdr)) return (EINVAL); max_len = cpu_max_ext_state_size - sizeof(union savefpu); if (len > max_len) return (EINVAL); ehdr = (struct xstate_hdr *)xfpustate; bv = ehdr->xstate_bv; /* * Avoid #gp. */ if (bv & ~xsave_mask) return (EINVAL); hdr = (struct xstate_hdr *)(get_pcb_user_save_td(td) + 1); hdr->xstate_bv = bv; bcopy(xfpustate + sizeof(struct xstate_hdr), (char *)(hdr + 1), len - sizeof(struct xstate_hdr)); return (0); } -#endif int npxsetregs(struct thread *td, union savefpu *addr, char *xfpustate, size_t xfpustate_size) { struct pcb *pcb; -#ifdef CPU_ENABLE_SSE int error; -#endif if (!hw_float) return (ENXIO); pcb = td->td_pcb; critical_enter(); if (td == PCPU_GET(fpcurthread) && PCB_USER_FPU(pcb)) { -#ifdef CPU_ENABLE_SSE error = npxsetxstate(td, xfpustate, xfpustate_size); if (error != 0) { critical_exit(); return (error); } if (!cpu_fxsr) -#endif fnclex(); /* As in npxdrop(). */ bcopy(addr, get_pcb_user_save_td(td), sizeof(*addr)); fpurstor(get_pcb_user_save_td(td)); critical_exit(); pcb->pcb_flags |= PCB_NPXUSERINITDONE | PCB_NPXINITDONE; } else { critical_exit(); -#ifdef CPU_ENABLE_SSE error = npxsetxstate(td, xfpustate, xfpustate_size); if (error != 0) return (error); -#endif bcopy(addr, get_pcb_user_save_td(td), sizeof(*addr)); npxuserinited(td); } return (0); } static void fpusave(addr) union savefpu *addr; { -#ifdef CPU_ENABLE_SSE if (use_xsave) xsave((char *)addr, xsave_mask); else if (cpu_fxsr) fxsave(addr); else -#endif fnsave(addr); } -#ifdef CPU_ENABLE_SSE static void npx_fill_fpregs_xmm1(struct savexmm *sv_xmm, struct save87 *sv_87) { struct env87 *penv_87; struct envxmm *penv_xmm; int i; penv_87 = &sv_87->sv_env; penv_xmm = &sv_xmm->sv_env; /* FPU control/status */ penv_87->en_cw = penv_xmm->en_cw; penv_87->en_sw = penv_xmm->en_sw; penv_87->en_fip = penv_xmm->en_fip; penv_87->en_fcs = penv_xmm->en_fcs; penv_87->en_opcode = penv_xmm->en_opcode; penv_87->en_foo = penv_xmm->en_foo; penv_87->en_fos = penv_xmm->en_fos; /* FPU registers and tags */ penv_87->en_tw = 0xffff; for (i = 0; i < 8; ++i) { sv_87->sv_ac[i] = sv_xmm->sv_fp[i].fp_acc; if ((penv_xmm->en_tw & (1 << i)) != 0) /* zero and special are set as valid */ penv_87->en_tw &= ~(3 << i); } } void npx_fill_fpregs_xmm(struct savexmm *sv_xmm, struct save87 *sv_87) { bzero(sv_87, sizeof(*sv_87)); npx_fill_fpregs_xmm1(sv_xmm, sv_87); } void npx_set_fpregs_xmm(struct save87 *sv_87, struct savexmm *sv_xmm) { struct env87 *penv_87; struct envxmm *penv_xmm; int i; penv_87 = &sv_87->sv_env; penv_xmm = &sv_xmm->sv_env; /* FPU control/status */ penv_xmm->en_cw = penv_87->en_cw; penv_xmm->en_sw = penv_87->en_sw; penv_xmm->en_fip = penv_87->en_fip; penv_xmm->en_fcs = penv_87->en_fcs; penv_xmm->en_opcode = penv_87->en_opcode; penv_xmm->en_foo = penv_87->en_foo; penv_xmm->en_fos = penv_87->en_fos; /* FPU registers and tags */ penv_xmm->en_tw = 0; for (i = 0; i < 8; ++i) { sv_xmm->sv_fp[i].fp_acc = sv_87->sv_ac[i]; if ((penv_87->en_tw && (3 << i)) != (3 << i)) penv_xmm->en_tw |= 1 << i; } } -#endif /* CPU_ENABLE_SSE */ void npx_get_fsave(void *addr) { struct thread *td; union savefpu *sv; td = curthread; npxgetregs(td); sv = get_pcb_user_save_td(td); -#ifdef CPU_ENABLE_SSE if (cpu_fxsr) npx_fill_fpregs_xmm1(&sv->sv_xmm, addr); else -#endif bcopy(sv, addr, sizeof(struct env87) + sizeof(struct fpacc87[8])); } int npx_set_fsave(void *addr) { union savefpu sv; int error; bzero(&sv, sizeof(sv)); -#ifdef CPU_ENABLE_SSE if (cpu_fxsr) npx_set_fpregs_xmm(addr, &sv.sv_xmm); else -#endif bcopy(addr, &sv, sizeof(struct env87) + sizeof(struct fpacc87[8])); error = npxsetregs(curthread, &sv, NULL, 0); return (error); } -#ifdef CPU_ENABLE_SSE /* * On AuthenticAMD processors, the fxrstor instruction does not restore * the x87's stored last instruction pointer, last data pointer, and last * opcode values, except in the rare case in which the exception summary * (ES) bit in the x87 status word is set to 1. * * In order to avoid leaking this information across processes, we clean * these values by performing a dummy load before executing fxrstor(). */ static void fpu_clean_state(void) { static float dummy_variable = 0.0; u_short status; /* * Clear the ES bit in the x87 status word if it is currently * set, in order to avoid causing a fault in the upcoming load. */ fnstsw(&status); if (status & 0x80) fnclex(); /* * Load the dummy variable into the x87 stack. This mangles * the x87 stack, but we don't care since we're about to call * fxrstor() anyway. */ __asm __volatile("ffree %%st(7); flds %0" : : "m" (dummy_variable)); } -#endif /* CPU_ENABLE_SSE */ static void fpurstor(union savefpu *addr) { -#ifdef CPU_ENABLE_SSE if (use_xsave) xrstor((char *)addr, xsave_mask); else if (cpu_fxsr) fxrstor(addr); else -#endif frstor(addr); } #ifdef DEV_ISA /* * This sucks up the legacy ISA support assignments from PNPBIOS/ACPI. */ static struct isa_pnp_id npxisa_ids[] = { { 0x040cd041, "Legacy ISA coprocessor support" }, /* PNP0C04 */ { 0 } }; static int npxisa_probe(device_t dev) { int result; if ((result = ISA_PNP_PROBE(device_get_parent(dev), dev, npxisa_ids)) <= 0) { device_quiet(dev); } return(result); } static int npxisa_attach(device_t dev) { return (0); } static device_method_t npxisa_methods[] = { /* Device interface */ DEVMETHOD(device_probe, npxisa_probe), DEVMETHOD(device_attach, npxisa_attach), DEVMETHOD(device_detach, bus_generic_detach), DEVMETHOD(device_shutdown, bus_generic_shutdown), DEVMETHOD(device_suspend, bus_generic_suspend), DEVMETHOD(device_resume, bus_generic_resume), { 0, 0 } }; static driver_t npxisa_driver = { "npxisa", npxisa_methods, 1, /* no softc */ }; static devclass_t npxisa_devclass; DRIVER_MODULE(npxisa, isa, npxisa_driver, npxisa_devclass, 0, 0); DRIVER_MODULE(npxisa, acpi, npxisa_driver, npxisa_devclass, 0, 0); #endif /* DEV_ISA */ static MALLOC_DEFINE(M_FPUKERN_CTX, "fpukern_ctx", "Kernel contexts for FPU state"); #define FPU_KERN_CTX_NPXINITDONE 0x01 #define FPU_KERN_CTX_DUMMY 0x02 #define FPU_KERN_CTX_INUSE 0x04 struct fpu_kern_ctx { union savefpu *prev; uint32_t flags; char hwstate1[]; }; struct fpu_kern_ctx * fpu_kern_alloc_ctx(u_int flags) { struct fpu_kern_ctx *res; size_t sz; sz = sizeof(struct fpu_kern_ctx) + XSAVE_AREA_ALIGN + cpu_max_ext_state_size; res = malloc(sz, M_FPUKERN_CTX, ((flags & FPU_KERN_NOWAIT) ? M_NOWAIT : M_WAITOK) | M_ZERO); return (res); } void fpu_kern_free_ctx(struct fpu_kern_ctx *ctx) { KASSERT((ctx->flags & FPU_KERN_CTX_INUSE) == 0, ("free'ing inuse ctx")); /* XXXKIB clear the memory ? */ free(ctx, M_FPUKERN_CTX); } static union savefpu * fpu_kern_ctx_savefpu(struct fpu_kern_ctx *ctx) { vm_offset_t p; p = (vm_offset_t)&ctx->hwstate1; p = roundup2(p, XSAVE_AREA_ALIGN); return ((union savefpu *)p); } int fpu_kern_enter(struct thread *td, struct fpu_kern_ctx *ctx, u_int flags) { struct pcb *pcb; KASSERT((ctx->flags & FPU_KERN_CTX_INUSE) == 0, ("using inuse ctx")); if ((flags & FPU_KERN_KTHR) != 0 && is_fpu_kern_thread(0)) { ctx->flags = FPU_KERN_CTX_DUMMY | FPU_KERN_CTX_INUSE; return (0); } pcb = td->td_pcb; KASSERT(!PCB_USER_FPU(pcb) || pcb->pcb_save == get_pcb_user_save_pcb(pcb), ("mangled pcb_save")); ctx->flags = FPU_KERN_CTX_INUSE; if ((pcb->pcb_flags & PCB_NPXINITDONE) != 0) ctx->flags |= FPU_KERN_CTX_NPXINITDONE; npxexit(td); ctx->prev = pcb->pcb_save; pcb->pcb_save = fpu_kern_ctx_savefpu(ctx); pcb->pcb_flags |= PCB_KERNNPX; pcb->pcb_flags &= ~PCB_NPXINITDONE; return (0); } int fpu_kern_leave(struct thread *td, struct fpu_kern_ctx *ctx) { struct pcb *pcb; KASSERT((ctx->flags & FPU_KERN_CTX_INUSE) != 0, ("leaving not inuse ctx")); ctx->flags &= ~FPU_KERN_CTX_INUSE; if (is_fpu_kern_thread(0) && (ctx->flags & FPU_KERN_CTX_DUMMY) != 0) return (0); pcb = td->td_pcb; critical_enter(); if (curthread == PCPU_GET(fpcurthread)) npxdrop(); critical_exit(); pcb->pcb_save = ctx->prev; if (pcb->pcb_save == get_pcb_user_save_pcb(pcb)) { if ((pcb->pcb_flags & PCB_NPXUSERINITDONE) != 0) pcb->pcb_flags |= PCB_NPXINITDONE; else pcb->pcb_flags &= ~PCB_NPXINITDONE; pcb->pcb_flags &= ~PCB_KERNNPX; } else { if ((ctx->flags & FPU_KERN_CTX_NPXINITDONE) != 0) pcb->pcb_flags |= PCB_NPXINITDONE; else pcb->pcb_flags &= ~PCB_NPXINITDONE; KASSERT(!PCB_USER_FPU(pcb), ("unpaired fpu_kern_leave")); } return (0); } int fpu_kern_thread(u_int flags) { KASSERT((curthread->td_pflags & TDP_KTHREAD) != 0, ("Only kthread may use fpu_kern_thread")); KASSERT(curpcb->pcb_save == get_pcb_user_save_pcb(curpcb), ("mangled pcb_save")); KASSERT(PCB_USER_FPU(curpcb), ("recursive call")); curpcb->pcb_flags |= PCB_KERNNPX; return (0); } int is_fpu_kern_thread(u_int flags) { if ((curthread->td_pflags & TDP_KTHREAD) == 0) return (0); return ((curpcb->pcb_flags & PCB_KERNNPX) != 0); } /* * FPU save area alloc/free/init utility routines */ union savefpu * fpu_save_area_alloc(void) { return (uma_zalloc(fpu_save_area_zone, 0)); } void fpu_save_area_free(union savefpu *fsa) { uma_zfree(fpu_save_area_zone, fsa); } void fpu_save_area_reset(union savefpu *fsa) { bcopy(npx_initialstate, fsa, cpu_max_ext_state_size); } Index: head/sys/i386/linux/linux_ptrace.c =================================================================== --- head/sys/i386/linux/linux_ptrace.c (revision 313153) +++ head/sys/i386/linux/linux_ptrace.c (revision 313154) @@ -1,486 +1,474 @@ /*- * Copyright (c) 2001 Alexander Kabaev * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer * in this position and unchanged. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_cpu.h" #include #include #include #include #include #include #include #include #include #include #include #include #include -#if !defined(CPU_DISABLE_SSE) && defined(I686_CPU) -#define CPU_ENABLE_SSE -#endif - /* * Linux ptrace requests numbers. Mostly identical to FreeBSD, * except for MD ones and PT_ATTACH/PT_DETACH. */ #define PTRACE_TRACEME 0 #define PTRACE_PEEKTEXT 1 #define PTRACE_PEEKDATA 2 #define PTRACE_PEEKUSR 3 #define PTRACE_POKETEXT 4 #define PTRACE_POKEDATA 5 #define PTRACE_POKEUSR 6 #define PTRACE_CONT 7 #define PTRACE_KILL 8 #define PTRACE_SINGLESTEP 9 #define PTRACE_ATTACH 16 #define PTRACE_DETACH 17 #define LINUX_PTRACE_SYSCALL 24 #define PTRACE_GETREGS 12 #define PTRACE_SETREGS 13 #define PTRACE_GETFPREGS 14 #define PTRACE_SETFPREGS 15 #define PTRACE_GETFPXREGS 18 #define PTRACE_SETFPXREGS 19 #define PTRACE_SETOPTIONS 21 /* * Linux keeps debug registers at the following * offset in the user struct */ #define LINUX_DBREG_OFFSET 252 #define LINUX_DBREG_SIZE (8*sizeof(l_int)) static __inline int map_signum(int signum) { signum = linux_to_bsd_signal(signum); return ((signum == SIGSTOP)? 0 : signum); } struct linux_pt_reg { l_long ebx; l_long ecx; l_long edx; l_long esi; l_long edi; l_long ebp; l_long eax; l_int xds; l_int xes; l_int xfs; l_int xgs; l_long orig_eax; l_long eip; l_int xcs; l_long eflags; l_long esp; l_int xss; }; /* * Translate i386 ptrace registers between Linux and FreeBSD formats. * The translation is pretty straighforward, for all registers, but * orig_eax on Linux side and r_trapno and r_err in FreeBSD */ static void map_regs_to_linux(struct reg *bsd_r, struct linux_pt_reg *linux_r) { linux_r->ebx = bsd_r->r_ebx; linux_r->ecx = bsd_r->r_ecx; linux_r->edx = bsd_r->r_edx; linux_r->esi = bsd_r->r_esi; linux_r->edi = bsd_r->r_edi; linux_r->ebp = bsd_r->r_ebp; linux_r->eax = bsd_r->r_eax; linux_r->xds = bsd_r->r_ds; linux_r->xes = bsd_r->r_es; linux_r->xfs = bsd_r->r_fs; linux_r->xgs = bsd_r->r_gs; linux_r->orig_eax = bsd_r->r_eax; linux_r->eip = bsd_r->r_eip; linux_r->xcs = bsd_r->r_cs; linux_r->eflags = bsd_r->r_eflags; linux_r->esp = bsd_r->r_esp; linux_r->xss = bsd_r->r_ss; } static void map_regs_from_linux(struct reg *bsd_r, struct linux_pt_reg *linux_r) { bsd_r->r_ebx = linux_r->ebx; bsd_r->r_ecx = linux_r->ecx; bsd_r->r_edx = linux_r->edx; bsd_r->r_esi = linux_r->esi; bsd_r->r_edi = linux_r->edi; bsd_r->r_ebp = linux_r->ebp; bsd_r->r_eax = linux_r->eax; bsd_r->r_ds = linux_r->xds; bsd_r->r_es = linux_r->xes; bsd_r->r_fs = linux_r->xfs; bsd_r->r_gs = linux_r->xgs; bsd_r->r_eip = linux_r->eip; bsd_r->r_cs = linux_r->xcs; bsd_r->r_eflags = linux_r->eflags; bsd_r->r_esp = linux_r->esp; bsd_r->r_ss = linux_r->xss; } struct linux_pt_fpreg { l_long cwd; l_long swd; l_long twd; l_long fip; l_long fcs; l_long foo; l_long fos; l_long st_space[2*10]; }; static void map_fpregs_to_linux(struct fpreg *bsd_r, struct linux_pt_fpreg *linux_r) { linux_r->cwd = bsd_r->fpr_env[0]; linux_r->swd = bsd_r->fpr_env[1]; linux_r->twd = bsd_r->fpr_env[2]; linux_r->fip = bsd_r->fpr_env[3]; linux_r->fcs = bsd_r->fpr_env[4]; linux_r->foo = bsd_r->fpr_env[5]; linux_r->fos = bsd_r->fpr_env[6]; bcopy(bsd_r->fpr_acc, linux_r->st_space, sizeof(linux_r->st_space)); } static void map_fpregs_from_linux(struct fpreg *bsd_r, struct linux_pt_fpreg *linux_r) { bsd_r->fpr_env[0] = linux_r->cwd; bsd_r->fpr_env[1] = linux_r->swd; bsd_r->fpr_env[2] = linux_r->twd; bsd_r->fpr_env[3] = linux_r->fip; bsd_r->fpr_env[4] = linux_r->fcs; bsd_r->fpr_env[5] = linux_r->foo; bsd_r->fpr_env[6] = linux_r->fos; bcopy(bsd_r->fpr_acc, linux_r->st_space, sizeof(bsd_r->fpr_acc)); } struct linux_pt_fpxreg { l_ushort cwd; l_ushort swd; l_ushort twd; l_ushort fop; l_long fip; l_long fcs; l_long foo; l_long fos; l_long mxcsr; l_long reserved; l_long st_space[32]; l_long xmm_space[32]; l_long padding[56]; }; -#ifdef CPU_ENABLE_SSE static int linux_proc_read_fpxregs(struct thread *td, struct linux_pt_fpxreg *fpxregs) { PROC_LOCK_ASSERT(td->td_proc, MA_OWNED); if (cpu_fxsr == 0 || (td->td_proc->p_flag & P_INMEM) == 0) return (EIO); bcopy(&get_pcb_user_save_td(td)->sv_xmm, fpxregs, sizeof(*fpxregs)); return (0); } static int linux_proc_write_fpxregs(struct thread *td, struct linux_pt_fpxreg *fpxregs) { PROC_LOCK_ASSERT(td->td_proc, MA_OWNED); if (cpu_fxsr == 0 || (td->td_proc->p_flag & P_INMEM) == 0) return (EIO); bcopy(fpxregs, &get_pcb_user_save_td(td)->sv_xmm, sizeof(*fpxregs)); return (0); } -#endif int linux_ptrace(struct thread *td, struct linux_ptrace_args *uap) { union { struct linux_pt_reg reg; struct linux_pt_fpreg fpreg; struct linux_pt_fpxreg fpxreg; } r; union { struct reg bsd_reg; struct fpreg bsd_fpreg; struct dbreg bsd_dbreg; } u; void *addr; pid_t pid; int error, req; error = 0; /* by default, just copy data intact */ req = uap->req; pid = (pid_t)uap->pid; addr = (void *)uap->addr; switch (req) { case PTRACE_TRACEME: case PTRACE_POKETEXT: case PTRACE_POKEDATA: case PTRACE_KILL: error = kern_ptrace(td, req, pid, addr, uap->data); break; case PTRACE_PEEKTEXT: case PTRACE_PEEKDATA: { /* need to preserve return value */ int rval = td->td_retval[0]; error = kern_ptrace(td, req, pid, addr, 0); if (error == 0) error = copyout(td->td_retval, (void *)uap->data, sizeof(l_int)); td->td_retval[0] = rval; break; } case PTRACE_DETACH: error = kern_ptrace(td, PT_DETACH, pid, (void *)1, map_signum(uap->data)); break; case PTRACE_SINGLESTEP: case PTRACE_CONT: error = kern_ptrace(td, req, pid, (void *)1, map_signum(uap->data)); break; case PTRACE_ATTACH: error = kern_ptrace(td, PT_ATTACH, pid, addr, uap->data); break; case PTRACE_GETREGS: /* Linux is using data where FreeBSD is using addr */ error = kern_ptrace(td, PT_GETREGS, pid, &u.bsd_reg, 0); if (error == 0) { map_regs_to_linux(&u.bsd_reg, &r.reg); error = copyout(&r.reg, (void *)uap->data, sizeof(r.reg)); } break; case PTRACE_SETREGS: /* Linux is using data where FreeBSD is using addr */ error = copyin((void *)uap->data, &r.reg, sizeof(r.reg)); if (error == 0) { map_regs_from_linux(&u.bsd_reg, &r.reg); error = kern_ptrace(td, PT_SETREGS, pid, &u.bsd_reg, 0); } break; case PTRACE_GETFPREGS: /* Linux is using data where FreeBSD is using addr */ error = kern_ptrace(td, PT_GETFPREGS, pid, &u.bsd_fpreg, 0); if (error == 0) { map_fpregs_to_linux(&u.bsd_fpreg, &r.fpreg); error = copyout(&r.fpreg, (void *)uap->data, sizeof(r.fpreg)); } break; case PTRACE_SETFPREGS: /* Linux is using data where FreeBSD is using addr */ error = copyin((void *)uap->data, &r.fpreg, sizeof(r.fpreg)); if (error == 0) { map_fpregs_from_linux(&u.bsd_fpreg, &r.fpreg); error = kern_ptrace(td, PT_SETFPREGS, pid, &u.bsd_fpreg, 0); } break; case PTRACE_SETFPXREGS: -#ifdef CPU_ENABLE_SSE error = copyin((void *)uap->data, &r.fpxreg, sizeof(r.fpxreg)); if (error) break; -#endif /* FALL THROUGH */ case PTRACE_GETFPXREGS: { -#ifdef CPU_ENABLE_SSE struct proc *p; struct thread *td2; if (sizeof(struct linux_pt_fpxreg) != sizeof(struct savexmm)) { static int once = 0; if (!once) { printf("linux: savexmm != linux_pt_fpxreg\n"); once = 1; } error = EIO; break; } if ((p = pfind(uap->pid)) == NULL) { error = ESRCH; break; } /* Exiting processes can't be debugged. */ if ((p->p_flag & P_WEXIT) != 0) { error = ESRCH; goto fail; } if ((error = p_candebug(td, p)) != 0) goto fail; /* System processes can't be debugged. */ if ((p->p_flag & P_SYSTEM) != 0) { error = EINVAL; goto fail; } /* not being traced... */ if ((p->p_flag & P_TRACED) == 0) { error = EPERM; goto fail; } /* not being traced by YOU */ if (p->p_pptr != td->td_proc) { error = EBUSY; goto fail; } /* not currently stopped */ if (!P_SHOULDSTOP(p) || (p->p_flag & P_WAITED) == 0) { error = EBUSY; goto fail; } if (req == PTRACE_GETFPXREGS) { _PHOLD(p); /* may block */ td2 = FIRST_THREAD_IN_PROC(p); error = linux_proc_read_fpxregs(td2, &r.fpxreg); _PRELE(p); PROC_UNLOCK(p); if (error == 0) error = copyout(&r.fpxreg, (void *)uap->data, sizeof(r.fpxreg)); } else { /* clear dangerous bits exactly as Linux does*/ r.fpxreg.mxcsr &= 0xffbf; _PHOLD(p); /* may block */ td2 = FIRST_THREAD_IN_PROC(p); error = linux_proc_write_fpxregs(td2, &r.fpxreg); _PRELE(p); PROC_UNLOCK(p); } break; fail: PROC_UNLOCK(p); -#else - error = EIO; -#endif break; } case PTRACE_PEEKUSR: case PTRACE_POKEUSR: { error = EIO; /* check addr for alignment */ if (uap->addr < 0 || uap->addr & (sizeof(l_int) - 1)) break; /* * Allow linux programs to access register values in * user struct. We simulate this through PT_GET/SETREGS * as necessary. */ if (uap->addr < sizeof(struct linux_pt_reg)) { error = kern_ptrace(td, PT_GETREGS, pid, &u.bsd_reg, 0); if (error != 0) break; map_regs_to_linux(&u.bsd_reg, &r.reg); if (req == PTRACE_PEEKUSR) { error = copyout((char *)&r.reg + uap->addr, (void *)uap->data, sizeof(l_int)); break; } *(l_int *)((char *)&r.reg + uap->addr) = (l_int)uap->data; map_regs_from_linux(&u.bsd_reg, &r.reg); error = kern_ptrace(td, PT_SETREGS, pid, &u.bsd_reg, 0); } /* * Simulate debug registers access */ if (uap->addr >= LINUX_DBREG_OFFSET && uap->addr <= LINUX_DBREG_OFFSET + LINUX_DBREG_SIZE) { error = kern_ptrace(td, PT_GETDBREGS, pid, &u.bsd_dbreg, 0); if (error != 0) break; uap->addr -= LINUX_DBREG_OFFSET; if (req == PTRACE_PEEKUSR) { error = copyout((char *)&u.bsd_dbreg + uap->addr, (void *)uap->data, sizeof(l_int)); break; } *(l_int *)((char *)&u.bsd_dbreg + uap->addr) = uap->data; error = kern_ptrace(td, PT_SETDBREGS, pid, &u.bsd_dbreg, 0); } break; } case LINUX_PTRACE_SYSCALL: /* fall through */ default: printf("linux: ptrace(%u, ...) not implemented\n", (unsigned int)uap->req); error = EINVAL; break; } return (error); } Index: head/sys/x86/acpica/acpi_wakeup.c =================================================================== --- head/sys/x86/acpica/acpi_wakeup.c (revision 313153) +++ head/sys/x86/acpica/acpi_wakeup.c (revision 313154) @@ -1,427 +1,424 @@ /*- * Copyright (c) 2001 Takanori Watanabe * Copyright (c) 2001-2012 Mitsuru IWASAKI * Copyright (c) 2003 Peter Wemm * Copyright (c) 2008-2012 Jung-uk Kim * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #if defined(__amd64__) #define DEV_APIC #else #include "opt_apic.h" #endif -#ifdef __i386__ -#include "opt_npx.h" -#endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DEV_APIC #include #include #endif #ifdef SMP #include #include #endif #include #include #include "acpi_wakecode.h" #include "acpi_wakedata.h" /* Make sure the code is less than a page and leave room for the stack. */ CTASSERT(sizeof(wakecode) < PAGE_SIZE - 1024); extern int acpi_resume_beep; extern int acpi_reset_video; #ifdef SMP extern struct susppcb **susppcbs; static cpuset_t suspcpus; #else static struct susppcb **susppcbs; #endif static void *acpi_alloc_wakeup_handler(void **); static void acpi_stop_beep(void *); #ifdef SMP static int acpi_wakeup_ap(struct acpi_softc *, int); static void acpi_wakeup_cpus(struct acpi_softc *); #endif #ifdef __amd64__ #define ACPI_WAKEPAGES 4 #else #define ACPI_WAKEPAGES 1 #endif #define WAKECODE_FIXUP(offset, type, val) do { \ type *addr; \ addr = (type *)(sc->acpi_wakeaddr + (offset)); \ *addr = val; \ } while (0) static void acpi_stop_beep(void *arg) { if (acpi_resume_beep != 0) timer_spkr_release(); } #ifdef SMP static int acpi_wakeup_ap(struct acpi_softc *sc, int cpu) { struct pcb *pcb; int vector = (sc->acpi_wakephys >> 12) & 0xff; int apic_id = cpu_apic_ids[cpu]; int ms; pcb = &susppcbs[cpu]->sp_pcb; WAKECODE_FIXUP(wakeup_pcb, struct pcb *, pcb); WAKECODE_FIXUP(wakeup_gdt, uint16_t, pcb->pcb_gdt.rd_limit); WAKECODE_FIXUP(wakeup_gdt + 2, uint64_t, pcb->pcb_gdt.rd_base); ipi_startup(apic_id, vector); /* Wait up to 5 seconds for it to resume. */ for (ms = 0; ms < 5000; ms++) { if (!CPU_ISSET(cpu, &suspended_cpus)) return (1); /* return SUCCESS */ DELAY(1000); } return (0); /* return FAILURE */ } #define WARMBOOT_TARGET 0 #define WARMBOOT_OFF (KERNBASE + 0x0467) #define WARMBOOT_SEG (KERNBASE + 0x0469) #define CMOS_REG (0x70) #define CMOS_DATA (0x71) #define BIOS_RESET (0x0f) #define BIOS_WARM (0x0a) static void acpi_wakeup_cpus(struct acpi_softc *sc) { uint32_t mpbioswarmvec; int cpu; u_char mpbiosreason; /* save the current value of the warm-start vector */ mpbioswarmvec = *((uint32_t *)WARMBOOT_OFF); outb(CMOS_REG, BIOS_RESET); mpbiosreason = inb(CMOS_DATA); /* setup a vector to our boot code */ *((volatile u_short *)WARMBOOT_OFF) = WARMBOOT_TARGET; *((volatile u_short *)WARMBOOT_SEG) = sc->acpi_wakephys >> 4; outb(CMOS_REG, BIOS_RESET); outb(CMOS_DATA, BIOS_WARM); /* 'warm-start' */ /* Wake up each AP. */ for (cpu = 1; cpu < mp_ncpus; cpu++) { if (!CPU_ISSET(cpu, &suspcpus)) continue; if (acpi_wakeup_ap(sc, cpu) == 0) { /* restore the warmstart vector */ *(uint32_t *)WARMBOOT_OFF = mpbioswarmvec; panic("acpi_wakeup: failed to resume AP #%d (PHY #%d)", cpu, cpu_apic_ids[cpu]); } } /* restore the warmstart vector */ *(uint32_t *)WARMBOOT_OFF = mpbioswarmvec; outb(CMOS_REG, BIOS_RESET); outb(CMOS_DATA, mpbiosreason); } #endif int acpi_sleep_machdep(struct acpi_softc *sc, int state) { ACPI_STATUS status; struct pcb *pcb; if (sc->acpi_wakeaddr == 0ul) return (-1); /* couldn't alloc wake memory */ #ifdef SMP suspcpus = all_cpus; CPU_CLR(PCPU_GET(cpuid), &suspcpus); #endif if (acpi_resume_beep != 0) timer_spkr_acquire(); AcpiSetFirmwareWakingVector(sc->acpi_wakephys, 0); intr_suspend(); pcb = &susppcbs[0]->sp_pcb; if (savectx(pcb)) { #ifdef __amd64__ fpususpend(susppcbs[0]->sp_fpususpend); -#elif defined(DEV_NPX) +#else npxsuspend(susppcbs[0]->sp_fpususpend); #endif #ifdef SMP if (!CPU_EMPTY(&suspcpus) && suspend_cpus(suspcpus) == 0) { device_printf(sc->acpi_dev, "Failed to suspend APs\n"); return (0); /* couldn't sleep */ } #endif WAKECODE_FIXUP(resume_beep, uint8_t, (acpi_resume_beep != 0)); WAKECODE_FIXUP(reset_video, uint8_t, (acpi_reset_video != 0)); #ifndef __amd64__ WAKECODE_FIXUP(wakeup_cr4, register_t, pcb->pcb_cr4); #endif WAKECODE_FIXUP(wakeup_pcb, struct pcb *, pcb); WAKECODE_FIXUP(wakeup_gdt, uint16_t, pcb->pcb_gdt.rd_limit); WAKECODE_FIXUP(wakeup_gdt + 2, uint64_t, pcb->pcb_gdt.rd_base); /* Call ACPICA to enter the desired sleep state */ if (state == ACPI_STATE_S4 && sc->acpi_s4bios) status = AcpiEnterSleepStateS4bios(); else status = AcpiEnterSleepState(state); if (ACPI_FAILURE(status)) { device_printf(sc->acpi_dev, "AcpiEnterSleepState failed - %s\n", AcpiFormatException(status)); return (0); /* couldn't sleep */ } for (;;) ia32_pause(); } else { #ifdef __amd64__ fpuresume(susppcbs[0]->sp_fpususpend); -#elif defined(DEV_NPX) +#else npxresume(susppcbs[0]->sp_fpususpend); #endif } return (1); /* wakeup successfully */ } int acpi_wakeup_machdep(struct acpi_softc *sc, int state, int sleep_result, int intr_enabled) { if (sleep_result == -1) return (sleep_result); if (!intr_enabled) { /* Wakeup MD procedures in interrupt disabled context */ if (sleep_result == 1) { pmap_init_pat(); initializecpu(); PCPU_SET(switchtime, 0); PCPU_SET(switchticks, ticks); #ifdef DEV_APIC lapic_xapic_mode(); #endif #ifdef SMP if (!CPU_EMPTY(&suspcpus)) acpi_wakeup_cpus(sc); #endif } #ifdef SMP if (!CPU_EMPTY(&suspcpus)) restart_cpus(suspcpus); #endif mca_resume(); #ifdef __amd64__ if (vmm_resume_p != NULL) vmm_resume_p(); #endif intr_resume(/*suspend_cancelled*/false); AcpiSetFirmwareWakingVector(0, 0); } else { /* Wakeup MD procedures in interrupt enabled context */ if (sleep_result == 1 && mem_range_softc.mr_op != NULL && mem_range_softc.mr_op->reinit != NULL) mem_range_softc.mr_op->reinit(&mem_range_softc); } return (sleep_result); } static void * acpi_alloc_wakeup_handler(void *wakepages[ACPI_WAKEPAGES]) { int i; memset(wakepages, 0, ACPI_WAKEPAGES * sizeof(*wakepages)); /* * Specify the region for our wakeup code. We want it in the low 1 MB * region, excluding real mode IVT (0-0x3ff), BDA (0x400-0x4ff), EBDA * (less than 128KB, below 0xa0000, must be excluded by SMAP and DSDT), * and ROM area (0xa0000 and above). The temporary page tables must be * page-aligned. */ for (i = 0; i < ACPI_WAKEPAGES; i++) { wakepages[i] = contigmalloc(PAGE_SIZE, M_DEVBUF, M_NOWAIT, 0x500, 0xa0000, PAGE_SIZE, 0ul); if (wakepages[i] == NULL) { printf("%s: can't alloc wake memory\n", __func__); goto freepages; } } if (EVENTHANDLER_REGISTER(power_resume, acpi_stop_beep, NULL, EVENTHANDLER_PRI_LAST) == NULL) { printf("%s: can't register event handler\n", __func__); goto freepages; } susppcbs = malloc(mp_ncpus * sizeof(*susppcbs), M_DEVBUF, M_WAITOK); for (i = 0; i < mp_ncpus; i++) { susppcbs[i] = malloc(sizeof(**susppcbs), M_DEVBUF, M_WAITOK); susppcbs[i]->sp_fpususpend = alloc_fpusave(M_WAITOK); } return (wakepages); freepages: for (i = 0; i < ACPI_WAKEPAGES; i++) if (wakepages[i] != NULL) contigfree(wakepages[i], PAGE_SIZE, M_DEVBUF); return (NULL); } void acpi_install_wakeup_handler(struct acpi_softc *sc) { static void *wakeaddr; void *wakepages[ACPI_WAKEPAGES]; #ifdef __amd64__ uint64_t *pt4, *pt3, *pt2; vm_paddr_t pt4pa, pt3pa, pt2pa; int i; #endif if (wakeaddr != NULL) return; if (acpi_alloc_wakeup_handler(wakepages) == NULL) return; wakeaddr = wakepages[0]; sc->acpi_wakeaddr = (vm_offset_t)wakeaddr; sc->acpi_wakephys = vtophys(wakeaddr); #ifdef __amd64__ pt4 = wakepages[1]; pt3 = wakepages[2]; pt2 = wakepages[3]; pt4pa = vtophys(pt4); pt3pa = vtophys(pt3); pt2pa = vtophys(pt2); #endif bcopy(wakecode, (void *)sc->acpi_wakeaddr, sizeof(wakecode)); /* Patch GDT base address, ljmp targets. */ WAKECODE_FIXUP((bootgdtdesc + 2), uint32_t, sc->acpi_wakephys + bootgdt); WAKECODE_FIXUP((wakeup_sw32 + 2), uint32_t, sc->acpi_wakephys + wakeup_32); #ifdef __amd64__ WAKECODE_FIXUP((wakeup_sw64 + 1), uint32_t, sc->acpi_wakephys + wakeup_64); WAKECODE_FIXUP(wakeup_pagetables, uint32_t, pt4pa); #endif /* Save pointers to some global data. */ WAKECODE_FIXUP(wakeup_ret, void *, resumectx); #ifndef __amd64__ #if defined(PAE) || defined(PAE_TABLES) WAKECODE_FIXUP(wakeup_cr3, register_t, vtophys(kernel_pmap->pm_pdpt)); #else WAKECODE_FIXUP(wakeup_cr3, register_t, vtophys(kernel_pmap->pm_pdir)); #endif #else /* __amd64__ */ /* Create the initial 1GB replicated page tables */ for (i = 0; i < 512; i++) { /* * Each slot of the level 4 pages points * to the same level 3 page */ pt4[i] = (uint64_t)pt3pa; pt4[i] |= PG_V | PG_RW | PG_U; /* * Each slot of the level 3 pages points * to the same level 2 page */ pt3[i] = (uint64_t)pt2pa; pt3[i] |= PG_V | PG_RW | PG_U; /* The level 2 page slots are mapped with 2MB pages for 1GB. */ pt2[i] = i * (2 * 1024 * 1024); pt2[i] |= PG_V | PG_RW | PG_PS | PG_U; } #endif /* !__amd64__ */ if (bootverbose) device_printf(sc->acpi_dev, "wakeup code va %#jx pa %#jx\n", (uintmax_t)sc->acpi_wakeaddr, (uintmax_t)sc->acpi_wakephys); } Index: head/sys/x86/x86/cpu_machdep.c =================================================================== --- head/sys/x86/x86/cpu_machdep.c (revision 313153) +++ head/sys/x86/x86/cpu_machdep.c (revision 313154) @@ -1,571 +1,570 @@ /*- * Copyright (c) 2003 Peter Wemm. * Copyright (c) 1992 Terrence R. Lambert. * Copyright (c) 1982, 1987, 1990 The Regents of the University of California. * All rights reserved. * * This code is derived from software contributed to Berkeley by * William Jolitz. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)machdep.c 7.4 (Berkeley) 6/3/91 */ #include __FBSDID("$FreeBSD$"); #include "opt_atpic.h" #include "opt_compat.h" #include "opt_cpu.h" #include "opt_ddb.h" #include "opt_inet.h" #include "opt_isa.h" #include "opt_kdb.h" #include "opt_kstack_pages.h" #include "opt_maxmem.h" #include "opt_mp_watchdog.h" #include "opt_platform.h" #ifdef __i386__ -#include "opt_npx.h" #include "opt_apic.h" #include "opt_xbox.h" #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef SMP #include #endif #include #include #include #include #include #include #include #include #ifdef SMP #include #endif #include #include #include #include #include #include #include #include #include #define STATE_RUNNING 0x0 #define STATE_MWAIT 0x1 #define STATE_SLEEPING 0x2 /* * Machine dependent boot() routine * * I haven't seen anything to put here yet * Possibly some stuff might be grafted back here from boot() */ void cpu_boot(int howto) { } /* * Flush the D-cache for non-DMA I/O so that the I-cache can * be made coherent later. */ void cpu_flush_dcache(void *ptr, size_t len) { /* Not applicable */ } void acpi_cpu_c1(void) { __asm __volatile("sti; hlt"); } void acpi_cpu_idle_mwait(uint32_t mwait_hint) { int *state; /* * XXXKIB. Software coordination mode should be supported, * but all Intel CPUs provide hardware coordination. */ state = (int *)PCPU_PTR(monitorbuf); KASSERT(*state == STATE_SLEEPING, ("cpu_mwait_cx: wrong monitorbuf state")); *state = STATE_MWAIT; cpu_monitor(state, 0, 0); if (*state == STATE_MWAIT) cpu_mwait(MWAIT_INTRBREAK, mwait_hint); /* * We should exit on any event that interrupts mwait, because * that event might be a wanted interrupt. */ *state = STATE_RUNNING; } /* Get current clock frequency for the given cpu id. */ int cpu_est_clockrate(int cpu_id, uint64_t *rate) { uint64_t tsc1, tsc2; uint64_t acnt, mcnt, perf; register_t reg; if (pcpu_find(cpu_id) == NULL || rate == NULL) return (EINVAL); #ifdef __i386__ if ((cpu_feature & CPUID_TSC) == 0) return (EOPNOTSUPP); #endif /* * If TSC is P-state invariant and APERF/MPERF MSRs do not exist, * DELAY(9) based logic fails. */ if (tsc_is_invariant && !tsc_perf_stat) return (EOPNOTSUPP); #ifdef SMP if (smp_cpus > 1) { /* Schedule ourselves on the indicated cpu. */ thread_lock(curthread); sched_bind(curthread, cpu_id); thread_unlock(curthread); } #endif /* Calibrate by measuring a short delay. */ reg = intr_disable(); if (tsc_is_invariant) { wrmsr(MSR_MPERF, 0); wrmsr(MSR_APERF, 0); tsc1 = rdtsc(); DELAY(1000); mcnt = rdmsr(MSR_MPERF); acnt = rdmsr(MSR_APERF); tsc2 = rdtsc(); intr_restore(reg); perf = 1000 * acnt / mcnt; *rate = (tsc2 - tsc1) * perf; } else { tsc1 = rdtsc(); DELAY(1000); tsc2 = rdtsc(); intr_restore(reg); *rate = (tsc2 - tsc1) * 1000; } #ifdef SMP if (smp_cpus > 1) { thread_lock(curthread); sched_unbind(curthread); thread_unlock(curthread); } #endif return (0); } /* * Shutdown the CPU as much as possible */ void cpu_halt(void) { for (;;) halt(); } bool cpu_mwait_usable(void) { return ((cpu_feature2 & CPUID2_MON) != 0 && ((cpu_mon_mwait_flags & (CPUID5_MON_MWAIT_EXT | CPUID5_MWAIT_INTRBREAK)) == (CPUID5_MON_MWAIT_EXT | CPUID5_MWAIT_INTRBREAK))); } void (*cpu_idle_hook)(sbintime_t) = NULL; /* ACPI idle hook. */ static int cpu_ident_amdc1e = 0; /* AMD C1E supported. */ static int idle_mwait = 1; /* Use MONITOR/MWAIT for short idle. */ SYSCTL_INT(_machdep, OID_AUTO, idle_mwait, CTLFLAG_RWTUN, &idle_mwait, 0, "Use MONITOR/MWAIT for short idle"); static void cpu_idle_acpi(sbintime_t sbt) { int *state; state = (int *)PCPU_PTR(monitorbuf); *state = STATE_SLEEPING; /* See comments in cpu_idle_hlt(). */ disable_intr(); if (sched_runnable()) enable_intr(); else if (cpu_idle_hook) cpu_idle_hook(sbt); else acpi_cpu_c1(); *state = STATE_RUNNING; } static void cpu_idle_hlt(sbintime_t sbt) { int *state; state = (int *)PCPU_PTR(monitorbuf); *state = STATE_SLEEPING; /* * Since we may be in a critical section from cpu_idle(), if * an interrupt fires during that critical section we may have * a pending preemption. If the CPU halts, then that thread * may not execute until a later interrupt awakens the CPU. * To handle this race, check for a runnable thread after * disabling interrupts and immediately return if one is * found. Also, we must absolutely guarentee that hlt is * the next instruction after sti. This ensures that any * interrupt that fires after the call to disable_intr() will * immediately awaken the CPU from hlt. Finally, please note * that on x86 this works fine because of interrupts enabled only * after the instruction following sti takes place, while IF is set * to 1 immediately, allowing hlt instruction to acknowledge the * interrupt. */ disable_intr(); if (sched_runnable()) enable_intr(); else acpi_cpu_c1(); *state = STATE_RUNNING; } static void cpu_idle_mwait(sbintime_t sbt) { int *state; state = (int *)PCPU_PTR(monitorbuf); *state = STATE_MWAIT; /* See comments in cpu_idle_hlt(). */ disable_intr(); if (sched_runnable()) { enable_intr(); *state = STATE_RUNNING; return; } cpu_monitor(state, 0, 0); if (*state == STATE_MWAIT) __asm __volatile("sti; mwait" : : "a" (MWAIT_C1), "c" (0)); else enable_intr(); *state = STATE_RUNNING; } static void cpu_idle_spin(sbintime_t sbt) { int *state; int i; state = (int *)PCPU_PTR(monitorbuf); *state = STATE_RUNNING; /* * The sched_runnable() call is racy but as long as there is * a loop missing it one time will have just a little impact if any * (and it is much better than missing the check at all). */ for (i = 0; i < 1000; i++) { if (sched_runnable()) return; cpu_spinwait(); } } /* * C1E renders the local APIC timer dead, so we disable it by * reading the Interrupt Pending Message register and clearing * both C1eOnCmpHalt (bit 28) and SmiOnCmpHalt (bit 27). * * Reference: * "BIOS and Kernel Developer's Guide for AMD NPT Family 0Fh Processors" * #32559 revision 3.00+ */ #define MSR_AMDK8_IPM 0xc0010055 #define AMDK8_SMIONCMPHALT (1ULL << 27) #define AMDK8_C1EONCMPHALT (1ULL << 28) #define AMDK8_CMPHALT (AMDK8_SMIONCMPHALT | AMDK8_C1EONCMPHALT) void cpu_probe_amdc1e(void) { /* * Detect the presence of C1E capability mostly on latest * dual-cores (or future) k8 family. */ if (cpu_vendor_id == CPU_VENDOR_AMD && (cpu_id & 0x00000f00) == 0x00000f00 && (cpu_id & 0x0fff0000) >= 0x00040000) { cpu_ident_amdc1e = 1; } } void (*cpu_idle_fn)(sbintime_t) = cpu_idle_acpi; void cpu_idle(int busy) { uint64_t msr; sbintime_t sbt = -1; CTR2(KTR_SPARE2, "cpu_idle(%d) at %d", busy, curcpu); #ifdef MP_WATCHDOG ap_watchdog(PCPU_GET(cpuid)); #endif /* If we are busy - try to use fast methods. */ if (busy) { if ((cpu_feature2 & CPUID2_MON) && idle_mwait) { cpu_idle_mwait(busy); goto out; } } /* If we have time - switch timers into idle mode. */ if (!busy) { critical_enter(); sbt = cpu_idleclock(); } /* Apply AMD APIC timer C1E workaround. */ if (cpu_ident_amdc1e && cpu_disable_c3_sleep) { msr = rdmsr(MSR_AMDK8_IPM); if (msr & AMDK8_CMPHALT) wrmsr(MSR_AMDK8_IPM, msr & ~AMDK8_CMPHALT); } /* Call main idle method. */ cpu_idle_fn(sbt); /* Switch timers back into active mode. */ if (!busy) { cpu_activeclock(); critical_exit(); } out: CTR2(KTR_SPARE2, "cpu_idle(%d) at %d done", busy, curcpu); } int cpu_idle_wakeup(int cpu) { struct pcpu *pcpu; int *state; pcpu = pcpu_find(cpu); state = (int *)pcpu->pc_monitorbuf; /* * This doesn't need to be atomic since missing the race will * simply result in unnecessary IPIs. */ if (*state == STATE_SLEEPING) return (0); if (*state == STATE_MWAIT) *state = STATE_RUNNING; return (1); } /* * Ordered by speed/power consumption. */ struct { void *id_fn; char *id_name; } idle_tbl[] = { { cpu_idle_spin, "spin" }, { cpu_idle_mwait, "mwait" }, { cpu_idle_hlt, "hlt" }, #if !defined(__i386__) { cpu_idle_acpi, "acpi" }, #endif { NULL, NULL } }; static int idle_sysctl_available(SYSCTL_HANDLER_ARGS) { char *avail, *p; int error; int i; avail = malloc(256, M_TEMP, M_WAITOK); p = avail; for (i = 0; idle_tbl[i].id_name != NULL; i++) { if (strstr(idle_tbl[i].id_name, "mwait") && (cpu_feature2 & CPUID2_MON) == 0) continue; #if !defined(__i386__) if (strcmp(idle_tbl[i].id_name, "acpi") == 0 && cpu_idle_hook == NULL) continue; #endif p += sprintf(p, "%s%s", p != avail ? ", " : "", idle_tbl[i].id_name); } error = sysctl_handle_string(oidp, avail, 0, req); free(avail, M_TEMP); return (error); } SYSCTL_PROC(_machdep, OID_AUTO, idle_available, CTLTYPE_STRING | CTLFLAG_RD, 0, 0, idle_sysctl_available, "A", "list of available idle functions"); static int idle_sysctl(SYSCTL_HANDLER_ARGS) { char buf[16]; int error; char *p; int i; p = "unknown"; for (i = 0; idle_tbl[i].id_name != NULL; i++) { if (idle_tbl[i].id_fn == cpu_idle_fn) { p = idle_tbl[i].id_name; break; } } strncpy(buf, p, sizeof(buf)); error = sysctl_handle_string(oidp, buf, sizeof(buf), req); if (error != 0 || req->newptr == NULL) return (error); for (i = 0; idle_tbl[i].id_name != NULL; i++) { if (strstr(idle_tbl[i].id_name, "mwait") && (cpu_feature2 & CPUID2_MON) == 0) continue; #if !defined(__i386__) if (strcmp(idle_tbl[i].id_name, "acpi") == 0 && cpu_idle_hook == NULL) continue; #endif if (strcmp(idle_tbl[i].id_name, buf)) continue; cpu_idle_fn = idle_tbl[i].id_fn; return (0); } return (EINVAL); } SYSCTL_PROC(_machdep, OID_AUTO, idle, CTLTYPE_STRING | CTLFLAG_RW, 0, 0, idle_sysctl, "A", "currently selected idle function"); static int panic_on_nmi = 1; SYSCTL_INT(_machdep, OID_AUTO, panic_on_nmi, CTLFLAG_RWTUN, &panic_on_nmi, 0, "Panic on NMI"); int nmi_is_broadcast = 1; SYSCTL_INT(_machdep, OID_AUTO, nmi_is_broadcast, CTLFLAG_RWTUN, &nmi_is_broadcast, 0, "Chipset NMI is broadcast"); #ifdef KDB int kdb_on_nmi = 1; SYSCTL_INT(_machdep, OID_AUTO, kdb_on_nmi, CTLFLAG_RWTUN, &kdb_on_nmi, 0, "Go to KDB on NMI"); #endif #ifdef DEV_ISA void nmi_call_kdb(u_int cpu, u_int type, struct trapframe *frame) { /* machine/parity/power fail/"kitchen sink" faults */ if (isa_nmi(frame->tf_err) == 0) { #ifdef KDB /* * NMI can be hooked up to a pushbutton for debugging. */ if (kdb_on_nmi) { printf("NMI/cpu%d ... going to debugger\n", cpu); kdb_trap(type, 0, frame); } #endif /* KDB */ } else if (panic_on_nmi) { panic("NMI indicates hardware failure"); } } #endif void nmi_handle_intr(u_int type, struct trapframe *frame) { #ifdef DEV_ISA #ifdef SMP if (nmi_is_broadcast) { nmi_call_kdb_smp(type, frame); return; } #endif nmi_call_kdb(PCPU_GET(cpuid), type, frame); #endif } Index: head/sys/x86/x86/identcpu.c =================================================================== --- head/sys/x86/x86/identcpu.c (revision 313153) +++ head/sys/x86/x86/identcpu.c (revision 313154) @@ -1,2420 +1,2414 @@ /*- * Copyright (c) 1992 Terrence R. Lambert. * Copyright (c) 1982, 1987, 1990 The Regents of the University of California. * Copyright (c) 1997 KATO Takenori. * All rights reserved. * * This code is derived from software contributed to Berkeley by * William Jolitz. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: Id: machdep.c,v 1.193 1996/06/18 01:22:04 bde Exp */ #include __FBSDID("$FreeBSD$"); #include "opt_cpu.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef __i386__ -#if !defined(CPU_DISABLE_SSE) && defined(I686_CPU) -#define CPU_ENABLE_SSE -#endif - #define IDENTBLUE_CYRIX486 0 #define IDENTBLUE_IBMCPU 1 #define IDENTBLUE_CYRIXM2 2 static void identifycyrix(void); static void print_transmeta_info(void); #endif static u_int find_cpu_vendor_id(void); static void print_AMD_info(void); static void print_INTEL_info(void); static void print_INTEL_TLB(u_int data); static void print_hypervisor_info(void); static void print_svm_info(void); static void print_via_padlock_info(void); static void print_vmx_info(void); #ifdef __i386__ int cpu; /* Are we 386, 386sx, 486, etc? */ int cpu_class; #endif u_int cpu_feature; /* Feature flags */ u_int cpu_feature2; /* Feature flags */ u_int amd_feature; /* AMD feature flags */ u_int amd_feature2; /* AMD feature flags */ u_int amd_pminfo; /* AMD advanced power management info */ u_int via_feature_rng; /* VIA RNG features */ u_int via_feature_xcrypt; /* VIA ACE features */ u_int cpu_high; /* Highest arg to CPUID */ u_int cpu_exthigh; /* Highest arg to extended CPUID */ u_int cpu_id; /* Stepping ID */ u_int cpu_procinfo; /* HyperThreading Info / Brand Index / CLFUSH */ u_int cpu_procinfo2; /* Multicore info */ char cpu_vendor[20]; /* CPU Origin code */ u_int cpu_vendor_id; /* CPU vendor ID */ -#if defined(__amd64__) || defined(CPU_ENABLE_SSE) u_int cpu_fxsr; /* SSE enabled */ u_int cpu_mxcsr_mask; /* Valid bits in mxcsr */ -#endif u_int cpu_clflush_line_size = 32; u_int cpu_stdext_feature; u_int cpu_stdext_feature2; u_int cpu_max_ext_state_size; u_int cpu_mon_mwait_flags; /* MONITOR/MWAIT flags (CPUID.05H.ECX) */ u_int cpu_mon_min_size; /* MONITOR minimum range size, bytes */ u_int cpu_mon_max_size; /* MONITOR minimum range size, bytes */ u_int cpu_maxphyaddr; /* Max phys addr width in bits */ char machine[] = MACHINE; SYSCTL_UINT(_hw, OID_AUTO, via_feature_rng, CTLFLAG_RD, &via_feature_rng, 0, "VIA RNG feature available in CPU"); SYSCTL_UINT(_hw, OID_AUTO, via_feature_xcrypt, CTLFLAG_RD, &via_feature_xcrypt, 0, "VIA xcrypt feature available in CPU"); #ifdef __amd64__ #ifdef SCTL_MASK32 extern int adaptive_machine_arch; #endif static int sysctl_hw_machine(SYSCTL_HANDLER_ARGS) { #ifdef SCTL_MASK32 static const char machine32[] = "i386"; #endif int error; #ifdef SCTL_MASK32 if ((req->flags & SCTL_MASK32) != 0 && adaptive_machine_arch) error = SYSCTL_OUT(req, machine32, sizeof(machine32)); else #endif error = SYSCTL_OUT(req, machine, sizeof(machine)); return (error); } SYSCTL_PROC(_hw, HW_MACHINE, machine, CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, sysctl_hw_machine, "A", "Machine class"); #else SYSCTL_STRING(_hw, HW_MACHINE, machine, CTLFLAG_RD, machine, 0, "Machine class"); #endif static char cpu_model[128]; SYSCTL_STRING(_hw, HW_MODEL, model, CTLFLAG_RD | CTLFLAG_MPSAFE, cpu_model, 0, "Machine model"); static int hw_clockrate; SYSCTL_INT(_hw, OID_AUTO, clockrate, CTLFLAG_RD, &hw_clockrate, 0, "CPU instruction clock rate"); u_int hv_high; char hv_vendor[16]; SYSCTL_STRING(_hw, OID_AUTO, hv_vendor, CTLFLAG_RD | CTLFLAG_MPSAFE, hv_vendor, 0, "Hypervisor vendor"); static eventhandler_tag tsc_post_tag; static char cpu_brand[48]; #ifdef __i386__ #define MAX_BRAND_INDEX 8 static const char *cpu_brandtable[MAX_BRAND_INDEX + 1] = { NULL, /* No brand */ "Intel Celeron", "Intel Pentium III", "Intel Pentium III Xeon", NULL, NULL, NULL, NULL, "Intel Pentium 4" }; static struct { char *cpu_name; int cpu_class; } cpus[] = { { "Intel 80286", CPUCLASS_286 }, /* CPU_286 */ { "i386SX", CPUCLASS_386 }, /* CPU_386SX */ { "i386DX", CPUCLASS_386 }, /* CPU_386 */ { "i486SX", CPUCLASS_486 }, /* CPU_486SX */ { "i486DX", CPUCLASS_486 }, /* CPU_486 */ { "Pentium", CPUCLASS_586 }, /* CPU_586 */ { "Cyrix 486", CPUCLASS_486 }, /* CPU_486DLC */ { "Pentium Pro", CPUCLASS_686 }, /* CPU_686 */ { "Cyrix 5x86", CPUCLASS_486 }, /* CPU_M1SC */ { "Cyrix 6x86", CPUCLASS_486 }, /* CPU_M1 */ { "Blue Lightning", CPUCLASS_486 }, /* CPU_BLUE */ { "Cyrix 6x86MX", CPUCLASS_686 }, /* CPU_M2 */ { "NexGen 586", CPUCLASS_386 }, /* CPU_NX586 (XXX) */ { "Cyrix 486S/DX", CPUCLASS_486 }, /* CPU_CY486DX */ { "Pentium II", CPUCLASS_686 }, /* CPU_PII */ { "Pentium III", CPUCLASS_686 }, /* CPU_PIII */ { "Pentium 4", CPUCLASS_686 }, /* CPU_P4 */ }; #endif static struct { char *vendor; u_int vendor_id; } cpu_vendors[] = { { INTEL_VENDOR_ID, CPU_VENDOR_INTEL }, /* GenuineIntel */ { AMD_VENDOR_ID, CPU_VENDOR_AMD }, /* AuthenticAMD */ { CENTAUR_VENDOR_ID, CPU_VENDOR_CENTAUR }, /* CentaurHauls */ #ifdef __i386__ { NSC_VENDOR_ID, CPU_VENDOR_NSC }, /* Geode by NSC */ { CYRIX_VENDOR_ID, CPU_VENDOR_CYRIX }, /* CyrixInstead */ { TRANSMETA_VENDOR_ID, CPU_VENDOR_TRANSMETA }, /* GenuineTMx86 */ { SIS_VENDOR_ID, CPU_VENDOR_SIS }, /* SiS SiS SiS */ { UMC_VENDOR_ID, CPU_VENDOR_UMC }, /* UMC UMC UMC */ { NEXGEN_VENDOR_ID, CPU_VENDOR_NEXGEN }, /* NexGenDriven */ { RISE_VENDOR_ID, CPU_VENDOR_RISE }, /* RiseRiseRise */ #if 0 /* XXX CPUID 8000_0000h and 8086_0000h, not 0000_0000h */ { "TransmetaCPU", CPU_VENDOR_TRANSMETA }, #endif #endif }; void printcpuinfo(void) { u_int regs[4], i; char *brand; printf("CPU: "); #ifdef __i386__ cpu_class = cpus[cpu].cpu_class; strncpy(cpu_model, cpus[cpu].cpu_name, sizeof (cpu_model)); #else strncpy(cpu_model, "Hammer", sizeof (cpu_model)); #endif /* Check for extended CPUID information and a processor name. */ if (cpu_exthigh >= 0x80000004) { brand = cpu_brand; for (i = 0x80000002; i < 0x80000005; i++) { do_cpuid(i, regs); memcpy(brand, regs, sizeof(regs)); brand += sizeof(regs); } } switch (cpu_vendor_id) { case CPU_VENDOR_INTEL: #ifdef __i386__ if ((cpu_id & 0xf00) > 0x300) { u_int brand_index; cpu_model[0] = '\0'; switch (cpu_id & 0x3000) { case 0x1000: strcpy(cpu_model, "Overdrive "); break; case 0x2000: strcpy(cpu_model, "Dual "); break; } switch (cpu_id & 0xf00) { case 0x400: strcat(cpu_model, "i486 "); /* Check the particular flavor of 486 */ switch (cpu_id & 0xf0) { case 0x00: case 0x10: strcat(cpu_model, "DX"); break; case 0x20: strcat(cpu_model, "SX"); break; case 0x30: strcat(cpu_model, "DX2"); break; case 0x40: strcat(cpu_model, "SL"); break; case 0x50: strcat(cpu_model, "SX2"); break; case 0x70: strcat(cpu_model, "DX2 Write-Back Enhanced"); break; case 0x80: strcat(cpu_model, "DX4"); break; } break; case 0x500: /* Check the particular flavor of 586 */ strcat(cpu_model, "Pentium"); switch (cpu_id & 0xf0) { case 0x00: strcat(cpu_model, " A-step"); break; case 0x10: strcat(cpu_model, "/P5"); break; case 0x20: strcat(cpu_model, "/P54C"); break; case 0x30: strcat(cpu_model, "/P24T"); break; case 0x40: strcat(cpu_model, "/P55C"); break; case 0x70: strcat(cpu_model, "/P54C"); break; case 0x80: strcat(cpu_model, "/P55C (quarter-micron)"); break; default: /* nothing */ break; } #if defined(I586_CPU) && !defined(NO_F00F_HACK) /* * XXX - If/when Intel fixes the bug, this * should also check the version of the * CPU, not just that it's a Pentium. */ has_f00f_bug = 1; #endif break; case 0x600: /* Check the particular flavor of 686 */ switch (cpu_id & 0xf0) { case 0x00: strcat(cpu_model, "Pentium Pro A-step"); break; case 0x10: strcat(cpu_model, "Pentium Pro"); break; case 0x30: case 0x50: case 0x60: strcat(cpu_model, "Pentium II/Pentium II Xeon/Celeron"); cpu = CPU_PII; break; case 0x70: case 0x80: case 0xa0: case 0xb0: strcat(cpu_model, "Pentium III/Pentium III Xeon/Celeron"); cpu = CPU_PIII; break; default: strcat(cpu_model, "Unknown 80686"); break; } break; case 0xf00: strcat(cpu_model, "Pentium 4"); cpu = CPU_P4; break; default: strcat(cpu_model, "unknown"); break; } /* * If we didn't get a brand name from the extended * CPUID, try to look it up in the brand table. */ if (cpu_high > 0 && *cpu_brand == '\0') { brand_index = cpu_procinfo & CPUID_BRAND_INDEX; if (brand_index <= MAX_BRAND_INDEX && cpu_brandtable[brand_index] != NULL) strcpy(cpu_brand, cpu_brandtable[brand_index]); } } #else /* Please make up your mind folks! */ strcat(cpu_model, "EM64T"); #endif break; case CPU_VENDOR_AMD: /* * Values taken from AMD Processor Recognition * http://www.amd.com/K6/k6docs/pdf/20734g.pdf * (also describes ``Features'' encodings. */ strcpy(cpu_model, "AMD "); #ifdef __i386__ switch (cpu_id & 0xFF0) { case 0x410: strcat(cpu_model, "Standard Am486DX"); break; case 0x430: strcat(cpu_model, "Enhanced Am486DX2 Write-Through"); break; case 0x470: strcat(cpu_model, "Enhanced Am486DX2 Write-Back"); break; case 0x480: strcat(cpu_model, "Enhanced Am486DX4/Am5x86 Write-Through"); break; case 0x490: strcat(cpu_model, "Enhanced Am486DX4/Am5x86 Write-Back"); break; case 0x4E0: strcat(cpu_model, "Am5x86 Write-Through"); break; case 0x4F0: strcat(cpu_model, "Am5x86 Write-Back"); break; case 0x500: strcat(cpu_model, "K5 model 0"); break; case 0x510: strcat(cpu_model, "K5 model 1"); break; case 0x520: strcat(cpu_model, "K5 PR166 (model 2)"); break; case 0x530: strcat(cpu_model, "K5 PR200 (model 3)"); break; case 0x560: strcat(cpu_model, "K6"); break; case 0x570: strcat(cpu_model, "K6 266 (model 1)"); break; case 0x580: strcat(cpu_model, "K6-2"); break; case 0x590: strcat(cpu_model, "K6-III"); break; case 0x5a0: strcat(cpu_model, "Geode LX"); break; default: strcat(cpu_model, "Unknown"); break; } #else if ((cpu_id & 0xf00) == 0xf00) strcat(cpu_model, "AMD64 Processor"); else strcat(cpu_model, "Unknown"); #endif break; #ifdef __i386__ case CPU_VENDOR_CYRIX: strcpy(cpu_model, "Cyrix "); switch (cpu_id & 0xff0) { case 0x440: strcat(cpu_model, "MediaGX"); break; case 0x520: strcat(cpu_model, "6x86"); break; case 0x540: cpu_class = CPUCLASS_586; strcat(cpu_model, "GXm"); break; case 0x600: strcat(cpu_model, "6x86MX"); break; default: /* * Even though CPU supports the cpuid * instruction, it can be disabled. * Therefore, this routine supports all Cyrix * CPUs. */ switch (cyrix_did & 0xf0) { case 0x00: switch (cyrix_did & 0x0f) { case 0x00: strcat(cpu_model, "486SLC"); break; case 0x01: strcat(cpu_model, "486DLC"); break; case 0x02: strcat(cpu_model, "486SLC2"); break; case 0x03: strcat(cpu_model, "486DLC2"); break; case 0x04: strcat(cpu_model, "486SRx"); break; case 0x05: strcat(cpu_model, "486DRx"); break; case 0x06: strcat(cpu_model, "486SRx2"); break; case 0x07: strcat(cpu_model, "486DRx2"); break; case 0x08: strcat(cpu_model, "486SRu"); break; case 0x09: strcat(cpu_model, "486DRu"); break; case 0x0a: strcat(cpu_model, "486SRu2"); break; case 0x0b: strcat(cpu_model, "486DRu2"); break; default: strcat(cpu_model, "Unknown"); break; } break; case 0x10: switch (cyrix_did & 0x0f) { case 0x00: strcat(cpu_model, "486S"); break; case 0x01: strcat(cpu_model, "486S2"); break; case 0x02: strcat(cpu_model, "486Se"); break; case 0x03: strcat(cpu_model, "486S2e"); break; case 0x0a: strcat(cpu_model, "486DX"); break; case 0x0b: strcat(cpu_model, "486DX2"); break; case 0x0f: strcat(cpu_model, "486DX4"); break; default: strcat(cpu_model, "Unknown"); break; } break; case 0x20: if ((cyrix_did & 0x0f) < 8) strcat(cpu_model, "6x86"); /* Where did you get it? */ else strcat(cpu_model, "5x86"); break; case 0x30: strcat(cpu_model, "6x86"); break; case 0x40: if ((cyrix_did & 0xf000) == 0x3000) { cpu_class = CPUCLASS_586; strcat(cpu_model, "GXm"); } else strcat(cpu_model, "MediaGX"); break; case 0x50: strcat(cpu_model, "6x86MX"); break; case 0xf0: switch (cyrix_did & 0x0f) { case 0x0d: strcat(cpu_model, "Overdrive CPU"); break; case 0x0e: strcpy(cpu_model, "Texas Instruments 486SXL"); break; case 0x0f: strcat(cpu_model, "486SLC/DLC"); break; default: strcat(cpu_model, "Unknown"); break; } break; default: strcat(cpu_model, "Unknown"); break; } break; } break; case CPU_VENDOR_RISE: strcpy(cpu_model, "Rise "); switch (cpu_id & 0xff0) { case 0x500: /* 6401 and 6441 (Kirin) */ case 0x520: /* 6510 (Lynx) */ strcat(cpu_model, "mP6"); break; default: strcat(cpu_model, "Unknown"); } break; #endif case CPU_VENDOR_CENTAUR: #ifdef __i386__ switch (cpu_id & 0xff0) { case 0x540: strcpy(cpu_model, "IDT WinChip C6"); break; case 0x580: strcpy(cpu_model, "IDT WinChip 2"); break; case 0x590: strcpy(cpu_model, "IDT WinChip 3"); break; case 0x660: strcpy(cpu_model, "VIA C3 Samuel"); break; case 0x670: if (cpu_id & 0x8) strcpy(cpu_model, "VIA C3 Ezra"); else strcpy(cpu_model, "VIA C3 Samuel 2"); break; case 0x680: strcpy(cpu_model, "VIA C3 Ezra-T"); break; case 0x690: strcpy(cpu_model, "VIA C3 Nehemiah"); break; case 0x6a0: case 0x6d0: strcpy(cpu_model, "VIA C7 Esther"); break; case 0x6f0: strcpy(cpu_model, "VIA Nano"); break; default: strcpy(cpu_model, "VIA/IDT Unknown"); } #else strcpy(cpu_model, "VIA "); if ((cpu_id & 0xff0) == 0x6f0) strcat(cpu_model, "Nano Processor"); else strcat(cpu_model, "Unknown"); #endif break; #ifdef __i386__ case CPU_VENDOR_IBM: strcpy(cpu_model, "Blue Lightning CPU"); break; case CPU_VENDOR_NSC: switch (cpu_id & 0xff0) { case 0x540: strcpy(cpu_model, "Geode SC1100"); cpu = CPU_GEODE1100; break; default: strcpy(cpu_model, "Geode/NSC unknown"); break; } break; #endif default: strcat(cpu_model, "Unknown"); break; } /* * Replace cpu_model with cpu_brand minus leading spaces if * we have one. */ brand = cpu_brand; while (*brand == ' ') ++brand; if (*brand != '\0') strcpy(cpu_model, brand); printf("%s (", cpu_model); if (tsc_freq != 0) { hw_clockrate = (tsc_freq + 5000) / 1000000; printf("%jd.%02d-MHz ", (intmax_t)(tsc_freq + 4999) / 1000000, (u_int)((tsc_freq + 4999) / 10000) % 100); } #ifdef __i386__ switch(cpu_class) { case CPUCLASS_286: printf("286"); break; case CPUCLASS_386: printf("386"); break; #if defined(I486_CPU) case CPUCLASS_486: printf("486"); break; #endif #if defined(I586_CPU) case CPUCLASS_586: printf("586"); break; #endif #if defined(I686_CPU) case CPUCLASS_686: printf("686"); break; #endif default: printf("Unknown"); /* will panic below... */ } #else printf("K8"); #endif printf("-class CPU)\n"); if (*cpu_vendor) printf(" Origin=\"%s\"", cpu_vendor); if (cpu_id) printf(" Id=0x%x", cpu_id); if (cpu_vendor_id == CPU_VENDOR_INTEL || cpu_vendor_id == CPU_VENDOR_AMD || cpu_vendor_id == CPU_VENDOR_CENTAUR || #ifdef __i386__ cpu_vendor_id == CPU_VENDOR_TRANSMETA || cpu_vendor_id == CPU_VENDOR_RISE || cpu_vendor_id == CPU_VENDOR_NSC || (cpu_vendor_id == CPU_VENDOR_CYRIX && ((cpu_id & 0xf00) > 0x500)) || #endif 0) { printf(" Family=0x%x", CPUID_TO_FAMILY(cpu_id)); printf(" Model=0x%x", CPUID_TO_MODEL(cpu_id)); printf(" Stepping=%u", cpu_id & CPUID_STEPPING); #ifdef __i386__ if (cpu_vendor_id == CPU_VENDOR_CYRIX) printf("\n DIR=0x%04x", cyrix_did); #endif /* * AMD CPUID Specification * http://support.amd.com/us/Embedded_TechDocs/25481.pdf * * Intel Processor Identification and CPUID Instruction * http://www.intel.com/assets/pdf/appnote/241618.pdf */ if (cpu_high > 0) { /* * Here we should probably set up flags indicating * whether or not various features are available. * The interesting ones are probably VME, PSE, PAE, * and PGE. The code already assumes without bothering * to check that all CPUs >= Pentium have a TSC and * MSRs. */ printf("\n Features=0x%b", cpu_feature, "\020" "\001FPU" /* Integral FPU */ "\002VME" /* Extended VM86 mode support */ "\003DE" /* Debugging Extensions (CR4.DE) */ "\004PSE" /* 4MByte page tables */ "\005TSC" /* Timestamp counter */ "\006MSR" /* Machine specific registers */ "\007PAE" /* Physical address extension */ "\010MCE" /* Machine Check support */ "\011CX8" /* CMPEXCH8 instruction */ "\012APIC" /* SMP local APIC */ "\013oldMTRR" /* Previous implementation of MTRR */ "\014SEP" /* Fast System Call */ "\015MTRR" /* Memory Type Range Registers */ "\016PGE" /* PG_G (global bit) support */ "\017MCA" /* Machine Check Architecture */ "\020CMOV" /* CMOV instruction */ "\021PAT" /* Page attributes table */ "\022PSE36" /* 36 bit address space support */ "\023PN" /* Processor Serial number */ "\024CLFLUSH" /* Has the CLFLUSH instruction */ "\025" "\026DTS" /* Debug Trace Store */ "\027ACPI" /* ACPI support */ "\030MMX" /* MMX instructions */ "\031FXSR" /* FXSAVE/FXRSTOR */ "\032SSE" /* Streaming SIMD Extensions */ "\033SSE2" /* Streaming SIMD Extensions #2 */ "\034SS" /* Self snoop */ "\035HTT" /* Hyperthreading (see EBX bit 16-23) */ "\036TM" /* Thermal Monitor clock slowdown */ "\037IA64" /* CPU can execute IA64 instructions */ "\040PBE" /* Pending Break Enable */ ); if (cpu_feature2 != 0) { printf("\n Features2=0x%b", cpu_feature2, "\020" "\001SSE3" /* SSE3 */ "\002PCLMULQDQ" /* Carry-Less Mul Quadword */ "\003DTES64" /* 64-bit Debug Trace */ "\004MON" /* MONITOR/MWAIT Instructions */ "\005DS_CPL" /* CPL Qualified Debug Store */ "\006VMX" /* Virtual Machine Extensions */ "\007SMX" /* Safer Mode Extensions */ "\010EST" /* Enhanced SpeedStep */ "\011TM2" /* Thermal Monitor 2 */ "\012SSSE3" /* SSSE3 */ "\013CNXT-ID" /* L1 context ID available */ "\014SDBG" /* IA32 silicon debug */ "\015FMA" /* Fused Multiply Add */ "\016CX16" /* CMPXCHG16B Instruction */ "\017xTPR" /* Send Task Priority Messages*/ "\020PDCM" /* Perf/Debug Capability MSR */ "\021" "\022PCID" /* Process-context Identifiers*/ "\023DCA" /* Direct Cache Access */ "\024SSE4.1" /* SSE 4.1 */ "\025SSE4.2" /* SSE 4.2 */ "\026x2APIC" /* xAPIC Extensions */ "\027MOVBE" /* MOVBE Instruction */ "\030POPCNT" /* POPCNT Instruction */ "\031TSCDLT" /* TSC-Deadline Timer */ "\032AESNI" /* AES Crypto */ "\033XSAVE" /* XSAVE/XRSTOR States */ "\034OSXSAVE" /* OS-Enabled State Management*/ "\035AVX" /* Advanced Vector Extensions */ "\036F16C" /* Half-precision conversions */ "\037RDRAND" /* RDRAND Instruction */ "\040HV" /* Hypervisor */ ); } if (amd_feature != 0) { printf("\n AMD Features=0x%b", amd_feature, "\020" /* in hex */ "\001" /* Same */ "\002" /* Same */ "\003" /* Same */ "\004" /* Same */ "\005" /* Same */ "\006" /* Same */ "\007" /* Same */ "\010" /* Same */ "\011" /* Same */ "\012" /* Same */ "\013" /* Undefined */ "\014SYSCALL" /* Have SYSCALL/SYSRET */ "\015" /* Same */ "\016" /* Same */ "\017" /* Same */ "\020" /* Same */ "\021" /* Same */ "\022" /* Same */ "\023" /* Reserved, unknown */ "\024MP" /* Multiprocessor Capable */ "\025NX" /* Has EFER.NXE, NX */ "\026" /* Undefined */ "\027MMX+" /* AMD MMX Extensions */ "\030" /* Same */ "\031" /* Same */ "\032FFXSR" /* Fast FXSAVE/FXRSTOR */ "\033Page1GB" /* 1-GB large page support */ "\034RDTSCP" /* RDTSCP */ "\035" /* Undefined */ "\036LM" /* 64 bit long mode */ "\0373DNow!+" /* AMD 3DNow! Extensions */ "\0403DNow!" /* AMD 3DNow! */ ); } if (amd_feature2 != 0) { printf("\n AMD Features2=0x%b", amd_feature2, "\020" "\001LAHF" /* LAHF/SAHF in long mode */ "\002CMP" /* CMP legacy */ "\003SVM" /* Secure Virtual Mode */ "\004ExtAPIC" /* Extended APIC register */ "\005CR8" /* CR8 in legacy mode */ "\006ABM" /* LZCNT instruction */ "\007SSE4A" /* SSE4A */ "\010MAS" /* Misaligned SSE mode */ "\011Prefetch" /* 3DNow! Prefetch/PrefetchW */ "\012OSVW" /* OS visible workaround */ "\013IBS" /* Instruction based sampling */ "\014XOP" /* XOP extended instructions */ "\015SKINIT" /* SKINIT/STGI */ "\016WDT" /* Watchdog timer */ "\017" "\020LWP" /* Lightweight Profiling */ "\021FMA4" /* 4-operand FMA instructions */ "\022TCE" /* Translation Cache Extension */ "\023" "\024NodeId" /* NodeId MSR support */ "\025" "\026TBM" /* Trailing Bit Manipulation */ "\027Topology" /* Topology Extensions */ "\030PCXC" /* Core perf count */ "\031PNXC" /* NB perf count */ "\032" "\033DBE" /* Data Breakpoint extension */ "\034PTSC" /* Performance TSC */ "\035PL2I" /* L2I perf count */ "\036" "\037" "\040" ); } if (cpu_stdext_feature != 0) { printf("\n Structured Extended Features=0x%b", cpu_stdext_feature, "\020" /* RDFSBASE/RDGSBASE/WRFSBASE/WRGSBASE */ "\001FSGSBASE" "\002TSCADJ" "\003SGX" /* Bit Manipulation Instructions */ "\004BMI1" /* Hardware Lock Elision */ "\005HLE" /* Advanced Vector Instructions 2 */ "\006AVX2" /* FDP_EXCPTN_ONLY */ "\007FDPEXC" /* Supervisor Mode Execution Prot. */ "\010SMEP" /* Bit Manipulation Instructions */ "\011BMI2" "\012ERMS" /* Invalidate Processor Context ID */ "\013INVPCID" /* Restricted Transactional Memory */ "\014RTM" "\015PQM" "\016NFPUSG" /* Intel Memory Protection Extensions */ "\017MPX" "\020PQE" /* AVX512 Foundation */ "\021AVX512F" "\022AVX512DQ" /* Enhanced NRBG */ "\023RDSEED" /* ADCX + ADOX */ "\024ADX" /* Supervisor Mode Access Prevention */ "\025SMAP" "\026AVX512IFMA" "\027PCOMMIT" "\030CLFLUSHOPT" "\031CLWB" "\032PROCTRACE" "\033AVX512PF" "\034AVX512ER" "\035AVX512CD" "\036SHA" "\037AVX512BW" ); } if (cpu_stdext_feature2 != 0) { printf("\n Structured Extended Features2=0x%b", cpu_stdext_feature2, "\020" "\001PREFETCHWT1" "\002AVX512VBMI" "\003UMIP" "\004PKU" "\005OSPKE" "\027RDPID" "\037SGXLC" ); } if ((cpu_feature2 & CPUID2_XSAVE) != 0) { cpuid_count(0xd, 0x1, regs); if (regs[0] != 0) { printf("\n XSAVE Features=0x%b", regs[0], "\020" "\001XSAVEOPT" "\002XSAVEC" "\003XINUSE" "\004XSAVES"); } } if (via_feature_rng != 0 || via_feature_xcrypt != 0) print_via_padlock_info(); if (cpu_feature2 & CPUID2_VMX) print_vmx_info(); if (amd_feature2 & AMDID2_SVM) print_svm_info(); if ((cpu_feature & CPUID_HTT) && cpu_vendor_id == CPU_VENDOR_AMD) cpu_feature &= ~CPUID_HTT; /* * If this CPU supports P-state invariant TSC then * mention the capability. */ if (tsc_is_invariant) { printf("\n TSC: P-state invariant"); if (tsc_perf_stat) printf(", performance statistics"); } } #ifdef __i386__ } else if (cpu_vendor_id == CPU_VENDOR_CYRIX) { printf(" DIR=0x%04x", cyrix_did); printf(" Stepping=%u", (cyrix_did & 0xf000) >> 12); printf(" Revision=%u", (cyrix_did & 0x0f00) >> 8); #ifndef CYRIX_CACHE_REALLY_WORKS if (cpu == CPU_M1 && (cyrix_did & 0xff00) < 0x1700) printf("\n CPU cache: write-through mode"); #endif #endif } /* Avoid ugly blank lines: only print newline when we have to. */ if (*cpu_vendor || cpu_id) printf("\n"); if (bootverbose) { if (cpu_vendor_id == CPU_VENDOR_AMD) print_AMD_info(); else if (cpu_vendor_id == CPU_VENDOR_INTEL) print_INTEL_info(); #ifdef __i386__ else if (cpu_vendor_id == CPU_VENDOR_TRANSMETA) print_transmeta_info(); #endif } print_hypervisor_info(); } #ifdef __i386__ void panicifcpuunsupported(void) { #if !defined(lint) #if !defined(I486_CPU) && !defined(I586_CPU) && !defined(I686_CPU) #error This kernel is not configured for one of the supported CPUs #endif #else /* lint */ #endif /* lint */ /* * Now that we have told the user what they have, * let them know if that machine type isn't configured. */ switch (cpu_class) { case CPUCLASS_286: /* a 286 should not make it this far, anyway */ case CPUCLASS_386: #if !defined(I486_CPU) case CPUCLASS_486: #endif #if !defined(I586_CPU) case CPUCLASS_586: #endif #if !defined(I686_CPU) case CPUCLASS_686: #endif panic("CPU class not configured"); default: break; } } static volatile u_int trap_by_rdmsr; /* * Special exception 6 handler. * The rdmsr instruction generates invalid opcodes fault on 486-class * Cyrix CPU. Stacked eip register points the rdmsr instruction in the * function identblue() when this handler is called. Stacked eip should * be advanced. */ inthand_t bluetrap6; #ifdef __GNUCLIKE_ASM __asm (" \n\ .text \n\ .p2align 2,0x90 \n\ .type " __XSTRING(CNAME(bluetrap6)) ",@function \n\ " __XSTRING(CNAME(bluetrap6)) ": \n\ ss \n\ movl $0xa8c1d," __XSTRING(CNAME(trap_by_rdmsr)) " \n\ addl $2, (%esp) /* rdmsr is a 2-byte instruction */ \n\ iret \n\ "); #endif /* * Special exception 13 handler. * Accessing non-existent MSR generates general protection fault. */ inthand_t bluetrap13; #ifdef __GNUCLIKE_ASM __asm (" \n\ .text \n\ .p2align 2,0x90 \n\ .type " __XSTRING(CNAME(bluetrap13)) ",@function \n\ " __XSTRING(CNAME(bluetrap13)) ": \n\ ss \n\ movl $0xa89c4," __XSTRING(CNAME(trap_by_rdmsr)) " \n\ popl %eax /* discard error code */ \n\ addl $2, (%esp) /* rdmsr is a 2-byte instruction */ \n\ iret \n\ "); #endif /* * Distinguish IBM Blue Lightning CPU from Cyrix CPUs that does not * support cpuid instruction. This function should be called after * loading interrupt descriptor table register. * * I don't like this method that handles fault, but I couldn't get * information for any other methods. Does blue giant know? */ static int identblue(void) { trap_by_rdmsr = 0; /* * Cyrix 486-class CPU does not support rdmsr instruction. * The rdmsr instruction generates invalid opcode fault, and exception * will be trapped by bluetrap6() on Cyrix 486-class CPU. The * bluetrap6() set the magic number to trap_by_rdmsr. */ setidt(IDT_UD, bluetrap6, SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); /* * Certain BIOS disables cpuid instruction of Cyrix 6x86MX CPU. * In this case, rdmsr generates general protection fault, and * exception will be trapped by bluetrap13(). */ setidt(IDT_GP, bluetrap13, SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); rdmsr(0x1002); /* Cyrix CPU generates fault. */ if (trap_by_rdmsr == 0xa8c1d) return IDENTBLUE_CYRIX486; else if (trap_by_rdmsr == 0xa89c4) return IDENTBLUE_CYRIXM2; return IDENTBLUE_IBMCPU; } /* * identifycyrix() set lower 16 bits of cyrix_did as follows: * * F E D C B A 9 8 7 6 5 4 3 2 1 0 * +-------+-------+---------------+ * | SID | RID | Device ID | * | (DIR 1) | (DIR 0) | * +-------+-------+---------------+ */ static void identifycyrix(void) { register_t saveintr; int ccr2_test = 0, dir_test = 0; u_char ccr2, ccr3; saveintr = intr_disable(); ccr2 = read_cyrix_reg(CCR2); write_cyrix_reg(CCR2, ccr2 ^ CCR2_LOCK_NW); read_cyrix_reg(CCR2); if (read_cyrix_reg(CCR2) != ccr2) ccr2_test = 1; write_cyrix_reg(CCR2, ccr2); ccr3 = read_cyrix_reg(CCR3); write_cyrix_reg(CCR3, ccr3 ^ CCR3_MAPEN3); read_cyrix_reg(CCR3); if (read_cyrix_reg(CCR3) != ccr3) dir_test = 1; /* CPU supports DIRs. */ write_cyrix_reg(CCR3, ccr3); if (dir_test) { /* Device ID registers are available. */ cyrix_did = read_cyrix_reg(DIR1) << 8; cyrix_did += read_cyrix_reg(DIR0); } else if (ccr2_test) cyrix_did = 0x0010; /* 486S A-step */ else cyrix_did = 0x00ff; /* Old 486SLC/DLC and TI486SXLC/SXL */ intr_restore(saveintr); } #endif /* Update TSC freq with the value indicated by the caller. */ static void tsc_freq_changed(void *arg __unused, const struct cf_level *level, int status) { /* If there was an error during the transition, don't do anything. */ if (status != 0) return; /* Total setting for this level gives the new frequency in MHz. */ hw_clockrate = level->total_set.freq; } static void hook_tsc_freq(void *arg __unused) { if (tsc_is_invariant) return; tsc_post_tag = EVENTHANDLER_REGISTER(cpufreq_post_change, tsc_freq_changed, NULL, EVENTHANDLER_PRI_ANY); } SYSINIT(hook_tsc_freq, SI_SUB_CONFIGURE, SI_ORDER_ANY, hook_tsc_freq, NULL); static const char *const vm_bnames[] = { "QEMU", /* QEMU */ "Plex86", /* Plex86 */ "Bochs", /* Bochs */ "Xen", /* Xen */ "BHYVE", /* bhyve */ "Seabios", /* KVM */ NULL }; static const char *const vm_pnames[] = { "VMware Virtual Platform", /* VMWare VM */ "Virtual Machine", /* Microsoft VirtualPC */ "VirtualBox", /* Sun xVM VirtualBox */ "Parallels Virtual Platform", /* Parallels VM */ "KVM", /* KVM */ NULL }; static void identify_hypervisor(void) { u_int regs[4]; char *p; int i; /* * [RFC] CPUID usage for interaction between Hypervisors and Linux. * http://lkml.org/lkml/2008/10/1/246 * * KB1009458: Mechanisms to determine if software is running in * a VMware virtual machine * http://kb.vmware.com/kb/1009458 */ if (cpu_feature2 & CPUID2_HV) { vm_guest = VM_GUEST_VM; do_cpuid(0x40000000, regs); if (regs[0] >= 0x40000000) { hv_high = regs[0]; ((u_int *)&hv_vendor)[0] = regs[1]; ((u_int *)&hv_vendor)[1] = regs[2]; ((u_int *)&hv_vendor)[2] = regs[3]; hv_vendor[12] = '\0'; if (strcmp(hv_vendor, "VMwareVMware") == 0) vm_guest = VM_GUEST_VMWARE; else if (strcmp(hv_vendor, "Microsoft Hv") == 0) vm_guest = VM_GUEST_HV; else if (strcmp(hv_vendor, "KVMKVMKVM") == 0) vm_guest = VM_GUEST_KVM; } return; } /* * Examine SMBIOS strings for older hypervisors. */ p = kern_getenv("smbios.system.serial"); if (p != NULL) { if (strncmp(p, "VMware-", 7) == 0 || strncmp(p, "VMW", 3) == 0) { vmware_hvcall(VMW_HVCMD_GETVERSION, regs); if (regs[1] == VMW_HVMAGIC) { vm_guest = VM_GUEST_VMWARE; freeenv(p); return; } } freeenv(p); } /* * XXX: Some of these entries may not be needed since they were * added to FreeBSD before the checks above. */ p = kern_getenv("smbios.bios.vendor"); if (p != NULL) { for (i = 0; vm_bnames[i] != NULL; i++) if (strcmp(p, vm_bnames[i]) == 0) { vm_guest = VM_GUEST_VM; freeenv(p); return; } freeenv(p); } p = kern_getenv("smbios.system.product"); if (p != NULL) { for (i = 0; vm_pnames[i] != NULL; i++) if (strcmp(p, vm_pnames[i]) == 0) { vm_guest = VM_GUEST_VM; freeenv(p); return; } freeenv(p); } } bool fix_cpuid(void) { uint64_t msr; /* * Clear "Limit CPUID Maxval" bit and return true if the caller should * get the largest standard CPUID function number again if it is set * from BIOS. It is necessary for probing correct CPU topology later * and for the correct operation of the AVX-aware userspace. */ if (cpu_vendor_id == CPU_VENDOR_INTEL && ((CPUID_TO_FAMILY(cpu_id) == 0xf && CPUID_TO_MODEL(cpu_id) >= 0x3) || (CPUID_TO_FAMILY(cpu_id) == 0x6 && CPUID_TO_MODEL(cpu_id) >= 0xe))) { msr = rdmsr(MSR_IA32_MISC_ENABLE); if ((msr & IA32_MISC_EN_LIMCPUID) != 0) { msr &= ~IA32_MISC_EN_LIMCPUID; wrmsr(MSR_IA32_MISC_ENABLE, msr); return (true); } } /* * Re-enable AMD Topology Extension that could be disabled by BIOS * on some notebook processors. Without the extension it's really * hard to determine the correct CPU cache topology. * See BIOS and Kernel Developer’s Guide (BKDG) for AMD Family 15h * Models 60h-6Fh Processors, Publication # 50742. */ if (cpu_vendor_id == CPU_VENDOR_AMD && CPUID_TO_FAMILY(cpu_id) == 0x15) { msr = rdmsr(MSR_EXTFEATURES); if ((msr & ((uint64_t)1 << 54)) == 0) { msr |= (uint64_t)1 << 54; wrmsr(MSR_EXTFEATURES, msr); return (true); } } return (false); } /* * Final stage of CPU identification. */ #ifdef __i386__ void finishidentcpu(void) #else void identify_cpu(void) #endif { u_int regs[4], cpu_stdext_disable; #ifdef __i386__ u_char ccr3; #endif #ifdef __amd64__ do_cpuid(0, regs); cpu_high = regs[0]; ((u_int *)&cpu_vendor)[0] = regs[1]; ((u_int *)&cpu_vendor)[1] = regs[3]; ((u_int *)&cpu_vendor)[2] = regs[2]; cpu_vendor[12] = '\0'; do_cpuid(1, regs); cpu_id = regs[0]; cpu_procinfo = regs[1]; cpu_feature = regs[3]; cpu_feature2 = regs[2]; #endif identify_hypervisor(); cpu_vendor_id = find_cpu_vendor_id(); if (fix_cpuid()) { do_cpuid(0, regs); cpu_high = regs[0]; } if (cpu_high >= 5 && (cpu_feature2 & CPUID2_MON) != 0) { do_cpuid(5, regs); cpu_mon_mwait_flags = regs[2]; cpu_mon_min_size = regs[0] & CPUID5_MON_MIN_SIZE; cpu_mon_max_size = regs[1] & CPUID5_MON_MAX_SIZE; } if (cpu_high >= 7) { cpuid_count(7, 0, regs); cpu_stdext_feature = regs[1]; /* * Some hypervisors fail to filter out unsupported * extended features. For now, disable the * extensions, activation of which requires setting a * bit in CR4, and which VM monitors do not support. */ if (cpu_feature2 & CPUID2_HV) { cpu_stdext_disable = CPUID_STDEXT_FSGSBASE | CPUID_STDEXT_SMEP; } else cpu_stdext_disable = 0; TUNABLE_INT_FETCH("hw.cpu_stdext_disable", &cpu_stdext_disable); cpu_stdext_feature &= ~cpu_stdext_disable; cpu_stdext_feature2 = regs[2]; } #ifdef __i386__ if (cpu_high > 0 && (cpu_vendor_id == CPU_VENDOR_INTEL || cpu_vendor_id == CPU_VENDOR_AMD || cpu_vendor_id == CPU_VENDOR_TRANSMETA || cpu_vendor_id == CPU_VENDOR_CENTAUR || cpu_vendor_id == CPU_VENDOR_NSC)) { do_cpuid(0x80000000, regs); if (regs[0] >= 0x80000000) cpu_exthigh = regs[0]; } #else if (cpu_vendor_id == CPU_VENDOR_INTEL || cpu_vendor_id == CPU_VENDOR_AMD || cpu_vendor_id == CPU_VENDOR_CENTAUR) { do_cpuid(0x80000000, regs); cpu_exthigh = regs[0]; } #endif if (cpu_exthigh >= 0x80000001) { do_cpuid(0x80000001, regs); amd_feature = regs[3] & ~(cpu_feature & 0x0183f3ff); amd_feature2 = regs[2]; } if (cpu_exthigh >= 0x80000007) { do_cpuid(0x80000007, regs); amd_pminfo = regs[3]; } if (cpu_exthigh >= 0x80000008) { do_cpuid(0x80000008, regs); cpu_maxphyaddr = regs[0] & 0xff; cpu_procinfo2 = regs[2]; } else { cpu_maxphyaddr = (cpu_feature & CPUID_PAE) != 0 ? 36 : 32; } #ifdef __i386__ if (cpu_vendor_id == CPU_VENDOR_CYRIX) { if (cpu == CPU_486) { /* * These conditions are equivalent to: * - CPU does not support cpuid instruction. * - Cyrix/IBM CPU is detected. */ if (identblue() == IDENTBLUE_IBMCPU) { strcpy(cpu_vendor, "IBM"); cpu_vendor_id = CPU_VENDOR_IBM; cpu = CPU_BLUE; return; } } switch (cpu_id & 0xf00) { case 0x600: /* * Cyrix's datasheet does not describe DIRs. * Therefor, I assume it does not have them * and use the result of the cpuid instruction. * XXX they seem to have it for now at least. -Peter */ identifycyrix(); cpu = CPU_M2; break; default: identifycyrix(); /* * This routine contains a trick. * Don't check (cpu_id & 0x00f0) == 0x50 to detect M2, now. */ switch (cyrix_did & 0x00f0) { case 0x00: case 0xf0: cpu = CPU_486DLC; break; case 0x10: cpu = CPU_CY486DX; break; case 0x20: if ((cyrix_did & 0x000f) < 8) cpu = CPU_M1; else cpu = CPU_M1SC; break; case 0x30: cpu = CPU_M1; break; case 0x40: /* MediaGX CPU */ cpu = CPU_M1SC; break; default: /* M2 and later CPUs are treated as M2. */ cpu = CPU_M2; /* * enable cpuid instruction. */ ccr3 = read_cyrix_reg(CCR3); write_cyrix_reg(CCR3, CCR3_MAPEN0); write_cyrix_reg(CCR4, read_cyrix_reg(CCR4) | CCR4_CPUID); write_cyrix_reg(CCR3, ccr3); do_cpuid(0, regs); cpu_high = regs[0]; /* eax */ do_cpuid(1, regs); cpu_id = regs[0]; /* eax */ cpu_feature = regs[3]; /* edx */ break; } } } else if (cpu == CPU_486 && *cpu_vendor == '\0') { /* * There are BlueLightning CPUs that do not change * undefined flags by dividing 5 by 2. In this case, * the CPU identification routine in locore.s leaves * cpu_vendor null string and puts CPU_486 into the * cpu. */ if (identblue() == IDENTBLUE_IBMCPU) { strcpy(cpu_vendor, "IBM"); cpu_vendor_id = CPU_VENDOR_IBM; cpu = CPU_BLUE; return; } } #endif } static u_int find_cpu_vendor_id(void) { int i; for (i = 0; i < nitems(cpu_vendors); i++) if (strcmp(cpu_vendor, cpu_vendors[i].vendor) == 0) return (cpu_vendors[i].vendor_id); return (0); } static void print_AMD_assoc(int i) { if (i == 255) printf(", fully associative\n"); else printf(", %d-way associative\n", i); } static void print_AMD_l2_assoc(int i) { switch (i & 0x0f) { case 0: printf(", disabled/not present\n"); break; case 1: printf(", direct mapped\n"); break; case 2: printf(", 2-way associative\n"); break; case 4: printf(", 4-way associative\n"); break; case 6: printf(", 8-way associative\n"); break; case 8: printf(", 16-way associative\n"); break; case 15: printf(", fully associative\n"); break; default: printf(", reserved configuration\n"); break; } } static void print_AMD_info(void) { #ifdef __i386__ uint64_t amd_whcr; #endif u_int regs[4]; if (cpu_exthigh >= 0x80000005) { do_cpuid(0x80000005, regs); printf("L1 2MB data TLB: %d entries", (regs[0] >> 16) & 0xff); print_AMD_assoc(regs[0] >> 24); printf("L1 2MB instruction TLB: %d entries", regs[0] & 0xff); print_AMD_assoc((regs[0] >> 8) & 0xff); printf("L1 4KB data TLB: %d entries", (regs[1] >> 16) & 0xff); print_AMD_assoc(regs[1] >> 24); printf("L1 4KB instruction TLB: %d entries", regs[1] & 0xff); print_AMD_assoc((regs[1] >> 8) & 0xff); printf("L1 data cache: %d kbytes", regs[2] >> 24); printf(", %d bytes/line", regs[2] & 0xff); printf(", %d lines/tag", (regs[2] >> 8) & 0xff); print_AMD_assoc((regs[2] >> 16) & 0xff); printf("L1 instruction cache: %d kbytes", regs[3] >> 24); printf(", %d bytes/line", regs[3] & 0xff); printf(", %d lines/tag", (regs[3] >> 8) & 0xff); print_AMD_assoc((regs[3] >> 16) & 0xff); } if (cpu_exthigh >= 0x80000006) { do_cpuid(0x80000006, regs); if ((regs[0] >> 16) != 0) { printf("L2 2MB data TLB: %d entries", (regs[0] >> 16) & 0xfff); print_AMD_l2_assoc(regs[0] >> 28); printf("L2 2MB instruction TLB: %d entries", regs[0] & 0xfff); print_AMD_l2_assoc((regs[0] >> 28) & 0xf); } else { printf("L2 2MB unified TLB: %d entries", regs[0] & 0xfff); print_AMD_l2_assoc((regs[0] >> 28) & 0xf); } if ((regs[1] >> 16) != 0) { printf("L2 4KB data TLB: %d entries", (regs[1] >> 16) & 0xfff); print_AMD_l2_assoc(regs[1] >> 28); printf("L2 4KB instruction TLB: %d entries", (regs[1] >> 16) & 0xfff); print_AMD_l2_assoc((regs[1] >> 28) & 0xf); } else { printf("L2 4KB unified TLB: %d entries", (regs[1] >> 16) & 0xfff); print_AMD_l2_assoc((regs[1] >> 28) & 0xf); } printf("L2 unified cache: %d kbytes", regs[2] >> 16); printf(", %d bytes/line", regs[2] & 0xff); printf(", %d lines/tag", (regs[2] >> 8) & 0x0f); print_AMD_l2_assoc((regs[2] >> 12) & 0x0f); } #ifdef __i386__ if (((cpu_id & 0xf00) == 0x500) && (((cpu_id & 0x0f0) > 0x80) || (((cpu_id & 0x0f0) == 0x80) && (cpu_id & 0x00f) > 0x07))) { /* K6-2(new core [Stepping 8-F]), K6-III or later */ amd_whcr = rdmsr(0xc0000082); if (!(amd_whcr & (0x3ff << 22))) { printf("Write Allocate Disable\n"); } else { printf("Write Allocate Enable Limit: %dM bytes\n", (u_int32_t)((amd_whcr & (0x3ff << 22)) >> 22) * 4); printf("Write Allocate 15-16M bytes: %s\n", (amd_whcr & (1 << 16)) ? "Enable" : "Disable"); } } else if (((cpu_id & 0xf00) == 0x500) && ((cpu_id & 0x0f0) > 0x50)) { /* K6, K6-2(old core) */ amd_whcr = rdmsr(0xc0000082); if (!(amd_whcr & (0x7f << 1))) { printf("Write Allocate Disable\n"); } else { printf("Write Allocate Enable Limit: %dM bytes\n", (u_int32_t)((amd_whcr & (0x7f << 1)) >> 1) * 4); printf("Write Allocate 15-16M bytes: %s\n", (amd_whcr & 0x0001) ? "Enable" : "Disable"); printf("Hardware Write Allocate Control: %s\n", (amd_whcr & 0x0100) ? "Enable" : "Disable"); } } #endif /* * Opteron Rev E shows a bug as in very rare occasions a read memory * barrier is not performed as expected if it is followed by a * non-atomic read-modify-write instruction. * As long as that bug pops up very rarely (intensive machine usage * on other operating systems generally generates one unexplainable * crash any 2 months) and as long as a model specific fix would be * impractical at this stage, print out a warning string if the broken * model and family are identified. */ if (CPUID_TO_FAMILY(cpu_id) == 0xf && CPUID_TO_MODEL(cpu_id) >= 0x20 && CPUID_TO_MODEL(cpu_id) <= 0x3f) printf("WARNING: This architecture revision has known SMP " "hardware bugs which may cause random instability\n"); } static void print_INTEL_info(void) { u_int regs[4]; u_int rounds, regnum; u_int nwaycode, nway; if (cpu_high >= 2) { rounds = 0; do { do_cpuid(0x2, regs); if (rounds == 0 && (rounds = (regs[0] & 0xff)) == 0) break; /* we have a buggy CPU */ for (regnum = 0; regnum <= 3; ++regnum) { if (regs[regnum] & (1<<31)) continue; if (regnum != 0) print_INTEL_TLB(regs[regnum] & 0xff); print_INTEL_TLB((regs[regnum] >> 8) & 0xff); print_INTEL_TLB((regs[regnum] >> 16) & 0xff); print_INTEL_TLB((regs[regnum] >> 24) & 0xff); } } while (--rounds > 0); } if (cpu_exthigh >= 0x80000006) { do_cpuid(0x80000006, regs); nwaycode = (regs[2] >> 12) & 0x0f; if (nwaycode >= 0x02 && nwaycode <= 0x08) nway = 1 << (nwaycode / 2); else nway = 0; printf("L2 cache: %u kbytes, %u-way associative, %u bytes/line\n", (regs[2] >> 16) & 0xffff, nway, regs[2] & 0xff); } } static void print_INTEL_TLB(u_int data) { switch (data) { case 0x0: case 0x40: default: break; case 0x1: printf("Instruction TLB: 4 KB pages, 4-way set associative, 32 entries\n"); break; case 0x2: printf("Instruction TLB: 4 MB pages, fully associative, 2 entries\n"); break; case 0x3: printf("Data TLB: 4 KB pages, 4-way set associative, 64 entries\n"); break; case 0x4: printf("Data TLB: 4 MB Pages, 4-way set associative, 8 entries\n"); break; case 0x6: printf("1st-level instruction cache: 8 KB, 4-way set associative, 32 byte line size\n"); break; case 0x8: printf("1st-level instruction cache: 16 KB, 4-way set associative, 32 byte line size\n"); break; case 0x9: printf("1st-level instruction cache: 32 KB, 4-way set associative, 64 byte line size\n"); break; case 0xa: printf("1st-level data cache: 8 KB, 2-way set associative, 32 byte line size\n"); break; case 0xb: printf("Instruction TLB: 4 MByte pages, 4-way set associative, 4 entries\n"); break; case 0xc: printf("1st-level data cache: 16 KB, 4-way set associative, 32 byte line size\n"); break; case 0xd: printf("1st-level data cache: 16 KBytes, 4-way set associative, 64 byte line size"); break; case 0xe: printf("1st-level data cache: 24 KBytes, 6-way set associative, 64 byte line size\n"); break; case 0x1d: printf("2nd-level cache: 128 KBytes, 2-way set associative, 64 byte line size\n"); break; case 0x21: printf("2nd-level cache: 256 KBytes, 8-way set associative, 64 byte line size\n"); break; case 0x22: printf("3rd-level cache: 512 KB, 4-way set associative, sectored cache, 64 byte line size\n"); break; case 0x23: printf("3rd-level cache: 1 MB, 8-way set associative, sectored cache, 64 byte line size\n"); break; case 0x24: printf("2nd-level cache: 1 MBytes, 16-way set associative, 64 byte line size\n"); break; case 0x25: printf("3rd-level cache: 2 MB, 8-way set associative, sectored cache, 64 byte line size\n"); break; case 0x29: printf("3rd-level cache: 4 MB, 8-way set associative, sectored cache, 64 byte line size\n"); break; case 0x2c: printf("1st-level data cache: 32 KB, 8-way set associative, 64 byte line size\n"); break; case 0x30: printf("1st-level instruction cache: 32 KB, 8-way set associative, 64 byte line size\n"); break; case 0x39: /* De-listed in SDM rev. 54 */ printf("2nd-level cache: 128 KB, 4-way set associative, sectored cache, 64 byte line size\n"); break; case 0x3b: /* De-listed in SDM rev. 54 */ printf("2nd-level cache: 128 KB, 2-way set associative, sectored cache, 64 byte line size\n"); break; case 0x3c: /* De-listed in SDM rev. 54 */ printf("2nd-level cache: 256 KB, 4-way set associative, sectored cache, 64 byte line size\n"); break; case 0x41: printf("2nd-level cache: 128 KB, 4-way set associative, 32 byte line size\n"); break; case 0x42: printf("2nd-level cache: 256 KB, 4-way set associative, 32 byte line size\n"); break; case 0x43: printf("2nd-level cache: 512 KB, 4-way set associative, 32 byte line size\n"); break; case 0x44: printf("2nd-level cache: 1 MB, 4-way set associative, 32 byte line size\n"); break; case 0x45: printf("2nd-level cache: 2 MB, 4-way set associative, 32 byte line size\n"); break; case 0x46: printf("3rd-level cache: 4 MB, 4-way set associative, 64 byte line size\n"); break; case 0x47: printf("3rd-level cache: 8 MB, 8-way set associative, 64 byte line size\n"); break; case 0x48: printf("2nd-level cache: 3MByte, 12-way set associative, 64 byte line size\n"); break; case 0x49: if (CPUID_TO_FAMILY(cpu_id) == 0xf && CPUID_TO_MODEL(cpu_id) == 0x6) printf("3rd-level cache: 4MB, 16-way set associative, 64-byte line size\n"); else printf("2nd-level cache: 4 MByte, 16-way set associative, 64 byte line size"); break; case 0x4a: printf("3rd-level cache: 6MByte, 12-way set associative, 64 byte line size\n"); break; case 0x4b: printf("3rd-level cache: 8MByte, 16-way set associative, 64 byte line size\n"); break; case 0x4c: printf("3rd-level cache: 12MByte, 12-way set associative, 64 byte line size\n"); break; case 0x4d: printf("3rd-level cache: 16MByte, 16-way set associative, 64 byte line size\n"); break; case 0x4e: printf("2nd-level cache: 6MByte, 24-way set associative, 64 byte line size\n"); break; case 0x4f: printf("Instruction TLB: 4 KByte pages, 32 entries\n"); break; case 0x50: printf("Instruction TLB: 4 KB, 2 MB or 4 MB pages, fully associative, 64 entries\n"); break; case 0x51: printf("Instruction TLB: 4 KB, 2 MB or 4 MB pages, fully associative, 128 entries\n"); break; case 0x52: printf("Instruction TLB: 4 KB, 2 MB or 4 MB pages, fully associative, 256 entries\n"); break; case 0x55: printf("Instruction TLB: 2-MByte or 4-MByte pages, fully associative, 7 entries\n"); break; case 0x56: printf("Data TLB0: 4 MByte pages, 4-way set associative, 16 entries\n"); break; case 0x57: printf("Data TLB0: 4 KByte pages, 4-way associative, 16 entries\n"); break; case 0x59: printf("Data TLB0: 4 KByte pages, fully associative, 16 entries\n"); break; case 0x5a: printf("Data TLB0: 2-MByte or 4 MByte pages, 4-way set associative, 32 entries\n"); break; case 0x5b: printf("Data TLB: 4 KB or 4 MB pages, fully associative, 64 entries\n"); break; case 0x5c: printf("Data TLB: 4 KB or 4 MB pages, fully associative, 128 entries\n"); break; case 0x5d: printf("Data TLB: 4 KB or 4 MB pages, fully associative, 256 entries\n"); break; case 0x60: printf("1st-level data cache: 16 KB, 8-way set associative, sectored cache, 64 byte line size\n"); break; case 0x61: printf("Instruction TLB: 4 KByte pages, fully associative, 48 entries\n"); break; case 0x63: printf("Data TLB: 2 MByte or 4 MByte pages, 4-way set associative, 32 entries and a separate array with 1 GByte pages, 4-way set associative, 4 entries\n"); break; case 0x64: printf("Data TLB: 4 KBytes pages, 4-way set associative, 512 entries\n"); break; case 0x66: printf("1st-level data cache: 8 KB, 4-way set associative, sectored cache, 64 byte line size\n"); break; case 0x67: printf("1st-level data cache: 16 KB, 4-way set associative, sectored cache, 64 byte line size\n"); break; case 0x68: printf("1st-level data cache: 32 KB, 4 way set associative, sectored cache, 64 byte line size\n"); break; case 0x6a: printf("uTLB: 4KByte pages, 8-way set associative, 64 entries\n"); break; case 0x6b: printf("DTLB: 4KByte pages, 8-way set associative, 256 entries\n"); break; case 0x6c: printf("DTLB: 2M/4M pages, 8-way set associative, 128 entries\n"); break; case 0x6d: printf("DTLB: 1 GByte pages, fully associative, 16 entries\n"); break; case 0x70: printf("Trace cache: 12K-uops, 8-way set associative\n"); break; case 0x71: printf("Trace cache: 16K-uops, 8-way set associative\n"); break; case 0x72: printf("Trace cache: 32K-uops, 8-way set associative\n"); break; case 0x76: printf("Instruction TLB: 2M/4M pages, fully associative, 8 entries\n"); break; case 0x78: printf("2nd-level cache: 1 MB, 4-way set associative, 64-byte line size\n"); break; case 0x79: printf("2nd-level cache: 128 KB, 8-way set associative, sectored cache, 64 byte line size\n"); break; case 0x7a: printf("2nd-level cache: 256 KB, 8-way set associative, sectored cache, 64 byte line size\n"); break; case 0x7b: printf("2nd-level cache: 512 KB, 8-way set associative, sectored cache, 64 byte line size\n"); break; case 0x7c: printf("2nd-level cache: 1 MB, 8-way set associative, sectored cache, 64 byte line size\n"); break; case 0x7d: printf("2nd-level cache: 2-MB, 8-way set associative, 64-byte line size\n"); break; case 0x7f: printf("2nd-level cache: 512-KB, 2-way set associative, 64-byte line size\n"); break; case 0x80: printf("2nd-level cache: 512 KByte, 8-way set associative, 64-byte line size\n"); break; case 0x82: printf("2nd-level cache: 256 KB, 8-way set associative, 32 byte line size\n"); break; case 0x83: printf("2nd-level cache: 512 KB, 8-way set associative, 32 byte line size\n"); break; case 0x84: printf("2nd-level cache: 1 MB, 8-way set associative, 32 byte line size\n"); break; case 0x85: printf("2nd-level cache: 2 MB, 8-way set associative, 32 byte line size\n"); break; case 0x86: printf("2nd-level cache: 512 KB, 4-way set associative, 64 byte line size\n"); break; case 0x87: printf("2nd-level cache: 1 MB, 8-way set associative, 64 byte line size\n"); break; case 0xa0: printf("DTLB: 4k pages, fully associative, 32 entries\n"); break; case 0xb0: printf("Instruction TLB: 4 KB Pages, 4-way set associative, 128 entries\n"); break; case 0xb1: printf("Instruction TLB: 2M pages, 4-way, 8 entries or 4M pages, 4-way, 4 entries\n"); break; case 0xb2: printf("Instruction TLB: 4KByte pages, 4-way set associative, 64 entries\n"); break; case 0xb3: printf("Data TLB: 4 KB Pages, 4-way set associative, 128 entries\n"); break; case 0xb4: printf("Data TLB1: 4 KByte pages, 4-way associative, 256 entries\n"); break; case 0xb5: printf("Instruction TLB: 4KByte pages, 8-way set associative, 64 entries\n"); break; case 0xb6: printf("Instruction TLB: 4KByte pages, 8-way set associative, 128 entries\n"); break; case 0xba: printf("Data TLB1: 4 KByte pages, 4-way associative, 64 entries\n"); break; case 0xc0: printf("Data TLB: 4 KByte and 4 MByte pages, 4-way associative, 8 entries\n"); break; case 0xc1: printf("Shared 2nd-Level TLB: 4 KByte/2MByte pages, 8-way associative, 1024 entries\n"); break; case 0xc2: printf("DTLB: 4 KByte/2 MByte pages, 4-way associative, 16 entries\n"); break; case 0xc3: printf("Shared 2nd-Level TLB: 4 KByte /2 MByte pages, 6-way associative, 1536 entries. Also 1GBbyte pages, 4-way, 16 entries\n"); break; case 0xc4: printf("DTLB: 2M/4M Byte pages, 4-way associative, 32 entries\n"); break; case 0xca: printf("Shared 2nd-Level TLB: 4 KByte pages, 4-way associative, 512 entries\n"); break; case 0xd0: printf("3rd-level cache: 512 KByte, 4-way set associative, 64 byte line size\n"); break; case 0xd1: printf("3rd-level cache: 1 MByte, 4-way set associative, 64 byte line size\n"); break; case 0xd2: printf("3rd-level cache: 2 MByte, 4-way set associative, 64 byte line size\n"); break; case 0xd6: printf("3rd-level cache: 1 MByte, 8-way set associative, 64 byte line size\n"); break; case 0xd7: printf("3rd-level cache: 2 MByte, 8-way set associative, 64 byte line size\n"); break; case 0xd8: printf("3rd-level cache: 4 MByte, 8-way set associative, 64 byte line size\n"); break; case 0xdc: printf("3rd-level cache: 1.5 MByte, 12-way set associative, 64 byte line size\n"); break; case 0xdd: printf("3rd-level cache: 3 MByte, 12-way set associative, 64 byte line size\n"); break; case 0xde: printf("3rd-level cache: 6 MByte, 12-way set associative, 64 byte line size\n"); break; case 0xe2: printf("3rd-level cache: 2 MByte, 16-way set associative, 64 byte line size\n"); break; case 0xe3: printf("3rd-level cache: 4 MByte, 16-way set associative, 64 byte line size\n"); break; case 0xe4: printf("3rd-level cache: 8 MByte, 16-way set associative, 64 byte line size\n"); break; case 0xea: printf("3rd-level cache: 12MByte, 24-way set associative, 64 byte line size\n"); break; case 0xeb: printf("3rd-level cache: 18MByte, 24-way set associative, 64 byte line size\n"); break; case 0xec: printf("3rd-level cache: 24MByte, 24-way set associative, 64 byte line size\n"); break; case 0xf0: printf("64-Byte prefetching\n"); break; case 0xf1: printf("128-Byte prefetching\n"); break; } } static void print_svm_info(void) { u_int features, regs[4]; uint64_t msr; int comma; printf("\n SVM: "); do_cpuid(0x8000000A, regs); features = regs[3]; msr = rdmsr(MSR_VM_CR); if ((msr & VM_CR_SVMDIS) == VM_CR_SVMDIS) printf("(disabled in BIOS) "); if (!bootverbose) { comma = 0; if (features & (1 << 0)) { printf("%sNP", comma ? "," : ""); comma = 1; } if (features & (1 << 3)) { printf("%sNRIP", comma ? "," : ""); comma = 1; } if (features & (1 << 5)) { printf("%sVClean", comma ? "," : ""); comma = 1; } if (features & (1 << 6)) { printf("%sAFlush", comma ? "," : ""); comma = 1; } if (features & (1 << 7)) { printf("%sDAssist", comma ? "," : ""); comma = 1; } printf("%sNAsids=%d", comma ? "," : "", regs[1]); return; } printf("Features=0x%b", features, "\020" "\001NP" /* Nested paging */ "\002LbrVirt" /* LBR virtualization */ "\003SVML" /* SVM lock */ "\004NRIPS" /* NRIP save */ "\005TscRateMsr" /* MSR based TSC rate control */ "\006VmcbClean" /* VMCB clean bits */ "\007FlushByAsid" /* Flush by ASID */ "\010DecodeAssist" /* Decode assist */ "\011" "\012" "\013PauseFilter" /* PAUSE intercept filter */ "\014" "\015PauseFilterThreshold" /* PAUSE filter threshold */ "\016AVIC" /* virtual interrupt controller */ ); printf("\nRevision=%d, ASIDs=%d", regs[0] & 0xff, regs[1]); } #ifdef __i386__ static void print_transmeta_info(void) { u_int regs[4], nreg = 0; do_cpuid(0x80860000, regs); nreg = regs[0]; if (nreg >= 0x80860001) { do_cpuid(0x80860001, regs); printf(" Processor revision %u.%u.%u.%u\n", (regs[1] >> 24) & 0xff, (regs[1] >> 16) & 0xff, (regs[1] >> 8) & 0xff, regs[1] & 0xff); } if (nreg >= 0x80860002) { do_cpuid(0x80860002, regs); printf(" Code Morphing Software revision %u.%u.%u-%u-%u\n", (regs[1] >> 24) & 0xff, (regs[1] >> 16) & 0xff, (regs[1] >> 8) & 0xff, regs[1] & 0xff, regs[2]); } if (nreg >= 0x80860006) { char info[65]; do_cpuid(0x80860003, (u_int*) &info[0]); do_cpuid(0x80860004, (u_int*) &info[16]); do_cpuid(0x80860005, (u_int*) &info[32]); do_cpuid(0x80860006, (u_int*) &info[48]); info[64] = 0; printf(" %s\n", info); } } #endif static void print_via_padlock_info(void) { u_int regs[4]; do_cpuid(0xc0000001, regs); printf("\n VIA Padlock Features=0x%b", regs[3], "\020" "\003RNG" /* RNG */ "\007AES" /* ACE */ "\011AES-CTR" /* ACE2 */ "\013SHA1,SHA256" /* PHE */ "\015RSA" /* PMM */ ); } static uint32_t vmx_settable(uint64_t basic, int msr, int true_msr) { uint64_t val; if (basic & (1ULL << 55)) val = rdmsr(true_msr); else val = rdmsr(msr); /* Just report the controls that can be set to 1. */ return (val >> 32); } static void print_vmx_info(void) { uint64_t basic, msr; uint32_t entry, exit, mask, pin, proc, proc2; int comma; printf("\n VT-x: "); msr = rdmsr(MSR_IA32_FEATURE_CONTROL); if (!(msr & IA32_FEATURE_CONTROL_VMX_EN)) printf("(disabled in BIOS) "); basic = rdmsr(MSR_VMX_BASIC); pin = vmx_settable(basic, MSR_VMX_PINBASED_CTLS, MSR_VMX_TRUE_PINBASED_CTLS); proc = vmx_settable(basic, MSR_VMX_PROCBASED_CTLS, MSR_VMX_TRUE_PROCBASED_CTLS); if (proc & PROCBASED_SECONDARY_CONTROLS) proc2 = vmx_settable(basic, MSR_VMX_PROCBASED_CTLS2, MSR_VMX_PROCBASED_CTLS2); else proc2 = 0; exit = vmx_settable(basic, MSR_VMX_EXIT_CTLS, MSR_VMX_TRUE_EXIT_CTLS); entry = vmx_settable(basic, MSR_VMX_ENTRY_CTLS, MSR_VMX_TRUE_ENTRY_CTLS); if (!bootverbose) { comma = 0; if (exit & VM_EXIT_SAVE_PAT && exit & VM_EXIT_LOAD_PAT && entry & VM_ENTRY_LOAD_PAT) { printf("%sPAT", comma ? "," : ""); comma = 1; } if (proc & PROCBASED_HLT_EXITING) { printf("%sHLT", comma ? "," : ""); comma = 1; } if (proc & PROCBASED_MTF) { printf("%sMTF", comma ? "," : ""); comma = 1; } if (proc & PROCBASED_PAUSE_EXITING) { printf("%sPAUSE", comma ? "," : ""); comma = 1; } if (proc2 & PROCBASED2_ENABLE_EPT) { printf("%sEPT", comma ? "," : ""); comma = 1; } if (proc2 & PROCBASED2_UNRESTRICTED_GUEST) { printf("%sUG", comma ? "," : ""); comma = 1; } if (proc2 & PROCBASED2_ENABLE_VPID) { printf("%sVPID", comma ? "," : ""); comma = 1; } if (proc & PROCBASED_USE_TPR_SHADOW && proc2 & PROCBASED2_VIRTUALIZE_APIC_ACCESSES && proc2 & PROCBASED2_VIRTUALIZE_X2APIC_MODE && proc2 & PROCBASED2_APIC_REGISTER_VIRTUALIZATION && proc2 & PROCBASED2_VIRTUAL_INTERRUPT_DELIVERY) { printf("%sVID", comma ? "," : ""); comma = 1; if (pin & PINBASED_POSTED_INTERRUPT) printf(",PostIntr"); } return; } mask = basic >> 32; printf("Basic Features=0x%b", mask, "\020" "\02132PA" /* 32-bit physical addresses */ "\022SMM" /* SMM dual-monitor */ "\027INS/OUTS" /* VM-exit info for INS and OUTS */ "\030TRUE" /* TRUE_CTLS MSRs */ ); printf("\n Pin-Based Controls=0x%b", pin, "\020" "\001ExtINT" /* External-interrupt exiting */ "\004NMI" /* NMI exiting */ "\006VNMI" /* Virtual NMIs */ "\007PreTmr" /* Activate VMX-preemption timer */ "\010PostIntr" /* Process posted interrupts */ ); printf("\n Primary Processor Controls=0x%b", proc, "\020" "\003INTWIN" /* Interrupt-window exiting */ "\004TSCOff" /* Use TSC offsetting */ "\010HLT" /* HLT exiting */ "\012INVLPG" /* INVLPG exiting */ "\013MWAIT" /* MWAIT exiting */ "\014RDPMC" /* RDPMC exiting */ "\015RDTSC" /* RDTSC exiting */ "\020CR3-LD" /* CR3-load exiting */ "\021CR3-ST" /* CR3-store exiting */ "\024CR8-LD" /* CR8-load exiting */ "\025CR8-ST" /* CR8-store exiting */ "\026TPR" /* Use TPR shadow */ "\027NMIWIN" /* NMI-window exiting */ "\030MOV-DR" /* MOV-DR exiting */ "\031IO" /* Unconditional I/O exiting */ "\032IOmap" /* Use I/O bitmaps */ "\034MTF" /* Monitor trap flag */ "\035MSRmap" /* Use MSR bitmaps */ "\036MONITOR" /* MONITOR exiting */ "\037PAUSE" /* PAUSE exiting */ ); if (proc & PROCBASED_SECONDARY_CONTROLS) printf("\n Secondary Processor Controls=0x%b", proc2, "\020" "\001APIC" /* Virtualize APIC accesses */ "\002EPT" /* Enable EPT */ "\003DT" /* Descriptor-table exiting */ "\004RDTSCP" /* Enable RDTSCP */ "\005x2APIC" /* Virtualize x2APIC mode */ "\006VPID" /* Enable VPID */ "\007WBINVD" /* WBINVD exiting */ "\010UG" /* Unrestricted guest */ "\011APIC-reg" /* APIC-register virtualization */ "\012VID" /* Virtual-interrupt delivery */ "\013PAUSE-loop" /* PAUSE-loop exiting */ "\014RDRAND" /* RDRAND exiting */ "\015INVPCID" /* Enable INVPCID */ "\016VMFUNC" /* Enable VM functions */ "\017VMCS" /* VMCS shadowing */ "\020EPT#VE" /* EPT-violation #VE */ "\021XSAVES" /* Enable XSAVES/XRSTORS */ ); printf("\n Exit Controls=0x%b", mask, "\020" "\003DR" /* Save debug controls */ /* Ignore Host address-space size */ "\015PERF" /* Load MSR_PERF_GLOBAL_CTRL */ "\020AckInt" /* Acknowledge interrupt on exit */ "\023PAT-SV" /* Save MSR_PAT */ "\024PAT-LD" /* Load MSR_PAT */ "\025EFER-SV" /* Save MSR_EFER */ "\026EFER-LD" /* Load MSR_EFER */ "\027PTMR-SV" /* Save VMX-preemption timer value */ ); printf("\n Entry Controls=0x%b", mask, "\020" "\003DR" /* Save debug controls */ /* Ignore IA-32e mode guest */ /* Ignore Entry to SMM */ /* Ignore Deactivate dual-monitor treatment */ "\016PERF" /* Load MSR_PERF_GLOBAL_CTRL */ "\017PAT" /* Load MSR_PAT */ "\020EFER" /* Load MSR_EFER */ ); if (proc & PROCBASED_SECONDARY_CONTROLS && (proc2 & (PROCBASED2_ENABLE_EPT | PROCBASED2_ENABLE_VPID)) != 0) { msr = rdmsr(MSR_VMX_EPT_VPID_CAP); mask = msr; printf("\n EPT Features=0x%b", mask, "\020" "\001XO" /* Execute-only translations */ "\007PW4" /* Page-walk length of 4 */ "\011UC" /* EPT paging-structure mem can be UC */ "\017WB" /* EPT paging-structure mem can be WB */ "\0212M" /* EPT PDE can map a 2-Mbyte page */ "\0221G" /* EPT PDPTE can map a 1-Gbyte page */ "\025INVEPT" /* INVEPT is supported */ "\026AD" /* Accessed and dirty flags for EPT */ "\032single" /* INVEPT single-context type */ "\033all" /* INVEPT all-context type */ ); mask = msr >> 32; printf("\n VPID Features=0x%b", mask, "\020" "\001INVVPID" /* INVVPID is supported */ "\011individual" /* INVVPID individual-address type */ "\012single" /* INVVPID single-context type */ "\013all" /* INVVPID all-context type */ /* INVVPID single-context-retaining-globals type */ "\014single-globals" ); } } static void print_hypervisor_info(void) { if (*hv_vendor) printf("Hypervisor: Origin = \"%s\"\n", hv_vendor); }