diff --git a/sys/contrib/vchiq/interface/vchiq_arm/vchiq_2835_arm.c b/sys/contrib/vchiq/interface/vchiq_arm/vchiq_2835_arm.c --- a/sys/contrib/vchiq/interface/vchiq_arm/vchiq_2835_arm.c +++ b/sys/contrib/vchiq/interface/vchiq_arm/vchiq_2835_arm.c @@ -65,9 +65,24 @@ #define MAX_FRAGMENTS (VCHIQ_NUM_CURRENT_BULKS * 2) +/* + * XXXMDC + * Do this less ad-hoc-y -- e.g. + * https://github.com/raspberrypi/linux/commit/c683db8860a80562a2bb5b451d77b3e471d24f36 + */ +#if defined(__aarch64__) +int g_cache_line_size = 64; +#else int g_cache_line_size = 32; +#endif static int g_fragment_size; +unsigned int g_long_bulk_space = 0; +#define VM_PAGE_TO_VC_BULK_PAGE(x) (\ + g_long_bulk_space ? VM_PAGE_TO_PHYS(x)\ + : PHYS_TO_VCBUS(VM_PAGE_TO_PHYS(x))\ +) + typedef struct vchiq_2835_state_struct { int inited; VCHIQ_ARM_STATE_T arm_state; @@ -113,6 +128,59 @@ *addr = PHYS_TO_VCBUS(segs[0].ds_addr); } +#if defined(__aarch64__) /* See comment in free_pagelist */ +static int +invalidate_cachelines_in_range_of_ppage( + vm_page_t p, + size_t offset, + size_t count +) +{ + if(offset + count > PAGE_SIZE){ return EINVAL; } + uint8_t *dst = (uint8_t*)pmap_quick_enter_page(p); + if (!dst){ + return ENOMEM; + } + cpu_dcache_inv_range((vm_offset_t)dst + offset, count); + pmap_quick_remove_page((vm_offset_t)dst); + return 0; +} + +/* XXXMDC bulk instead of loading and invalidating single pages? */ +static void +invalidate_cachelines_in_range_of_ppage_seq( + vm_page_t *p, + size_t start, + size_t count +) +{ + if(start >= PAGE_SIZE) goto invalid_input; + +#define _NEXT_AT(x,_m) (((x)+((_m)-1)) & ~((_m)-1)) /* for power of two m */ + size_t offset = _NEXT_AT(start,g_cache_line_size); +#undef _NEXT_AT + count = (offset < start + count) ? count - (offset - start) : 0; + offset = offset & (PAGE_SIZE - 1); + for( + size_t done = 0; + count > done; + p++, done += PAGE_SIZE - offset, offset = 0 + ){ + size_t in_page = PAGE_SIZE - offset; + size_t todo = (count-done > in_page) ? in_page : count-done; + int e = invalidate_cachelines_in_range_of_ppage(*p, offset, todo); + if(e != 0) + goto problem_in_loop; + } + return; + +problem_in_loop: +invalid_input: + WARN_ON(1); + return; +} +#endif + static int copyout_page(vm_page_t p, size_t offset, void *kaddr, size_t size) { @@ -171,7 +239,7 @@ goto failed_load; } - WARN_ON(((int)g_slot_mem & (PAGE_SIZE - 1)) != 0); + WARN_ON(((size_t)g_slot_mem & (PAGE_SIZE - 1)) != 0); vchiq_slot_zero = vchiq_init_slots(g_slot_mem, g_slot_mem_size); if (!vchiq_slot_zero) { @@ -204,8 +272,8 @@ bcm_mbox_write(BCM2835_MBOX_CHAN_VCHIQ, (unsigned int)g_slot_phys); vchiq_log_info(vchiq_arm_log_level, - "vchiq_init - done (slots %x, phys %x)", - (unsigned int)vchiq_slot_zero, g_slot_phys); + "vchiq_init - done (slots %zx, phys %zx)", + (size_t)vchiq_slot_zero, g_slot_phys); vchiq_call_connected_callbacks(); @@ -393,13 +461,14 @@ ** from increased speed as a result. */ + static int create_pagelist(char __user *buf, size_t count, unsigned short type, struct proc *p, BULKINFO_T *bi) { PAGELIST_T *pagelist; vm_page_t* pages; - unsigned long *addrs; + uint32_t *addrs; unsigned int num_pages, i; vm_offset_t offset; int pagelist_size; @@ -436,7 +505,7 @@ err = bus_dmamem_alloc(bi->pagelist_dma_tag, (void **)&pagelist, BUS_DMA_COHERENT | BUS_DMA_WAITOK, &bi->pagelist_dma_map); - if (err) { + if (err || !pagelist) { vchiq_log_error(vchiq_core_log_level, "Unable to allocate pagelist memory"); err = -ENOMEM; goto failed_alloc; @@ -449,16 +518,14 @@ if (err) { vchiq_log_error(vchiq_core_log_level, "cannot load DMA map for pagelist memory"); err = -ENOMEM; + bi->pagelist = pagelist; goto failed_load; } vchiq_log_trace(vchiq_arm_log_level, - "create_pagelist - %x (%d bytes @%p)", (unsigned int)pagelist, count, buf); + "create_pagelist - %zx (%zu bytes @%p)", (size_t)pagelist, count, buf); - if (!pagelist) - return -ENOMEM; - - addrs = pagelist->addrs; + addrs = (uint32_t *) pagelist->addrs; pages = (vm_page_t*)(addrs + num_pages); actual_pages = vm_fault_quick_hold_pages(&p->p_vmspace->vm_map, @@ -467,8 +534,9 @@ if (actual_pages != num_pages) { vm_page_unhold_pages(pages, actual_pages); - free(pagelist, M_VCPAGELIST); - return (-ENOMEM); + err = -ENOMEM; + bi->pagelist = pagelist; + goto failed_hold; } pagelist->length = count; @@ -477,27 +545,28 @@ /* Group the pages into runs of contiguous pages */ - base_addr = (void *)PHYS_TO_VCBUS(VM_PAGE_TO_PHYS(pages[0])); + size_t run_ceil = g_long_bulk_space ? 0x100 : PAGE_SIZE; + unsigned int pg_addr_rshift = g_long_bulk_space ? 4 : 0; + base_addr = (void *) VM_PAGE_TO_VC_BULK_PAGE(pages[0]); next_addr = base_addr + PAGE_SIZE; addridx = 0; run = 0; - +#define _PG_BLOCK(base,run) \ + ((((size_t) (base)) >> pg_addr_rshift) & ~(run_ceil-1)) + (run) for (i = 1; i < num_pages; i++) { - addr = (void *)PHYS_TO_VCBUS(VM_PAGE_TO_PHYS(pages[i])); - if ((addr == next_addr) && (run < (PAGE_SIZE - 1))) { + addr = (void *)VM_PAGE_TO_VC_BULK_PAGE(pages[i]); + if ((addr == next_addr) && (run < run_ceil - 1)) { next_addr += PAGE_SIZE; run++; } else { - addrs[addridx] = (unsigned long)base_addr + run; - addridx++; + addrs[addridx++] = (uint32_t) _PG_BLOCK(base_addr,run); base_addr = addr; next_addr = addr + PAGE_SIZE; run = 0; } } - - addrs[addridx] = (unsigned long)base_addr + run; - addridx++; + addrs[addridx++] = _PG_BLOCK(base_addr, run); +#undef _PG_BLOCK /* Partial cache lines (fragments) require special measures */ if ((type == PAGELIST_READ) && @@ -519,12 +588,24 @@ g_free_fragments = *(char **) g_free_fragments; up(&g_free_fragments_mutex); pagelist->type = - PAGELIST_READ_WITH_FRAGMENTS + - (fragments - g_fragments_base)/g_fragment_size; + PAGELIST_READ_WITH_FRAGMENTS + + (fragments - g_fragments_base)/g_fragment_size; +#if defined(__aarch64__) + bus_dmamap_sync(bcm_slots_dma_tag, bcm_slots_dma_map, BUS_DMASYNC_PREREAD); +#endif } +#if defined(__aarch64__) + if(type == PAGELIST_READ){ + cpu_dcache_wbinv_range((vm_offset_t)buf,count); + }else{ + cpu_dcache_wb_range((vm_offset_t)buf,count); + } + dsb(sy); +#else pa = pmap_extract(PCPU_GET(curpmap), (vm_offset_t)buf); dcache_wbinv_poc((vm_offset_t)buf, pa, count); +#endif bus_dmamap_sync(bi->pagelist_dma_tag, bi->pagelist_dma_map, BUS_DMASYNC_PREWRITE); @@ -532,6 +613,8 @@ return 0; +failed_hold: + bus_dmamap_unload(bi->pagelist_dma_tag,bi->pagelist_dma_map); failed_load: bus_dmamem_free(bi->pagelist_dma_tag, bi->pagelist, bi->pagelist_dma_map); failed_alloc: @@ -550,7 +633,7 @@ pagelist = bi->pagelist; vchiq_log_trace(vchiq_arm_log_level, - "free_pagelist - %x, %d (%lu bytes @%p)", (unsigned int)pagelist, actual, pagelist->length, bi->buf); + "free_pagelist - %zx, %d (%lu bytes @%p)", (size_t)pagelist, actual, pagelist->length, bi->buf); num_pages = (pagelist->length + pagelist->offset + PAGE_SIZE - 1) / @@ -558,6 +641,27 @@ pages = (vm_page_t*)(pagelist->addrs + num_pages); +#if defined(__aarch64__) + /* + * On arm64, even if the user keeps their end of the bargain + * -- do NOT touch the buffers sent to VC -- but reads around the + * pagelist after the invalidation above, the arm might preemptively + * load (and validate) cache lines for areas inside the page list, + * so we must invalidate them again. + * + * The functional test does it and without this it doesn't pass. + * + * XXXMDC might it be enough to invalidate a couple of pages at + * the ends of the page list? + */ + if(pagelist->type >= PAGELIST_READ && actual > 0) + invalidate_cachelines_in_range_of_ppage_seq( + pages, + pagelist->offset, + actual + ); +#endif + /* Deal with any partial cache lines (fragments) */ if (pagelist->type >= PAGELIST_READ_WITH_FRAGMENTS) { char *fragments = g_fragments_base + @@ -594,13 +698,18 @@ up(&g_free_fragments_sema); } - for (i = 0; i < num_pages; i++) { - if (pagelist->type != PAGELIST_WRITE) { + if (pagelist->type != PAGELIST_WRITE) { + for (i = 0; i < num_pages; i++) { vm_page_dirty(pages[i]); pagelist_page_free(pages[i]); } } +#if defined(__aarch64__) + /* XXXMDC necessary? */ + dsb(sy); +#endif + bus_dmamap_unload(bi->pagelist_dma_tag, bi->pagelist_dma_map); bus_dmamem_free(bi->pagelist_dma_tag, bi->pagelist, bi->pagelist_dma_map); bus_dma_tag_destroy(bi->pagelist_dma_tag);