diff options
Diffstat (limited to 'meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.14.71/3427-drm-amdkfd-Add-topology-support-for-dGPUs.patch')
-rw-r--r-- | meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.14.71/3427-drm-amdkfd-Add-topology-support-for-dGPUs.patch | 1084 |
1 files changed, 1084 insertions, 0 deletions
diff --git a/meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.14.71/3427-drm-amdkfd-Add-topology-support-for-dGPUs.patch b/meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.14.71/3427-drm-amdkfd-Add-topology-support-for-dGPUs.patch new file mode 100644 index 00000000..5ee73845 --- /dev/null +++ b/meta-amd-bsp/recipes-kernel/linux/linux-yocto-4.14.71/3427-drm-amdkfd-Add-topology-support-for-dGPUs.patch @@ -0,0 +1,1084 @@ +From 4ac9bcd5b8a8a88456ed04bd9562120ac80f2a68 Mon Sep 17 00:00:00 2001 +From: Harish Kasiviswanathan <Harish.Kasiviswanathan@amd.com> +Date: Fri, 8 Dec 2017 23:08:59 -0500 +Subject: [PATCH 3427/4131] drm/amdkfd: Add topology support for dGPUs + +Generate and parse VCRAT tables for dGPUs in kfd_topology_add_device. + +Some information that isn't available in the CRAT table is patched +into the topology after parsing. + +HSA_CAP_DOORBELL_TYPE_1_0 is dependent on the ASIC feature +CP_HQD_PQ_CONTROL.SLOT_BASED_WPTR, which was not introduced in VI +until Carrizo. Report HSA_CAP_DOORBELL_TYPE_PRE_1_0 on Tonga ASICs. + +v2: Added #include <linux/pci.h> to kfd_crat.c to make it compile + +Signed-off-by: Harish Kasiviswanathan <Harish.Kasiviswanathan@amd.com> +Signed-off-by: Ben Goz <ben.goz@amd.com> +Signed-off-by: Amber Lin <Amber.Lin@amd.com> +Signed-off-by: Jay Cornwall <Jay.Cornwall@amd.com> +Signed-off-by: Kent Russell <kent.russell@amd.com> +Signed-off-by: Felix Kuehling <Felix.Kuehling@amd.com> +Acked-by: Oded Gabbay <oded.gabbay@gmail.com> +Signed-off-by: Oded Gabbay <oded.gabbay@gmail.com> +--- + drivers/gpu/drm/amd/amdkfd/kfd_crat.c | 596 +++++++++++++++++++++++++++++- + drivers/gpu/drm/amd/amdkfd/kfd_crat.h | 5 +- + drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 2 + + drivers/gpu/drm/amd/amdkfd/kfd_topology.c | 188 ++++++++-- + drivers/gpu/drm/amd/amdkfd/kfd_topology.h | 8 +- + 5 files changed, 748 insertions(+), 51 deletions(-) + +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c +index c8afbf8..d00061b 100644 +--- a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c ++++ b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c +@@ -19,11 +19,120 @@ + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ ++ ++#include <linux/pci.h> + #include <linux/acpi.h> ++#include <linux/amd-iommu.h> + #include "kfd_crat.h" + #include "kfd_priv.h" + #include "kfd_topology.h" + ++/* GPU Processor ID base for dGPUs for which VCRAT needs to be created. ++ * GPU processor ID are expressed with Bit[31]=1. ++ * The base is set to 0x8000_0000 + 0x1000 to avoid collision with GPU IDs ++ * used in the CRAT. ++ */ ++static uint32_t gpu_processor_id_low = 0x80001000; ++ ++/* Return the next available gpu_processor_id and increment it for next GPU ++ * @total_cu_count - Total CUs present in the GPU including ones ++ * masked off ++ */ ++static inline unsigned int get_and_inc_gpu_processor_id( ++ unsigned int total_cu_count) ++{ ++ int current_id = gpu_processor_id_low; ++ ++ gpu_processor_id_low += total_cu_count; ++ return current_id; ++} ++ ++/* Static table to describe GPU Cache information */ ++struct kfd_gpu_cache_info { ++ uint32_t cache_size; ++ uint32_t cache_level; ++ uint32_t flags; ++ /* Indicates how many Compute Units share this cache ++ * Value = 1 indicates the cache is not shared ++ */ ++ uint32_t num_cu_shared; ++}; ++ ++static struct kfd_gpu_cache_info kaveri_cache_info[] = { ++ { ++ /* TCP L1 Cache per CU */ ++ .cache_size = 16, ++ .cache_level = 1, ++ .flags = (CRAT_CACHE_FLAGS_ENABLED | ++ CRAT_CACHE_FLAGS_DATA_CACHE | ++ CRAT_CACHE_FLAGS_SIMD_CACHE), ++ .num_cu_shared = 1, ++ ++ }, ++ { ++ /* Scalar L1 Instruction Cache (in SQC module) per bank */ ++ .cache_size = 16, ++ .cache_level = 1, ++ .flags = (CRAT_CACHE_FLAGS_ENABLED | ++ CRAT_CACHE_FLAGS_INST_CACHE | ++ CRAT_CACHE_FLAGS_SIMD_CACHE), ++ .num_cu_shared = 2, ++ }, ++ { ++ /* Scalar L1 Data Cache (in SQC module) per bank */ ++ .cache_size = 8, ++ .cache_level = 1, ++ .flags = (CRAT_CACHE_FLAGS_ENABLED | ++ CRAT_CACHE_FLAGS_DATA_CACHE | ++ CRAT_CACHE_FLAGS_SIMD_CACHE), ++ .num_cu_shared = 2, ++ }, ++ ++ /* TODO: Add L2 Cache information */ ++}; ++ ++ ++static struct kfd_gpu_cache_info carrizo_cache_info[] = { ++ { ++ /* TCP L1 Cache per CU */ ++ .cache_size = 16, ++ .cache_level = 1, ++ .flags = (CRAT_CACHE_FLAGS_ENABLED | ++ CRAT_CACHE_FLAGS_DATA_CACHE | ++ CRAT_CACHE_FLAGS_SIMD_CACHE), ++ .num_cu_shared = 1, ++ }, ++ { ++ /* Scalar L1 Instruction Cache (in SQC module) per bank */ ++ .cache_size = 8, ++ .cache_level = 1, ++ .flags = (CRAT_CACHE_FLAGS_ENABLED | ++ CRAT_CACHE_FLAGS_INST_CACHE | ++ CRAT_CACHE_FLAGS_SIMD_CACHE), ++ .num_cu_shared = 4, ++ }, ++ { ++ /* Scalar L1 Data Cache (in SQC module) per bank. */ ++ .cache_size = 4, ++ .cache_level = 1, ++ .flags = (CRAT_CACHE_FLAGS_ENABLED | ++ CRAT_CACHE_FLAGS_DATA_CACHE | ++ CRAT_CACHE_FLAGS_SIMD_CACHE), ++ .num_cu_shared = 4, ++ }, ++ ++ /* TODO: Add L2 Cache information */ ++}; ++ ++/* NOTE: In future if more information is added to struct kfd_gpu_cache_info ++ * the following ASICs may need a separate table. ++ */ ++#define hawaii_cache_info kaveri_cache_info ++#define tonga_cache_info carrizo_cache_info ++#define fiji_cache_info carrizo_cache_info ++#define polaris10_cache_info carrizo_cache_info ++#define polaris11_cache_info carrizo_cache_info ++ + static void kfd_populated_cu_info_cpu(struct kfd_topology_device *dev, + struct crat_subtype_computeunit *cu) + { +@@ -44,7 +153,7 @@ static void kfd_populated_cu_info_gpu(struct kfd_topology_device *dev, + dev->node_props.lds_size_in_kb = cu->lds_size_in_kb; + dev->node_props.max_waves_per_simd = cu->max_waves_simd; + dev->node_props.wave_front_size = cu->wave_front_size; +- dev->node_props.array_count = cu->num_arrays; ++ dev->node_props.array_count = cu->array_count; + dev->node_props.cu_per_simd_array = cu->num_cu_per_array; + dev->node_props.simd_per_cu = cu->num_simd_per_cu; + dev->node_props.max_slots_scratch_cu = cu->max_slots_scatch_cu; +@@ -94,9 +203,16 @@ static int kfd_parse_subtype_mem(struct crat_subtype_memory *mem, + if (!props) + return -ENOMEM; + +- if (dev->node_props.cpu_cores_count == 0) +- props->heap_type = HSA_MEM_HEAP_TYPE_FB_PRIVATE; +- else ++ /* We're on GPU node */ ++ if (dev->node_props.cpu_cores_count == 0) { ++ /* APU */ ++ if (mem->visibility_type == 0) ++ props->heap_type = ++ HSA_MEM_HEAP_TYPE_FB_PRIVATE; ++ /* dGPU */ ++ else ++ props->heap_type = mem->visibility_type; ++ } else + props->heap_type = HSA_MEM_HEAP_TYPE_SYSTEM; + + if (mem->flags & CRAT_MEM_FLAGS_HOT_PLUGGABLE) +@@ -128,13 +244,29 @@ static int kfd_parse_subtype_cache(struct crat_subtype_cache *cache, + struct kfd_cache_properties *props; + struct kfd_topology_device *dev; + uint32_t id; ++ uint32_t total_num_of_cu; + + id = cache->processor_id_low; + + pr_debug("Found cache entry in CRAT table with processor_id=%d\n", id); +- list_for_each_entry(dev, device_list, list) +- if (id == dev->node_props.cpu_core_id_base || +- id == dev->node_props.simd_id_base) { ++ list_for_each_entry(dev, device_list, list) { ++ total_num_of_cu = (dev->node_props.array_count * ++ dev->node_props.cu_per_simd_array); ++ ++ /* Cache infomration in CRAT doesn't have proximity_domain ++ * information as it is associated with a CPU core or GPU ++ * Compute Unit. So map the cache using CPU core Id or SIMD ++ * (GPU) ID. ++ * TODO: This works because currently we can safely assume that ++ * Compute Units are parsed before caches are parsed. In ++ * future, remove this dependency ++ */ ++ if ((id >= dev->node_props.cpu_core_id_base && ++ id <= dev->node_props.cpu_core_id_base + ++ dev->node_props.cpu_cores_count) || ++ (id >= dev->node_props.simd_id_base && ++ id < dev->node_props.simd_id_base + ++ total_num_of_cu)) { + props = kfd_alloc_struct(props); + if (!props) + return -ENOMEM; +@@ -146,6 +278,8 @@ static int kfd_parse_subtype_cache(struct crat_subtype_cache *cache, + props->cachelines_per_tag = cache->lines_per_tag; + props->cache_assoc = cache->associativity; + props->cache_latency = cache->cache_latency; ++ memcpy(props->sibling_map, cache->sibling_map, ++ sizeof(props->sibling_map)); + + if (cache->flags & CRAT_CACHE_FLAGS_DATA_CACHE) + props->cache_type |= HSA_CACHE_TYPE_DATA; +@@ -162,6 +296,7 @@ static int kfd_parse_subtype_cache(struct crat_subtype_cache *cache, + + break; + } ++ } + + return 0; + } +@@ -172,8 +307,8 @@ static int kfd_parse_subtype_cache(struct crat_subtype_cache *cache, + static int kfd_parse_subtype_iolink(struct crat_subtype_iolink *iolink, + struct list_head *device_list) + { +- struct kfd_iolink_properties *props; +- struct kfd_topology_device *dev; ++ struct kfd_iolink_properties *props = NULL, *props2; ++ struct kfd_topology_device *dev, *cpu_dev; + uint32_t id_from; + uint32_t id_to; + +@@ -192,11 +327,12 @@ static int kfd_parse_subtype_iolink(struct crat_subtype_iolink *iolink, + props->node_to = id_to; + props->ver_maj = iolink->version_major; + props->ver_min = iolink->version_minor; ++ props->iolink_type = iolink->io_interface_type; + +- /* +- * weight factor (derived from CDIR), currently always 1 +- */ +- props->weight = 1; ++ if (props->iolink_type == CRAT_IOLINK_TYPE_PCIEXPRESS) ++ props->weight = 20; ++ else ++ props->weight = node_distance(id_from, id_to); + + props->min_latency = iolink->minimum_latency; + props->max_latency = iolink->maximum_latency; +@@ -208,11 +344,29 @@ static int kfd_parse_subtype_iolink(struct crat_subtype_iolink *iolink, + dev->io_link_count++; + dev->node_props.io_links_count++; + list_add_tail(&props->list, &dev->io_link_props); +- + break; + } + } + ++ /* CPU topology is created before GPUs are detected, so CPU->GPU ++ * links are not built at that time. If a PCIe type is discovered, it ++ * means a GPU is detected and we are adding GPU->CPU to the topology. ++ * At this time, also add the corresponded CPU->GPU link. ++ */ ++ if (props && props->iolink_type == CRAT_IOLINK_TYPE_PCIEXPRESS) { ++ cpu_dev = kfd_topology_device_by_proximity_domain(id_to); ++ if (!cpu_dev) ++ return -ENODEV; ++ /* same everything but the other direction */ ++ props2 = kmemdup(props, sizeof(*props2), GFP_KERNEL); ++ props2->node_from = id_to; ++ props2->node_to = id_from; ++ props2->kobj = NULL; ++ cpu_dev->io_link_count++; ++ cpu_dev->node_props.io_links_count++; ++ list_add_tail(&props2->list, &cpu_dev->io_link_props); ++ } ++ + return 0; + } + +@@ -338,6 +492,176 @@ int kfd_parse_crat_table(void *crat_image, struct list_head *device_list, + return ret; + } + ++/* Helper function. See kfd_fill_gpu_cache_info for parameter description */ ++static int fill_in_pcache(struct crat_subtype_cache *pcache, ++ struct kfd_gpu_cache_info *pcache_info, ++ struct kfd_cu_info *cu_info, ++ int mem_available, ++ int cu_bitmask, ++ int cache_type, unsigned int cu_processor_id, ++ int cu_block) ++{ ++ unsigned int cu_sibling_map_mask; ++ int first_active_cu; ++ ++ /* First check if enough memory is available */ ++ if (sizeof(struct crat_subtype_cache) > mem_available) ++ return -ENOMEM; ++ ++ cu_sibling_map_mask = cu_bitmask; ++ cu_sibling_map_mask >>= cu_block; ++ cu_sibling_map_mask &= ++ ((1 << pcache_info[cache_type].num_cu_shared) - 1); ++ first_active_cu = ffs(cu_sibling_map_mask); ++ ++ /* CU could be inactive. In case of shared cache find the first active ++ * CU. and incase of non-shared cache check if the CU is inactive. If ++ * inactive active skip it ++ */ ++ if (first_active_cu) { ++ memset(pcache, 0, sizeof(struct crat_subtype_cache)); ++ pcache->type = CRAT_SUBTYPE_CACHE_AFFINITY; ++ pcache->length = sizeof(struct crat_subtype_cache); ++ pcache->flags = pcache_info[cache_type].flags; ++ pcache->processor_id_low = cu_processor_id ++ + (first_active_cu - 1); ++ pcache->cache_level = pcache_info[cache_type].cache_level; ++ pcache->cache_size = pcache_info[cache_type].cache_size; ++ ++ /* Sibling map is w.r.t processor_id_low, so shift out ++ * inactive CU ++ */ ++ cu_sibling_map_mask = ++ cu_sibling_map_mask >> (first_active_cu - 1); ++ ++ pcache->sibling_map[0] = (uint8_t)(cu_sibling_map_mask & 0xFF); ++ pcache->sibling_map[1] = ++ (uint8_t)((cu_sibling_map_mask >> 8) & 0xFF); ++ pcache->sibling_map[2] = ++ (uint8_t)((cu_sibling_map_mask >> 16) & 0xFF); ++ pcache->sibling_map[3] = ++ (uint8_t)((cu_sibling_map_mask >> 24) & 0xFF); ++ return 0; ++ } ++ return 1; ++} ++ ++/* kfd_fill_gpu_cache_info - Fill GPU cache info using kfd_gpu_cache_info ++ * tables ++ * ++ * @kdev - [IN] GPU device ++ * @gpu_processor_id - [IN] GPU processor ID to which these caches ++ * associate ++ * @available_size - [IN] Amount of memory available in pcache ++ * @cu_info - [IN] Compute Unit info obtained from KGD ++ * @pcache - [OUT] memory into which cache data is to be filled in. ++ * @size_filled - [OUT] amount of data used up in pcache. ++ * @num_of_entries - [OUT] number of caches added ++ */ ++static int kfd_fill_gpu_cache_info(struct kfd_dev *kdev, ++ int gpu_processor_id, ++ int available_size, ++ struct kfd_cu_info *cu_info, ++ struct crat_subtype_cache *pcache, ++ int *size_filled, ++ int *num_of_entries) ++{ ++ struct kfd_gpu_cache_info *pcache_info; ++ int num_of_cache_types = 0; ++ int i, j, k; ++ int ct = 0; ++ int mem_available = available_size; ++ unsigned int cu_processor_id; ++ int ret; ++ ++ switch (kdev->device_info->asic_family) { ++ case CHIP_KAVERI: ++ pcache_info = kaveri_cache_info; ++ num_of_cache_types = ARRAY_SIZE(kaveri_cache_info); ++ break; ++ case CHIP_HAWAII: ++ pcache_info = hawaii_cache_info; ++ num_of_cache_types = ARRAY_SIZE(hawaii_cache_info); ++ break; ++ case CHIP_CARRIZO: ++ pcache_info = carrizo_cache_info; ++ num_of_cache_types = ARRAY_SIZE(carrizo_cache_info); ++ break; ++ case CHIP_TONGA: ++ pcache_info = tonga_cache_info; ++ num_of_cache_types = ARRAY_SIZE(tonga_cache_info); ++ break; ++ case CHIP_FIJI: ++ pcache_info = fiji_cache_info; ++ num_of_cache_types = ARRAY_SIZE(fiji_cache_info); ++ break; ++ case CHIP_POLARIS10: ++ pcache_info = polaris10_cache_info; ++ num_of_cache_types = ARRAY_SIZE(polaris10_cache_info); ++ break; ++ case CHIP_POLARIS11: ++ pcache_info = polaris11_cache_info; ++ num_of_cache_types = ARRAY_SIZE(polaris11_cache_info); ++ break; ++ default: ++ return -EINVAL; ++ } ++ ++ *size_filled = 0; ++ *num_of_entries = 0; ++ ++ /* For each type of cache listed in the kfd_gpu_cache_info table, ++ * go through all available Compute Units. ++ * The [i,j,k] loop will ++ * if kfd_gpu_cache_info.num_cu_shared = 1 ++ * will parse through all available CU ++ * If (kfd_gpu_cache_info.num_cu_shared != 1) ++ * then it will consider only one CU from ++ * the shared unit ++ */ ++ ++ for (ct = 0; ct < num_of_cache_types; ct++) { ++ cu_processor_id = gpu_processor_id; ++ for (i = 0; i < cu_info->num_shader_engines; i++) { ++ for (j = 0; j < cu_info->num_shader_arrays_per_engine; ++ j++) { ++ for (k = 0; k < cu_info->num_cu_per_sh; ++ k += pcache_info[ct].num_cu_shared) { ++ ++ ret = fill_in_pcache(pcache, ++ pcache_info, ++ cu_info, ++ mem_available, ++ cu_info->cu_bitmap[i][j], ++ ct, ++ cu_processor_id, ++ k); ++ ++ if (ret < 0) ++ break; ++ ++ if (!ret) { ++ pcache++; ++ (*num_of_entries)++; ++ mem_available -= ++ sizeof(*pcache); ++ (*size_filled) += ++ sizeof(*pcache); ++ } ++ ++ /* Move to next CU block */ ++ cu_processor_id += ++ pcache_info[ct].num_cu_shared; ++ } ++ } ++ } ++ } ++ ++ pr_debug("Added [%d] GPU cache entries\n", *num_of_entries); ++ ++ return 0; ++} ++ + /* + * kfd_create_crat_image_acpi - Allocates memory for CRAT image and + * copies CRAT from ACPI (if available). +@@ -624,6 +948,239 @@ static int kfd_create_vcrat_image_cpu(void *pcrat_image, size_t *size) + return 0; + } + ++static int kfd_fill_gpu_memory_affinity(int *avail_size, ++ struct kfd_dev *kdev, uint8_t type, uint64_t size, ++ struct crat_subtype_memory *sub_type_hdr, ++ uint32_t proximity_domain, ++ const struct kfd_local_mem_info *local_mem_info) ++{ ++ *avail_size -= sizeof(struct crat_subtype_memory); ++ if (*avail_size < 0) ++ return -ENOMEM; ++ ++ memset((void *)sub_type_hdr, 0, sizeof(struct crat_subtype_memory)); ++ sub_type_hdr->type = CRAT_SUBTYPE_MEMORY_AFFINITY; ++ sub_type_hdr->length = sizeof(struct crat_subtype_memory); ++ sub_type_hdr->flags |= CRAT_SUBTYPE_FLAGS_ENABLED; ++ ++ sub_type_hdr->proximity_domain = proximity_domain; ++ ++ pr_debug("Fill gpu memory affinity - type 0x%x size 0x%llx\n", ++ type, size); ++ ++ sub_type_hdr->length_low = lower_32_bits(size); ++ sub_type_hdr->length_high = upper_32_bits(size); ++ ++ sub_type_hdr->width = local_mem_info->vram_width; ++ sub_type_hdr->visibility_type = type; ++ ++ return 0; ++} ++ ++/* kfd_fill_gpu_direct_io_link - Fill in direct io link from GPU ++ * to its NUMA node ++ * @avail_size: Available size in the memory ++ * @kdev - [IN] GPU device ++ * @sub_type_hdr: Memory into which io link info will be filled in ++ * @proximity_domain - proximity domain of the GPU node ++ * ++ * Return 0 if successful else return -ve value ++ */ ++static int kfd_fill_gpu_direct_io_link(int *avail_size, ++ struct kfd_dev *kdev, ++ struct crat_subtype_iolink *sub_type_hdr, ++ uint32_t proximity_domain) ++{ ++ *avail_size -= sizeof(struct crat_subtype_iolink); ++ if (*avail_size < 0) ++ return -ENOMEM; ++ ++ memset((void *)sub_type_hdr, 0, sizeof(struct crat_subtype_iolink)); ++ ++ /* Fill in subtype header data */ ++ sub_type_hdr->type = CRAT_SUBTYPE_IOLINK_AFFINITY; ++ sub_type_hdr->length = sizeof(struct crat_subtype_iolink); ++ sub_type_hdr->flags |= CRAT_SUBTYPE_FLAGS_ENABLED; ++ ++ /* Fill in IOLINK subtype. ++ * TODO: Fill-in other fields of iolink subtype ++ */ ++ sub_type_hdr->io_interface_type = CRAT_IOLINK_TYPE_PCIEXPRESS; ++ sub_type_hdr->proximity_domain_from = proximity_domain; ++#ifdef CONFIG_NUMA ++ if (kdev->pdev->dev.numa_node == NUMA_NO_NODE) ++ sub_type_hdr->proximity_domain_to = 0; ++ else ++ sub_type_hdr->proximity_domain_to = kdev->pdev->dev.numa_node; ++#else ++ sub_type_hdr->proximity_domain_to = 0; ++#endif ++ return 0; ++} ++ ++/* kfd_create_vcrat_image_gpu - Create Virtual CRAT for CPU ++ * ++ * @pcrat_image: Fill in VCRAT for GPU ++ * @size: [IN] allocated size of crat_image. ++ * [OUT] actual size of data filled in crat_image ++ */ ++static int kfd_create_vcrat_image_gpu(void *pcrat_image, ++ size_t *size, struct kfd_dev *kdev, ++ uint32_t proximity_domain) ++{ ++ struct crat_header *crat_table = (struct crat_header *)pcrat_image; ++ struct crat_subtype_generic *sub_type_hdr; ++ struct crat_subtype_computeunit *cu; ++ struct kfd_cu_info cu_info; ++ struct amd_iommu_device_info iommu_info; ++ int avail_size = *size; ++ uint32_t total_num_of_cu; ++ int num_of_cache_entries = 0; ++ int cache_mem_filled = 0; ++ int ret = 0; ++ const u32 required_iommu_flags = AMD_IOMMU_DEVICE_FLAG_ATS_SUP | ++ AMD_IOMMU_DEVICE_FLAG_PRI_SUP | ++ AMD_IOMMU_DEVICE_FLAG_PASID_SUP; ++ struct kfd_local_mem_info local_mem_info; ++ ++ if (!pcrat_image || avail_size < VCRAT_SIZE_FOR_GPU) ++ return -EINVAL; ++ ++ /* Fill the CRAT Header. ++ * Modify length and total_entries as subunits are added. ++ */ ++ avail_size -= sizeof(struct crat_header); ++ if (avail_size < 0) ++ return -ENOMEM; ++ ++ memset(crat_table, 0, sizeof(struct crat_header)); ++ ++ memcpy(&crat_table->signature, CRAT_SIGNATURE, ++ sizeof(crat_table->signature)); ++ /* Change length as we add more subtypes*/ ++ crat_table->length = sizeof(struct crat_header); ++ crat_table->num_domains = 1; ++ crat_table->total_entries = 0; ++ ++ /* Fill in Subtype: Compute Unit ++ * First fill in the sub type header and then sub type data ++ */ ++ avail_size -= sizeof(struct crat_subtype_computeunit); ++ if (avail_size < 0) ++ return -ENOMEM; ++ ++ sub_type_hdr = (struct crat_subtype_generic *)(crat_table + 1); ++ memset(sub_type_hdr, 0, sizeof(struct crat_subtype_computeunit)); ++ ++ sub_type_hdr->type = CRAT_SUBTYPE_COMPUTEUNIT_AFFINITY; ++ sub_type_hdr->length = sizeof(struct crat_subtype_computeunit); ++ sub_type_hdr->flags = CRAT_SUBTYPE_FLAGS_ENABLED; ++ ++ /* Fill CU subtype data */ ++ cu = (struct crat_subtype_computeunit *)sub_type_hdr; ++ cu->flags |= CRAT_CU_FLAGS_GPU_PRESENT; ++ cu->proximity_domain = proximity_domain; ++ ++ kdev->kfd2kgd->get_cu_info(kdev->kgd, &cu_info); ++ cu->num_simd_per_cu = cu_info.simd_per_cu; ++ cu->num_simd_cores = cu_info.simd_per_cu * cu_info.cu_active_number; ++ cu->max_waves_simd = cu_info.max_waves_per_simd; ++ ++ cu->wave_front_size = cu_info.wave_front_size; ++ cu->array_count = cu_info.num_shader_arrays_per_engine * ++ cu_info.num_shader_engines; ++ total_num_of_cu = (cu->array_count * cu_info.num_cu_per_sh); ++ cu->processor_id_low = get_and_inc_gpu_processor_id(total_num_of_cu); ++ cu->num_cu_per_array = cu_info.num_cu_per_sh; ++ cu->max_slots_scatch_cu = cu_info.max_scratch_slots_per_cu; ++ cu->num_banks = cu_info.num_shader_engines; ++ cu->lds_size_in_kb = cu_info.lds_size; ++ ++ cu->hsa_capability = 0; ++ ++ /* Check if this node supports IOMMU. During parsing this flag will ++ * translate to HSA_CAP_ATS_PRESENT ++ */ ++ iommu_info.flags = 0; ++ if (amd_iommu_device_info(kdev->pdev, &iommu_info) == 0) { ++ if ((iommu_info.flags & required_iommu_flags) == ++ required_iommu_flags) ++ cu->hsa_capability |= CRAT_CU_FLAGS_IOMMU_PRESENT; ++ } ++ ++ crat_table->length += sub_type_hdr->length; ++ crat_table->total_entries++; ++ ++ /* Fill in Subtype: Memory. Only on systems with large BAR (no ++ * private FB), report memory as public. On other systems ++ * report the total FB size (public+private) as a single ++ * private heap. ++ */ ++ kdev->kfd2kgd->get_local_mem_info(kdev->kgd, &local_mem_info); ++ sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr + ++ sub_type_hdr->length); ++ ++ if (local_mem_info.local_mem_size_private == 0) ++ ret = kfd_fill_gpu_memory_affinity(&avail_size, ++ kdev, HSA_MEM_HEAP_TYPE_FB_PUBLIC, ++ local_mem_info.local_mem_size_public, ++ (struct crat_subtype_memory *)sub_type_hdr, ++ proximity_domain, ++ &local_mem_info); ++ else ++ ret = kfd_fill_gpu_memory_affinity(&avail_size, ++ kdev, HSA_MEM_HEAP_TYPE_FB_PRIVATE, ++ local_mem_info.local_mem_size_public + ++ local_mem_info.local_mem_size_private, ++ (struct crat_subtype_memory *)sub_type_hdr, ++ proximity_domain, ++ &local_mem_info); ++ if (ret < 0) ++ return ret; ++ ++ crat_table->length += sizeof(struct crat_subtype_memory); ++ crat_table->total_entries++; ++ ++ /* TODO: Fill in cache information. This information is NOT readily ++ * available in KGD ++ */ ++ sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr + ++ sub_type_hdr->length); ++ ret = kfd_fill_gpu_cache_info(kdev, cu->processor_id_low, ++ avail_size, ++ &cu_info, ++ (struct crat_subtype_cache *)sub_type_hdr, ++ &cache_mem_filled, ++ &num_of_cache_entries); ++ ++ if (ret < 0) ++ return ret; ++ ++ crat_table->length += cache_mem_filled; ++ crat_table->total_entries += num_of_cache_entries; ++ avail_size -= cache_mem_filled; ++ ++ /* Fill in Subtype: IO_LINKS ++ * Only direct links are added here which is Link from GPU to ++ * to its NUMA node. Indirect links are added by userspace. ++ */ ++ sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr + ++ cache_mem_filled); ++ ret = kfd_fill_gpu_direct_io_link(&avail_size, kdev, ++ (struct crat_subtype_iolink *)sub_type_hdr, proximity_domain); ++ ++ if (ret < 0) ++ return ret; ++ ++ crat_table->length += sub_type_hdr->length; ++ crat_table->total_entries++; ++ ++ *size = crat_table->length; ++ pr_info("Virtual CRAT table created for GPU\n"); ++ ++ return ret; ++} ++ + /* kfd_create_crat_image_virtual - Allocates memory for CRAT image and + * creates a Virtual CRAT (VCRAT) image + * +@@ -667,9 +1224,14 @@ int kfd_create_crat_image_virtual(void **crat_image, size_t *size, + ret = kfd_create_vcrat_image_cpu(pcrat_image, size); + break; + case COMPUTE_UNIT_GPU: +- /* TODO: */ +- ret = -EINVAL; +- pr_err("VCRAT not implemented for dGPU\n"); ++ if (!kdev) ++ return -EINVAL; ++ pcrat_image = kmalloc(VCRAT_SIZE_FOR_GPU, GFP_KERNEL); ++ if (!pcrat_image) ++ return -ENOMEM; ++ *size = VCRAT_SIZE_FOR_GPU; ++ ret = kfd_create_vcrat_image_gpu(pcrat_image, size, kdev, ++ proximity_domain); + break; + case (COMPUTE_UNIT_CPU | COMPUTE_UNIT_GPU): + /* TODO: */ +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_crat.h b/drivers/gpu/drm/amd/amdkfd/kfd_crat.h +index 1711ab6..b5cd182 100644 +--- a/drivers/gpu/drm/amd/amdkfd/kfd_crat.h ++++ b/drivers/gpu/drm/amd/amdkfd/kfd_crat.h +@@ -109,7 +109,7 @@ struct crat_subtype_computeunit { + uint8_t wave_front_size; + uint8_t num_banks; + uint16_t micro_engine_id; +- uint8_t num_arrays; ++ uint8_t array_count; + uint8_t num_cu_per_array; + uint8_t num_simd_per_cu; + uint8_t max_slots_scatch_cu; +@@ -137,7 +137,8 @@ struct crat_subtype_memory { + uint32_t length_low; + uint32_t length_high; + uint32_t width; +- uint8_t reserved2[CRAT_MEMORY_RESERVED_LENGTH]; ++ uint8_t visibility_type; /* for virtual (dGPU) CRAT */ ++ uint8_t reserved2[CRAT_MEMORY_RESERVED_LENGTH - 1]; + }; + + /* +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h +index aeee9d4..f0327c2 100644 +--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h ++++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h +@@ -668,6 +668,8 @@ int kfd_topology_init(void); + void kfd_topology_shutdown(void); + int kfd_topology_add_device(struct kfd_dev *gpu); + int kfd_topology_remove_device(struct kfd_dev *gpu); ++struct kfd_topology_device *kfd_topology_device_by_proximity_domain( ++ uint32_t proximity_domain); + struct kfd_dev *kfd_device_by_id(uint32_t gpu_id); + struct kfd_dev *kfd_device_by_pci_dev(const struct pci_dev *pdev); + int kfd_topology_enum_kfd_devices(uint8_t idx, struct kfd_dev **kdev); +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c +index 9aa6004..7fe7ee0 100644 +--- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c ++++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c +@@ -43,6 +43,25 @@ static struct kfd_system_properties sys_props; + static DECLARE_RWSEM(topology_lock); + static atomic_t topology_crat_proximity_domain; + ++struct kfd_topology_device *kfd_topology_device_by_proximity_domain( ++ uint32_t proximity_domain) ++{ ++ struct kfd_topology_device *top_dev; ++ struct kfd_topology_device *device = NULL; ++ ++ down_read(&topology_lock); ++ ++ list_for_each_entry(top_dev, &topology_device_list, list) ++ if (top_dev->proximity_domain == proximity_domain) { ++ device = top_dev; ++ break; ++ } ++ ++ up_read(&topology_lock); ++ ++ return device; ++} ++ + struct kfd_dev *kfd_device_by_id(uint32_t gpu_id) + { + struct kfd_topology_device *top_dev; +@@ -79,6 +98,7 @@ struct kfd_dev *kfd_device_by_pci_dev(const struct pci_dev *pdev) + return device; + } + ++/* Called with write topology_lock acquired */ + static void kfd_release_topology_device(struct kfd_topology_device *dev) + { + struct kfd_mem_properties *mem; +@@ -394,8 +414,7 @@ static ssize_t node_show(struct kobject *kobj, struct attribute *attr, + } + + sysfs_show_32bit_prop(buffer, "max_engine_clk_fcompute", +- dev->gpu->kfd2kgd->get_max_engine_clock_in_mhz( +- dev->gpu->kgd)); ++ dev->node_props.max_engine_clk_fcompute); + + sysfs_show_64bit_prop(buffer, "local_mem_size", + (unsigned long long int) 0); +@@ -597,6 +616,7 @@ static int kfd_build_sysfs_node_entry(struct kfd_topology_device *dev, + return 0; + } + ++/* Called with write topology lock acquired */ + static int kfd_build_sysfs_node_tree(void) + { + struct kfd_topology_device *dev; +@@ -613,6 +633,7 @@ static int kfd_build_sysfs_node_tree(void) + return 0; + } + ++/* Called with write topology lock acquired */ + static void kfd_remove_sysfs_node_tree(void) + { + struct kfd_topology_device *dev; +@@ -908,19 +929,26 @@ static uint32_t kfd_generate_gpu_id(struct kfd_dev *gpu) + + return hashout; + } +- ++/* kfd_assign_gpu - Attach @gpu to the correct kfd topology device. If ++ * the GPU device is not already present in the topology device ++ * list then return NULL. This means a new topology device has to ++ * be created for this GPU. ++ * TODO: Rather than assiging @gpu to first topology device withtout ++ * gpu attached, it will better to have more stringent check. ++ */ + static struct kfd_topology_device *kfd_assign_gpu(struct kfd_dev *gpu) + { + struct kfd_topology_device *dev; + struct kfd_topology_device *out_dev = NULL; + ++ down_write(&topology_lock); + list_for_each_entry(dev, &topology_device_list, list) + if (!dev->gpu && (dev->node_props.simd_count > 0)) { + dev->gpu = gpu; + out_dev = dev; + break; + } +- ++ up_write(&topology_lock); + return out_dev; + } + +@@ -932,6 +960,45 @@ static void kfd_notify_gpu_change(uint32_t gpu_id, int arrival) + */ + } + ++/* kfd_fill_mem_clk_max_info - Since CRAT doesn't have memory clock info, ++ * patch this after CRAT parsing. ++ */ ++static void kfd_fill_mem_clk_max_info(struct kfd_topology_device *dev) ++{ ++ struct kfd_mem_properties *mem; ++ struct kfd_local_mem_info local_mem_info; ++ ++ if (!dev) ++ return; ++ ++ /* Currently, amdgpu driver (amdgpu_mc) deals only with GPUs with ++ * single bank of VRAM local memory. ++ * for dGPUs - VCRAT reports only one bank of Local Memory ++ * for APUs - If CRAT from ACPI reports more than one bank, then ++ * all the banks will report the same mem_clk_max information ++ */ ++ dev->gpu->kfd2kgd->get_local_mem_info(dev->gpu->kgd, ++ &local_mem_info); ++ ++ list_for_each_entry(mem, &dev->mem_props, list) ++ mem->mem_clk_max = local_mem_info.mem_clk_max; ++} ++ ++static void kfd_fill_iolink_non_crat_info(struct kfd_topology_device *dev) ++{ ++ struct kfd_iolink_properties *link; ++ ++ if (!dev || !dev->gpu) ++ return; ++ ++ /* GPU only creates direck links so apply flags setting to all */ ++ if (dev->gpu->device_info->asic_family == CHIP_HAWAII) ++ list_for_each_entry(link, &dev->io_link_props, list) ++ link->flags = CRAT_IOLINK_FLAGS_ENABLED | ++ CRAT_IOLINK_FLAGS_NO_ATOMICS_32_BIT | ++ CRAT_IOLINK_FLAGS_NO_ATOMICS_64_BIT; ++} ++ + int kfd_topology_add_device(struct kfd_dev *gpu) + { + uint32_t gpu_id; +@@ -939,6 +1006,9 @@ int kfd_topology_add_device(struct kfd_dev *gpu) + struct kfd_cu_info cu_info; + int res = 0; + struct list_head temp_topology_device_list; ++ void *crat_image = NULL; ++ size_t image_size = 0; ++ int proximity_domain; + + INIT_LIST_HEAD(&temp_topology_device_list); + +@@ -946,27 +1016,33 @@ int kfd_topology_add_device(struct kfd_dev *gpu) + + pr_debug("Adding new GPU (ID: 0x%x) to topology\n", gpu_id); + +- /* +- * Try to assign the GPU to existing topology device (generated from +- * CRAT table ++ proximity_domain = atomic_inc_return(&topology_crat_proximity_domain); ++ ++ /* Check to see if this gpu device exists in the topology_device_list. ++ * If so, assign the gpu to that device, ++ * else create a Virtual CRAT for this gpu device and then parse that ++ * CRAT to create a new topology device. Once created assign the gpu to ++ * that topology device + */ + dev = kfd_assign_gpu(gpu); + if (!dev) { +- pr_info("GPU was not found in the current topology. Extending.\n"); +- kfd_debug_print_topology(); +- dev = kfd_create_topology_device(&temp_topology_device_list); +- if (!dev) { +- res = -ENOMEM; ++ res = kfd_create_crat_image_virtual(&crat_image, &image_size, ++ COMPUTE_UNIT_GPU, gpu, ++ proximity_domain); ++ if (res) { ++ pr_err("Error creating VCRAT for GPU (ID: 0x%x)\n", ++ gpu_id); ++ return res; ++ } ++ res = kfd_parse_crat_table(crat_image, ++ &temp_topology_device_list, ++ proximity_domain); ++ if (res) { ++ pr_err("Error parsing VCRAT for GPU (ID: 0x%x)\n", ++ gpu_id); + goto err; + } + +- dev->gpu = gpu; +- +- /* +- * TODO: Make a call to retrieve topology information from the +- * GPU vBIOS +- */ +- + down_write(&topology_lock); + kfd_topology_update_device_list(&temp_topology_device_list, + &topology_device_list); +@@ -974,34 +1050,86 @@ int kfd_topology_add_device(struct kfd_dev *gpu) + /* Update the SYSFS tree, since we added another topology + * device + */ +- if (kfd_topology_update_sysfs() < 0) +- kfd_topology_release_sysfs(); +- ++ res = kfd_topology_update_sysfs(); + up_write(&topology_lock); + ++ if (!res) ++ sys_props.generation_count++; ++ else ++ pr_err("Failed to update GPU (ID: 0x%x) to sysfs topology. res=%d\n", ++ gpu_id, res); ++ dev = kfd_assign_gpu(gpu); ++ if (WARN_ON(!dev)) { ++ res = -ENODEV; ++ goto err; ++ } + } + + dev->gpu_id = gpu_id; + gpu->id = gpu_id; ++ ++ /* TODO: Move the following lines to function ++ * kfd_add_non_crat_information ++ */ ++ ++ /* Fill-in additional information that is not available in CRAT but ++ * needed for the topology ++ */ ++ + dev->gpu->kfd2kgd->get_cu_info(dev->gpu->kgd, &cu_info); +- dev->node_props.simd_count = dev->node_props.simd_per_cu * +- cu_info.cu_active_number; ++ dev->node_props.simd_arrays_per_engine = ++ cu_info.num_shader_arrays_per_engine; ++ + dev->node_props.vendor_id = gpu->pdev->vendor; + dev->node_props.device_id = gpu->pdev->device; + dev->node_props.location_id = PCI_DEVID(gpu->pdev->bus->number, + gpu->pdev->devfn); +- /* +- * TODO: Retrieve max engine clock values from KGD +- */ +- +- if (dev->gpu->device_info->asic_family == CHIP_CARRIZO) { +- dev->node_props.capability |= HSA_CAP_DOORBELL_PACKET_TYPE; ++ dev->node_props.max_engine_clk_fcompute = ++ dev->gpu->kfd2kgd->get_max_engine_clock_in_mhz(dev->gpu->kgd); ++ dev->node_props.max_engine_clk_ccompute = ++ cpufreq_quick_get_max(0) / 1000; ++ ++ kfd_fill_mem_clk_max_info(dev); ++ kfd_fill_iolink_non_crat_info(dev); ++ ++ switch (dev->gpu->device_info->asic_family) { ++ case CHIP_KAVERI: ++ case CHIP_HAWAII: ++ case CHIP_TONGA: ++ dev->node_props.capability |= ((HSA_CAP_DOORBELL_TYPE_PRE_1_0 << ++ HSA_CAP_DOORBELL_TYPE_TOTALBITS_SHIFT) & ++ HSA_CAP_DOORBELL_TYPE_TOTALBITS_MASK); ++ break; ++ case CHIP_CARRIZO: ++ case CHIP_FIJI: ++ case CHIP_POLARIS10: ++ case CHIP_POLARIS11: + pr_debug("Adding doorbell packet type capability\n"); ++ dev->node_props.capability |= ((HSA_CAP_DOORBELL_TYPE_1_0 << ++ HSA_CAP_DOORBELL_TYPE_TOTALBITS_SHIFT) & ++ HSA_CAP_DOORBELL_TYPE_TOTALBITS_MASK); ++ break; ++ default: ++ WARN(1, "Unexpected ASIC family %u", ++ dev->gpu->device_info->asic_family); + } + ++ /* Fix errors in CZ CRAT. ++ * simd_count: Carrizo CRAT reports wrong simd_count, probably ++ * because it doesn't consider masked out CUs ++ * capability flag: Carrizo CRAT doesn't report IOMMU ++ * flags. TODO: Fix this. ++ */ ++ if (dev->gpu->device_info->asic_family == CHIP_CARRIZO) ++ dev->node_props.simd_count = ++ cu_info.simd_per_cu * cu_info.cu_active_number; ++ ++ kfd_debug_print_topology(); ++ + if (!res) + kfd_notify_gpu_change(gpu_id, 1); + err: ++ kfd_destroy_crat_image(crat_image); + return res; + } + +diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.h b/drivers/gpu/drm/amd/amdkfd/kfd_topology.h +index 8668189..55de56f 100644 +--- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.h ++++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.h +@@ -39,8 +39,12 @@ + #define HSA_CAP_WATCH_POINTS_SUPPORTED 0x00000080 + #define HSA_CAP_WATCH_POINTS_TOTALBITS_MASK 0x00000f00 + #define HSA_CAP_WATCH_POINTS_TOTALBITS_SHIFT 8 +-#define HSA_CAP_RESERVED 0xfffff000 +-#define HSA_CAP_DOORBELL_PACKET_TYPE 0x00001000 ++#define HSA_CAP_DOORBELL_TYPE_TOTALBITS_MASK 0x00003000 ++#define HSA_CAP_DOORBELL_TYPE_TOTALBITS_SHIFT 12 ++#define HSA_CAP_RESERVED 0xffffc000 ++ ++#define HSA_CAP_DOORBELL_TYPE_PRE_1_0 0x0 ++#define HSA_CAP_DOORBELL_TYPE_1_0 0x1 + + struct kfd_node_properties { + uint32_t cpu_cores_count; +-- +2.7.4 + |