aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/misc/habanalabs/gaudi/gaudi.c
diff options
context:
space:
mode:
authorYuri Nudelman <ynudelman@habana.ai>2021-06-06 10:28:51 +0300
committerOded Gabbay <ogabbay@kernel.org>2021-08-29 09:47:46 +0300
commit938b793fdede518e10c67dc1dcfba1804faf98a6 (patch)
tree640575a63c1b0c204cdbacab5f2d0bb1c4cce3d7 /drivers/misc/habanalabs/gaudi/gaudi.c
parente79e745b208b3c709cc5e689e587d04a4c0fe1bb (diff)
downloadlinux-938b793fdede518e10c67dc1dcfba1804faf98a6.tar.gz
habanalabs: expose state dump
To improve the user's ability to debug the case where a workload that is part of executing training/inference of a topology is getting stuck, we need to add a 'core dump' each time a CS times-out. The 'core dump' shall contain all relevant Sync Manager information and corresponding fence values. The most recent dumps shall be accessible via debugfs, under 'state_dump' node. Reading from the node will provide the oldest dump available. Writing an integer value X will discard X dumps, starting with the oldest one, i.e. subsequent read will now return newer dumps. Signed-off-by: Yuri Nudelman <ynudelman@habana.ai> Reviewed-by: Oded Gabbay <ogabbay@kernel.org> Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
Diffstat (limited to 'drivers/misc/habanalabs/gaudi/gaudi.c')
-rw-r--r--drivers/misc/habanalabs/gaudi/gaudi.c24
1 files changed, 23 insertions, 1 deletions
diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c
index aa8a0ca5aca2..7f90f637d7f4 100644
--- a/drivers/misc/habanalabs/gaudi/gaudi.c
+++ b/drivers/misc/habanalabs/gaudi/gaudi.c
@@ -348,6 +348,8 @@ static enum hl_queue_type gaudi_queue_type[GAUDI_QUEUE_ID_SIZE] = {
QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_NIC_9_3 */
};
+static s64 gaudi_state_dump_specs_props[SP_MAX] = {0};
+
struct ecc_info_extract_params {
u64 block_address;
u32 num_memories;
@@ -8977,6 +8979,25 @@ static int gaudi_map_pll_idx_to_fw_idx(u32 pll_idx)
}
}
+static int gaudi_gen_sync_to_engine_map(struct hl_device *hdev,
+ struct hl_sync_to_engine_map *map)
+{
+ /* Not implemented */
+ return 0;
+}
+
+
+static struct hl_state_dump_specs_funcs gaudi_state_dump_funcs = {
+ .gen_sync_to_engine_map = gaudi_gen_sync_to_engine_map,
+};
+
+static void gaudi_state_dump_init(struct hl_device *hdev)
+{
+ /* Not implemented */
+ hdev->state_dump_specs.props = gaudi_state_dump_specs_props;
+ hdev->state_dump_specs.funcs = gaudi_state_dump_funcs;
+}
+
static const struct hl_asic_funcs gaudi_funcs = {
.early_init = gaudi_early_init,
.early_fini = gaudi_early_fini,
@@ -9062,7 +9083,8 @@ static const struct hl_asic_funcs gaudi_funcs = {
.enable_events_from_fw = gaudi_enable_events_from_fw,
.map_pll_idx_to_fw_idx = gaudi_map_pll_idx_to_fw_idx,
.init_firmware_loader = gaudi_init_firmware_loader,
- .init_cpu_scrambler_dram = gaudi_init_scrambler_hbm
+ .init_cpu_scrambler_dram = gaudi_init_scrambler_hbm,
+ .state_dump_init = gaudi_state_dump_init
};
/**