diff options
author | Yuri Nudelman <ynudelman@habana.ai> | 2021-06-06 10:28:51 +0300 |
---|---|---|
committer | Oded Gabbay <ogabbay@kernel.org> | 2021-08-29 09:47:46 +0300 |
commit | 938b793fdede518e10c67dc1dcfba1804faf98a6 (patch) | |
tree | 640575a63c1b0c204cdbacab5f2d0bb1c4cce3d7 /drivers/misc/habanalabs/gaudi/gaudi.c | |
parent | e79e745b208b3c709cc5e689e587d04a4c0fe1bb (diff) | |
download | linux-938b793fdede518e10c67dc1dcfba1804faf98a6.tar.gz |
habanalabs: expose state dump
To improve the user's ability to debug the case where a workload that
is part of executing training/inference of a topology is getting stuck,
we need to add a 'core dump' each time a CS times-out. The 'core dump'
shall contain all relevant Sync Manager information and corresponding
fence values.
The most recent dumps shall be accessible via debugfs, under
'state_dump' node. Reading from the node will provide the oldest dump
available. Writing an integer value X will discard X dumps, starting
with the oldest one, i.e. subsequent read will now return newer
dumps.
Signed-off-by: Yuri Nudelman <ynudelman@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
Diffstat (limited to 'drivers/misc/habanalabs/gaudi/gaudi.c')
-rw-r--r-- | drivers/misc/habanalabs/gaudi/gaudi.c | 24 |
1 files changed, 23 insertions, 1 deletions
diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c index aa8a0ca5aca2..7f90f637d7f4 100644 --- a/drivers/misc/habanalabs/gaudi/gaudi.c +++ b/drivers/misc/habanalabs/gaudi/gaudi.c @@ -348,6 +348,8 @@ static enum hl_queue_type gaudi_queue_type[GAUDI_QUEUE_ID_SIZE] = { QUEUE_TYPE_INT, /* GAUDI_QUEUE_ID_NIC_9_3 */ }; +static s64 gaudi_state_dump_specs_props[SP_MAX] = {0}; + struct ecc_info_extract_params { u64 block_address; u32 num_memories; @@ -8977,6 +8979,25 @@ static int gaudi_map_pll_idx_to_fw_idx(u32 pll_idx) } } +static int gaudi_gen_sync_to_engine_map(struct hl_device *hdev, + struct hl_sync_to_engine_map *map) +{ + /* Not implemented */ + return 0; +} + + +static struct hl_state_dump_specs_funcs gaudi_state_dump_funcs = { + .gen_sync_to_engine_map = gaudi_gen_sync_to_engine_map, +}; + +static void gaudi_state_dump_init(struct hl_device *hdev) +{ + /* Not implemented */ + hdev->state_dump_specs.props = gaudi_state_dump_specs_props; + hdev->state_dump_specs.funcs = gaudi_state_dump_funcs; +} + static const struct hl_asic_funcs gaudi_funcs = { .early_init = gaudi_early_init, .early_fini = gaudi_early_fini, @@ -9062,7 +9083,8 @@ static const struct hl_asic_funcs gaudi_funcs = { .enable_events_from_fw = gaudi_enable_events_from_fw, .map_pll_idx_to_fw_idx = gaudi_map_pll_idx_to_fw_idx, .init_firmware_loader = gaudi_init_firmware_loader, - .init_cpu_scrambler_dram = gaudi_init_scrambler_hbm + .init_cpu_scrambler_dram = gaudi_init_scrambler_hbm, + .state_dump_init = gaudi_state_dump_init }; /** |