Skills/mY Technutz

kernel dump 로 생성된 vmcore 에서 NIC device name 확인하기

mirr 2021. 4. 29. 23:25

가끔 덤프분석을 하다보면 어이가 없게도, 물리적 문제로 인한 덤프 생성시, 물리적 문제가 발생한

물리적 장치를 직접 찾아서 지목해 달라고 요구받는 경우가 있다.

이번시간에 vmcore 를 통해 문제가 있는 PCI device 의 위치와 이름을 확인하는 방법을 공유해 본다.

crash7latest> sys
      KERNEL: /share/vmlinux/64/5.4.17-2036.100.6.1.el7uek.x86_64/vmlinux
    DUMPFILE: vmcore  [PARTIAL DUMP]
        CPUS: 96
        DATE: Wed Mar 10 16:38:47 2021
      UPTIME: 22:58:32
LOAD AVERAGE: 1.02, 1.01, 0.96
       TASKS: 2285
    NODENAME: **********02
     RELEASE: 5.4.17-2036.100.6.1.el7uek.x86_64
     VERSION: #2 SMP Thu Oct 29 17:04:48 PDT 2020
     MACHINE: x86_64  (3300 Mhz)
      MEMORY: 754.7 GB
       PANIC: "Oops: 0000 [#1] SMP NOPTI" (check log for details)
crash7latest> bt
PID: 0      TASK: ffff8910419ddd00  CPU: 43  COMMAND: "swapper/43"
 #0 [ffffc900195c4a30] machine_kexec at ffffffff81075371
 #1 [ffffc900195c4a90] __crash_kexec at ffffffff8115dd72
 #2 [ffffc900195c4b60] crash_kexec at ffffffff8115f0cc
 #3 [ffffc900195c4b80] oops_end at ffffffff81036a7b
 #4 [ffffc900195c4ba8] no_context at ffffffff81085817
 #5 [ffffc900195c4c18] __bad_area_nosemaphore at ffffffff81085bf0
 #6 [ffffc900195c4c70] bad_area_nosemaphore at ffffffff81085d86
 #7 [ffffc900195c4c80] __do_page_fault at ffffffff81086b59
 #8 [ffffc900195c4ce8] do_page_fault at ffffffff81086d66
 #9 [ffffc900195c4d20] page_fault at ffffffff81a02a9d
    [exception RIP: bnxt_tx_int+234]
    RIP: ffffffffc01b800a  RSP: ffffc900195c4dd0  RFLAGS: 00010246
    RAX: ffff893e7f93d000  RBX: ffff8933aa234980  RCX: 0000000080200001
    RDX: 000000008020001a  RSI: 0000000080200019  RDI: ffff88830210b3c0
    RBP: ffffc900195c4e20   R8: 0000000000000000   R9: 0000000000000001
    R10: 0000000000000001  R11: 0000000000000000  R12: ffff88cc3ffdd7a8
    R13: 0000000000000000  R14: 00000000000000c9  R15: ffffc90064f7a900
    ORIG_RAX: ffffffffffffffff  CS: 0010  SS: 0018
#10 [ffffc900195c4e28] __bnxt_poll_work_done at ffffffffc01b7981 [bnxt_en]
#11 [ffffc900195c4e48] bnxt_poll at ffffffffc01bc61a [bnxt_en]
#12 [ffffc900195c4ec8] net_rx_action at ffffffff8180c999
#13 [ffffc900195c4f48] __softirqentry_text_start at ffffffff81c000e1
#14 [ffffc900195c4fb0] irq_exit at ffffffff810ac5eb
#15 [ffffc900195c4fc8] do_IRQ at ffffffff81a03e9a
--- <IRQ stack> ---
#16 [ffffc90018b4fd88] ret_from_intr at ffffffff81a00c8f
    [exception RIP: cpuidle_enter_state+191]
    RIP: ffffffff8177b6cf  RSP: ffffc90018b4fe38  RFLAGS: 00000246
    RAX: ffff893e9e1ead40  RBX: 000000000000002b  RCX: 000000000000001f
    RDX: 0000000000000000  RSI: 0000000000000002  RDI: 0000000000000000
    RBP: ffffc90018b4fe78   R8: 0000000000000002   R9: ffffffe649b40387
    R10: 0000000000000018  R11: 071c71c71c71c71c  R12: ffffe8ffff9c8000
    R13: ffffffff82540440  R14: 0000000000000002  R15: ffffffff82540518
    ORIG_RAX: ffffffffffffffd0  CS: 0010  SS: 0018
#17 [ffffc90018b4fe80] cpuidle_enter at ffffffff8177ba6e
#18 [ffffc90018b4fea8] call_cpuidle at ffffffff810e3f43
#19 [ffffc90018b4feb8] do_idle at ffffffff810e41fb
#20 [ffffc90018b4ff10] cpu_startup_entry at ffffffff810e442d
#21 [ffffc90018b4ff28] start_secondary at ffffffff81069089
#22 [ffffc90018b4ff50] secondary_startup_64 at ffffffff810000e6

 Exception RIP 에서 bnxt_tx_int 함수를 통해 해당 모듈은 bnxt 모듈을 이용하고 있음을 알 수 있다.

RBX 값에서 bnxt 의 구조체 내용을 살펴보자 :

crash7latest> struct bnxt ffff8933aa234980 | more
struct bnxt {
  bar0 = 0xffffc9001aab0000,
  bar1 = 0xffffc9001ea00000,
  bar2 = 0xffffc9001a896000,
  reg_base = 0,
  chip_num = 5847,
  chip_rev = 1 '\001',
  dev = 0xffff8933aa234000,
  pdev = 0xffff893e7f93d000,
  intr_sem = {
    counter = 0
  },
  flags = 337768440,
  edev = 0xffff88d01c1b2380,
  ulp_probe = 0xffffffffc01d3660 <bnxt_ulp_probe>,
  bnapi = 0xffff88caedbe8000,
  rx_ring = 0xffff88ceb9464000,
  tx_ring = 0xffff88cc3ffdd000,
  tx_ring_map = 0xffff88dd7f70f9e0,
  gro_func = 0xffffffffc01b40d0 <bnxt_gro_func_5731x>,
  rx_skb_func = 0xffffffffc01ba610 <bnxt_rx_skb>,

pdev 의 값은 pci device 를 포함하는 구조체의 주소를 갖고 있다.

상세한 부분은 생략한다. 해당 구조체에 대해서 찾아보아라.

pci_dev 의 구조체 내용을 확인하자.

crash7latest> struct pci_dev 0xffff893e7f93d000 | more
struct pci_dev {
  bus_list = {
    next = 0xffff88e041b83028,
    prev = 0xffff893e7f93b000
  },
  bus = 0xffff88e041b83000,
  subordinate = 0x0,
  sysdata = 0xffff88e041240ab8,
  procent = 0xffff890bfea54780,
  slot = 0xffff88e041b89500,
..
 
  dev = {
    kobj = {
      name = 0xffff88dd7fddbe10 "0003:41:00.1",
      entry = {
        next = 0xffff88dd7f9a9008,
        prev = 0xffff88dd7f9af808
      },

dev 블럭의 name 에서 PCI 버스 위치를 확인 할 수 있다.

사실 버스 위치를 확인하려고 한게 아니라, OS 에서 인식된 device name 을 찾고자 한 것이므로,

net_device 구조체를 확인해야 한다.

앞선 bnxt 구조체 멤버중 dev 주소를 통해 확인할 수 있다.

crash7latest>  struct net_device 0xffff8933aa234000 | more
struct net_device {
  name = "enP3s2095f1\000\000\000\000",
  name_hlist = {
    next = 0x0,
    pprev = 0xffff888100faf320
  },
  ifalias = 0x0,
  mem_end = 0,
  mem_start = 0,
  base_addr = 0,
  irq = 0,
  state = 3,

"enP3s2095f1" 이라는 device name 이 확인이 가능하다.

2020.04.06 - [Skills/mY Technutz] - eBPF vmcore Analysis