본문 바로가기

Skills/mY Technutz

kernel dump 로 생성된 vmcore 에서 NIC device name 확인하기

가끔 덤프분석을 하다보면 어이가 없게도, 물리적 문제로 인한 덤프 생성시, 물리적 문제가 발생한

물리적 장치를 직접 찾아서 지목해 달라고 요구받는 경우가 있다.

이번시간에 vmcore 를 통해 문제가 있는 PCI device 의 위치와 이름을 확인하는 방법을 공유해 본다.

crash7latest> sys
      KERNEL: /share/vmlinux/64/5.4.17-2036.100.6.1.el7uek.x86_64/vmlinux
    DUMPFILE: vmcore  [PARTIAL DUMP]
        CPUS: 96
        DATE: Wed Mar 10 16:38:47 2021
      UPTIME: 22:58:32
LOAD AVERAGE: 1.02, 1.01, 0.96
       TASKS: 2285
    NODENAME: **********02
     RELEASE: 5.4.17-2036.100.6.1.el7uek.x86_64
     VERSION: #2 SMP Thu Oct 29 17:04:48 PDT 2020
     MACHINE: x86_64  (3300 Mhz)
      MEMORY: 754.7 GB
       PANIC: "Oops: 0000 [#1] SMP NOPTI" (check log for details)
crash7latest> bt
PID: 0      TASK: ffff8910419ddd00  CPU: 43  COMMAND: "swapper/43"
 #0 [ffffc900195c4a30] machine_kexec at ffffffff81075371
 #1 [ffffc900195c4a90] __crash_kexec at ffffffff8115dd72
 #2 [ffffc900195c4b60] crash_kexec at ffffffff8115f0cc
 #3 [ffffc900195c4b80] oops_end at ffffffff81036a7b
 #4 [ffffc900195c4ba8] no_context at ffffffff81085817
 #5 [ffffc900195c4c18] __bad_area_nosemaphore at ffffffff81085bf0
 #6 [ffffc900195c4c70] bad_area_nosemaphore at ffffffff81085d86
 #7 [ffffc900195c4c80] __do_page_fault at ffffffff81086b59
 #8 [ffffc900195c4ce8] do_page_fault at ffffffff81086d66
 #9 [ffffc900195c4d20] page_fault at ffffffff81a02a9d
    [exception RIP: bnxt_tx_int+234]
    RIP: ffffffffc01b800a  RSP: ffffc900195c4dd0  RFLAGS: 00010246
    RAX: ffff893e7f93d000  RBX: ffff8933aa234980  RCX: 0000000080200001
    RDX: 000000008020001a  RSI: 0000000080200019  RDI: ffff88830210b3c0
    RBP: ffffc900195c4e20   R8: 0000000000000000   R9: 0000000000000001
    R10: 0000000000000001  R11: 0000000000000000  R12: ffff88cc3ffdd7a8
    R13: 0000000000000000  R14: 00000000000000c9  R15: ffffc90064f7a900
    ORIG_RAX: ffffffffffffffff  CS: 0010  SS: 0018
#10 [ffffc900195c4e28] __bnxt_poll_work_done at ffffffffc01b7981 [bnxt_en]
#11 [ffffc900195c4e48] bnxt_poll at ffffffffc01bc61a [bnxt_en]
#12 [ffffc900195c4ec8] net_rx_action at ffffffff8180c999
#13 [ffffc900195c4f48] __softirqentry_text_start at ffffffff81c000e1
#14 [ffffc900195c4fb0] irq_exit at ffffffff810ac5eb
#15 [ffffc900195c4fc8] do_IRQ at ffffffff81a03e9a
--- <IRQ stack> ---
#16 [ffffc90018b4fd88] ret_from_intr at ffffffff81a00c8f
    [exception RIP: cpuidle_enter_state+191]
    RIP: ffffffff8177b6cf  RSP: ffffc90018b4fe38  RFLAGS: 00000246
    RAX: ffff893e9e1ead40  RBX: 000000000000002b  RCX: 000000000000001f
    RDX: 0000000000000000  RSI: 0000000000000002  RDI: 0000000000000000
    RBP: ffffc90018b4fe78   R8: 0000000000000002   R9: ffffffe649b40387
    R10: 0000000000000018  R11: 071c71c71c71c71c  R12: ffffe8ffff9c8000
    R13: ffffffff82540440  R14: 0000000000000002  R15: ffffffff82540518
    ORIG_RAX: ffffffffffffffd0  CS: 0010  SS: 0018
#17 [ffffc90018b4fe80] cpuidle_enter at ffffffff8177ba6e
#18 [ffffc90018b4fea8] call_cpuidle at ffffffff810e3f43
#19 [ffffc90018b4feb8] do_idle at ffffffff810e41fb
#20 [ffffc90018b4ff10] cpu_startup_entry at ffffffff810e442d
#21 [ffffc90018b4ff28] start_secondary at ffffffff81069089
#22 [ffffc90018b4ff50] secondary_startup_64 at ffffffff810000e6

 Exception RIP 에서 bnxt_tx_int 함수를 통해 해당 모듈은 bnxt 모듈을 이용하고 있음을 알 수 있다.

RBX 값에서 bnxt 의 구조체 내용을 살펴보자 :

crash7latest> struct bnxt ffff8933aa234980 | more
struct bnxt {
  bar0 = 0xffffc9001aab0000,
  bar1 = 0xffffc9001ea00000,
  bar2 = 0xffffc9001a896000,
  reg_base = 0,
  chip_num = 5847,
  chip_rev = 1 '\001',
  dev = 0xffff8933aa234000,
  pdev = 0xffff893e7f93d000,
  intr_sem = {
    counter = 0
  },
  flags = 337768440,
  edev = 0xffff88d01c1b2380,
  ulp_probe = 0xffffffffc01d3660 <bnxt_ulp_probe>,
  bnapi = 0xffff88caedbe8000,
  rx_ring = 0xffff88ceb9464000,
  tx_ring = 0xffff88cc3ffdd000,
  tx_ring_map = 0xffff88dd7f70f9e0,
  gro_func = 0xffffffffc01b40d0 <bnxt_gro_func_5731x>,
  rx_skb_func = 0xffffffffc01ba610 <bnxt_rx_skb>,

pdev 의 값은 pci device 를 포함하는 구조체의 주소를 갖고 있다.

상세한 부분은 생략한다. 해당 구조체에 대해서 찾아보아라.

pci_dev 의 구조체 내용을 확인하자.

crash7latest> struct pci_dev 0xffff893e7f93d000 | more
struct pci_dev {
  bus_list = {
    next = 0xffff88e041b83028,
    prev = 0xffff893e7f93b000
  },
  bus = 0xffff88e041b83000,
  subordinate = 0x0,
  sysdata = 0xffff88e041240ab8,
  procent = 0xffff890bfea54780,
  slot = 0xffff88e041b89500,
..
 
  dev = {
    kobj = {
      name = 0xffff88dd7fddbe10 "0003:41:00.1",
      entry = {
        next = 0xffff88dd7f9a9008,
        prev = 0xffff88dd7f9af808
      },

dev 블럭의 name 에서 PCI 버스 위치를 확인 할 수 있다.

사실 버스 위치를 확인하려고 한게 아니라, OS 에서 인식된 device name 을 찾고자 한 것이므로,

net_device 구조체를 확인해야 한다.

앞선 bnxt 구조체 멤버중 dev 주소를 통해 확인할 수 있다.

crash7latest>  struct net_device 0xffff8933aa234000 | more
struct net_device {
  name = "enP3s2095f1\000\000\000\000",
  name_hlist = {
    next = 0x0,
    pprev = 0xffff888100faf320
  },
  ifalias = 0x0,
  mem_end = 0,
  mem_start = 0,
  base_addr = 0,
  irq = 0,
  state = 3,

"enP3s2095f1" 이라는 device name 이 확인이 가능하다.

2020.04.06 - [Skills/mY Technutz] - eBPF vmcore Analysis