1 ixgbe 网卡注册驱动
和大部分设备驱动一样,网卡驱动是作为一个 module 注册到 kernel 的
通过 module_init() -> ixgbe_init_module() -> pci_register_driver() 注册 ixgbe_driver
通过 module_exit() -> ixgbe_exit_module() -> pci_unregister_driver() 注销 ixgbe_driver
1.1 ixgbe_driver 类
static struct pci_driver ixgbe_driver = {.name = ixgbe_driver_name,.id_table = ixgbe_pci_tbl,.probe = ixgbe_probe,// 系统探测到ixgbe网卡后调用ixgbe_probe().remove = ixgbe_remove,
#ifdef CONFIG_PM.suspend = ixgbe_suspend,.resume = ixgbe_resume,
#endif.shutdown = ixgbe_shutdown,.sriov_configure = ixgbe_pci_sriov_configure,.err_handler = &ixgbe_err_handler
};
1.2 ixgbe_driver 注册/注销
/*** ixgbe_init_module - Driver Registration Routine** ixgbe_init_module is the first routine called when the driver is* loaded. All it does is register with the PCI subsystem.**/
static int __init ixgbe_init_module(void)
{int ret;pr_info("%s - version %s\n", ixgbe_driver_string, ixgbe_driver_version);pr_info("%s\n", ixgbe_copyright);ixgbe_wq = create_singlethread_workqueue(ixgbe_driver_name);if (!ixgbe_wq) {pr_err("%s: Failed to create workqueue\n", ixgbe_driver_name);return -ENOMEM;}ixgbe_dbg_init();ret = pci_register_driver(&ixgbe_driver); //注册ixgbe_driverif (ret) {destroy_workqueue(ixgbe_wq);ixgbe_dbg_exit();return ret;}#ifdef CONFIG_IXGBE_DCAdca_register_notify(&dca_notifier);
#endifreturn 0;
}module_init(ixgbe_init_module);/*** ixgbe_exit_module - Driver Exit Cleanup Routine** ixgbe_exit_module is called just before the driver is removed* from memory.**/
static void __exit ixgbe_exit_module(void)
{
#ifdef CONFIG_IXGBE_DCAdca_unregister_notify(&dca_notifier);
#endifpci_unregister_driver(&ixgbe_driver); //注销 ixgbe_driverixgbe_dbg_exit();if (ixgbe_wq) {destroy_workqueue(ixgbe_wq);ixgbe_wq = NULL;}
}module_exit(ixgbe_exit_module);
2 ixgbe 的 PCI 注册驱动流程 pci_register_driver()
pci_register_driver() ->
__pci_register_driver() ->
driver_register() ->
bus_add_driver() ->
driver_attach() ->
bus_for_each_dev() ->
__driver_attach() ->
driver_probe_device() ->
really_probe() ->
pci_device_probe() ->
__pci_device_probe() ->
pci_call_probe() ->
local_pci_probe()static long local_pci_probe(void *_ddi)
{...rc = pci_drv->probe(pci_dev, ddi->id); // 系统探测到设备后调用设备驱动的probe...
}
【文章福利】小编推荐自己的Linux内核技术交流群: 【977878001】整理一些个人觉得比较好得学习书籍、视频资料共享在群文件里面,有需要的可以自行添加哦!!!前100进群领取,额外赠送一份 价值699的内核资料包(含视频教程、电子书、实战项目及代码)
内核资料直通车:Linux内核源码技术学习路线+视频教程代码资料
学习直通车:Linux内核源码/内存调优/文件系统/进程管理/设备驱动/网络协议栈
3 ixgbe 网卡探测 ixgbe_probe()【核心】
/*** ixgbe_probe - Device Initialization Routine* @pdev: PCI device information struct* @ent: entry in ixgbe_pci_tbl** Returns 0 on success, negative on failure** ixgbe_probe initializes an adapter identified by a pci_dev structure.* The OS initialization, configuring of the adapter private structure,* and a hardware reset occur.**/
static int ixgbe_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
{struct net_device *netdev;struct ixgbe_adapter *adapter = NULL;struct ixgbe_hw *hw;const struct ixgbe_info *ii = ixgbe_info_tbl[ent->driver_data];//根据网卡型号(82598/82599)选择ixgbe_infoint i, err, pci_using_dac, expected_gts;unsigned int indices = MAX_TX_QUEUES;u8 part_str[IXGBE_PBANUM_LENGTH];bool disable_dev = false;
#ifdef IXGBE_FCOEu16 device_caps;
#endifu32 eec;/* Catch broken hardware that put the wrong VF device ID in* the PCIe SR-IOV capability.*/if (pdev->is_virtfn) {WARN(1, KERN_ERR "%s (%hx:%hx) should not be a VF!\n",pci_name(pdev), pdev->vendor, pdev->device);return -EINVAL;}/* pci_enable_device_mem() -> __pci_enable_device_flags() -> do_pci_enable_device()-> pcibios_enable_device() -> pci_enable_resources() -> pci_write_config_word()向配置寄存器Command(0x04)中写入 PCI_COMMAND_MEMORY(0x2),允许网卡驱动访问网卡的Memory空间 */err = pci_enable_device_mem(pdev);if (err)return err;/* pci_set_dma_mask() -> dma_set_mask() -> dma_supported()检查并设置PCI总线地址位数 */if (!dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(64))) {pci_using_dac = 1;} else {err = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(32));if (err) {dev_err(&pdev->dev,"No usable DMA configuration, aborting\n");goto err_dma;}pci_using_dac = 0;}/* pci_request_mem_regions() -> pci_request_selected_regions() -> __pci_request_selected_regions()-> __pci_request_region()-> request_region()/__request_mem_region()-> __request_region() -> __request_resource()登记BAR中的总线地址(将resource插入iomem_resource资源树) */err = pci_request_mem_regions(pdev, ixgbe_driver_name);if (err) {dev_err(&pdev->dev,"pci_request_selected_regions failed 0x%x\n", err);goto err_pci_reg;}pci_enable_pcie_error_reporting(pdev);/* pci_set_master() -> __pci_set_master() -> pci_write_config_word()向配置寄存器Command(0x04)中写入PCI_COMMAND_MASTER(0x4),允许网卡申请PCI总线控制权 */pci_set_master(pdev);/* pci_save_state() -> pci_read_config_dword()读取并保存配置空间到dev->saved_config_space */ pci_save_state(pdev);if (ii->mac == ixgbe_mac_82598EB) {
#ifdef CONFIG_IXGBE_DCB/* 8 TC w/ 4 queues per TC */indices = 4 * MAX_TRAFFIC_CLASS;
#elseindices = IXGBE_MAX_RSS_INDICES;
#endif}// 分配net_device和ixgbe_adapter,发送队列数为 indicesnetdev = alloc_etherdev_mq(sizeof(struct ixgbe_adapter), indices);if (!netdev) {err = -ENOMEM;goto err_alloc_etherdev;}SET_NETDEV_DEV(netdev, &pdev->dev);adapter = netdev_priv(netdev); //得到ixgbe_adapter的指针adapter->netdev = netdev;adapter->pdev = pdev;hw = &adapter->hw; //得到ixgbe_hw的指针hw->back = adapter;adapter->msg_enable = netif_msg_init(debug, DEFAULT_MSG_ENABLE);// 将BAR0中的总线地址映射成内存地址,赋给hw->hw_addr,允许网卡驱动通过hw->hw_addr访问网卡的BAR0对应的Memory空间hw->hw_addr = ioremap(pci_resource_start(pdev, 0),pci_resource_len(pdev, 0));adapter->io_addr = hw->hw_addr;if (!hw->hw_addr) {err = -EIO;goto err_ioremap;}netdev->netdev_ops = &ixgbe_netdev_ops;// 注册ixgbe_netdev_opsixgbe_set_ethtool_ops(netdev);netdev->watchdog_timeo = 5 * HZ;strlcpy(netdev->name, pci_name(pdev), sizeof(netdev->name));/* Setup hw api */hw->mac.ops = *ii->mac_ops;hw->mac.type = ii->mac;hw->mvals = ii->mvals;if (ii->link_ops)hw->link.ops = *ii->link_ops;/* EEPROM */hw->eeprom.ops = *ii->eeprom_ops;eec = IXGBE_READ_REG(hw, IXGBE_EEC(hw));// 读取BAR0对应的Memory空间的IXGBE_EECif (ixgbe_removed(hw->hw_addr)) {err = -EIO;goto err_ioremap;}/* If EEPROM is valid (bit 8 = 1), use default otherwise use bit bang */if (!(eec & BIT(8)))hw->eeprom.ops.read = &ixgbe_read_eeprom_bit_bang_generic;/* PHY */hw->phy.ops = *ii->phy_ops;hw->phy.sfp_type = ixgbe_sfp_type_unknown;/* ixgbe_identify_phy_generic will set prtad and mmds properly */hw->phy.mdio.prtad = MDIO_PRTAD_NONE;hw->phy.mdio.mmds = 0;hw->phy.mdio.mode_support = MDIO_SUPPORTS_C45 | MDIO_EMULATE_C22;hw->phy.mdio.dev = netdev;hw->phy.mdio.mdio_read = ixgbe_mdio_read;hw->phy.mdio.mdio_write = ixgbe_mdio_write;/* setup the private structure *//* 初始化ixgbe_adapter:设置adapter->tx/rx_ring_count为1024(默认1024,最小64,最大4096)设置adapter->ring_feature[RING_F_RSS].indices为min(CPU数, IXGBE_MAX_RSS_INDICES(16))设置adapter->ring_feature[RING_F_FDIR].indices为IXGBE_MAX_FDIR_INDICES(64)设置adapter->flags的IXGBE_FLAG_RSS_ENABLED和IXGBE_FLAG_FDIR_HASH_CAPABLE */err = ixgbe_sw_init(adapter, ii);if (err)goto err_sw_init;/* Make sure the SWFW semaphore is in a valid state */if (hw->mac.ops.init_swfw_sync)hw->mac.ops.init_swfw_sync(hw);/* Make it possible the adapter to be woken up via WOL */switch (adapter->hw.mac.type) {case ixgbe_mac_82599EB:case ixgbe_mac_X540:case ixgbe_mac_X550:case ixgbe_mac_X550EM_x:case ixgbe_mac_x550em_a:IXGBE_WRITE_REG(&adapter->hw, IXGBE_WUS, ~0);break;default:break;}/** If there is a fan on this device and it has failed log the* failure.*/if (adapter->flags & IXGBE_FLAG_FAN_FAIL_CAPABLE) {u32 esdp = IXGBE_READ_REG(hw, IXGBE_ESDP);if (esdp & IXGBE_ESDP_SDP1)e_crit(probe, "Fan has stopped, replace the adapter\n");}if (allow_unsupported_sfp)hw->allow_unsupported_sfp = allow_unsupported_sfp;/* reset_hw fills in the perm_addr as well */hw->phy.reset_if_overtemp = true;/* ixgbe_reset_hw_82599() -> ixgbe_get_mac_addr_generic()读取eeprom中的mac地址,写入hw->mac.perm_addr */err = hw->mac.ops.reset_hw(hw);hw->phy.reset_if_overtemp = false;ixgbe_set_eee_capable(adapter);if (err == IXGBE_ERR_SFP_NOT_PRESENT) {err = 0;} else if (err == IXGBE_ERR_SFP_NOT_SUPPORTED) {e_dev_err("failed to load because an unsupported SFP+ or QSFP module type was detected.\n");e_dev_err("Reload the driver after installing a supported module.\n");goto err_sw_init;} else if (err) {e_dev_err("HW Init failed: %d\n", err);goto err_sw_init;}#ifdef CONFIG_PCI_IOV/* SR-IOV not supported on the 82598 */if (adapter->hw.mac.type == ixgbe_mac_82598EB)goto skip_sriov;/* Mailbox */ixgbe_init_mbx_params_pf(hw);hw->mbx.ops = ii->mbx_ops;pci_sriov_set_totalvfs(pdev, IXGBE_MAX_VFS_DRV_LIMIT);ixgbe_enable_sriov(adapter, max_vfs);
skip_sriov:#endifnetdev->features = NETIF_F_SG |NETIF_F_TSO |NETIF_F_TSO6 |NETIF_F_RXHASH |NETIF_F_RXCSUM |NETIF_F_HW_CSUM;#define IXGBE_GSO_PARTIAL_FEATURES (NETIF_F_GSO_GRE | \NETIF_F_GSO_GRE_CSUM | \NETIF_F_GSO_IPXIP4 | \NETIF_F_GSO_IPXIP6 | \NETIF_F_GSO_UDP_TUNNEL | \NETIF_F_GSO_UDP_TUNNEL_CSUM)netdev->gso_partial_features = IXGBE_GSO_PARTIAL_FEATURES;netdev->features |= NETIF_F_GSO_PARTIAL |IXGBE_GSO_PARTIAL_FEATURES;if (hw->mac.type >= ixgbe_mac_82599EB)netdev->features |= NETIF_F_SCTP_CRC;/* copy netdev features into list of user selectable features */netdev->hw_features |= netdev->features |NETIF_F_HW_VLAN_CTAG_FILTER |NETIF_F_HW_VLAN_CTAG_RX |NETIF_F_HW_VLAN_CTAG_TX |NETIF_F_RXALL |NETIF_F_HW_L2FW_DOFFLOAD;if (hw->mac.type >= ixgbe_mac_82599EB)netdev->hw_features |= NETIF_F_NTUPLE |NETIF_F_HW_TC;if (pci_using_dac)netdev->features |= NETIF_F_HIGHDMA;netdev->vlan_features |= netdev->features | NETIF_F_TSO_MANGLEID;netdev->hw_enc_features |= netdev->vlan_features;netdev->mpls_features |= NETIF_F_SG |NETIF_F_TSO |NETIF_F_TSO6 |NETIF_F_HW_CSUM;netdev->mpls_features |= IXGBE_GSO_PARTIAL_FEATURES;/* set this bit last since it cannot be part of vlan_features */netdev->features |= NETIF_F_HW_VLAN_CTAG_FILTER |NETIF_F_HW_VLAN_CTAG_RX |NETIF_F_HW_VLAN_CTAG_TX;netdev->priv_flags |= IFF_UNICAST_FLT;netdev->priv_flags |= IFF_SUPP_NOFCS;/* MTU range: 68 - 9710 */netdev->min_mtu = ETH_MIN_MTU;netdev->max_mtu = IXGBE_MAX_JUMBO_FRAME_SIZE - (ETH_HLEN + ETH_FCS_LEN);#ifdef CONFIG_IXGBE_DCBif (adapter->flags & IXGBE_FLAG_DCB_CAPABLE)netdev->dcbnl_ops = &ixgbe_dcbnl_ops;
#endif#ifdef IXGBE_FCOEif (adapter->flags & IXGBE_FLAG_FCOE_CAPABLE) {unsigned int fcoe_l;if (hw->mac.ops.get_device_caps) {hw->mac.ops.get_device_caps(hw, &device_caps);if (device_caps & IXGBE_DEVICE_CAPS_FCOE_OFFLOADS)adapter->flags &= ~IXGBE_FLAG_FCOE_CAPABLE;}fcoe_l = min_t(int, IXGBE_FCRETA_SIZE, num_online_cpus());adapter->ring_feature[RING_F_FCOE].limit = fcoe_l;netdev->features |= NETIF_F_FSO |NETIF_F_FCOE_CRC;netdev->vlan_features |= NETIF_F_FSO |NETIF_F_FCOE_CRC |NETIF_F_FCOE_MTU;}
#endif /* IXGBE_FCOE */if (adapter->flags2 & IXGBE_FLAG2_RSC_CAPABLE)netdev->hw_features |= NETIF_F_LRO;if (adapter->flags2 & IXGBE_FLAG2_RSC_ENABLED)netdev->features |= NETIF_F_LRO;/* make sure the EEPROM is good */if (hw->eeprom.ops.validate_checksum(hw, NULL) < 0) {e_dev_err("The EEPROM Checksum Is Not Valid\n");err = -EIO;goto err_sw_init;}eth_platform_get_mac_address(&adapter->pdev->dev,adapter->hw.mac.perm_addr);memcpy(netdev->dev_addr, hw->mac.perm_addr, netdev->addr_len);if (!is_valid_ether_addr(netdev->dev_addr)) {e_dev_err("invalid MAC address\n");err = -EIO;goto err_sw_init;}/* Set hw->mac.addr to permanent MAC address */ether_addr_copy(hw->mac.addr, hw->mac.perm_addr);ixgbe_mac_set_default_filter(adapter);setup_timer(&adapter->service_timer, &ixgbe_service_timer,(unsigned long) adapter);if (ixgbe_removed(hw->hw_addr)) {err = -EIO;goto err_sw_init;}INIT_WORK(&adapter->service_task, ixgbe_service_task);set_bit(__IXGBE_SERVICE_INITED, &adapter->state);clear_bit(__IXGBE_SERVICE_SCHED, &adapter->state);/* ixgbe_init_interrupt_scheme() -> ixgbe_set_num_queues() -> ixgbe_set_fdir_queues()/ixgbe_set_rss_queues()ixgbe_set_interrupt_capability() -> ixgbe_acquire_msix_vectors() -> pci_enable_msix()ixgbe_alloc_q_vectors()根据FDIR/RSS设置adapter->num_tx/rx_queues向PCI子系统请求中断设置poll函数,分配ixgbe_q_vector,初始化napi并加入napi_list分配发送/接收ring数组 */err = ixgbe_init_interrupt_scheme(adapter);if (err)goto err_sw_init;for (i = 0; i < adapter->num_rx_queues; i++)u64_stats_init(&adapter->rx_ring[i]->syncp);for (i = 0; i < adapter->num_tx_queues; i++)u64_stats_init(&adapter->tx_ring[i]->syncp);for (i = 0; i < adapter->num_xdp_queues; i++)u64_stats_init(&adapter->xdp_ring[i]->syncp);/* WOL not supported for all devices */adapter->wol = 0;hw->eeprom.ops.read(hw, 0x2c, &adapter->eeprom_cap);hw->wol_enabled = ixgbe_wol_supported(adapter, pdev->device,pdev->subsystem_device);if (hw->wol_enabled)adapter->wol = IXGBE_WUFC_MAG;device_set_wakeup_enable(&adapter->pdev->dev, adapter->wol);/* save off EEPROM version number */hw->eeprom.ops.read(hw, 0x2e, &adapter->eeprom_verh);hw->eeprom.ops.read(hw, 0x2d, &adapter->eeprom_verl);/* pick up the PCI bus settings for reporting later */if (ixgbe_pcie_from_parent(hw))ixgbe_get_parent_bus_info(adapter);elsehw->mac.ops.get_bus_info(hw);/* calculate the expected PCIe bandwidth required for optimal* performance. Note that some older parts will never have enough* bandwidth due to being older generation PCIe parts. We clamp these* parts to ensure no warning is displayed if it can't be fixed.*/switch (hw->mac.type) {case ixgbe_mac_82598EB:expected_gts = min(ixgbe_enumerate_functions(adapter) * 10, 16);break;default:expected_gts = ixgbe_enumerate_functions(adapter) * 10;break;}/* don't check link if we failed to enumerate functions */if (expected_gts > 0)ixgbe_check_minimum_link(adapter, expected_gts);err = ixgbe_read_pba_string_generic(hw, part_str, sizeof(part_str));if (err)strlcpy(part_str, "Unknown", sizeof(part_str));if (ixgbe_is_sfp(hw) && hw->phy.sfp_type != ixgbe_sfp_type_not_present)e_dev_info("MAC: %d, PHY: %d, SFP+: %d, PBA No: %s\n",hw->mac.type, hw->phy.type, hw->phy.sfp_type,part_str);elsee_dev_info("MAC: %d, PHY: %d, PBA No: %s\n",hw->mac.type, hw->phy.type, part_str);e_dev_info("%pM\n", netdev->dev_addr);/* reset the hardware with the new settings */err = hw->mac.ops.start_hw(hw);if (err == IXGBE_ERR_EEPROM_VERSION) {/* We are running on a pre-production device, log a warning */e_dev_warn("This device is a pre-production adapter/LOM. ""Please be aware there may be issues associated ""with your hardware. If you are experiencing ""problems please contact your Intel or hardware ""representative who provided you with this ""hardware.\n");}strcpy(netdev->name, "eth%d");pci_set_drvdata(pdev, adapter);err = register_netdev(netdev);// 注册netdevif (err)goto err_register;/* power down the optics for 82599 SFP+ fiber */if (hw->mac.ops.disable_tx_laser)hw->mac.ops.disable_tx_laser(hw);/* carrier off reporting is important to ethtool even BEFORE open */netif_carrier_off(netdev);#ifdef CONFIG_IXGBE_DCAif (dca_add_requester(&pdev->dev) == 0) {adapter->flags |= IXGBE_FLAG_DCA_ENABLED;ixgbe_setup_dca(adapter);}
#endifif (adapter->flags & IXGBE_FLAG_SRIOV_ENABLED) {e_info(probe, "IOV is enabled with %d VFs\n", adapter->num_vfs);for (i = 0; i < adapter->num_vfs; i++)ixgbe_vf_configuration(pdev, (i | 0x10000000));}/* firmware requires driver version to be 0xFFFFFFFF* since os does not support feature*/if (hw->mac.ops.set_fw_drv_ver)hw->mac.ops.set_fw_drv_ver(hw, 0xFF, 0xFF, 0xFF, 0xFF,sizeof(ixgbe_driver_version) - 1,ixgbe_driver_version);/* add san mac addr to netdev */ixgbe_add_sanmac_netdev(netdev);e_dev_info("%s\n", ixgbe_default_device_descr);#ifdef CONFIG_IXGBE_HWMONif (ixgbe_sysfs_init(adapter))e_err(probe, "failed to allocate sysfs resources\n");
#endif /* CONFIG_IXGBE_HWMON */ixgbe_dbg_adapter_init(adapter);/* setup link for SFP devices with MNG FW, else wait for IXGBE_UP */if (ixgbe_mng_enabled(hw) && ixgbe_is_sfp(hw) && hw->mac.ops.setup_link)hw->mac.ops.setup_link(hw,IXGBE_LINK_SPEED_10GB_FULL | IXGBE_LINK_SPEED_1GB_FULL,true);return 0;err_register:ixgbe_release_hw_control(adapter);ixgbe_clear_interrupt_scheme(adapter);
err_sw_init:ixgbe_disable_sriov(adapter);adapter->flags2 &= ~IXGBE_FLAG2_SEARCH_FOR_SFP;iounmap(adapter->io_addr);kfree(adapter->jump_tables[0]);kfree(adapter->mac_table);kfree(adapter->rss_key);
err_ioremap:disable_dev = !test_and_set_bit(__IXGBE_DISABLED, &adapter->state);free_netdev(netdev);
err_alloc_etherdev:pci_release_mem_regions(pdev);
err_pci_reg:
err_dma:if (!adapter || disable_dev)pci_disable_device(pdev);return err;
}
3.1 ixgbe_info 选取
根据网卡型号(82598/82599/540/550)在 ixgbe_info_tbl 列表中选择 ixgbe_info
const struct ixgbe_info *ii = ixgbe_info_tbl[ent->driver_data];static const struct ixgbe_info *ixgbe_info_tbl[] = {[board_82598] = &ixgbe_82598_info,[board_82599] = &ixgbe_82599_info,[board_X540] = &ixgbe_X540_info,[board_X550] = &ixgbe_X550_info,[board_X550EM_x] = &ixgbe_X550EM_x_info,[board_x550em_x_fw] = &ixgbe_x550em_x_fw_info,[board_x550em_a] = &ixgbe_x550em_a_info,[board_x550em_a_fw] = &ixgbe_x550em_a_fw_info,
};enum ixgbe_boards {board_82598,board_82599,board_X540,board_X550,board_X550EM_x,board_x550em_x_fw,board_x550em_a,board_x550em_a_fw,
};const struct ixgbe_info ixgbe_82599_info = {.mac = ixgbe_mac_82599EB,.get_invariants = &ixgbe_get_invariants_82599,.mac_ops = &mac_ops_82599,.eeprom_ops = &eeprom_ops_82599,.phy_ops = &phy_ops_82599,.mbx_ops = &mbx_ops_generic,.mvals = ixgbe_mvals_8259X,
};
3.2 net_device/ixgbe_adapter 分配
netdev = alloc_etherdev_mq(sizeof(struct ixgbe_adapter), MAX_TX_QUEUES);struct net_device *alloc_etherdev_mq(int sizeof_priv, unsigned int queue_count)
{return alloc_netdev_mq(sizeof_priv, "eth%d", ether_setup, queue_count);
}struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,void (*setup)(struct net_device *), unsigned int queue_count)
{struct netdev_queue *tx;struct net_device *dev;size_t alloc_size;struct net_device *p;BUG_ON(strlen(name) >= sizeof(dev->name));alloc_size = sizeof(struct net_device); // net_device的大小if (sizeof_priv) {/* ensure 32-byte alignment of private area */alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);alloc_size += sizeof_priv; // 加上private data的大小}/* ensure 32-byte alignment of whole construct */alloc_size += NETDEV_ALIGN - 1;p = kzalloc(alloc_size, GFP_KERNEL); // 分配net_device和private dataif (!p) {printk(KERN_ERR "alloc_netdev: Unable to allocate device.\n");return NULL;}// 分配queue_count个netdev_queue(发送队列数组),一个发送队列对应一个netdev_queuetx = kcalloc(queue_count, sizeof(struct netdev_queue), GFP_KERNEL);if (!tx) {printk(KERN_ERR "alloc_netdev: Unable to allocate ""tx qdiscs.\n");goto free_p;}dev = PTR_ALIGN(p, NETDEV_ALIGN);dev->padded = (char *)dev - (char *)p;if (dev_addr_init(dev))goto free_tx;dev_unicast_init(dev);dev_net_set(dev, &init_net);dev->_tx = tx; // 保存发送队列数组dev->num_tx_queues = queue_count; // 设置发送队列数dev->real_num_tx_queues = queue_count; // 设置实际发送队列数dev->gso_max_size = GSO_MAX_SIZE;netdev_init_queues(dev); // 设置dev->_tx[i]->dev和dev->rx_queue->dev为devINIT_LIST_HEAD(&dev->napi_list);dev->priv_flags = IFF_XMIT_DST_RELEASE;setup(dev); // 以太网为ether_setup()strcpy(dev->name, name);return dev;free_tx:kfree(tx);free_p:kfree(p);return NULL;
}static void netdev_init_queues(struct net_device *dev)
{netdev_init_one_queue(dev, &dev->rx_queue, NULL);netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);spin_lock_init(&dev->tx_global_lock);
}static void netdev_init_one_queue(struct net_device *dev,struct netdev_queue *queue,void *_unused)
{queue->dev = dev;
}static inline void netdev_for_each_tx_queue(struct net_device *dev,void (*f)(struct net_device *,struct netdev_queue *,void *),void *arg)
{unsigned int i;for (i = 0; i < dev->num_tx_queues; i++)f(dev, &dev->_tx[i], arg);
}void ether_setup(struct net_device *dev)
{dev->header_ops = ð_header_ops;dev->type = ARPHRD_ETHER; // 以太网格式dev->hard_header_len = ETH_HLEN; // 14dev->mtu = ETH_DATA_LEN; // 1500dev->addr_len = ETH_ALEN; // 6dev->tx_queue_len = 1000; /* Ethernet wants good queues */dev->flags = IFF_BROADCAST|IFF_MULTICAST;memset(dev->broadcast, 0xFF, ETH_ALEN);}
3.3 读取eeprom中的mac地址,写入hw->mac.perm_addr
struct ixgbe_info ixgbe_82599_info = {.mac = ixgbe_mac_82599EB,.get_invariants = &ixgbe_get_invariants_82599,.mac_ops = &mac_ops_82599,.eeprom_ops = &eeprom_ops_82599,.phy_ops = &phy_ops_82599,
};static struct ixgbe_mac_operations mac_ops_82599 = {.init_hw = &ixgbe_init_hw_generic,.reset_hw = &ixgbe_reset_hw_82599,.start_hw = &ixgbe_start_hw_82599,.clear_hw_cntrs = &ixgbe_clear_hw_cntrs_generic,.get_media_type = &ixgbe_get_media_type_82599,.get_supported_physical_layer = &ixgbe_get_supported_physical_layer_82599,.enable_rx_dma = &ixgbe_enable_rx_dma_82599,.get_mac_addr = &ixgbe_get_mac_addr_generic,.get_san_mac_addr = &ixgbe_get_san_mac_addr_82599,.get_device_caps = &ixgbe_get_device_caps_82599,.stop_adapter = &ixgbe_stop_adapter_generic,.get_bus_info = &ixgbe_get_bus_info_generic,.set_lan_id = &ixgbe_set_lan_id_multi_port_pcie,.read_analog_reg8 = &ixgbe_read_analog_reg8_82599,.write_analog_reg8 = &ixgbe_write_analog_reg8_82599,.setup_link = &ixgbe_setup_mac_link_82599,.check_link = &ixgbe_check_mac_link_82599,.get_link_capabilities = &ixgbe_get_link_capabilities_82599,.led_on = &ixgbe_led_on_generic,.led_off = &ixgbe_led_off_generic,.blink_led_start = &ixgbe_blink_led_start_generic,.blink_led_stop = &ixgbe_blink_led_stop_generic,.set_rar = &ixgbe_set_rar_generic,.clear_rar = &ixgbe_clear_rar_generic,.set_vmdq = &ixgbe_set_vmdq_82599,.clear_vmdq = &ixgbe_clear_vmdq_82599,.init_rx_addrs = &ixgbe_init_rx_addrs_generic,.update_uc_addr_list = &ixgbe_update_uc_addr_list_generic,.update_mc_addr_list = &ixgbe_update_mc_addr_list_generic,.enable_mc = &ixgbe_enable_mc_generic,.disable_mc = &ixgbe_disable_mc_generic,.clear_vfta = &ixgbe_clear_vfta_82599,.set_vfta = &ixgbe_set_vfta_82599,.fc_enable = &ixgbe_fc_enable_generic,.init_uta_tables = &ixgbe_init_uta_tables_82599,.setup_sfp = &ixgbe_setup_sfp_modules_82599,
};static s32 ixgbe_reset_hw_82599(struct ixgbe_hw *hw)
{s32 status = 0;u32 ctrl, ctrl_ext;u32 i;u32 autoc;u32 autoc2;/* Call adapter stop to disable tx/rx and clear interrupts */hw->mac.ops.stop_adapter(hw);/* PHY ops must be identified and initialized prior to reset *//* Init PHY and function pointers, perform SFP setup */status = hw->phy.ops.init(hw);if (status == IXGBE_ERR_SFP_NOT_SUPPORTED)goto reset_hw_out;/* Setup SFP module if there is one present. */if (hw->phy.sfp_setup_needed) {status = hw->mac.ops.setup_sfp(hw);hw->phy.sfp_setup_needed = false;}/* Reset PHY */if (hw->phy.reset_disable == false && hw->phy.ops.reset != NULL)hw->phy.ops.reset(hw);/** Prevent the PCI-E bus from from hanging by disabling PCI-E master* access and verify no pending requests before reset*/status = ixgbe_disable_pcie_master(hw);if (status != 0) {status = IXGBE_ERR_MASTER_REQUESTS_PENDING;hw_dbg(hw, "PCI-E Master disable polling has failed.\n");}/** Issue global reset to the MAC. This needs to be a SW reset.* If link reset is used, it might reset the MAC when mng is using it*/ctrl = IXGBE_READ_REG(hw, IXGBE_CTRL);IXGBE_WRITE_REG(hw, IXGBE_CTRL, (ctrl | IXGBE_CTRL_RST));IXGBE_WRITE_FLUSH(hw);/* Poll for reset bit to self-clear indicating reset is complete */for (i = 0; i < 10; i++) {udelay(1);ctrl = IXGBE_READ_REG(hw, IXGBE_CTRL);if (!(ctrl & IXGBE_CTRL_RST))break;}if (ctrl & IXGBE_CTRL_RST) {status = IXGBE_ERR_RESET_FAILED;hw_dbg(hw, "Reset polling failed to complete.\n");}/* Clear PF Reset Done bit so PF/VF Mail Ops can work */ctrl_ext = IXGBE_READ_REG(hw, IXGBE_CTRL_EXT);ctrl_ext |= IXGBE_CTRL_EXT_PFRSTD;IXGBE_WRITE_REG(hw, IXGBE_CTRL_EXT, ctrl_ext);msleep(50);/** Store the original AUTOC/AUTOC2 values if they have not been* stored off yet. Otherwise restore the stored original* values since the reset operation sets back to defaults.*/autoc = IXGBE_READ_REG(hw, IXGBE_AUTOC);autoc2 = IXGBE_READ_REG(hw, IXGBE_AUTOC2);if (hw->mac.orig_link_settings_stored == false) {hw->mac.orig_autoc = autoc;hw->mac.orig_autoc2 = autoc2;hw->mac.orig_link_settings_stored = true;} else {if (autoc != hw->mac.orig_autoc)IXGBE_WRITE_REG(hw, IXGBE_AUTOC, (hw->mac.orig_autoc |IXGBE_AUTOC_AN_RESTART));if ((autoc2 & IXGBE_AUTOC2_UPPER_MASK) !=(hw->mac.orig_autoc2 & IXGBE_AUTOC2_UPPER_MASK)) {autoc2 &= ~IXGBE_AUTOC2_UPPER_MASK;autoc2 |= (hw->mac.orig_autoc2 &IXGBE_AUTOC2_UPPER_MASK);IXGBE_WRITE_REG(hw, IXGBE_AUTOC2, autoc2);}}/** Store MAC address from RAR0, clear receive address registers, and* clear the multicast table. Also reset num_rar_entries to 128,* since we modify this value when programming the SAN MAC address.*/hw->mac.num_rar_entries = 128;hw->mac.ops.init_rx_addrs(hw);/* Store the permanent mac address */hw->mac.ops.get_mac_addr(hw, hw->mac.perm_addr); // 读取eeprom中的mac地址,写入hw->mac.perm_addr/* Store the permanent SAN mac address */hw->mac.ops.get_san_mac_addr(hw, hw->mac.san_addr);/* Add the SAN MAC address to the RAR only if it's a valid address */if (ixgbe_validate_mac_addr(hw->mac.san_addr) == 0) {hw->mac.ops.set_rar(hw, hw->mac.num_rar_entries - 1,hw->mac.san_addr, 0, IXGBE_RAH_AV);/* Reserve the last RAR for the SAN MAC address */hw->mac.num_rar_entries--;}reset_hw_out:return status;
}s32 ixgbe_get_mac_addr_generic(struct ixgbe_hw *hw, u8 *mac_addr)
{u32 rar_high;u32 rar_low;u16 i;rar_high = IXGBE_READ_REG(hw, IXGBE_RAH(0));rar_low = IXGBE_READ_REG(hw, IXGBE_RAL(0));for (i = 0; i < 4; i++)mac_addr[i] = (u8)(rar_low >> (i*8));for (i = 0; i < 2; i++)mac_addr[i+4] = (u8)(rar_high >> (i*8));return 0;
}#define IXGBE_RAL(_i) (((_i) <= 15) ? (0x05400 + ((_i) * 8)) : \(0x0A200 + ((_i) * 8)))
#define IXGBE_RAH(_i) (((_i) <= 15) ? (0x05404 + ((_i) * 8)) : \(0x0A204 + ((_i) * 8)))
3.4 ixgbe_init_interrupt_scheme()
/*** ixgbe_init_interrupt_scheme - Determine proper interrupt scheme* @adapter: board private structure to initialize** We determine which interrupt scheme to use based on...* - Kernel support (MSI, MSI-X)* - which can be user-defined (via MODULE_PARAM)* - Hardware queue count (num_*_queues)* - defined by miscellaneous hardware support/features (RSS, etc.)**/
int ixgbe_init_interrupt_scheme(struct ixgbe_adapter *adapter)
{int err;/* Number of supported queues */ixgbe_set_num_queues(adapter); //根据FDIR/RSS设置adapter->num_tx/rx_queues/* Set interrupt mode */ixgbe_set_interrupt_capability(adapter); //向PCI子系统请求中断err = ixgbe_alloc_q_vectors(adapter); //设置poll函数,分配ixgbe_q_vector,初始化napi并加入napi_listif (err) {e_dev_err("Unable to allocate memory for queue vectors\n");goto err_alloc_q_vectors;}ixgbe_cache_ring_register(adapter);// 分配发送/接收ring数组e_dev_info("Multiqueue %s: Rx Queue count = %u, Tx Queue count = %u XDP Queue count = %u\n",(adapter->num_rx_queues > 1) ? "Enabled" : "Disabled",adapter->num_rx_queues, adapter->num_tx_queues,adapter->num_xdp_queues);set_bit(__IXGBE_DOWN, &adapter->state);return 0;err_alloc_q_vectors:ixgbe_reset_interrupt_capability(adapter);return err;
}
3.4.1 设置收发队列 ixgbe_set_num_queues()
/*** ixgbe_set_num_queues - Allocate queues for device, feature dependent* @adapter: board private structure to initialize** This is the top level queue allocation routine. The order here is very* important, starting with the "most" number of features turned on at once,* and ending with the smallest set of features. This way large combinations* can be allocated if they're turned on, and smaller combinations are the* fallthrough conditions.***/
static void ixgbe_set_num_queues(struct ixgbe_adapter *adapter)
{/* Start with base case */adapter->num_rx_queues = 1;adapter->num_tx_queues = 1;adapter->num_xdp_queues = 0;adapter->num_rx_pools = adapter->num_rx_queues;adapter->num_rx_queues_per_pool = 1;#ifdef CONFIG_IXGBE_DCBif (ixgbe_set_dcb_sriov_queues(adapter))return;if (ixgbe_set_dcb_queues(adapter))return;#endifif (ixgbe_set_sriov_queues(adapter))return;ixgbe_set_rss_queues(adapter);
}
3.4.2 向PCI子系统请求中断 ixgbe_set_interrupt_capability()
/*** ixgbe_set_interrupt_capability - set MSI-X or MSI if supported* @adapter: board private structure to initialize** Attempt to configure the interrupts using the best available* capabilities of the hardware and the kernel.**/
static void ixgbe_set_interrupt_capability(struct ixgbe_adapter *adapter)
{int err;/* We will try to get MSI-X interrupts first */if (!ixgbe_acquire_msix_vectors(adapter))return;/* At this point, we do not have MSI-X capabilities. We need to* reconfigure or disable various features which require MSI-X* capability.*//* Disable DCB unless we only have a single traffic class */if (netdev_get_num_tc(adapter->netdev) > 1) {e_dev_warn("Number of DCB TCs exceeds number of available queues. Disabling DCB support.\n");netdev_reset_tc(adapter->netdev);if (adapter->hw.mac.type == ixgbe_mac_82598EB)adapter->hw.fc.requested_mode = adapter->last_lfc_mode;adapter->flags &= ~IXGBE_FLAG_DCB_ENABLED;adapter->temp_dcb_cfg.pfc_mode_enable = false;adapter->dcb_cfg.pfc_mode_enable = false;}adapter->dcb_cfg.num_tcs.pg_tcs = 1;adapter->dcb_cfg.num_tcs.pfc_tcs = 1;/* Disable SR-IOV support */e_dev_warn("Disabling SR-IOV support\n");ixgbe_disable_sriov(adapter);/* Disable RSS */e_dev_warn("Disabling RSS support\n");adapter->ring_feature[RING_F_RSS].limit = 1;/* recalculate number of queues now that many features have been* changed or disabled.*/ixgbe_set_num_queues(adapter);adapter->num_q_vectors = 1;err = pci_enable_msi(adapter->pdev); //向PCI子系统请求1个msi中断if (err)e_dev_warn("Failed to allocate MSI interrupt, falling back to legacy. Error: %d\n",err);elseadapter->flags |= IXGBE_FLAG_MSI_ENABLED;
}
3.4.3 申请中断向量表 ixgbe_alloc_q_vectors
/*** ixgbe_alloc_q_vectors - Allocate memory for interrupt vectors* @adapter: board private structure to initialize** We allocate one q_vector per queue interrupt. If allocation fails we* return -ENOMEM.**/
static int ixgbe_alloc_q_vectors(struct ixgbe_adapter *adapter)
{int q_vectors = adapter->num_q_vectors;int rxr_remaining = adapter->num_rx_queues;int txr_remaining = adapter->num_tx_queues;int xdp_remaining = adapter->num_xdp_queues;int rxr_idx = 0, txr_idx = 0, xdp_idx = 0, v_idx = 0;int err;/* only one q_vector if MSI-X is disabled. */// 使用MSIX(Message Signaled Interrupt-X)// 去掉绑定ixgbe0所在NUMA的所有CPU的msix中断(LSC等)if (!(adapter->flags & IXGBE_FLAG_MSIX_ENABLED))q_vectors = 1;if (q_vectors >= (rxr_remaining + txr_remaining + xdp_remaining)) {for (; rxr_remaining; v_idx++) {err = ixgbe_alloc_q_vector(adapter, q_vectors, v_idx,0, 0, 0, 0, 1, rxr_idx); //分配ixgbe_q_vectorif (err)goto err_out;/* update counts and index */rxr_remaining--;rxr_idx++;}}for (; v_idx < q_vectors; v_idx++) {int rqpv = DIV_ROUND_UP(rxr_remaining, q_vectors - v_idx);int tqpv = DIV_ROUND_UP(txr_remaining, q_vectors - v_idx);int xqpv = DIV_ROUND_UP(xdp_remaining, q_vectors - v_idx);err = ixgbe_alloc_q_vector(adapter, q_vectors, v_idx,tqpv, txr_idx,xqpv, xdp_idx,rqpv, rxr_idx);if (err)goto err_out;/* update counts and index */rxr_remaining -= rqpv;txr_remaining -= tqpv;xdp_remaining -= xqpv;rxr_idx++;txr_idx++;xdp_idx += xqpv;}return 0;err_out:adapter->num_tx_queues = 0;adapter->num_xdp_queues = 0;adapter->num_rx_queues = 0;adapter->num_q_vectors = 0;while (v_idx--)ixgbe_free_q_vector(adapter, v_idx);return -ENOMEM;
}
中断向量表分配 ixgbe_alloc_q_vector,NAPI的模式的 poll 函数(ixgbe_poll:一次读取64个数据包)注册
/*** ixgbe_alloc_q_vector - Allocate memory for a single interrupt vector* @adapter: board private structure to initialize* @v_count: q_vectors allocated on adapter, used for ring interleaving* @v_idx: index of vector in adapter struct* @txr_count: total number of Tx rings to allocate* @txr_idx: index of first Tx ring to allocate* @xdp_count: total number of XDP rings to allocate* @xdp_idx: index of first XDP ring to allocate* @rxr_count: total number of Rx rings to allocate* @rxr_idx: index of first Rx ring to allocate** We allocate one q_vector. If allocation fails we return -ENOMEM.**/
static int ixgbe_alloc_q_vector(struct ixgbe_adapter *adapter,int v_count, int v_idx,int txr_count, int txr_idx,int xdp_count, int xdp_idx,int rxr_count, int rxr_idx)
{struct ixgbe_q_vector *q_vector;struct ixgbe_ring *ring;int node = NUMA_NO_NODE;int cpu = -1;int ring_count, size;u8 tcs = netdev_get_num_tc(adapter->netdev);ring_count = txr_count + rxr_count + xdp_count;size = sizeof(struct ixgbe_q_vector) +(sizeof(struct ixgbe_ring) * ring_count);/* customize cpu for Flow Director mapping */if ((tcs <= 1) && !(adapter->flags & IXGBE_FLAG_SRIOV_ENABLED)) {u16 rss_i = adapter->ring_feature[RING_F_RSS].indices;if (rss_i > 1 && adapter->atr_sample_rate) {if (cpu_online(v_idx)) {cpu = v_idx;node = cpu_to_node(cpu);}}}/* allocate q_vector and rings */q_vector = kzalloc_node(size, GFP_KERNEL, node);if (!q_vector)q_vector = kzalloc(size, GFP_KERNEL);if (!q_vector)return -ENOMEM;/* setup affinity mask and node */if (cpu != -1)cpumask_set_cpu(cpu, &q_vector->affinity_mask);q_vector->numa_node = node;#ifdef CONFIG_IXGBE_DCA/* initialize CPU for DCA */q_vector->cpu = -1;#endif/* 初始化q_vector->napi并加入adapter->netdev的napi_list,其中poll函数为ixgbe_clean_rxtx_many()/ixgbe_poll(),一次poll的最大报文数为64 *//* initialize NAPI */netif_napi_add(adapter->netdev, &q_vector->napi,ixgbe_poll, 64);/* tie q_vector and adapter together */adapter->q_vector[v_idx] = q_vector;// 地址赋给adapter->q_vector[q_idx]q_vector->adapter = adapter;q_vector->v_idx = v_idx;/* initialize work limits */q_vector->tx.work_limit = adapter->tx_work_limit;/* initialize pointer to rings */ring = q_vector->ring;/* intialize ITR */if (txr_count && !rxr_count) {/* tx only vector */if (adapter->tx_itr_setting == 1)q_vector->itr = IXGBE_12K_ITR;elseq_vector->itr = adapter->tx_itr_setting;} else {/* rx or rx/tx vector */if (adapter->rx_itr_setting == 1)q_vector->itr = IXGBE_20K_ITR;elseq_vector->itr = adapter->rx_itr_setting;}while (txr_count) {/* assign generic ring traits */ring->dev = &adapter->pdev->dev;ring->netdev = adapter->netdev;/* configure backlink on ring */ring->q_vector = q_vector;/* update q_vector Tx values */ixgbe_add_ring(ring, &q_vector->tx);/* apply Tx specific ring traits */ring->count = adapter->tx_ring_count;if (adapter->num_rx_pools > 1)ring->queue_index =txr_idx % adapter->num_rx_queues_per_pool;elsering->queue_index = txr_idx;/* assign ring to adapter */WRITE_ONCE(adapter->tx_ring[txr_idx], ring);/* update count and index */txr_count--;txr_idx += v_count;/* push pointer to next ring */ring++;}while (xdp_count) {/* assign generic ring traits */ring->dev = &adapter->pdev->dev;ring->netdev = adapter->netdev;/* configure backlink on ring */ring->q_vector = q_vector;/* update q_vector Tx values */ixgbe_add_ring(ring, &q_vector->tx);/* apply Tx specific ring traits */ring->count = adapter->tx_ring_count;ring->queue_index = xdp_idx;set_ring_xdp(ring);/* assign ring to adapter */WRITE_ONCE(adapter->xdp_ring[xdp_idx], ring);/* update count and index */xdp_count--;xdp_idx++;/* push pointer to next ring */ring++;}while (rxr_count) {/* assign generic ring traits */ring->dev = &adapter->pdev->dev;ring->netdev = adapter->netdev;/* configure backlink on ring */ring->q_vector = q_vector;/* update q_vector Rx values */ixgbe_add_ring(ring, &q_vector->rx);/** 82599 errata, UDP frames with a 0 checksum* can be marked as checksum errors.*/if (adapter->hw.mac.type == ixgbe_mac_82599EB)set_bit(__IXGBE_RX_CSUM_UDP_ZERO_ERR, &ring->state);#ifdef IXGBE_FCOEif (adapter->netdev->features & NETIF_F_FCOE_MTU) {struct ixgbe_ring_feature *f;f = &adapter->ring_feature[RING_F_FCOE];if ((rxr_idx >= f->offset) &&(rxr_idx < f->offset + f->indices))set_bit(__IXGBE_RX_FCOE, &ring->state);}#endif /* IXGBE_FCOE *//* apply Rx specific ring traits */ring->count = adapter->rx_ring_count;if (adapter->num_rx_pools > 1)ring->queue_index =rxr_idx % adapter->num_rx_queues_per_pool;elsering->queue_index = rxr_idx;/* assign ring to adapter */WRITE_ONCE(adapter->rx_ring[rxr_idx], ring);/* update count and index */rxr_count--;rxr_idx += v_count;/* push pointer to next ring */ring++;}return 0;
}
3.4.4 rx/tx 中的描述符 fd 分配注册 ixgbe_cache_ring_register
/*** ixgbe_cache_ring_register - Descriptor ring to register mapping* @adapter: board private structure to initialize** Once we know the feature-set enabled for the device, we'll cache* the register offset the descriptor ring is assigned to.** Note, the order the various feature calls is important. It must start with* the "most" features enabled at the same time, then trickle down to the* least amount of features turned on at once.**/
static void ixgbe_cache_ring_register(struct ixgbe_adapter *adapter)
{/* start with default case */adapter->rx_ring[0]->reg_idx = 0;adapter->tx_ring[0]->reg_idx = 0;#ifdef CONFIG_IXGBE_DCBif (ixgbe_cache_ring_dcb_sriov(adapter))return;if (ixgbe_cache_ring_dcb(adapter))return;#endifif (ixgbe_cache_ring_sriov(adapter))return;ixgbe_cache_ring_rss(adapter);
}