一、应用
其实不管怎么设计,如何开发,结果都是要展现一个结果,能够为人所用。虽然说virtio的应用场景有不少,但是在DPDK中主要就是网卡。所以,在此处主要是对网卡的抽象的实现,即对上层的应用实现底层的virtio_net驱动和相关设备的定义。这样的话就可以在上层软件中将其看做普通的网卡接口来使用。也就达到了抽象的目的,隔离了虚拟设备的种类和具体实现,对外暴露统一的网络接口,上层应用不需要区分到底是真实的网卡还是虚拟的网卡。
而DPDK中使用这种机制,能更好的发挥本身实现网络通信功能并实现更好的适应性。
二、基本数据结构
在DPDK中,对抽象实现的数据结构代码:
//librte_ethdev/rte_ethdev_core.h
/*** @internal* The generic data structure associated with each ethernet device.** Pointers to burst-oriented packet receive and transmit functions are* located at the beginning of the structure, along with the pointer to* where all the data elements for the particular device are stored in shared* memory. This split allows the function pointer and driver data to be per-* process, while the actual configuration data for the device is shared.*/
struct rte_eth_dev {eth_rx_burst_t rx_pkt_burst; /**< Pointer to PMD receive function. */eth_tx_burst_t tx_pkt_burst; /**< Pointer to PMD transmit function. */eth_tx_prep_t tx_pkt_prepare; /**< Pointer to PMD transmit prepare function. *//*** Next two fields are per-device data but *data is shared between* primary and secondary processes and *process_private is per-process* private. The second one is managed by PMDs if necessary.*/struct rte_eth_dev_data *data; /**< Pointer to device data. */void *process_private; /**< Pointer to per-process device data. */const struct eth_dev_ops *dev_ops; /**< Functions exported by PMD */struct rte_device *device; /**< Backing device */struct rte_intr_handle *intr_handle; /**< Device interrupt handle *//** User application callbacks for NIC interrupts */struct rte_eth_dev_cb_list link_intr_cbs;/*** User-supplied functions called from rx_burst to post-process* received packets before passing them to the user*/struct rte_eth_rxtx_callback *post_rx_burst_cbs[RTE_MAX_QUEUES_PER_PORT];/*** User-supplied functions called from tx_burst to pre-process* received packets before passing them to the driver for transmission.*/struct rte_eth_rxtx_callback *pre_tx_burst_cbs[RTE_MAX_QUEUES_PER_PORT];enum rte_eth_dev_state state; /**< Flag indicating the port state */void *security_ctx; /**< Context for security ops */uint64_t reserved_64s[4]; /**< Reserved for future fields */void *reserved_ptrs[4]; /**< Reserved for future fields * /
}
/*** @internal A structure containing the functions exported by an Ethernet driver.*/
struct eth_dev_ops {eth_dev_configure_t dev_configure; /**< Configure device. */eth_dev_start_t dev_start; /**< Start device. */eth_dev_stop_t dev_stop; /**< Stop device. */eth_dev_set_link_up_t dev_set_link_up; /**< Device link up. */eth_dev_set_link_down_t dev_set_link_down; /**< Device link down. */eth_dev_close_t dev_close; /**< Close device. */eth_dev_reset_t dev_reset; /**< Reset device. */eth_link_update_t link_update; /**< Get device link state. */eth_is_removed_t is_removed;/**< Check if the device was physically removed. */eth_promiscuous_enable_t promiscuous_enable; /**< Promiscuous ON. */eth_promiscuous_disable_t promiscuous_disable;/**< Promiscuous OFF. */eth_allmulticast_enable_t allmulticast_enable;/**< RX multicast ON. */eth_allmulticast_disable_t allmulticast_disable;/**< RX multicast OFF. */eth_mac_addr_remove_t mac_addr_remove; /**< Remove MAC address. */eth_mac_addr_add_t mac_addr_add; /**< Add a MAC address. */eth_mac_addr_set_t mac_addr_set; /**< Set a MAC address. */eth_set_mc_addr_list_t set_mc_addr_list; /**< set list of mcast addrs. */mtu_set_t mtu_set; /**< Set MTU. */eth_stats_get_t stats_get; /**< Get generic device statistics. */eth_stats_reset_t stats_reset; /**< Reset generic device statistics. */eth_xstats_get_t xstats_get; /**< Get extended device statistics. */eth_xstats_reset_t xstats_reset; /**< Reset extended device statistics. */eth_xstats_get_names_t xstats_get_names;/**< Get names of extended statistics. */eth_queue_stats_mapping_set_t queue_stats_mapping_set;/**< Configure per queue stat counter mapping. */eth_dev_infos_get_t dev_infos_get; /**< Get device info. */eth_rxq_info_get_t rxq_info_get; /**< retrieve RX queue information. */eth_txq_info_get_t txq_info_get; /**< retrieve TX queue information. */eth_burst_mode_get_t rx_burst_mode_get; /**< Get RX burst mode */eth_burst_mode_get_t tx_burst_mode_get; /**< Get TX burst mode */eth_fw_version_get_t fw_version_get; /**< Get firmware version. */eth_dev_supported_ptypes_get_t dev_supported_ptypes_get;/**< Get packet types supported and identified by device. */eth_dev_ptypes_set_t dev_ptypes_set;/**< Inform Ethernet device about reduced range of packet types to handle. */vlan_filter_set_t vlan_filter_set; /**< Filter VLAN Setup. */vlan_tpid_set_t vlan_tpid_set; /**< Outer/Inner VLAN TPID Setup. */vlan_strip_queue_set_t vlan_strip_queue_set; /**< VLAN Stripping on queue. */vlan_offload_set_t vlan_offload_set; /**< Set VLAN Offload. */vlan_pvid_set_t vlan_pvid_set; /**< Set port based TX VLAN insertion. */eth_queue_start_t rx_queue_start;/**< Start RX for a queue. */eth_queue_stop_t rx_queue_stop; /**< Stop RX for a queue. */eth_queue_start_t tx_queue_start;/**< Start TX for a queue. */eth_queue_stop_t tx_queue_stop; /**< Stop TX for a queue. */eth_rx_queue_setup_t rx_queue_setup;/**< Set up device RX queue. */eth_queue_release_t rx_queue_release; /**< Release RX queue. */eth_rx_queue_count_t rx_queue_count;/**< Get the number of used RX descriptors. */eth_rx_descriptor_done_t rx_descriptor_done; /**< Check rxd DD bit. */eth_rx_descriptor_status_t rx_descriptor_status;/**< Check the status of a Rx descriptor. */eth_tx_descriptor_status_t tx_descriptor_status;/**< Check the status of a Tx descriptor. */eth_rx_enable_intr_t rx_queue_intr_enable; /**< Enable Rx queue interrupt. */eth_rx_disable_intr_t rx_queue_intr_disable; /**< Disable Rx queue interrupt. */eth_tx_queue_setup_t tx_queue_setup;/**< Set up device TX queue. */eth_queue_release_t tx_queue_release; /**< Release TX queue. */eth_tx_done_cleanup_t tx_done_cleanup;/**< Free tx ring mbufs */eth_dev_led_on_t dev_led_on; /**< Turn on LED. */eth_dev_led_off_t dev_led_off; /**< Turn off LED. */flow_ctrl_get_t flow_ctrl_get; /**< Get flow control. */flow_ctrl_set_t flow_ctrl_set; /**< Setup flow control. */priority_flow_ctrl_set_t priority_flow_ctrl_set; /**< Setup priority flow control. */eth_uc_hash_table_set_t uc_hash_table_set; /**< Set Unicast Table Array. */eth_uc_all_hash_table_set_t uc_all_hash_table_set; /**< Set Unicast hash bitmap. */eth_mirror_rule_set_t mirror_rule_set; /**< Add a traffic mirror rule. */eth_mirror_rule_reset_t mirror_rule_reset; /**< reset a traffic mirror rule. */eth_udp_tunnel_port_add_t udp_tunnel_port_add; /** Add UDP tunnel port. */eth_udp_tunnel_port_del_t udp_tunnel_port_del; /** Del UDP tunnel port. */eth_l2_tunnel_eth_type_conf_t l2_tunnel_eth_type_conf;/** Config ether type of l2 tunnel. */eth_l2_tunnel_offload_set_t l2_tunnel_offload_set;/** Enable/disable l2 tunnel offload functions. */eth_set_queue_rate_limit_t set_queue_rate_limit; /**< Set queue rate limit. */rss_hash_update_t rss_hash_update; /** Configure RSS hash protocols. */rss_hash_conf_get_t rss_hash_conf_get; /** Get current RSS hash configuration. */reta_update_t reta_update; /** Update redirection table. */reta_query_t reta_query; /** Query redirection table. */eth_get_reg_t get_reg; /**< Get registers. */eth_get_eeprom_length_t get_eeprom_length; /**< Get eeprom length. */eth_get_eeprom_t get_eeprom; /**< Get eeprom data. */eth_set_eeprom_t set_eeprom; /**< Set eeprom. */eth_get_module_info_t get_module_info;/** Get plugin module eeprom attribute. */eth_get_module_eeprom_t get_module_eeprom;/** Get plugin module eeprom data. */eth_filter_ctrl_t filter_ctrl; /**< common filter control. */eth_get_dcb_info get_dcb_info; /** Get DCB information. */eth_timesync_enable_t timesync_enable;/** Turn IEEE1588/802.1AS timestamping on. */eth_timesync_disable_t timesync_disable;/** Turn IEEE1588/802.1AS timestamping off. */eth_timesync_read_rx_timestamp_t timesync_read_rx_timestamp;/** Read the IEEE1588/802.1AS RX timestamp. */eth_timesync_read_tx_timestamp_t timesync_read_tx_timestamp;/** Read the IEEE1588/802.1AS TX timestamp. */eth_timesync_adjust_time timesync_adjust_time; /** Adjust the device clock. */eth_timesync_read_time timesync_read_time; /** Get the device clock time. */eth_timesync_write_time timesync_write_time; /** Set the device clock time. */eth_read_clock read_clock;eth_xstats_get_by_id_t xstats_get_by_id;/**< Get extended device statistic values by ID. */eth_xstats_get_names_by_id_t xstats_get_names_by_id;/**< Get name of extended device statistics by ID. */eth_tm_ops_get_t tm_ops_get;/**< Get Traffic Management (TM) operations. */eth_mtr_ops_get_t mtr_ops_get;/**< Get Traffic Metering and Policing (MTR) operations. */eth_pool_ops_supported_t pool_ops_supported;/**< Test if a port supports specific mempool ops */eth_hairpin_cap_get_t hairpin_cap_get;/**< Returns the hairpin capabilities. */eth_rx_hairpin_queue_setup_t rx_hairpin_queue_setup;/**< Set up device RX hairpin queue. */eth_tx_hairpin_queue_setup_t tx_hairpin_queue_setup;/**< Set up device TX hairpin queue. */
};
/*** @internal* The data part, with no function pointers, associated with each ethernet device.** This structure is safe to place in shared memory to be common among different* processes in a multi-process configuration.*/
struct rte_eth_dev_data {char name[RTE_ETH_NAME_MAX_LEN]; /**< Unique identifier name */void **rx_queues; /**< Array of pointers to RX queues. */void **tx_queues; /**< Array of pointers to TX queues. */uint16_t nb_rx_queues; /**< Number of RX queues. */uint16_t nb_tx_queues; /**< Number of TX queues. */struct rte_eth_dev_sriov sriov; /**< SRIOV data */void *dev_private;/**< PMD-specific private data.* @see rte_eth_dev_release_port()*/struct rte_eth_link dev_link; /**< Link-level information & status. */struct rte_eth_conf dev_conf; /**< Configuration applied to device. */uint16_t mtu; /**< Maximum Transmission Unit. */uint32_t min_rx_buf_size;/**< Common RX buffer size handled by all queues. */uint64_t rx_mbuf_alloc_failed; /**< RX ring mbuf allocation failures. */struct rte_ether_addr *mac_addrs;/**< Device Ethernet link address.* @see rte_eth_dev_release_port()*/uint64_t mac_pool_sel[ETH_NUM_RECEIVE_MAC_ADDR];/**< Bitmap associating MAC addresses to pools. */struct rte_ether_addr *hash_mac_addrs;/**< Device Ethernet MAC addresses of hash filtering.* @see rte_eth_dev_release_port()*/uint16_t port_id; /**< Device [external] port identifier. */__extension__uint8_t promiscuous : 1, /**< RX promiscuous mode ON(1) / OFF(0). */scattered_rx : 1, /**< RX of scattered packets is ON(1) / OFF(0) */all_multicast : 1, /**< RX all multicast mode ON(1) / OFF(0). */dev_started : 1, /**< Device state: STARTED(1) / STOPPED(0). */lro : 1; /**< RX LRO is ON(1) / OFF(0) */uint8_t rx_queue_state[RTE_MAX_QUEUES_PER_PORT];/**< Queues state: HAIRPIN(2) / STARTED(1) / STOPPED(0). */uint8_t tx_queue_state[RTE_MAX_QUEUES_PER_PORT];/**< Queues state: HAIRPIN(2) / STARTED(1) / STOPPED(0). */uint32_t dev_flags; /**< Capabilities. */enum rte_kernel_driver kdrv; /**< Kernel driver passthrough. */int numa_node; /**< NUMA node connection. */struct rte_vlan_filter_conf vlan_filter_conf;/**< VLAN filter configuration. */struct rte_eth_dev_owner owner; /**< The port owner. */uint16_t representor_id;/**< Switch-specific identifier.* Valid if RTE_ETH_DEV_REPRESENTOR in dev_flags.*/uint64_t reserved_64s[4]; /**< Reserved for future fields */void *reserved_ptrs[4]; /**< Reserved for future fields * /
}
上面的三个数据结构,rte_eth_dev_data用来说明网卡通信数据的内容,rte_eth_dev用来网卡的整体的描述,包括收发数据和函数指针的定义。eth_dev_ops用来对网卡设备的驱动的操作定义导出函数。这三个组合在一起就可以把一个网卡整体框架的数据和函数功能基本表现出来。
三、基本流程
其实在底层实现中为了描述对底层的函数调用,简单分析过一些此处的函数,所以应该看到有的函数并不陌生,下面看一下主要的流程代码:
/** This function is based on probe() function in virtio_pci.c* It returns 0 on success.*/
int
eth_virtio_dev_init(struct rte_eth_dev *eth_dev)
{struct virtio_hw *hw = eth_dev->data->dev_private;int ret;if (sizeof(struct virtio_net_hdr_mrg_rxbuf) > RTE_PKTMBUF_HEADROOM) {PMD_INIT_LOG(ERR,"Not sufficient headroom required = %d, avail = %d",(int)sizeof(struct virtio_net_hdr_mrg_rxbuf),RTE_PKTMBUF_HEADROOM);return -1;}eth_dev->dev_ops = &virtio_eth_dev_ops;if (rte_eal_process_type() == RTE_PROC_SECONDARY) {if (!hw->virtio_user_dev) {ret = virtio_remap_pci(RTE_ETH_DEV_TO_PCI(eth_dev), hw);if (ret)return ret;}virtio_set_vtpci_ops(hw);set_rxtx_funcs(eth_dev);return 0;}/** Pass the information to the rte_eth_dev_close() that it should also* release the private port resources.*/eth_dev->data->dev_flags |= RTE_ETH_DEV_CLOSE_REMOVE;/* Allocate memory for storing MAC addresses */eth_dev->data->mac_addrs = rte_zmalloc("virtio",VIRTIO_MAX_MAC_ADDRS * RTE_ETHER_ADDR_LEN, 0);if (eth_dev->data->mac_addrs == NULL) {PMD_INIT_LOG(ERR,"Failed to allocate %d bytes needed to store MAC addresses",VIRTIO_MAX_MAC_ADDRS * RTE_ETHER_ADDR_LEN);return -ENOMEM;}hw->port_id = eth_dev->data->port_id;/* For virtio_user case the hw->virtio_user_dev is populated by* virtio_user_eth_dev_alloc() before eth_virtio_dev_init() is called.*/if (!hw->virtio_user_dev) {ret = vtpci_init(RTE_ETH_DEV_TO_PCI(eth_dev), hw);if (ret)goto err_vtpci_init;}rte_spinlock_init(&hw->state_lock);/* reset device and negotiate default features * /ret = virtio_init_device(eth_dev, VIRTIO_PMD_DEFAULT_GUEST_FEATURES);if (ret < 0)goto err_virtio_init;hw->opened = true;return 0;err_virtio_init:if (!hw->virtio_user_dev) {rte_pci_unmap_device(RTE_ETH_DEV_TO_PCI(eth_dev));if (!hw->modern)rte_pci_ioport_unmap(VTPCI_IO(hw));}
err_vtpci_init:rte_free(eth_dev->data->mac_addrs);eth_dev->data->mac_addrs = NULL;return ret;
}
这个函数的说明里也告诉大家基础的调用是在virtio_pci.c中,和刚刚提到是底层抽象的具体实现的互相呼应。在这个函数里,主要是对描述的数据结构进行初始化和定义,对相关virtio设备进行初始化,同时对一些交互的特征值进行预定义,包括对数据队列的初始化。
收发函数的设置在:
/* set rx and tx handlers according to what is supported */
static void
set_rxtx_funcs(struct rte_eth_dev * eth_dev)
{struct virtio_hw * hw = eth_dev->data->dev_private;eth_dev->tx_pkt_prepare = virtio_xmit_pkts_prepare;if (vtpci_packed_queue(hw)) {PMD_INIT_LOG(INFO,"virtio: using packed ring %s Tx path on port %u",hw->use_inorder_tx ? "inorder" : "standard",eth_dev->data->port_id);eth_dev->tx_pkt_burst = virtio_xmit_pkts_packed;} else {if (hw->use_inorder_tx) {PMD_INIT_LOG(INFO, "virtio: using inorder Tx path on port %u",eth_dev->data->port_id);eth_dev->tx_pkt_burst = virtio_xmit_pkts_inorder;} else {PMD_INIT_LOG(INFO, "virtio: using standard Tx path on port %u",eth_dev->data->port_id);eth_dev->tx_pkt_burst = virtio_xmit_pkts;}}if (vtpci_packed_queue(hw)) {if (vtpci_with_feature(hw, VIRTIO_NET_F_MRG_RXBUF)) {PMD_INIT_LOG(INFO,"virtio: using packed ring mergeable buffer Rx path on port %u",eth_dev->data->port_id);eth_dev->rx_pkt_burst =&virtio_recv_mergeable_pkts_packed;} else {PMD_INIT_LOG(INFO,"virtio: using packed ring standard Rx path on port %u",eth_dev->data->port_id);eth_dev->rx_pkt_burst = &virtio_recv_pkts_packed;}} else {if (hw->use_simple_rx) {PMD_INIT_LOG(INFO, "virtio: using simple Rx path on port %u",eth_dev->data->port_id);eth_dev->rx_pkt_burst = virtio_recv_pkts_vec;} else if (hw->use_inorder_rx) {PMD_INIT_LOG(INFO,"virtio: using inorder Rx path on port %u",eth_dev->data->port_id);eth_dev->rx_pkt_burst = &virtio_recv_pkts_inorder;} else if (vtpci_with_feature(hw, VIRTIO_NET_F_MRG_RXBUF)) {PMD_INIT_LOG(INFO,"virtio: using mergeable buffer Rx path on port %u",eth_dev->data->port_id);eth_dev->rx_pkt_burst = &virtio_recv_mergeable_pkts;} else {PMD_INIT_LOG(INFO, "virtio: using standard Rx path on port %u",eth_dev->data->port_id);eth_dev->rx_pkt_burst = &virtio_recv_pkts;}}}
其中virtio_recv_pkts和virtio_xmit_pkts两个系列的函数就是用来进行收发的两大类具体的工作的。
下面的再看一下virtio_recv_pkts函数定义(均定义在drivers\net\virtio\virtio_rxtx.c):
#define VIRTIO_MBUF_BURST_SZ 64
#define DESC_PER_CACHELINE (RTE_CACHE_LINE_SIZE / sizeof(struct vring_desc))
uint16_t
virtio_recv_pkts(void * rx_queue, struct rte_mbuf ** rx_pkts, uint16_t nb_pkts)
{struct virtnet_rx * rxvq = rx_queue;struct virtqueue * vq = rxvq->vq;struct virtio_hw * hw = vq->hw;struct rte_mbuf * rxm;uint16_t nb_used, num, nb_rx;uint32_t len[VIRTIO_MBUF_BURST_SZ];struct rte_mbuf * rcv_pkts[VIRTIO_MBUF_BURST_SZ];int error;uint32_t i, nb_enqueued;uint32_t hdr_size;struct virtio_net_hdr * hdr;nb_rx = 0;if (unlikely(hw->started == 0))return nb_rx;nb_used = VIRTQUEUE_NUSED(vq);virtio_rmb(hw->weak_barriers);num = likely(nb_used <= nb_pkts) ? nb_used : nb_pkts;if (unlikely(num > VIRTIO_MBUF_BURST_SZ))num = VIRTIO_MBUF_BURST_SZ;if (likely(num > DESC_PER_CACHELINE))num = num - ((vq->vq_used_cons_idx + num) % DESC_PER_CACHELINE);num = virtqueue_dequeue_burst_rx(vq, rcv_pkts, len, num);PMD_RX_LOG(DEBUG, "used:%d dequeue:%d", nb_used, num);nb_enqueued = 0;hdr_size = hw->vtnet_hdr_size;for (i = 0; i < num ; i++) {rxm = rcv_pkts[i];PMD_RX_LOG(DEBUG, "packet len:%d", len[i]);if (unlikely(len[i] < hdr_size + RTE_ETHER_HDR_LEN)) {PMD_RX_LOG(ERR, "Packet drop");nb_enqueued++;virtio_discard_rxbuf(vq, rxm);rxvq->stats.errors++;continue;}rxm->port = rxvq->port_id;rxm->data_off = RTE_PKTMBUF_HEADROOM;rxm->ol_flags = 0;rxm->vlan_tci = 0;rxm->pkt_len = (uint32_t)(len[i] - hdr_size);rxm->data_len = (uint16_t)(len[i] - hdr_size);hdr = (struct virtio_net_hdr * )((char * )rxm->buf_addr +RTE_PKTMBUF_HEADROOM - hdr_size);if (hw->vlan_strip)rte_vlan_strip(rxm);if (hw->has_rx_offload && virtio_rx_offload(rxm, hdr) < 0) {virtio_discard_rxbuf(vq, rxm);rxvq->stats.errors++;continue;}virtio_rx_stats_updated(rxvq, rxm);rx_pkts[nb_rx++] = rxm;}rxvq->stats.packets += nb_rx;/* Allocate new mbuf for the used descriptor * /if (likely(!virtqueue_full(vq))) {uint16_t free_cnt = vq->vq_free_cnt;struct rte_mbuf * new_pkts[free_cnt];if (likely(rte_pktmbuf_alloc_bulk(rxvq->mpool, new_pkts,free_cnt) == 0)) {error = virtqueue_enqueue_recv_refill(vq, new_pkts,free_cnt);if (unlikely(error)) {for (i = 0; i < free_cnt; i++)rte_pktmbuf_free(new_pkts[i]);}nb_enqueued += free_cnt;} else {struct rte_eth_dev * dev =&rte_eth_devices[rxvq->port_id];dev->data->rx_mbuf_alloc_failed += free_cnt;}}if (likely(nb_enqueued)) {vq_update_avail_idx(vq);if (unlikely(virtqueue_kick_prepare(vq))) {virtqueue_notify(vq);PMD_RX_LOG(DEBUG, "Notified");}}return nb_rx;
}
先是基础数据结构的变量定义,然后利用likely进行初步的判断,通过virtqueue_dequeue_burst_rx得报文的数量,包括描述符,然后再将其内部数据读取,完毕后将desc回收。然后通过循环读取具体的数据。最后更新可用desc,分配mbuf,并插入到可用队列,同时通知vhost。
再看一下virtio_xmit_pkts函数定义:
uint16_t
virtio_xmit_pkts(void * tx_queue, struct rte_mbuf ** tx_pkts, uint16_t nb_pkts)
{struct virtnet_tx * txvq = tx_queue;struct virtqueue * vq = txvq->vq;struct virtio_hw * hw = vq->hw;uint16_t hdr_size = hw->vtnet_hdr_size;uint16_t nb_used, nb_tx = 0;if (unlikely(hw->started == 0 && tx_pkts != hw->inject_pkts))return nb_tx;if (unlikely(nb_pkts < 1))return nb_pkts;PMD_TX_LOG(DEBUG, "%d packets to xmit", nb_pkts);nb_used = VIRTQUEUE_NUSED(vq);virtio_rmb(hw->weak_barriers);if (likely(nb_used > vq->vq_nentries - vq->vq_free_thresh))virtio_xmit_cleanup(vq, nb_used);for (nb_tx = 0; nb_tx < nb_pkts; nb_tx++) {struct rte_mbuf * txm = tx_pkts[nb_tx];int can_push = 0, use_indirect = 0, slots, need;/* optimize ring usage */if ((vtpci_with_feature(hw, VIRTIO_F_ANY_LAYOUT) ||vtpci_with_feature(hw, VIRTIO_F_VERSION_1)) &&rte_mbuf_refcnt_read(txm) == 1 &&RTE_MBUF_DIRECT(txm) &&txm->nb_segs == 1 &&rte_pktmbuf_headroom(txm) >= hdr_size &&rte_is_aligned(rte_pktmbuf_mtod(txm, char *),__alignof__(struct virtio_net_hdr_mrg_rxbuf)))can_push = 1;else if (vtpci_with_feature(hw, VIRTIO_RING_F_INDIRECT_DESC) &&txm->nb_segs < VIRTIO_MAX_TX_INDIRECT)use_indirect = 1;/* How many main ring entries are needed to this Tx?* any_layout => number of segments* indirect => 1* default => number of segments + 1*/slots = use_indirect ? 1 : (txm->nb_segs + !can_push);need = slots - vq->vq_free_cnt;/* Positive value indicates it need free vring descriptors */if (unlikely(need > 0)) {nb_used = VIRTQUEUE_NUSED(vq);virtio_rmb(hw->weak_barriers);need = RTE_MIN(need, (int)nb_used);virtio_xmit_cleanup(vq, need);need = slots - vq->vq_free_cnt;if (unlikely(need > 0)) {PMD_TX_LOG(ERR,"No free tx descriptors to transmit");break;}}/* Enqueue Packet buffers * /virtqueue_enqueue_xmit(txvq, txm, slots, use_indirect,can_push, 0);virtio_update_packet_stats(&txvq->stats, txm);}txvq->stats.packets += nb_tx;if (likely(nb_tx)) {vq_update_avail_idx(vq);if (unlikely(virtqueue_kick_prepare(vq))) {virtqueue_notify(vq);PMD_TX_LOG(DEBUG, "Notified backend after xmit");}}return nb_tx;
}
真正的数据发送发生在循环中调用virtqueue_enqueue_xmit这个函数上,其它的都是一些相关状态及数量等的准备。发送完成后更新状态参数。再看一下调用的实际发送函数:
static inline void
virtqueue_enqueue_xmit(struct virtnet_tx *txvq, struct rte_mbuf *cookie,uint16_t needed, int use_indirect, int can_push,int in_order)
{struct virtio_tx_region * txr = txvq->virtio_net_hdr_mz->addr;struct vq_desc_extra * dxp;struct virtqueue * vq = txvq->vq;struct vring_desc * start_dp;uint16_t seg_num = cookie->nb_segs;uint16_t head_idx, idx;int16_t head_size = vq->hw->vtnet_hdr_size;bool prepend_header = false;struct virtio_net_hdr * hdr;head_idx = vq->vq_desc_head_idx;idx = head_idx;if (in_order)dxp = &vq->vq_descx[vq->vq_avail_idx & (vq->vq_nentries - 1)];elsedxp = &vq->vq_descx[idx];dxp->cookie = (void * )cookie;dxp->ndescs = needed;start_dp = vq->vq_split.ring.desc;if (can_push) {/* prepend cannot fail, checked by caller */hdr = rte_pktmbuf_mtod_offset(cookie, struct virtio_net_hdr *,-head_size);prepend_header = true;/* if offload disabled, it is not zeroed below, do it now */if (!vq->hw->has_tx_offload)virtqueue_clear_net_hdr(hdr);} else if (use_indirect) {/* setup tx ring slot to point to indirect* descriptor list stored in reserved region.** the first slot in indirect ring is already preset* to point to the header in reserved region*/start_dp[idx].addr = txvq->virtio_net_hdr_mem +RTE_PTR_DIFF(&txr[idx].tx_indir, txr);start_dp[idx].len = (seg_num + 1) * sizeof(struct vring_desc);start_dp[idx].flags = VRING_DESC_F_INDIRECT;hdr = (struct virtio_net_hdr *)&txr[idx].tx_hdr;/* loop below will fill in rest of the indirect elements */start_dp = txr[idx].tx_indir;idx = 1;} else {/* setup first tx ring slot to point to header* stored in reserved region.* /start_dp[idx].addr = txvq->virtio_net_hdr_mem +RTE_PTR_DIFF(&txr[idx].tx_hdr, txr);start_dp[idx].len = vq->hw->vtnet_hdr_size;start_dp[idx].flags = VRING_DESC_F_NEXT;hdr = (struct virtio_net_hdr * )&txr[idx].tx_hdr;idx = start_dp[idx].next;}virtqueue_xmit_offload(hdr, cookie, vq->hw->has_tx_offload);do {start_dp[idx].addr = VIRTIO_MBUF_DATA_DMA_ADDR(cookie, vq);start_dp[idx].len = cookie->data_len;if (prepend_header) {start_dp[idx].addr -= head_size;start_dp[idx].len += head_size;prepend_header = false;}start_dp[idx].flags = cookie->next ? VRING_DESC_F_NEXT : 0;idx = start_dp[idx].next;} while ((cookie = cookie->next) != NULL);if (use_indirect)idx = vq->vq_split.ring.desc[head_idx].next;vq->vq_free_cnt = (uint16_t)(vq->vq_free_cnt - needed);vq->vq_desc_head_idx = idx;vq_update_avail_ring(vq, head_idx);if (!in_order) {if (vq->vq_desc_head_idx == VQ_RING_DESC_CHAIN_END)vq->vq_desc_tail_idx = idx;}
}
在这个函数中其实就是可以看到前面提到的队列和描述符的操作动作了。特别是一些数据结构的来回填充回收使用的过程。if的第一个判断主要处理的是indirect等情况,重点看一下else里(即do循环的代码)。
四、总结
其实通过这几篇初步的分析,就可以明白virtio在DPDK中的应用方式,其实和Virtio的设计并没有原则的不同。只是在支持上进行了优化和简化,毕竟DPDK中只是对网卡一种设备的应用。学习别人源码的目的是什么?就是为了学以致用,更是为了在此基础上发展自己的框架和基础开发库。不应该为了应用而应用,这也是网上讽刺的“外国开源,国内自研;外国闭源,国内卡脖子”。
ChatGPT越火,越是要沉住气,任重而道远啊!