Linux内核DCB子系统(2)


Dcbtool
Dcbtool可以用来查询和设置DCB以太网接口的DCB特性。通用的命令有gc, sc等。(对应的DCB模块的get和set函数),主要可以参考https://github.com/jrfastab/lldpad比如:<gc|go> dcbx:gets the configured or operational version of the  DCB  capabilities  exchange  protocol.  可以设置本地interface的配置特性。sc <ifname> <feature> <args>              sets the configuration of feature on interface ifname.这些特性feature包括:        dcb    DCB state of the port        pg    priority groupspgid:xxxxxxxxPriority group ID for the 8  priorities.  From  left  to  right(priorities  0-7),  x  is  the  corresponding  priority group IDvalue, which can be 0-7 for priority groups with bandwidth allo-cations  or f (priority group ID 15) for the unrestricted prior-ity group.        pfc    priority flow control(特殊的参数有pfcup: xxxxxxxx)x是0或1,1指的是这个相应的优先级(总共有8个优先级0-7嘛)使用传输的pause帧机制,0就表示不使用。        app:<subtype> 特殊的参数是appcfg:xx xx是一个16进制的值代表一个8bit的bitmap,某一位为1代表着这个subtype使用这个优先级。              application specific data      subtype can be:      ---------------      0|fcoe Fiber Channel over Ethernet (FCoE)      下面是dcbtool的使用例子:达到的目的:使得PFC pause发生作用的传输优先级是3,并且将FCoE流量分配到这个第三优先级上。      dcbtool sc eth2 pfc pfcup:00010000      dcbtool sc eth2 app:0 appcfg:08(app:0是表示FcoE流量)另外带宽分配的部分就是pg:比如      dcbtool sc eth2 pg pgid:0000111f pgpct:25,75,0,0,0,0,0,0
使用Netlink和内核DCB子系统交互

在net\netlink\af_netlink.c中:

static int __init netlink_proto_init(void)

sock_register(&netlink_family_ops);

}

static const struct net_proto_familynetlink_family_ops = {

.family = PF_NETLINK,

.create = netlink_create,

.owner    = THIS_MODULE, /* for consistency 8) */

};


 

在netlink_create中调用

static int __netlink_create(struct net *net, struct socket *sock,

struct mutex *cb_mutex, int protocol)

{

struct sock *sk;

struct netlink_sock *nlk;

sock->ops = &netlink_ops;

sk = sk_alloc(net, PF_NETLINK, GFP_KERNEL, &netlink_proto);

}

static const struct proto_opsnetlink_ops= {

.family = PF_NETLINK,

.owner = THIS_MODULE,

.release =      netlink_release,

.bind =          netlink_bind,

.connect =    netlink_connect,

.socketpair = sock_no_socketpair,

.accept =      sock_no_accept,

.getname =    netlink_getname,

.poll =          datagram_poll,

.ioctl =  sock_no_ioctl,

.listen =  sock_no_listen,

.shutdown =  sock_no_shutdown,

.setsockopt =      netlink_setsockopt,

.getsockopt =      netlink_getsockopt,

.sendmsg =  netlink_sendmsg,//这可能就是和那些系统调用对应的函数

.recvmsg =    netlink_recvmsg,

.mmap =              sock_no_mmap,

.sendpage =  sock_no_sendpage,

};


 

数据结构dcbnl_ops
这里的dcbnl_ops应该算是routing子系统中的一个需要用到的结构。为了进一步理解,还是把这个过程弄清楚吧。  到现在lldpad怎么操作,基本上梳理清楚,至于lldpad怎么和内核程序进行交互。应该要看一看struct dcbnl_rtnl_ops出现在内核的哪些部分。include/net/dcbnl.h定义这个结构
定义:include/net/dcbnl.h, line 46
·        *·          43  * Ops struct for the netlink callbacks.  Used by DCB-enabled drivers through* the netdevice struct.·          45  */·          46 struct dcbnl_rtnl_ops {·          47        /* IEEE 802.1Qaz std */·        56        int (*ieee_delapp) (struct net_device *, struct dcb_app *);·        59·          60        /* CEE std */·          61        u8  (*getstate)(struct net_device *);·                        u16 *);·          96        int (*peer_getapptable)(struct net_device *, struct dcb_app *);·          97 ·          98        /* CEE peer */·          99        int (*cee_peer_getpg) (struct net_device *, struct cee_pg *);·        100        int (*cee_peer_getpfc) (struct net_device *, struct cee_pfc *);·        101 };·        102·        103 #endif /* __NET_DCBNL_H__ */·        104
referenced in by驱动(DCB enabled)和DCB模块
然后referenced in很多的地方:被很多的驱动引用,比如broadcom的bnx2x(这个编译内核的时候没有必要选上吧)。mellanox/mlx4;/qlogic/qlcnic;intel/ixgbe;等这些基本上都是定义并且实现了这个结构;并且把这个结构作为netdev的子结构传递给netdev结构体。 然后在net/dcb/dcbnl.c(DCB模块)中有这些函数static int dcbnl_build_peer_app(struct net_device *netdev, struct sk_buff* skb,int app_nested_type, int app_info_type, int app_entry_type)然后这个里面使用了一个变量指针ops,将这个netdev的dcbnl_ops传递给它。const struct dcbnl_rtnl_ops *ops = netdev->dcbnl_ops; static int dcbnl_ieee_fill(struct sk_buff *skb, struct net_device *netdev)1035 {1036        struct nlattr *ieee, *app;1037        struct dcb_app_type *itr;1038        const struct dcbnl_rtnl_ops *ops = netdev->dcbnl_ops;1039        int dcbx; static int dcbnl_cee_pg_fill(struct sk_buff *skb, struct net_device *dev,1142                              int dir)1143 {1144        u8pgid, up_map, prio, tc_pct;1145        const struct dcbnl_rtnl_ops *ops = dev->dcbnl_ops; 还有dcbnl_notifydcbnl_cee_fill(dcbnl_cee_get/* Handle CEE DCBX GET commands. */DCB模块
Net/dcb目录

主要是和应用程序交互,解析应用程序的包,执行相关的功能,然后去调用变量的callback函数进行get或者set操作,再将结果反馈给应用程序lldpad。

DCB子系统注册rtnetlink
感觉看一下这个3.2的代码完全没有什么问题的呀。差不多就是那个论文中提到的那样子

Note:翻译自代码,可是翻译起来真的好带感呢。

__rtnl_register函数:注册一个rtnetlink消息类型(是提供给模块自己注册的哦)

参数

@protocol:协议家族或者PF_UNSPEC

@msgtype:rtnetlink的消息类型

@doit:每次请求消息调用的函数指针

@dumpit:每次dump请求NLM_F_DUM调用的函数指针

@calit:计算dump消息大小的指针函数

static struct rtnl_link*rtnl_msg_handlers[RTNL_FAMILY_MAX + 1];

int __rtnl_register(int protocol, intmsgtype,

rtnl_doit_func doit, rtnl_dumpit_funcdumpit,

rtnl_calcit_func calcit){

struct rtnl_link *tab;

intmsgindex;

BUG_ON(protocol< 0 || protocol > RTNL_FAMILY_MAX);

msgindex= rtm_msgindex(msgtype);

tab= rtnl_msg_handlers[protocol];

if(tab == NULL) {

tab= kcalloc(RTM_NR_MSGTYPES, sizeof(*tab), GFP_KERNEL);

if(tab == NULL)

return-ENOBUFS;

rtnl_msg_handlers[protocol]= tab;

}

if(doit)

tab[msgindex].doit= doit;

if(dumpit)

tab[msgindex].dumpit= dumpit;

if(calcit)

tab[msgindex].calcit= calcit;

return0;

}

EXPORT_SYMBOL_GPL(__rtnl_register);

void rtnl_register(int protocol, intmsgtype,

rtnl_doit_func doit, rtnl_dumpit_funcdumpit,

rtnl_calcit_func calcit)

{

if(__rtnl_register(protocol, msgtype, doit, dumpit, calcit) < 0)

panic("Unableto register rtnetlink message handler, "

"protocol = %d, message type =%d\n",

protocol, msgtype);

}

EXPORT_SYMBOL_GPL(rtnl_register);

当消息到达doit之后:

Dcb/dcbnl.c中的

static int __init dcbnl_init(void)

{

INIT_LIST_HEAD(&dcb_app_list);

rtnl_register(PF_UNSPEC,RTM_GETDCB, dcb_doit, NULL, NULL);

rtnl_register(PF_UNSPEC,RTM_SETDCB, dcb_doit, NULL, NULL);

return0;

}

然后这个dcb_doit做了很多的事情,比如说解析skb和其他头部,然后进行相关的操作。

static int dcb_doit(struct sk_buff *skb,structnlmsghdr *nlh, void *arg)

{

structnet *net = sock_net(skb->sk);

structnet_device *netdev;

structdcbmsg  *dcb = (struct dcbmsg *)NLMSG_DATA(nlh);

structnlattr *tb[DCB_ATTR_MAX + 1];

u32pid = skb ? NETLINK_CB(skb).pid : 0;

intret = -EINVAL;

if(!net_eq(net, &init_net))

return-EINVAL;

ret= nlmsg_parse(nlh, sizeof(*dcb), tb, DCB_ATTR_MAX,

dcbnl_rtnl_policy);

if(ret < 0)

returnret;

if(!tb[DCB_ATTR_IFNAME])

return-EINVAL;

netdev= dev_get_by_name(&init_net, nla_data(tb[DCB_ATTR_IFNAME]));

if(!netdev)

return-EINVAL;

if(!netdev->dcbnl_ops)

gotoerrout;

switch(dcb->cmd) {

caseDCB_CMD_GSTATE:

ret= dcbnl_getstate(netdev, tb, pid, nlh->nlmsg_seq,

nlh->nlmsg_flags);

/**

* enum dcbnl_attrs - DCB top-level netlink attributes

*

* @DCB_ATTR_UNDEFINED: unspecified attribute to catch errors

* @DCB_ATTR_IFNAME: interface name of the underlying device (NLA_STRING)

* @DCB_ATTR_STATE: enable state of DCB in the device (NLA_U8)

* @DCB_ATTR_PFC_STATE: enable state of PFC in the device (NLA_U8)

* @DCB_ATTR_PFC_CFG: priority flow control configuration (NLA_NESTED)

* @DCB_ATTR_NUM_TC: number of traffic classes supported in the device (NLA_U8)

* @DCB_ATTR_PG_CFG: priority group configuration (NLA_NESTED)

* @DCB_ATTR_SET_ALL: bool to commit changes to hardware or not (NLA_U8)

* @DCB_ATTR_PERM_HWADDR: MAC address of the physical device (NLA_NESTED)

* @DCB_ATTR_CAP: DCB capabilities of the device (NLA_NESTED)

* @DCB_ATTR_NUMTCS: number of traffic classes supported (NLA_NESTED)

* @DCB_ATTR_BCN: backward congestion notification configuration (NLA_NESTED)

* @DCB_ATTR_IEEE: IEEE 802.1Qaz supported attributes (NLA_NESTED)

* @DCB_ATTR_DCBX: DCBX engine configuration in the device (NLA_U8)

* @DCB_ATTR_FEATCFG: DCBX features flags (NLA_NESTED)

* @DCB_ATTR_CEE: CEE std supported attributes (NLA_NESTED)

*/
 

struct nlattr {

__u16          nla_len;

__u16          nla_type;

};
 

struct nlmsghdr {

__u32                nlmsg_len;      /* Length of message including header */

__u16                nlmsg_type;    /* Message content */

__u16                nlmsg_flags;    /* Additional flags */

__u32                nlmsg_seq;      /* Sequence number */

__u32                nlmsg_pid;      /* Sending process port ID */

};
 

自定义的一些DCB属性,比如:

/* DCB netlink attributes policy */

static const struct nla_policy dcbnl_rtnl_policy[DCB_ATTR_MAX + 1] = {

[DCB_ATTR_IFNAME]      = {.type = NLA_NUL_STRING, .len = IFNAMSIZ - 1},

[DCB_ATTR_STATE]      = {.type = NLA_U8},

[DCB_ATTR_PFC_CFG]    = {.type = NLA_NESTED},

[DCB_ATTR_PG_CFG]      = {.type = NLA_NESTED},

[DCB_ATTR_SET_ALL]    = {.type = NLA_U8},

[DCB_ATTR_PERM_HWADDR] = {.type = NLA_FLAG},

[DCB_ATTR_CAP]        = {.type = NLA_NESTED},

[DCB_ATTR_PFC_STATE]  = {.type = NLA_U8},

[DCB_ATTR_BCN]        = {.type = NLA_NESTED},

[DCB_ATTR_APP]        = {.type = NLA_NESTED},

[DCB_ATTR_IEEE]          = {.type = NLA_NESTED},

[DCB_ATTR_DCBX]        = {.type = NLA_U8},

[DCB_ATTR_FEATCFG]    = {.type = NLA_NESTED},

};
 

关于rtmsg的解析,是这样实现的

/**

* nlmsg_parse - parse attributes of a netlink message

* @nlh: netlink message header

* @hdrlen: length of family specific header

* @tb: destination array with maxtype+1 elements

* @maxtype: maximum attribute type to be expected

* @policy: validation policy

*

* See nla_parse()

*/

static inline int nlmsg_parse(const struct nlmsghdr *nlh, int hdrlen,

struct nlattr *tb[], int maxtype,

const struct nla_policy *policy)

{

if (nlh->nlmsg_len < nlmsg_msg_size(hdrlen))

return -EINVAL;

return nla_parse(tb, maxtype, nlmsg_attrdata(nlh, hdrlen),

nlmsg_attrlen(nlh, hdrlen), policy);

}

Parses a stream of attributes and stores a pointer to each attribute in

* the tb array accessible via the attribute type. Attributes with a type

* exceeding maxtype will be silently ignored for backwards compatibility

* reasons. policy may be set to NULL if no validation is required.

*

* Returns 0 on success or a negative error code.

*/

int nla_parse(struct nlattr **tb, int maxtype, const struct nlattr *head,

int len, const struct nla_policy *policy)

{

const struct nlattr *nla;

int rem, err;

memset(tb, 0, sizeof(struct nlattr *) * (maxtype + 1));

nla_for_each_attr(nla, head, len, rem) {

u16 type = nla_type(nla);

if (type > 0 && type <= maxtype) {

if (policy) {

err = validate_nla(nla, maxtype, policy);

if (err < 0)

goto errout;

}

tb[type] = (struct nlattr *)nla;

}

}


 

关于dcbmsg结构是在dcbnl.h中定义的

struct dcbmsg {

__u8              dcb_family;

__u8              cmd;

__u16              dcb_pad;

};

命令包括:

/**

* enum dcbnl_commands - supported DCB commands

*

* @DCB_CMD_UNDEFINED: unspecified command to catch errors

* @DCB_CMD_GSTATE: request the state of DCB in the device

* @DCB_CMD_SSTATE: set the state of DCB in the device

* @DCB_CMD_PGTX_GCFG: request the priority group configuration for Tx

* @DCB_CMD_PGTX_SCFG: set the priority group configuration for Tx

* @DCB_CMD_PGRX_GCFG: request the priority group configuration for Rx

* @DCB_CMD_PGRX_SCFG: set the priority group configuration for Rx

* @DCB_CMD_PFC_GCFG: request the priority flow control configuration

* @DCB_CMD_PFC_SCFG: set the priority flow control configuration

* @DCB_CMD_SET_ALL: apply all changes to the underlying device

* @DCB_CMD_GPERM_HWADDR: get the permanent MAC address of the underlying

*                        device.  Only useful when using bonding.

* @DCB_CMD_GCAP: request the DCB capabilities of the device

* @DCB_CMD_GNUMTCS: get the number of traffic classes currently supported

* @DCB_CMD_SNUMTCS: set the number of traffic classes

* @DCB_CMD_GBCN: set backward congestion notification configuration

* @DCB_CMD_SBCN: get backward congestion notification configration.

* @DCB_CMD_GAPP: get application protocol configuration

* @DCB_CMD_SAPP: set application protocol configuration

* @DCB_CMD_IEEE_SET: set IEEE 802.1Qaz configuration

* @DCB_CMD_IEEE_GET: get IEEE 802.1Qaz configuration

* @DCB_CMD_GDCBX: get DCBX engine configuration

* @DCB_CMD_SDCBX: set DCBX engine configuration

* @DCB_CMD_GFEATCFG: get DCBX features flags

* @DCB_CMD_SFEATCFG: set DCBX features negotiation flags

* @DCB_CMD_CEE_GET: get CEE aggregated configuration

* @DCB_CMD_IEEE_DEL: delete IEEE 802.1Qaz configuration

*/
 

一个标准的netlink replay call的例子如下,比如get的时候就是基本上调用了它。

static int dcbnl_reply(u8 value, u8 event, u8 cmd, u8 attr, u32 pid,

u32 seq, u16 flags)

{

struct sk_buff *dcbnl_skb;

struct dcbmsg *dcb;

struct nlmsghdr *nlh;

int ret = -EINVAL

dcbnl_skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);

if (!dcbnl_skb)

return ret;

nlh = NLMSG_NEW(dcbnl_skb, pid, seq, event, sizeof(*dcb), flags);

dcb = NLMSG_DATA(nlh);

dcb->dcb_family = AF_UNSPEC;

dcb->cmd = cmd;

dcb->dcb_pad = 0;

ret = nla_put_u8(dcbnl_skb, attr, value);

if (ret)

goto err;

/* end the message, assign the nlmsg_len. */

nlmsg_end(dcbnl_skb, nlh);

ret = rtnl_unicast(dcbnl_skb, &init_net, pid);
 

内容版权声明:除非注明,否则皆为本站原创文章。

转载注明出处:https://www.heiqu.com/16649.html