Dcbtool
Dcbtool可以用来查询和设置DCB以太网接口的DCB特性。通用的命令有gc, sc等。(对应的DCB模块的get和set函数),主要可以参考https://github.com/jrfastab/lldpad比如:<gc|go> dcbx:gets the configured or operational version of the DCB capabilities exchange protocol. 可以设置本地interface的配置特性。sc <ifname> <feature> <args> sets the configuration of feature on interface ifname.这些特性feature包括: dcb DCB state of the port pg priority groupspgid:xxxxxxxxPriority group ID for the 8 priorities. From left to right(priorities 0-7), x is the corresponding priority group IDvalue, which can be 0-7 for priority groups with bandwidth allo-cations or f (priority group ID 15) for the unrestricted prior-ity group. pfc priority flow control(特殊的参数有pfcup: xxxxxxxx)x是0或1,1指的是这个相应的优先级(总共有8个优先级0-7嘛)使用传输的pause帧机制,0就表示不使用。 app:<subtype> 特殊的参数是appcfg:xx xx是一个16进制的值代表一个8bit的bitmap,某一位为1代表着这个subtype使用这个优先级。 application specific data subtype can be: --------------- 0|fcoe Fiber Channel over Ethernet (FCoE) 下面是dcbtool的使用例子:达到的目的:使得PFC pause发生作用的传输优先级是3,并且将FCoE流量分配到这个第三优先级上。 dcbtool sc eth2 pfc pfcup:00010000 dcbtool sc eth2 app:0 appcfg:08(app:0是表示FcoE流量)另外带宽分配的部分就是pg:比如 dcbtool sc eth2 pg pgid:0000111f pgpct:25,75,0,0,0,0,0,0
使用Netlink和内核DCB子系统交互
在net\netlink\af_netlink.c中:
static int __init netlink_proto_init(void)
sock_register(&netlink_family_ops);
}
static const struct net_proto_familynetlink_family_ops = {
.family = PF_NETLINK,
.create = netlink_create,
.owner = THIS_MODULE, /* for consistency 8) */
};
在netlink_create中调用
static int __netlink_create(struct net *net, struct socket *sock,
struct mutex *cb_mutex, int protocol)
{
struct sock *sk;
struct netlink_sock *nlk;
sock->ops = &netlink_ops;
sk = sk_alloc(net, PF_NETLINK, GFP_KERNEL, &netlink_proto);
…
}
static const struct proto_opsnetlink_ops= {
.family = PF_NETLINK,
.owner = THIS_MODULE,
.release = netlink_release,
.bind = netlink_bind,
.connect = netlink_connect,
.socketpair = sock_no_socketpair,
.accept = sock_no_accept,
.getname = netlink_getname,
.poll = datagram_poll,
.ioctl = sock_no_ioctl,
.listen = sock_no_listen,
.shutdown = sock_no_shutdown,
.setsockopt = netlink_setsockopt,
.getsockopt = netlink_getsockopt,
.sendmsg = netlink_sendmsg,//这可能就是和那些系统调用对应的函数
.recvmsg = netlink_recvmsg,
.mmap = sock_no_mmap,
.sendpage = sock_no_sendpage,
};
数据结构dcbnl_ops
这里的dcbnl_ops应该算是routing子系统中的一个需要用到的结构。为了进一步理解,还是把这个过程弄清楚吧。 到现在lldpad怎么操作,基本上梳理清楚,至于lldpad怎么和内核程序进行交互。应该要看一看struct dcbnl_rtnl_ops出现在内核的哪些部分。include/net/dcbnl.h定义这个结构
定义:include/net/dcbnl.h, line 46
· *· 43 * Ops struct for the netlink callbacks. Used by DCB-enabled drivers through* the netdevice struct.· 45 */· 46 struct dcbnl_rtnl_ops {· 47 /* IEEE 802.1Qaz std */· 56 int (*ieee_delapp) (struct net_device *, struct dcb_app *);· 59· 60 /* CEE std */· 61 u8 (*getstate)(struct net_device *);· u16 *);· 96 int (*peer_getapptable)(struct net_device *, struct dcb_app *);· 97 · 98 /* CEE peer */· 99 int (*cee_peer_getpg) (struct net_device *, struct cee_pg *);· 100 int (*cee_peer_getpfc) (struct net_device *, struct cee_pfc *);· 101 };· 102· 103 #endif /* __NET_DCBNL_H__ */· 104
referenced in by驱动(DCB enabled)和DCB模块
然后referenced in很多的地方:被很多的驱动引用,比如broadcom的bnx2x(这个编译内核的时候没有必要选上吧)。mellanox/mlx4;/qlogic/qlcnic;intel/ixgbe;等这些基本上都是定义并且实现了这个结构;并且把这个结构作为netdev的子结构传递给netdev结构体。 然后在net/dcb/dcbnl.c(DCB模块)中有这些函数static int dcbnl_build_peer_app(struct net_device *netdev, struct sk_buff* skb,int app_nested_type, int app_info_type, int app_entry_type)然后这个里面使用了一个变量指针ops,将这个netdev的dcbnl_ops传递给它。const struct dcbnl_rtnl_ops *ops = netdev->dcbnl_ops; static int dcbnl_ieee_fill(struct sk_buff *skb, struct net_device *netdev)1035 {1036 struct nlattr *ieee, *app;1037 struct dcb_app_type *itr;1038 const struct dcbnl_rtnl_ops *ops = netdev->dcbnl_ops;1039 int dcbx; static int dcbnl_cee_pg_fill(struct sk_buff *skb, struct net_device *dev,1142 int dir)1143 {1144 u8pgid, up_map, prio, tc_pct;1145 const struct dcbnl_rtnl_ops *ops = dev->dcbnl_ops; 还有dcbnl_notifydcbnl_cee_fill(dcbnl_cee_get/* Handle CEE DCBX GET commands. */DCB模块
Net/dcb目录
主要是和应用程序交互,解析应用程序的包,执行相关的功能,然后去调用变量的callback函数进行get或者set操作,再将结果反馈给应用程序lldpad。
DCB子系统注册rtnetlink
感觉看一下这个3.2的代码完全没有什么问题的呀。差不多就是那个论文中提到的那样子
Note:翻译自代码,可是翻译起来真的好带感呢。
__rtnl_register函数:注册一个rtnetlink消息类型(是提供给模块自己注册的哦)
参数
@protocol:协议家族或者PF_UNSPEC
@msgtype:rtnetlink的消息类型
@doit:每次请求消息调用的函数指针
@dumpit:每次dump请求NLM_F_DUM调用的函数指针
@calit:计算dump消息大小的指针函数
static struct rtnl_link*rtnl_msg_handlers[RTNL_FAMILY_MAX + 1];
int __rtnl_register(int protocol, intmsgtype,
rtnl_doit_func doit, rtnl_dumpit_funcdumpit,
rtnl_calcit_func calcit){
struct rtnl_link *tab;
intmsgindex;
BUG_ON(protocol< 0 || protocol > RTNL_FAMILY_MAX);
msgindex= rtm_msgindex(msgtype);
tab= rtnl_msg_handlers[protocol];
if(tab == NULL) {
tab= kcalloc(RTM_NR_MSGTYPES, sizeof(*tab), GFP_KERNEL);
if(tab == NULL)
return-ENOBUFS;
rtnl_msg_handlers[protocol]= tab;
}
if(doit)
tab[msgindex].doit= doit;
if(dumpit)
tab[msgindex].dumpit= dumpit;
if(calcit)
tab[msgindex].calcit= calcit;
return0;
}
EXPORT_SYMBOL_GPL(__rtnl_register);
void rtnl_register(int protocol, intmsgtype,
rtnl_doit_func doit, rtnl_dumpit_funcdumpit,
rtnl_calcit_func calcit)
{
if(__rtnl_register(protocol, msgtype, doit, dumpit, calcit) < 0)
panic("Unableto register rtnetlink message handler, "
"protocol = %d, message type =%d\n",
protocol, msgtype);
}
EXPORT_SYMBOL_GPL(rtnl_register);
当消息到达doit之后:
Dcb/dcbnl.c中的
static int __init dcbnl_init(void)
{
INIT_LIST_HEAD(&dcb_app_list);
rtnl_register(PF_UNSPEC,RTM_GETDCB, dcb_doit, NULL, NULL);
rtnl_register(PF_UNSPEC,RTM_SETDCB, dcb_doit, NULL, NULL);
return0;
}
然后这个dcb_doit做了很多的事情,比如说解析skb和其他头部,然后进行相关的操作。
static int dcb_doit(struct sk_buff *skb,structnlmsghdr *nlh, void *arg)
{
structnet *net = sock_net(skb->sk);
structnet_device *netdev;
structdcbmsg *dcb = (struct dcbmsg *)NLMSG_DATA(nlh);
structnlattr *tb[DCB_ATTR_MAX + 1];
u32pid = skb ? NETLINK_CB(skb).pid : 0;
intret = -EINVAL;
if(!net_eq(net, &init_net))
return-EINVAL;
ret= nlmsg_parse(nlh, sizeof(*dcb), tb, DCB_ATTR_MAX,
dcbnl_rtnl_policy);
if(ret < 0)
returnret;
if(!tb[DCB_ATTR_IFNAME])
return-EINVAL;
netdev= dev_get_by_name(&init_net, nla_data(tb[DCB_ATTR_IFNAME]));
if(!netdev)
return-EINVAL;
if(!netdev->dcbnl_ops)
gotoerrout;
switch(dcb->cmd) {
caseDCB_CMD_GSTATE:
ret= dcbnl_getstate(netdev, tb, pid, nlh->nlmsg_seq,
nlh->nlmsg_flags);
/**
* enum dcbnl_attrs - DCB top-level netlink attributes
*
* @DCB_ATTR_UNDEFINED: unspecified attribute to catch errors
* @DCB_ATTR_IFNAME: interface name of the underlying device (NLA_STRING)
* @DCB_ATTR_STATE: enable state of DCB in the device (NLA_U8)
* @DCB_ATTR_PFC_STATE: enable state of PFC in the device (NLA_U8)
* @DCB_ATTR_PFC_CFG: priority flow control configuration (NLA_NESTED)
* @DCB_ATTR_NUM_TC: number of traffic classes supported in the device (NLA_U8)
* @DCB_ATTR_PG_CFG: priority group configuration (NLA_NESTED)
* @DCB_ATTR_SET_ALL: bool to commit changes to hardware or not (NLA_U8)
* @DCB_ATTR_PERM_HWADDR: MAC address of the physical device (NLA_NESTED)
* @DCB_ATTR_CAP: DCB capabilities of the device (NLA_NESTED)
* @DCB_ATTR_NUMTCS: number of traffic classes supported (NLA_NESTED)
* @DCB_ATTR_BCN: backward congestion notification configuration (NLA_NESTED)
* @DCB_ATTR_IEEE: IEEE 802.1Qaz supported attributes (NLA_NESTED)
* @DCB_ATTR_DCBX: DCBX engine configuration in the device (NLA_U8)
* @DCB_ATTR_FEATCFG: DCBX features flags (NLA_NESTED)
* @DCB_ATTR_CEE: CEE std supported attributes (NLA_NESTED)
*/
struct nlattr {
__u16 nla_len;
__u16 nla_type;
};
struct nlmsghdr {
__u32 nlmsg_len; /* Length of message including header */
__u16 nlmsg_type; /* Message content */
__u16 nlmsg_flags; /* Additional flags */
__u32 nlmsg_seq; /* Sequence number */
__u32 nlmsg_pid; /* Sending process port ID */
};
自定义的一些DCB属性,比如:
/* DCB netlink attributes policy */
static const struct nla_policy dcbnl_rtnl_policy[DCB_ATTR_MAX + 1] = {
[DCB_ATTR_IFNAME] = {.type = NLA_NUL_STRING, .len = IFNAMSIZ - 1},
[DCB_ATTR_STATE] = {.type = NLA_U8},
[DCB_ATTR_PFC_CFG] = {.type = NLA_NESTED},
[DCB_ATTR_PG_CFG] = {.type = NLA_NESTED},
[DCB_ATTR_SET_ALL] = {.type = NLA_U8},
[DCB_ATTR_PERM_HWADDR] = {.type = NLA_FLAG},
[DCB_ATTR_CAP] = {.type = NLA_NESTED},
[DCB_ATTR_PFC_STATE] = {.type = NLA_U8},
[DCB_ATTR_BCN] = {.type = NLA_NESTED},
[DCB_ATTR_APP] = {.type = NLA_NESTED},
[DCB_ATTR_IEEE] = {.type = NLA_NESTED},
[DCB_ATTR_DCBX] = {.type = NLA_U8},
[DCB_ATTR_FEATCFG] = {.type = NLA_NESTED},
};
关于rtmsg的解析,是这样实现的
/**
* nlmsg_parse - parse attributes of a netlink message
* @nlh: netlink message header
* @hdrlen: length of family specific header
* @tb: destination array with maxtype+1 elements
* @maxtype: maximum attribute type to be expected
* @policy: validation policy
*
* See nla_parse()
*/
static inline int nlmsg_parse(const struct nlmsghdr *nlh, int hdrlen,
struct nlattr *tb[], int maxtype,
const struct nla_policy *policy)
{
if (nlh->nlmsg_len < nlmsg_msg_size(hdrlen))
return -EINVAL;
return nla_parse(tb, maxtype, nlmsg_attrdata(nlh, hdrlen),
nlmsg_attrlen(nlh, hdrlen), policy);
}
Parses a stream of attributes and stores a pointer to each attribute in
* the tb array accessible via the attribute type. Attributes with a type
* exceeding maxtype will be silently ignored for backwards compatibility
* reasons. policy may be set to NULL if no validation is required.
*
* Returns 0 on success or a negative error code.
*/
int nla_parse(struct nlattr **tb, int maxtype, const struct nlattr *head,
int len, const struct nla_policy *policy)
{
const struct nlattr *nla;
int rem, err;
memset(tb, 0, sizeof(struct nlattr *) * (maxtype + 1));
nla_for_each_attr(nla, head, len, rem) {
u16 type = nla_type(nla);
if (type > 0 && type <= maxtype) {
if (policy) {
err = validate_nla(nla, maxtype, policy);
if (err < 0)
goto errout;
}
tb[type] = (struct nlattr *)nla;
}
}
关于dcbmsg结构是在dcbnl.h中定义的
struct dcbmsg {
__u8 dcb_family;
__u8 cmd;
__u16 dcb_pad;
};
命令包括:
/**
* enum dcbnl_commands - supported DCB commands
*
* @DCB_CMD_UNDEFINED: unspecified command to catch errors
* @DCB_CMD_GSTATE: request the state of DCB in the device
* @DCB_CMD_SSTATE: set the state of DCB in the device
* @DCB_CMD_PGTX_GCFG: request the priority group configuration for Tx
* @DCB_CMD_PGTX_SCFG: set the priority group configuration for Tx
* @DCB_CMD_PGRX_GCFG: request the priority group configuration for Rx
* @DCB_CMD_PGRX_SCFG: set the priority group configuration for Rx
* @DCB_CMD_PFC_GCFG: request the priority flow control configuration
* @DCB_CMD_PFC_SCFG: set the priority flow control configuration
* @DCB_CMD_SET_ALL: apply all changes to the underlying device
* @DCB_CMD_GPERM_HWADDR: get the permanent MAC address of the underlying
* device. Only useful when using bonding.
* @DCB_CMD_GCAP: request the DCB capabilities of the device
* @DCB_CMD_GNUMTCS: get the number of traffic classes currently supported
* @DCB_CMD_SNUMTCS: set the number of traffic classes
* @DCB_CMD_GBCN: set backward congestion notification configuration
* @DCB_CMD_SBCN: get backward congestion notification configration.
* @DCB_CMD_GAPP: get application protocol configuration
* @DCB_CMD_SAPP: set application protocol configuration
* @DCB_CMD_IEEE_SET: set IEEE 802.1Qaz configuration
* @DCB_CMD_IEEE_GET: get IEEE 802.1Qaz configuration
* @DCB_CMD_GDCBX: get DCBX engine configuration
* @DCB_CMD_SDCBX: set DCBX engine configuration
* @DCB_CMD_GFEATCFG: get DCBX features flags
* @DCB_CMD_SFEATCFG: set DCBX features negotiation flags
* @DCB_CMD_CEE_GET: get CEE aggregated configuration
* @DCB_CMD_IEEE_DEL: delete IEEE 802.1Qaz configuration
*/
一个标准的netlink replay call的例子如下,比如get的时候就是基本上调用了它。
static int dcbnl_reply(u8 value, u8 event, u8 cmd, u8 attr, u32 pid,
u32 seq, u16 flags)
{
struct sk_buff *dcbnl_skb;
struct dcbmsg *dcb;
struct nlmsghdr *nlh;
int ret = -EINVAL
dcbnl_skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
if (!dcbnl_skb)
return ret;
nlh = NLMSG_NEW(dcbnl_skb, pid, seq, event, sizeof(*dcb), flags);
dcb = NLMSG_DATA(nlh);
dcb->dcb_family = AF_UNSPEC;
dcb->cmd = cmd;
dcb->dcb_pad = 0;
ret = nla_put_u8(dcbnl_skb, attr, value);
if (ret)
goto err;
/* end the message, assign the nlmsg_len. */
nlmsg_end(dcbnl_skb, nlh);
ret = rtnl_unicast(dcbnl_skb, &init_net, pid);