Skip to main content

docker与iptable和网桥

· 8 min read

如何创建网桥

创建网桥,可以通过bridge-utils包的brctl来创建一个网桥

$sudo brctl addbr br0

然后通过brctl show可以看到列出的网桥

$brctl  show
bridge name bridge id STP enabled interfaces
br0 8000.000000000000 no

通过strace查看系统调用

$sudo strace  brctl addbr br1

输出

ubuntu@VM-0-3-ubuntu:~/libnlbuild/bin$ sudo strace  brctl addbr br1
...
socket(AF_UNIX, SOCK_STREAM, 0) = 3
ioctl(3, SIOCBRADDBR, "br1") = 0
+++ exited with 0 +++

看到调用

ioctl(3, SIOCBRADDBR, "br1") 

3 指的是打开的文件描述符.0,1,2都是特殊的标准输入输出错误等的文件描述符,所以下一个打开的文件就是3

我写的一个创建网桥的小例子

//  bradd.c
#include <linux/sockios.h>
#include <sys/types.h>
#include <sys/socket.h>
#include <sys/un.h>
#include <fcntl.h>
#include <sys/ioctl.h>
#include <stdio.h>
int main(){
int br_socket_fd,ret;
if(br_socket_fd = socket(AF_LOCAL, SOCK_STREAM, 0) < 0){
perror("Error: ");
}
if(ret = ioctl(br_socket_fd, SIOCBRADDBR,"hello") < 0) // SIOCBRADDBR 由sockios.h 引入
{
perror("ioctl error");
}
return 0;
}
$gcc bradd.c -o 
## 需要使用sudo添加网桥
$sudo ./bradd

然后用brctl show 输出,创建了一个叫hello的网桥:

$ brctl show
bridge name bridge id STP enabled interfaces
docker0 8000.024273119fd1 no vethe6cf6a0
hello 8000.000000000000 no

然后我们发现了docker0hello两个网桥相差一个interfaces,我们如何添加veth呢?

  • 在brctl 中可以使用brctl addif
int br_add_interface(const char *bridge, const char *dev)
{
struct ifreq ifr;
...
int ifindex = if_nametoindex(dev);
...
strncpy(ifr.ifr_name, bridge, IFNAMSIZ);
ifr.ifr_ifindex = ifindex;
err = ioctl(br_socket_fd, SIOCBRADDIF, &ifr);
...
}

最后调用linux 的net/bridge/br_if.c:

// dev 是我们要添加的设备 // br 是我们的网桥

/* called with RTNL */
int br_add_if(struct net_bridge *br, struct net_device *dev,
struct netlink_ext_ack *extack)
{
struct net_bridge_port *p;
int err = 0;
unsigned br_hr, dev_hr;
bool changed_addr;

...
p = new_nbp(br, dev);
if (IS_ERR(p))
return PTR_ERR(p);

call_netdevice_notifiers(NETDEV_JOIN, dev);

err = dev_set_allmulti(dev, 1);
if (err) {
kfree(p); /* kobject not yet init'd, manually free */
goto err1;
}

err = kobject_init_and_add(&p->kobj, &brport_ktype, &(dev->dev.kobj),
SYSFS_BRIDGE_PORT_ATTR);
if (err)
goto err2;

err = br_sysfs_addif(p);
if (err)
goto err2;

err = br_netpoll_enable(p);
if (err)
goto err3;

err = netdev_rx_handler_register(dev, br_handle_frame, p);
if (err)
goto err4;

dev->priv_flags |= IFF_BRIDGE_PORT;

err = netdev_master_upper_dev_link(dev, br->dev, NULL, NULL, extack);
if (err)
goto err5;

err = nbp_switchdev_mark_set(p);
if (err)
goto err6;

dev_disable_lro(dev);

list_add_rcu(&p->list, &br->port_list);

nbp_update_port_count(br);

netdev_update_features(br->dev);

br_hr = br->dev->needed_headroom;
dev_hr = netdev_get_fwd_headroom(dev);
if (br_hr < dev_hr)
update_headroom(br, dev_hr);
else
netdev_set_rx_headroom(dev, br_hr);

if (br_fdb_insert(br, p, dev->dev_addr, 0))
netdev_err(dev, "failed insert local address bridge forwarding table\n");

if (br->dev->addr_assign_type != NET_ADDR_SET) {
/* Ask for permission to use this MAC address now, even if we
* don't end up choosing it below.
*/
err = dev_pre_changeaddr_notify(br->dev, dev->dev_addr, extack);
if (err)
goto err7;
}

err = nbp_vlan_init(p, extack);
if (err) {
netdev_err(dev, "failed to initialize vlan filtering on this port\n");
goto err7;
}

spin_lock_bh(&br->lock);
changed_addr = br_stp_recalculate_bridge_id(br);

if (netif_running(dev) && netif_oper_up(dev) &&
(br->dev->flags & IFF_UP))
br_stp_enable_port(p);
spin_unlock_bh(&br->lock);

br_ifinfo_notify(RTM_NEWLINK, NULL, p);

if (changed_addr)
call_netdevice_notifiers(NETDEV_CHANGEADDR, br->dev);

br_mtu_auto_adjust(br);
br_set_gso_limits(br);

kobject_uevent(&p->kobj, KOBJ_ADD);

return 0;
...
}

添加虚拟设备:

# strace  ip link add vethaaa type veth peer name vethbbb
execve("/sbin/ip", ["ip", "link", "add", "vethaaa", "type", "veth", "peer", "name", "vethbbb"], 0x7ffed8af30f0 /* 23 vars */)
...
socket(AF_NETLINK, SOCK_RAW|SOCK_CLOEXEC, NETLINK_ROUTE) = 3
setsockopt(3, SOL_SOCKET, SO_SNDBUF, [32768], 4) = 0
setsockopt(3, SOL_SOCKET, SO_RCVBUF, [1048576], 4) = 0
setsockopt(3, SOL_NETLINK, NETLINK_EXT_ACK, [1], 4) = 0
bind(3, {sa_family=AF_NETLINK, nl_pid=0, nl_groups=00000000}, 12) = 0
getsockname(3, {sa_family=AF_NETLINK, nl_pid=26226, nl_groups=00000000}, [12]) = 0
sendto(3, {{len=32, type=RTM_NEWLINK, flags=NLM_F_REQUEST|NLM_F_ACK, seq=0, pid=0}, {ifi_family=AF_UNSPEC, ifi_type=ARPHRD_NETROM, ifi_index=0, ifi_flags=0, ifi_change=0}}, 32, 0, NULL, 0) = 32
recvmsg(3, {msg_name={sa_family=AF_NETLINK, nl_pid=0, nl_groups=00000000}, msg_namelen=12, msg_iov=[{iov_base={{len=52, type=NLMSG_ERROR, flags=0, seq=0, pid=26226}, {error=-ENODEV, msg={{len=32, type=RTM_NEWLINK, flags=NLM_F_REQUEST|NLM_F_ACK, seq=0, pid=0}, {ifi_family=AF_UNSPEC, ifi_type=ARPHRD_NETROM, ifi_index=0, ifi_flags=0, ifi_change=0}}}}, iov_len=16384}], msg_iovlen=1, msg_controllen=0, msg_flags=0}, 0) = 52
access("/proc/net", R_OK) = 0
access("/proc/net/unix", R_OK) = 0
socket(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0) = 4
ioctl(4, SIOCGIFINDEX, {ifr_name="vethaaa"}) = -1 ENODEV (No such device)
close(4) = 0
brk(NULL) = 0x560e12455000
brk(0x560e12476000) = 0x560e12476000
openat(AT_FDCWD, "/usr/lib/ip/link_veth.so", O_RDONLY|O_CLOEXEC) = -1 ENOENT (No such file or directory)
sendmsg(3, {msg_name={sa_family=AF_NETLINK, nl_pid=0, nl_groups=00000000}, msg_namelen=12, msg_iov=[{iov_base={{len=92, type=RTM_NEWLINK, flags=NLM_F_REQUEST|NLM_F_ACK|NLM_F_EXCL|NLM_F_CREATE, seq=1576836139, pid=0}, {ifi_family=AF_UNSPEC, ifi_type=ARPHRD_NETROM, ifi_index=0, ifi_flags=0, ifi_change=0}, [{{nla_len=12, nla_type=IFLA_IFNAME}, "vethaaa"}, {{nla_len=48, nla_type=IFLA_LINKINFO}, [{{nla_len=8, nla_type=IFLA_INFO_KIND}, "veth"...}, {{nla_len=36, nla_type=IFLA_INFO_DATA}, "\x20\x00\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x0c\x00\x03\x00\x76\x65\x74\x68\x62\x62\x62\x00"}]}]}, iov_len=92}], msg_iovlen=1, msg_controllen=0, msg_flags=0}, 0) = 92
recvmsg(3, {msg_name={sa_family=AF_NETLINK, nl_pid=0, nl_groups=00000000}, msg_namelen=12, msg_iov=[{iov_base=NULL, iov_len=0}], msg_iovlen=1, msg_controllen=0, msg_flags=MSG_TRUNC}, MSG_PEEK|MSG_TRUNC) = 36
recvmsg(3, {msg_name={sa_family=AF_NETLINK, nl_pid=0, nl_groups=00000000}, msg_namelen=12, msg_iov=[{iov_base={{len=36, type=NLMSG_ERROR, flags=NLM_F_CAPPED, seq=1576836139, pid=26226}, {error=0, msg={len=92, type=RTM_NEWLINK, flags=NLM_F_REQUEST|NLM_F_ACK|NLM_F_EXCL|NLM_F_CREATE, seq=1576836139, pid=0}}}, iov_len=36}], msg_iovlen=1, msg_controllen=0, msg_flags=0}, 0) = 36

socket(AF_NETLINK, SOCK_RAW|SOCK_CLOEXEC, NETLINK_ROUTE) = 3
setsockopt(3, SOL_SOCKET, SO_SNDBUF, [32768], 4) = 0
setsockopt(3, SOL_SOCKET, SO_RCVBUF, [1048576], 4) = 0
setsockopt(3, SOL_NETLINK, NETLINK_EXT_ACK, [1], 4) = 0
bind(3, {sa_family=AF_NETLINK, nl_pid=0, nl_groups=00000000}, 12) = 0
getsockname(3, {sa_family=AF_NETLINK, nl_pid=18263, nl_groups=00000000}, [12]) = 0
sendto(3, {{len=32, type=RTM_NEWLINK, flags=NLM_F_REQUEST|NLM_F_ACK, seq=0, pid=0}, {ifi_family=AF_UNSPEC, ifi_type=ARPHRD_NETROM, ifi_index=0, ifi_flags=0, ifi_change=0}}, 32, 0, NULL, 0) = 32
recvmsg(3, {msg_name={sa_family=AF_NETLINK, nl_pid=0, nl_groups=00000000}, msg_namelen=12, msg_iov=[{iov_base={{len=52, type=NLMSG_ERROR, flags=0, seq=0, pid=18263}, {error=-EPERM, msg={{len=32, type=RTM_NEWLINK, flags=NLM_F_REQUEST|NLM_F_ACK, seq=0, pid=0}, {ifi_family=AF_UNSPEC, ifi_type=ARPHRD_NETROM, ifi_index=0, ifi_flags=0, ifi_change=0}}}}, iov_len=16384}], msg_iovlen=1, msg_controllen=0, msg_flags=0}, 0) = 52
access("/proc/net", R_OK) = 0
access("/proc/net/unix", R_OK) = 0
socket(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0) = 4
ioctl(4, SIOCGIFINDEX, {ifr_name="p1"}) = -1 ENODEV (No such device)
close(4) = 0
brk(NULL) = 0x5595d01bb000
brk(0x5595d01dc000) = 0x5595d01dc000
openat(AT_FDCWD, "/usr/lib/ip/link_veth.so", O_RDONLY|O_CLOEXEC) = -1 ENOENT (No such file or directory)
sendmsg(3, {msg_name={sa_family=AF_NETLINK, nl_pid=0, nl_groups=00000000}, msg_namelen=12, msg_iov=[{iov_base={{len=84, type=RTM_NEWLINK, flags=NLM_F_REQUEST|NLM_F_ACK|NLM_F_EXCL|NLM_F_CREATE, seq=1576748752, pid=0}, {ifi_family=AF_UNSPEC, ifi_type=ARPHRD_NETROM, ifi_index=0, ifi_flags=0, ifi_change=0}, [{{nla_len=7, nla_type=IFLA_IFNAME}, "p1"}, {{nla_len=44, nla_type=IFLA_LINKINFO}, [{{nla_len=8, nla_type=IFLA_INFO_KIND}, "veth"...}, {{nla_len=32, nla_type=IFLA_INFO_DATA}, "\x1c\x00\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x07\x00\x03\x00\x70\x32\x00\x00"}]}]}, iov_len=84}], msg_iovlen=1, msg_controllen=0, msg_flags=0}, 0) = 84
recvmsg(3, {msg_name={sa_family=AF_NETLINK, nl_pid=0, nl_groups=00000000}, msg_namelen=12, msg_iov=[{iov_base=NULL, iov_len=0}], msg_iovlen=1, msg_controllen=0, msg_flags=MSG_TRUNC}, MSG_PEEK|MSG_TRUNC) = 104
recvmsg(3, {msg_name={sa_family=AF_NETLINK, nl_pid=0, nl_groups=00000000}, msg_namelen=12, msg_iov=[{iov_base={{len=104, type=NLMSG_ERROR, flags=0, seq=1576748752, pid=18263}, {error=-EPERM, msg={{len=84, type=RTM_NEWLINK, flags=NLM_F_REQUEST|NLM_F_ACK|NLM_F_EXCL|NLM_F_CREATE, seq=1576748752, pid=0}, {ifi_family=AF_UNSPEC, ifi_type=ARPHRD_NETROM, ifi_index=0, ifi_flags=0, ifi_change=0}, [{{nla_len=7, nla_type=IFLA_IFNAME}, "p1"}, {{nla_len=44, nla_type=IFLA_LINKINFO}, [{{nla_len=8, nla_type=IFLA_INFO_KIND}, "veth"...}, {{nla_len=32, nla_type=IFLA_INFO_DATA}, "\x1c\x00\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x07\x00\x03\x00\x70\x32\x00\x00"}]}]}}}, iov_len=104}], msg_iovlen=1, msg_controllen=0, msg_flags=0}, 0) = 104
write(2, "RTNETLINK answers: Operation not"..., 43RTNETLINK answers: Operation not permitted
) = 43
exit_group(2) = ?
+++ exited with 2 +++

linux 相关的netlink veth内容:

// drivers\net\veth.c
static struct rtnl_link_ops veth_link_ops = {
.kind = DRV_NAME,
.priv_size = sizeof(struct veth_priv),
.setup = veth_setup,
.validate = veth_validate,
.newlink = veth_newlink,
.dellink = veth_dellink,
.policy = veth_policy,
.maxtype = VETH_INFO_MAX,
.get_link_net = veth_get_link_net,
};
E:\linux-master\net\netlink\af_netlink.c
static const struct proto_ops netlink_ops = {
.family = PF_NETLINK,
.owner = THIS_MODULE,
.release = netlink_release,
.bind = netlink_bind,
.connect = netlink_connect,
.socketpair = sock_no_socketpair,
.accept = sock_no_accept,
.getname = netlink_getname,
.poll = datagram_poll,
.ioctl = netlink_ioctl,
.listen = sock_no_listen,
.shutdown = sock_no_shutdown,
.setsockopt = netlink_setsockopt,
.getsockopt = netlink_getsockopt,
.sendmsg = netlink_sendmsg,
.recvmsg = netlink_recvmsg,
.mmap = sock_no_mmap,
.sendpage = sock_no_sendpage,
};

添加veth 设备

首先是添加socket

(gdb) bt
#0 socket () at ../sysdeps/unix/syscall-template.S:78
#1 0x00005555555b60c7 in rtnl_open_byproto (rth=0x5555557d8020 <rth>, subscriptions=0, protocol=<optimized out>) at libnetlink.c:194
#2 0x000055555555f956 in main (argc=9, argv=0x7fffffffe548) at ip.c:308
Breakpoint 6, __libc_sendmsg (fd=3, msg=msg@entry=0x7fffffffdd70, flags=flags@entry=0) at ../sysdeps/unix/sysv/linux/sendmsg.c:28
28 ../sysdeps/unix/sysv/linux/sendmsg.c: No such file or directory.
(gdb) bt
#0 __libc_sendmsg (fd=3, msg=msg@entry=0x7fffffffdd70, flags=flags@entry=0) at ../sysdeps/unix/sysv/linux/sendmsg.c:28
#1 0x00005555555b5c8f in __rtnl_talk_iov (rtnl=0x5555557d8020 <rth>, iov=iov@entry=0x7fffffffddf0, iovlen=iovlen@entry=1, answer=answer@entry=0x0, show_rtnl_err=show_rtnl_err@entry=true,
errfn=0x0) at libnetlink.c:887
#2 0x00005555555b7225 in __rtnl_talk (errfn=0x0, show_rtnl_err=true, answer=<optimized out>, n=0x7fffffffde40, rtnl=<optimized out>) at libnetlink.c:1000
#3 rtnl_talk (rtnl=<optimized out>, n=n@entry=0x7fffffffde40, answer=answer@entry=0x0) at libnetlink.c:1006
#4 0x000055555557bc6e in iplink_modify (cmd=cmd@entry=16, flags=flags@entry=1536, argc=3, argc@entry=6, argv=<optimized out>, argv@entry=0x7fffffffe560) at iplink.c:1084
#5 0x000055555557c0c6 in do_iplink (argc=7, argv=0x7fffffffe558) at iplink.c:1641
#6 0x000055555555ff0c in do_cmd (argv0=0x7fffffffe7d8 "link", argc=8, argv=0x7fffffffe550) at ip.c:113
#7 0x000055555555f9a0 in main (argc=9, argv=0x7fffffffe548) at ip.c:317

比如命令ip link add veth_0 type veth peer name veth_0_peer 初始化的时候req.n 的长度是32

 p req.n.nlmsg_len 
$1 = 32

经过ret = iplink_parse(argc, argv, &req, &type); 后变成44,

(gdb) p ((char *)n)[32]@64
$50 = "\v\000\003\000veth_0\000\000\064\000\022\000\b\000\001\000veth(\000\002\000$\000\001", '\000' <repeats 17 times>, "\020\000\003\000veth_0_peer"

iptables是什么?

# type iptables
iptables is hashed (/sbin/iptables)

iptables命令为什么可以处理那些问题呢?

iptable原理

iptable就是通过socket netlink做特别的通信,改变netfilter子系统的相关hook

源码 相关阅读

相关阅读