I think you can have a look at Software Defined Networking using VXLAN by Thomas Richter (presented in LinuxCon 2013).
You can turn on l2miss
and l3miss
of the vxlan
device which is not in global netns and set ARP and FDB entries manually.
The following example shows how to achieve this.
function setup_overlay() {
docker run -d --net=none --name=test-overlay ubuntu sleep 321339
sleep 3
pid=`docker inspect -f '{{.State.Pid}}' test-overlay`
ip netns add overlay
ip netns exec overlay ip li ad dev br0 type bridge
ip li add dev vxlan212 type vxlan id 42 l2miss l3miss proxy learning dstport 4789
ip link set vxlan212 netns overlay
ip netns exec overlay ip li set dev vxlan212 name vxlan1
ip netns exec overlay brctl addif br0 vxlan1
ip li add dev vetha1 mtu 1450 type veth peer name vetha2 mtu 1450
ip li set dev vetha1 netns overlay
ip netns exec overlay ip -d li set dev vetha1 name veth2
ip netns exec overlay brctl addif br0 veth2
ip netns exec overlay ip ad add dev br0 $bridge_gatway_cidr
ip netns exec overlay ip li set vxlan1 up
ip netns exec overlay ip li set veth2 up
ip netns exec overlay ip li set br0 up
ln -sfn /proc/$pid/ns/net /var/run/netns/$pid
ip li set dev vetha2 netns $pid
ip netns exec $pid ip li set dev vetha2 name eth1 address $container1_mac_addr
ip netns exec $pid ip ad add dev eth1 $container1_ip_cidr
ip netns exec $pid ip li set dev eth1 up
ip netns exec overlay ip neighbor add $container2_ip lladdr $container2_mac_addr dev vxlan1 nud permanent
ip netns exec overlay bridge fdb add $container2_mac_addr dev vxlan1 self dst $container2_host_ip vni 42 port 4789
}
# setup overlay on host1
bridge_gatway_cidr='10.0.0.1/24'
container1_ip_cidr='10.0.0.2/24'
container1_mac_addr='02:42:0a:00:00:02'
container2_ip='10.0.0.3'
container2_mac_addr='02:42:0a:00:00:03'
container2_host_ip='192.168.10.22'
setup_overlay
# setup overlay on host2
bridge_gatway_cidr='10.0.0.1/24'
container1_ip_cidr='10.0.0.3/24'
container1_mac_addr='02:42:0a:00:00:03'
container2_ip='10.0.0.2'
container2_mac_addr='02:42:0a:00:00:02'
container2_host_ip='192.168.10.21'
setup_overlay
The above script setup a overlay network between two docker containers on two hosts. Vxlan device connects to the bridge br0
in overlay
netns and br0
connects to the container netns with a pair of veth device.
Now check your newly setup overlay network.
# ping container2 on host1
ip netns exec $pid ping -c 10 10.0.0.3
## successful output
root@docker-1:/home/vagrant# ip netns exec $pid ping -c 10 10.0.0.3
PING 10.0.0.3 (10.0.0.3) 56(84) bytes of data.
64 bytes from 10.0.0.3: icmp_seq=1 ttl=64 time=0.879 ms
64 bytes from 10.0.0.3: icmp_seq=2 ttl=64 time=0.558 ms
64 bytes from 10.0.0.3: icmp_seq=3 ttl=64 time=0.576 ms
64 bytes from 10.0.0.3: icmp_seq=4 ttl=64 time=0.614 ms
64 bytes from 10.0.0.3: icmp_seq=5 ttl=64 time=0.521 ms
64 bytes from 10.0.0.3: icmp_seq=6 ttl=64 time=0.389 ms
64 bytes from 10.0.0.3: icmp_seq=7 ttl=64 time=0.551 ms
64 bytes from 10.0.0.3: icmp_seq=8 ttl=64 time=0.565 ms
64 bytes from 10.0.0.3: icmp_seq=9 ttl=64 time=0.488 ms
64 bytes from 10.0.0.3: icmp_seq=10 ttl=64 time=0.531 ms
--- 10.0.0.3 ping statistics ---
10 packets transmitted, 10 received, 0% packet loss, time 9008ms
rtt min/avg/max/mdev = 0.389/0.567/0.879/0.119 ms
## tcpdump sample on host1
root@docker-1:/home/vagrant# tcpdump -vv -n -s 0 -e -i eth1
tcpdump: listening on eth1, link-type EN10MB (Ethernet), capture size 262144 bytes
12:09:35.589244 08:00:27:00:4a:3a > 08:00:27:82:e5:ca, ethertype IPv4 (0x0800), length 148: (tos 0x0, ttl 64, id 59751, offset 0, flags [none], proto UDP (17), length 134)
192.168.0.11.42791 > 192.168.0.12.4789: [no cksum] VXLAN, flags [I] (0x08), vni 42
02:42:0a:00:00:02 > 02:42:0a:00:00:03, ethertype IPv4 (0x0800), length 98: (tos 0x0, ttl 64, id 49924, offset 0, flags [DF], proto ICMP (1), length 84)
10.0.0.2 > 10.0.0.3: ICMP echo request, id 1908, seq 129, length 64
12:09:35.589559 08:00:27:82:e5:ca > 08:00:27:00:4a:3a, ethertype IPv4 (0x0800), length 148: (tos 0x0, ttl 64, id 38389, offset 0, flags [none], proto UDP (17), length 134)
192.168.0.12.56727 > 192.168.0.11.4789: [no cksum] VXLAN, flags [I] (0x08), vni 42
02:42:0a:00:00:03 > 02:42:0a:00:00:02, ethertype IPv4 (0x0800), length 98: (tos 0x0, ttl 64, id 19444, offset 0, flags [none], proto ICMP (1), length 84)
10.0.0.3 > 10.0.0.2: ICMP echo reply, id 1908, seq 129, length 64
12:09:36.590840 08:00:27:00:4a:3a > 08:00:27:82:e5:ca, ethertype IPv4 (0x0800), length 148: (tos 0x0, ttl 64, id 59879, offset 0, flags [none], proto UDP (17), length 134)
192.168.0.11.42791 > 192.168.0.12.4789: [no cksum] VXLAN, flags [I] (0x08), vni 42
02:42:0a:00:00:02 > 02:42:0a:00:00:03, ethertype IPv4 (0x0800), length 98: (tos 0x0, ttl 64, id 49951, offset 0, flags [DF], proto ICMP (1), length 84)
10.0.0.2 > 10.0.0.3: ICMP echo request, id 1908, seq 130, length 64
12:09:36.591328 08:00:27:82:e5:ca > 08:00:27:00:4a:3a, ethertype IPv4 (0x0800), length 148: (tos 0x0, ttl 64, id 38437, offset 0, flags [none], proto UDP (17), length 134)
192.168.0.12.56727 > 192.168.0.11.4789: [no cksum] VXLAN, flags [I] (0x08), vni 42
02:42:0a:00:00:03 > 02:42:0a:00:00:02, ethertype IPv4 (0x0800), length 98: (tos 0x0, ttl 64, id 19687, offset 0, flags [none], proto ICMP (1), length 84)
10.0.0.3 > 10.0.0.2: ICMP echo reply, id 1908, seq 130, length 64
Clean up on each host
ip netns del overlay
ip netns del $pid
docker rm -v -f test-overlay
To explain why the vxlan device works with no receivers in a non-global netns:
Note that we first create the vxlan device in global netns and move it into the overlay
netns. This is indeed needed cause vxlan driver in kernel will keep a reference to the src netns when creating vxlan device. See the following code in drivers/net/vxlan.c
:
static int vxlan_dev_configure(struct net *src_net, struct net_device *dev,
struct vxlan_config *conf)
{
//...
vxlan->net = src_net;
//...
}
and vxlan driver creates udp socket in the src netns
vxlan_sock_add(vxlan->net, vxlan->cfg.dst_port, vxlan->cfg.no_share, vxlan->flags);