diff options
Diffstat (limited to 'net/openvswitch')
| -rw-r--r-- | net/openvswitch/Kconfig | 69 | ||||
| -rw-r--r-- | net/openvswitch/Makefile | 24 | ||||
| -rw-r--r-- | net/openvswitch/actions.c | 1195 | ||||
| -rw-r--r-- | net/openvswitch/conntrack.c | 822 | ||||
| -rw-r--r-- | net/openvswitch/conntrack.h | 93 | ||||
| -rw-r--r-- | net/openvswitch/datapath.c | 2388 | ||||
| -rw-r--r-- | net/openvswitch/datapath.h | 216 | ||||
| -rw-r--r-- | net/openvswitch/dp_notify.c | 99 | ||||
| -rw-r--r-- | net/openvswitch/flow.c | 745 | ||||
| -rw-r--r-- | net/openvswitch/flow.h | 229 | ||||
| -rw-r--r-- | net/openvswitch/flow_netlink.c | 2514 | ||||
| -rw-r--r-- | net/openvswitch/flow_netlink.h | 78 | ||||
| -rw-r--r-- | net/openvswitch/flow_table.c | 782 | ||||
| -rw-r--r-- | net/openvswitch/flow_table.h | 90 | ||||
| -rw-r--r-- | net/openvswitch/vport-geneve.c | 137 | ||||
| -rw-r--r-- | net/openvswitch/vport-gre.c | 109 | ||||
| -rw-r--r-- | net/openvswitch/vport-internal_dev.c | 314 | ||||
| -rw-r--r-- | net/openvswitch/vport-internal_dev.h | 30 | ||||
| -rw-r--r-- | net/openvswitch/vport-netdev.c | 222 | ||||
| -rw-r--r-- | net/openvswitch/vport-netdev.h | 36 | ||||
| -rw-r--r-- | net/openvswitch/vport-vxlan.c | 174 | ||||
| -rw-r--r-- | net/openvswitch/vport.c | 511 | ||||
| -rw-r--r-- | net/openvswitch/vport.h | 222 |
23 files changed, 11099 insertions, 0 deletions
diff --git a/net/openvswitch/Kconfig b/net/openvswitch/Kconfig new file mode 100644 index 000000000000..d143aa9f6654 --- /dev/null +++ b/net/openvswitch/Kconfig @@ -0,0 +1,69 @@ +# +# Open vSwitch +# + +config OPENVSWITCH + tristate "Open vSwitch" + depends on INET + depends on !NF_CONNTRACK || \ + (NF_CONNTRACK && (!NF_DEFRAG_IPV6 || NF_DEFRAG_IPV6)) + select LIBCRC32C + select MPLS + select NET_MPLS_GSO + ---help--- + Open vSwitch is a multilayer Ethernet switch targeted at virtualized + environments. In addition to supporting a variety of features + expected in a traditional hardware switch, it enables fine-grained + programmatic extension and flow-based control of the network. This + control is useful in a wide variety of applications but is + particularly important in multi-server virtualization deployments, + which are often characterized by highly dynamic endpoints and the + need to maintain logical abstractions for multiple tenants. + + The Open vSwitch datapath provides an in-kernel fast path for packet + forwarding. It is complemented by a userspace daemon, ovs-vswitchd, + which is able to accept configuration from a variety of sources and + translate it into packet processing rules. + + See http://openvswitch.org for more information and userspace + utilities. + + To compile this code as a module, choose M here: the module will be + called openvswitch. + + If unsure, say N. + +config OPENVSWITCH_GRE + tristate "Open vSwitch GRE tunneling support" + depends on OPENVSWITCH + depends on NET_IPGRE + default OPENVSWITCH + ---help--- + If you say Y here, then the Open vSwitch will be able create GRE + vport. + + Say N to exclude this support and reduce the binary size. + + If unsure, say Y. + +config OPENVSWITCH_VXLAN + tristate "Open vSwitch VXLAN tunneling support" + depends on OPENVSWITCH + depends on VXLAN + default OPENVSWITCH + ---help--- + If you say Y here, then the Open vSwitch will be able create vxlan vport. + + Say N to exclude this support and reduce the binary size. + + If unsure, say Y. + +config OPENVSWITCH_GENEVE + tristate "Open vSwitch Geneve tunneling support" + depends on OPENVSWITCH + depends on GENEVE + default OPENVSWITCH + ---help--- + If you say Y here, then the Open vSwitch will be able create geneve vport. + + Say N to exclude this support and reduce the binary size. diff --git a/net/openvswitch/Makefile b/net/openvswitch/Makefile new file mode 100644 index 000000000000..60f809085b92 --- /dev/null +++ b/net/openvswitch/Makefile @@ -0,0 +1,24 @@ +# +# Makefile for Open vSwitch. +# + +obj-$(CONFIG_OPENVSWITCH) += openvswitch.o + +openvswitch-y := \ + actions.o \ + datapath.o \ + dp_notify.o \ + flow.o \ + flow_netlink.o \ + flow_table.o \ + vport.o \ + vport-internal_dev.o \ + vport-netdev.o + +ifneq ($(CONFIG_NF_CONNTRACK),) +openvswitch-y += conntrack.o +endif + +obj-$(CONFIG_OPENVSWITCH_VXLAN)+= vport-vxlan.o +obj-$(CONFIG_OPENVSWITCH_GENEVE)+= vport-geneve.o +obj-$(CONFIG_OPENVSWITCH_GRE) += vport-gre.o diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c new file mode 100644 index 000000000000..828fdced4ecd --- /dev/null +++ b/net/openvswitch/actions.c @@ -0,0 +1,1195 @@ +/* + * Copyright (c) 2007-2014 Nicira, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/skbuff.h> +#include <linux/in.h> +#include <linux/ip.h> +#include <linux/openvswitch.h> +#include <linux/netfilter_ipv6.h> +#include <linux/sctp.h> +#include <linux/tcp.h> +#include <linux/udp.h> +#include <linux/in6.h> +#include <linux/if_arp.h> +#include <linux/if_vlan.h> + +#include <net/dst.h> +#include <net/ip.h> +#include <net/ipv6.h> +#include <net/ip6_fib.h> +#include <net/checksum.h> +#include <net/dsfield.h> +#include <net/mpls.h> +#include <net/sctp/checksum.h> + +#include "datapath.h" +#include "flow.h" +#include "conntrack.h" +#include "vport.h" + +static int do_execute_actions(struct datapath *dp, struct sk_buff *skb, + struct sw_flow_key *key, + const struct nlattr *attr, int len); + +struct deferred_action { + struct sk_buff *skb; + const struct nlattr *actions; + + /* Store pkt_key clone when creating deferred action. */ + struct sw_flow_key pkt_key; +}; + +#define MAX_L2_LEN (VLAN_ETH_HLEN + 3 * MPLS_HLEN) +struct ovs_frag_data { + unsigned long dst; + struct vport *vport; + struct ovs_skb_cb cb; + __be16 inner_protocol; + __u16 vlan_tci; + __be16 vlan_proto; + unsigned int l2_len; + u8 l2_data[MAX_L2_LEN]; +}; + +static DEFINE_PER_CPU(struct ovs_frag_data, ovs_frag_data_storage); + +#define DEFERRED_ACTION_FIFO_SIZE 10 +struct action_fifo { + int head; + int tail; + /* Deferred action fifo queue storage. */ + struct deferred_action fifo[DEFERRED_ACTION_FIFO_SIZE]; +}; + +static struct action_fifo __percpu *action_fifos; +static DEFINE_PER_CPU(int, exec_actions_level); + +static void action_fifo_init(struct action_fifo *fifo) +{ + fifo->head = 0; + fifo->tail = 0; +} + +static bool action_fifo_is_empty(const struct action_fifo *fifo) +{ + return (fifo->head == fifo->tail); +} + +static struct deferred_action *action_fifo_get(struct action_fifo *fifo) +{ + if (action_fifo_is_empty(fifo)) + return NULL; + + return &fifo->fifo[fifo->tail++]; +} + +static struct deferred_action *action_fifo_put(struct action_fifo *fifo) +{ + if (fifo->head >= DEFERRED_ACTION_FIFO_SIZE - 1) + return NULL; + + return &fifo->fifo[fifo->head++]; +} + +/* Return true if fifo is not full */ +static struct deferred_action *add_deferred_actions(struct sk_buff *skb, + const struct sw_flow_key *key, + const struct nlattr *attr) +{ + struct action_fifo *fifo; + struct deferred_action *da; + + fifo = this_cpu_ptr(action_fifos); + da = action_fifo_put(fifo); + if (da) { + da->skb = skb; + da->actions = attr; + da->pkt_key = *key; + } + + return da; +} + +static void invalidate_flow_key(struct sw_flow_key *key) +{ + key->eth.type = htons(0); +} + +static bool is_flow_key_valid(const struct sw_flow_key *key) +{ + return !!key->eth.type; +} + +static void update_ethertype(struct sk_buff *skb, struct ethhdr *hdr, + __be16 ethertype) +{ + if (skb->ip_summed == CHECKSUM_COMPLETE) { + __be16 diff[] = { ~(hdr->h_proto), ethertype }; + + skb->csum = csum_partial((char *)diff, sizeof(diff), skb->csum); + } + + hdr->h_proto = ethertype; +} + +static int push_mpls(struct sk_buff *skb, struct sw_flow_key *key, + const struct ovs_action_push_mpls *mpls) +{ + __be32 *new_mpls_lse; + + /* Networking stack do not allow simultaneous Tunnel and MPLS GSO. */ + if (skb->encapsulation) + return -ENOTSUPP; + + if (skb_cow_head(skb, MPLS_HLEN) < 0) + return -ENOMEM; + + skb_push(skb, MPLS_HLEN); + memmove(skb_mac_header(skb) - MPLS_HLEN, skb_mac_header(skb), + skb->mac_len); + skb_reset_mac_header(skb); + + new_mpls_lse = (__be32 *)skb_mpls_header(skb); + *new_mpls_lse = mpls->mpls_lse; + + skb_postpush_rcsum(skb, new_mpls_lse, MPLS_HLEN); + + update_ethertype(skb, eth_hdr(skb), mpls->mpls_ethertype); + if (!skb->inner_protocol) + skb_set_inner_protocol(skb, skb->protocol); + skb->protocol = mpls->mpls_ethertype; + + invalidate_flow_key(key); + return 0; +} + +static int pop_mpls(struct sk_buff *skb, struct sw_flow_key *key, + const __be16 ethertype) +{ + struct ethhdr *hdr; + int err; + + err = skb_ensure_writable(skb, skb->mac_len + MPLS_HLEN); + if (unlikely(err)) + return err; + + skb_postpull_rcsum(skb, skb_mpls_header(skb), MPLS_HLEN); + + memmove(skb_mac_header(skb) + MPLS_HLEN, skb_mac_header(skb), + skb->mac_len); + + __skb_pull(skb, MPLS_HLEN); + skb_reset_mac_header(skb); + + /* skb_mpls_header() is used to locate the ethertype + * field correctly in the presence of VLAN tags. + */ + hdr = (struct ethhdr *)(skb_mpls_header(skb) - ETH_HLEN); + update_ethertype(skb, hdr, ethertype); + if (eth_p_mpls(skb->protocol)) + skb->protocol = ethertype; + + invalidate_flow_key(key); + return 0; +} + +static int set_mpls(struct sk_buff *skb, struct sw_flow_key *flow_key, + const __be32 *mpls_lse, const __be32 *mask) +{ + __be32 *stack; + __be32 lse; + int err; + + err = skb_ensure_writable(skb, skb->mac_len + MPLS_HLEN); + if (unlikely(err)) + return err; + + stack = (__be32 *)skb_mpls_header(skb); + lse = OVS_MASKED(*stack, *mpls_lse, *mask); + if (skb->ip_summed == CHECKSUM_COMPLETE) { + __be32 diff[] = { ~(*stack), lse }; + + skb->csum = csum_partial((char *)diff, sizeof(diff), skb->csum); + } + + *stack = lse; + flow_key->mpls.top_lse = lse; + return 0; +} + +static int pop_vlan(struct sk_buff *skb, struct sw_flow_key *key) +{ + int err; + + err = skb_vlan_pop(skb); + if (skb_vlan_tag_present(skb)) + invalidate_flow_key(key); + else + key->eth.tci = 0; + return err; +} + +static int push_vlan(struct sk_buff *skb, struct sw_flow_key *key, + const struct ovs_action_push_vlan *vlan) +{ + if (skb_vlan_tag_present(skb)) + invalidate_flow_key(key); + else + key->eth.tci = vlan->vlan_tci; + return skb_vlan_push(skb, vlan->vlan_tpid, + ntohs(vlan->vlan_tci) & ~VLAN_TAG_PRESENT); +} + +/* 'src' is already properly masked. */ +static void ether_addr_copy_masked(u8 *dst_, const u8 *src_, const u8 *mask_) +{ + u16 *dst = (u16 *)dst_; + const u16 *src = (const u16 *)src_; + const u16 *mask = (const u16 *)mask_; + + OVS_SET_MASKED(dst[0], src[0], mask[0]); + OVS_SET_MASKED(dst[1], src[1], mask[1]); + OVS_SET_MASKED(dst[2], src[2], mask[2]); +} + +static int set_eth_addr(struct sk_buff *skb, struct sw_flow_key *flow_key, + const struct ovs_key_ethernet *key, + const struct ovs_key_ethernet *mask) +{ + int err; + + err = skb_ensure_writable(skb, ETH_HLEN); + if (unlikely(err)) + return err; + + skb_postpull_rcsum(skb, eth_hdr(skb), ETH_ALEN * 2); + + ether_addr_copy_masked(eth_hdr(skb)->h_source, key->eth_src, + mask->eth_src); + ether_addr_copy_masked(eth_hdr(skb)->h_dest, key->eth_dst, + mask->eth_dst); + + skb_postpush_rcsum(skb, eth_hdr(skb), ETH_ALEN * 2); + + ether_addr_copy(flow_key->eth.src, eth_hdr(skb)->h_source); + ether_addr_copy(flow_key->eth.dst, eth_hdr(skb)->h_dest); + return 0; +} + +static void update_ip_l4_checksum(struct sk_buff *skb, struct iphdr *nh, + __be32 addr, __be32 new_addr) +{ + int transport_len = skb->len - skb_transport_offset(skb); + + if (nh->frag_off & htons(IP_OFFSET)) + return; + + if (nh->protocol == IPPROTO_TCP) { + if (likely(transport_len >= sizeof(struct tcphdr))) + inet_proto_csum_replace4(&tcp_hdr(skb)->check, skb, + addr, new_addr, true); + } else if (nh->protocol == IPPROTO_UDP) { + if (likely(transport_len >= sizeof(struct udphdr))) { + struct udphdr *uh = udp_hdr(skb); + + if (uh->check || skb->ip_summed == CHECKSUM_PARTIAL) { + inet_proto_csum_replace4(&uh->check, skb, + addr, new_addr, true); + if (!uh->check) + uh->check = CSUM_MANGLED_0; + } + } + } +} + +static void set_ip_addr(struct sk_buff *skb, struct iphdr *nh, + __be32 *addr, __be32 new_addr) +{ + update_ip_l4_checksum(skb, nh, *addr, new_addr); + csum_replace4(&nh->check, *addr, new_addr); + skb_clear_hash(skb); + *addr = new_addr; +} + +static void update_ipv6_checksum(struct sk_buff *skb, u8 l4_proto, + __be32 addr[4], const __be32 new_addr[4]) +{ + int transport_len = skb->len - skb_transport_offset(skb); + + if (l4_proto == NEXTHDR_TCP) { + if (likely(transport_len >= sizeof(struct tcphdr))) + inet_proto_csum_replace16(&tcp_hdr(skb)->check, skb, + addr, new_addr, true); + } else if (l4_proto == NEXTHDR_UDP) { + if (likely(transport_len >= sizeof(struct udphdr))) { + struct udphdr *uh = udp_hdr(skb); + + if (uh->check || skb->ip_summed == CHECKSUM_PARTIAL) { + inet_proto_csum_replace16(&uh->check, skb, + addr, new_addr, true); + if (!uh->check) + uh->check = CSUM_MANGLED_0; + } + } + } else if (l4_proto == NEXTHDR_ICMP) { + if (likely(transport_len >= sizeof(struct icmp6hdr))) + inet_proto_csum_replace16(&icmp6_hdr(skb)->icmp6_cksum, + skb, addr, new_addr, true); + } +} + +static void mask_ipv6_addr(const __be32 old[4], const __be32 addr[4], + const __be32 mask[4], __be32 masked[4]) +{ + masked[0] = OVS_MASKED(old[0], addr[0], mask[0]); + masked[1] = OVS_MASKED(old[1], addr[1], mask[1]); + masked[2] = OVS_MASKED(old[2], addr[2], mask[2]); + masked[3] = OVS_MASKED(old[3], addr[3], mask[3]); +} + +static void set_ipv6_addr(struct sk_buff *skb, u8 l4_proto, + __be32 addr[4], const __be32 new_addr[4], + bool recalculate_csum) +{ + if (recalculate_csum) + update_ipv6_checksum(skb, l4_proto, addr, new_addr); + + skb_clear_hash(skb); + memcpy(addr, new_addr, sizeof(__be32[4])); +} + +static void set_ipv6_fl(struct ipv6hdr *nh, u32 fl, u32 mask) +{ + /* Bits 21-24 are always unmasked, so this retains their values. */ + OVS_SET_MASKED(nh->flow_lbl[0], (u8)(fl >> 16), (u8)(mask >> 16)); + OVS_SET_MASKED(nh->flow_lbl[1], (u8)(fl >> 8), (u8)(mask >> 8)); + OVS_SET_MASKED(nh->flow_lbl[2], (u8)fl, (u8)mask); +} + +static void set_ip_ttl(struct sk_buff *skb, struct iphdr *nh, u8 new_ttl, + u8 mask) +{ + new_ttl = OVS_MASKED(nh->ttl, new_ttl, mask); + + csum_replace2(&nh->check, htons(nh->ttl << 8), htons(new_ttl << 8)); + nh->ttl = new_ttl; +} + +static int set_ipv4(struct sk_buff *skb, struct sw_flow_key *flow_key, + const struct ovs_key_ipv4 *key, + const struct ovs_key_ipv4 *mask) +{ + struct iphdr *nh; + __be32 new_addr; + int err; + + err = skb_ensure_writable(skb, skb_network_offset(skb) + + sizeof(struct iphdr)); + if (unlikely(err)) + return err; + + nh = ip_hdr(skb); + + /* Setting an IP addresses is typically only a side effect of + * matching on them in the current userspace implementation, so it + * makes sense to check if the value actually changed. + */ + if (mask->ipv4_src) { + new_addr = OVS_MASKED(nh->saddr, key->ipv4_src, mask->ipv4_src); + + if (unlikely(new_addr != nh->saddr)) { + set_ip_addr(skb, nh, &nh->saddr, new_addr); + flow_key->ipv4.addr.src = new_addr; + } + } + if (mask->ipv4_dst) { + new_addr = OVS_MASKED(nh->daddr, key->ipv4_dst, mask->ipv4_dst); + + if (unlikely(new_addr != nh->daddr)) { + set_ip_addr(skb, nh, &nh->daddr, new_addr); + flow_key->ipv4.addr.dst = new_addr; + } + } + if (mask->ipv4_tos) { + ipv4_change_dsfield(nh, ~mask->ipv4_tos, key->ipv4_tos); + flow_key->ip.tos = nh->tos; + } + if (mask->ipv4_ttl) { + set_ip_ttl(skb, nh, key->ipv4_ttl, mask->ipv4_ttl); + flow_key->ip.ttl = nh->ttl; + } + + return 0; +} + +static bool is_ipv6_mask_nonzero(const __be32 addr[4]) +{ + return !!(addr[0] | addr[1] | addr[2] | addr[3]); +} + +static int set_ipv6(struct sk_buff *skb, struct sw_flow_key *flow_key, + const struct ovs_key_ipv6 *key, + const struct ovs_key_ipv6 *mask) +{ + struct ipv6hdr *nh; + int err; + + err = skb_ensure_writable(skb, skb_network_offset(skb) + + sizeof(struct ipv6hdr)); + if (unlikely(err)) + return err; + + nh = ipv6_hdr(skb); + + /* Setting an IP addresses is typically only a side effect of + * matching on them in the current userspace implementation, so it + * makes sense to check if the value actually changed. + */ + if (is_ipv6_mask_nonzero(mask->ipv6_src)) { + __be32 *saddr = (__be32 *)&nh->saddr; + __be32 masked[4]; + + mask_ipv6_addr(saddr, key->ipv6_src, mask->ipv6_src, masked); + + if (unlikely(memcmp(saddr, masked, sizeof(masked)))) { + set_ipv6_addr(skb, flow_key->ip.proto, saddr, masked, + true); + memcpy(&flow_key->ipv6.addr.src, masked, + sizeof(flow_key->ipv6.addr.src)); + } + } + if (is_ipv6_mask_nonzero(mask->ipv6_dst)) { + unsigned int offset = 0; + int flags = IP6_FH_F_SKIP_RH; + bool recalc_csum = true; + __be32 *daddr = (__be32 *)&nh->daddr; + __be32 masked[4]; + + mask_ipv6_addr(daddr, key->ipv6_dst, mask->ipv6_dst, masked); + + if (unlikely(memcmp(daddr, masked, sizeof(masked)))) { + if (ipv6_ext_hdr(nh->nexthdr)) + recalc_csum = (ipv6_find_hdr(skb, &offset, + NEXTHDR_ROUTING, + NULL, &flags) + != NEXTHDR_ROUTING); + + set_ipv6_addr(skb, flow_key->ip.proto, daddr, masked, + recalc_csum); + memcpy(&flow_key->ipv6.addr.dst, masked, + sizeof(flow_key->ipv6.addr.dst)); + } + } + if (mask->ipv6_tclass) { + ipv6_change_dsfield(nh, ~mask->ipv6_tclass, key->ipv6_tclass); + flow_key->ip.tos = ipv6_get_dsfield(nh); + } + if (mask->ipv6_label) { + set_ipv6_fl(nh, ntohl(key->ipv6_label), + ntohl(mask->ipv6_label)); + flow_key->ipv6.label = + *(__be32 *)nh & htonl(IPV6_FLOWINFO_FLOWLABEL); + } + if (mask->ipv6_hlimit) { + OVS_SET_MASKED(nh->hop_limit, key->ipv6_hlimit, + mask->ipv6_hlimit); + flow_key->ip.ttl = nh->hop_limit; + } + return 0; +} + +/* Must follow skb_ensure_writable() since that can move the skb data. */ +static void set_tp_port(struct sk_buff *skb, __be16 *port, + __be16 new_port, __sum16 *check) +{ + inet_proto_csum_replace2(check, skb, *port, new_port, false); + *port = new_port; +} + +static int set_udp(struct sk_buff *skb, struct sw_flow_key *flow_key, + const struct ovs_key_udp *key, + const struct ovs_key_udp *mask) +{ + struct udphdr *uh; + __be16 src, dst; + int err; + + err = skb_ensure_writable(skb, skb_transport_offset(skb) + + sizeof(struct udphdr)); + if (unlikely(err)) + return err; + + uh = udp_hdr(skb); + /* Either of the masks is non-zero, so do not bother checking them. */ + src = OVS_MASKED(uh->source, key->udp_src, mask->udp_src); + dst = OVS_MASKED(uh->dest, key->udp_dst, mask->udp_dst); + + if (uh->check && skb->ip_summed != CHECKSUM_PARTIAL) { + if (likely(src != uh->source)) { + set_tp_port(skb, &uh->source, src, &uh->check); + flow_key->tp.src = src; + } + if (likely(dst != uh->dest)) { + set_tp_port(skb, &uh->dest, dst, &uh->check); + flow_key->tp.dst = dst; + } + + if (unlikely(!uh->check)) + uh->check = CSUM_MANGLED_0; + } else { + uh->source = src; + uh->dest = dst; + flow_key->tp.src = src; + flow_key->tp.dst = dst; + } + + skb_clear_hash(skb); + + return 0; +} + +static int set_tcp(struct sk_buff *skb, struct sw_flow_key *flow_key, + const struct ovs_key_tcp *key, + const struct ovs_key_tcp *mask) +{ + struct tcphdr *th; + __be16 src, dst; + int err; + + err = skb_ensure_writable(skb, skb_transport_offset(skb) + + sizeof(struct tcphdr)); + if (unlikely(err)) + return err; + + th = tcp_hdr(skb); + src = OVS_MASKED(th->source, key->tcp_src, mask->tcp_src); + if (likely(src != th->source)) { + set_tp_port(skb, &th->source, src, &th->check); + flow_key->tp.src = src; + } + dst = OVS_MASKED(th->dest, key->tcp_dst, mask->tcp_dst); + if (likely(dst != th->dest)) { + set_tp_port(skb, &th->dest, dst, &th->check); + flow_key->tp.dst = dst; + } + skb_clear_hash(skb); + + return 0; +} + +static int set_sctp(struct sk_buff *skb, struct sw_flow_key *flow_key, + const struct ovs_key_sctp *key, + const struct ovs_key_sctp *mask) +{ + unsigned int sctphoff = skb_transport_offset(skb); + struct sctphdr *sh; + __le32 old_correct_csum, new_csum, old_csum; + int err; + + err = skb_ensure_writable(skb, sctphoff + sizeof(struct sctphdr)); + if (unlikely(err)) + return err; + + sh = sctp_hdr(skb); + old_csum = sh->checksum; + old_correct_csum = sctp_compute_cksum(skb, sctphoff); + + sh->source = OVS_MASKED(sh->source, key->sctp_src, mask->sctp_src); + sh->dest = OVS_MASKED(sh->dest, key->sctp_dst, mask->sctp_dst); + + new_csum = sctp_compute_cksum(skb, sctphoff); + + /* Carry any checksum errors through. */ + sh->checksum = old_csum ^ old_correct_csum ^ new_csum; + + skb_clear_hash(skb); + flow_key->tp.src = sh->source; + flow_key->tp.dst = sh->dest; + + return 0; +} + +static int ovs_vport_output(struct net *net, struct sock *sk, struct sk_buff *skb) +{ + struct ovs_frag_data *data = this_cpu_ptr(&ovs_frag_data_storage); + struct vport *vport = data->vport; + + if (skb_cow_head(skb, data->l2_len) < 0) { + kfree_skb(skb); + return -ENOMEM; + } + + __skb_dst_copy(skb, data->dst); + *OVS_CB(skb) = data->cb; + skb->inner_protocol = data->inner_protocol; + skb->vlan_tci = data->vlan_tci; + skb->vlan_proto = data->vlan_proto; + + /* Reconstruct the MAC header. */ + skb_push(skb, data->l2_len); + memcpy(skb->data, &data->l2_data, data->l2_len); + skb_postpush_rcsum(skb, skb->data, data->l2_len); + skb_reset_mac_header(skb); + + ovs_vport_send(vport, skb); + return 0; +} + +static unsigned int +ovs_dst_get_mtu(const struct dst_entry *dst) +{ + return dst->dev->mtu; +} + +static struct dst_ops ovs_dst_ops = { + .family = AF_UNSPEC, + .mtu = ovs_dst_get_mtu, +}; + +/* prepare_frag() is called once per (larger-than-MTU) frame; its inverse is + * ovs_vport_output(), which is called once per fragmented packet. + */ +static void prepare_frag(struct vport *vport, struct sk_buff *skb) +{ + unsigned int hlen = skb_network_offset(skb); + struct ovs_frag_data *data; + + data = this_cpu_ptr(&ovs_frag_data_storage); + data->dst = skb->_skb_refdst; + data->vport = vport; + data->cb = *OVS_CB(skb); + data->inner_protocol = skb->inner_protocol; + data->vlan_tci = skb->vlan_tci; + data->vlan_proto = skb->vlan_proto; + data->l2_len = hlen; + memcpy(&data->l2_data, skb->data, hlen); + + memset(IPCB(skb), 0, sizeof(struct inet_skb_parm)); + skb_pull(skb, hlen); +} + +static void ovs_fragment(struct net *net, struct vport *vport, + struct sk_buff *skb, u16 mru, __be16 ethertype) +{ + if (skb_network_offset(skb) > MAX_L2_LEN) { + OVS_NLERR(1, "L2 header too long to fragment"); + goto err; + } + + if (ethertype == htons(ETH_P_IP)) { + struct dst_entry ovs_dst; + unsigned long orig_dst; + + prepare_frag(vport, skb); + dst_init(&ovs_dst, &ovs_dst_ops, NULL, 1, + DST_OBSOLETE_NONE, DST_NOCOUNT); + ovs_dst.dev = vport->dev; + + orig_dst = skb->_skb_refdst; + skb_dst_set_noref(skb, &ovs_dst); + IPCB(skb)->frag_max_size = mru; + + ip_do_fragment(net, skb->sk, skb, ovs_vport_output); + refdst_drop(orig_dst); + } else if (ethertype == htons(ETH_P_IPV6)) { + const struct nf_ipv6_ops *v6ops = nf_get_ipv6_ops(); + unsigned long orig_dst; + struct rt6_info ovs_rt; + + if (!v6ops) { + goto err; + } + + prepare_frag(vport, skb); + memset(&ovs_rt, 0, sizeof(ovs_rt)); + dst_init(&ovs_rt.dst, &ovs_dst_ops, NULL, 1, + DST_OBSOLETE_NONE, DST_NOCOUNT); + ovs_rt.dst.dev = vport->dev; + + orig_dst = skb->_skb_refdst; + skb_dst_set_noref(skb, &ovs_rt.dst); + IP6CB(skb)->frag_max_size = mru; + + v6ops->fragment(net, skb->sk, skb, ovs_vport_output); + refdst_drop(orig_dst); + } else { + WARN_ONCE(1, "Failed fragment ->%s: eth=%04x, MRU=%d, MTU=%d.", + ovs_vport_name(vport), ntohs(ethertype), mru, + vport->dev->mtu); + goto err; + } + + return; +err: + kfree_skb(skb); +} + +static void do_output(struct datapath *dp, struct sk_buff *skb, int out_port, + struct sw_flow_key *key) +{ + struct vport *vport = ovs_vport_rcu(dp, out_port); + + if (likely(vport)) { + u16 mru = OVS_CB(skb)->mru; + + if (likely(!mru || (skb->len <= mru + ETH_HLEN))) { + ovs_vport_send(vport, skb); + } else if (mru <= vport->dev->mtu) { + struct net *net = read_pnet(&dp->net); + __be16 ethertype = key->eth.type; + + if (!is_flow_key_valid(key)) { + if (eth_p_mpls(skb->protocol)) + ethertype = skb->inner_protocol; + else + ethertype = vlan_get_protocol(skb); + } + + ovs_fragment(net, vport, skb, mru, ethertype); + } else { + kfree_skb(skb); + } + } else { + kfree_skb(skb); + } +} + +static int output_userspace(struct datapath *dp, struct sk_buff *skb, + struct sw_flow_key *key, const struct nlattr *attr, + const struct nlattr *actions, int actions_len) +{ + struct dp_upcall_info upcall; + const struct nlattr *a; + int rem; + + memset(&upcall, 0, sizeof(upcall)); + upcall.cmd = OVS_PACKET_CMD_ACTION; + upcall.mru = OVS_CB(skb)->mru; + + for (a = nla_data(attr), rem = nla_len(attr); rem > 0; + a = nla_next(a, &rem)) { + switch (nla_type(a)) { + case OVS_USERSPACE_ATTR_USERDATA: + upcall.userdata = a; + break; + + case OVS_USERSPACE_ATTR_PID: + upcall.portid = nla_get_u32(a); + break; + + case OVS_USERSPACE_ATTR_EGRESS_TUN_PORT: { + /* Get out tunnel info. */ + struct vport *vport; + + vport = ovs_vport_rcu(dp, nla_get_u32(a)); + if (vport) { + int err; + + err = dev_fill_metadata_dst(vport->dev, skb); + if (!err) + upcall.egress_tun_info = skb_tunnel_info(skb); + } + + break; + } + + case OVS_USERSPACE_ATTR_ACTIONS: { + /* Include actions. */ + upcall.actions = actions; + upcall.actions_len = actions_len; + break; + } + + } /* End of switch. */ + } + + return ovs_dp_upcall(dp, skb, key, &upcall); +} + +static int sample(struct datapath *dp, struct sk_buff *skb, + struct sw_flow_key *key, const struct nlattr *attr, + const struct nlattr *actions, int actions_len) +{ + const struct nlattr *acts_list = NULL; + const struct nlattr *a; + int rem; + + for (a = nla_data(attr), rem = nla_len(attr); rem > 0; + a = nla_next(a, &rem)) { + u32 probability; + + switch (nla_type(a)) { + case OVS_SAMPLE_ATTR_PROBABILITY: + probability = nla_get_u32(a); + if (!probability || prandom_u32() > probability) + return 0; + break; + + case OVS_SAMPLE_ATTR_ACTIONS: + acts_list = a; + break; + } + } + + rem = nla_len(acts_list); + a = nla_data(acts_list); + + /* Actions list is empty, do nothing */ + if (unlikely(!rem)) + return 0; + + /* The only known usage of sample action is having a single user-space + * action. Treat this usage as a special case. + * The output_userspace() should clone the skb to be sent to the + * user space. This skb will be consumed by its caller. + */ + if (likely(nla_type(a) == OVS_ACTION_ATTR_USERSPACE && + nla_is_last(a, rem))) + return output_userspace(dp, skb, key, a, actions, actions_len); + + skb = skb_clone(skb, GFP_ATOMIC); + if (!skb) + /* Skip the sample action when out of memory. */ + return 0; + + if (!add_deferred_actions(skb, key, a)) { + if (net_ratelimit()) + pr_warn("%s: deferred actions limit reached, dropping sample action\n", + ovs_dp_name(dp)); + + kfree_skb(skb); + } + return 0; +} + +static void execute_hash(struct sk_buff *skb, struct sw_flow_key *key, + const struct nlattr *attr) +{ + struct ovs_action_hash *hash_act = nla_data(attr); + u32 hash = 0; + + /* OVS_HASH_ALG_L4 is the only possible hash algorithm. */ + hash = skb_get_hash(skb); + hash = jhash_1word(hash, hash_act->hash_basis); + if (!hash) + hash = 0x1; + + key->ovs_flow_hash = hash; +} + +static int execute_set_action(struct sk_buff *skb, + struct sw_flow_key *flow_key, + const struct nlattr *a) +{ + /* Only tunnel set execution is supported without a mask. */ + if (nla_type(a) == OVS_KEY_ATTR_TUNNEL_INFO) { + struct ovs_tunnel_info *tun = nla_data(a); + + skb_dst_drop(skb); + dst_hold((struct dst_entry *)tun->tun_dst); + skb_dst_set(skb, (struct dst_entry *)tun->tun_dst); + return 0; + } + + return -EINVAL; +} + +/* Mask is at the midpoint of the data. */ +#define get_mask(a, type) ((const type)nla_data(a) + 1) + +static int execute_masked_set_action(struct sk_buff *skb, + struct sw_flow_key *flow_key, + const struct nlattr *a) +{ + int err = 0; + + switch (nla_type(a)) { + case OVS_KEY_ATTR_PRIORITY: + OVS_SET_MASKED(skb->priority, nla_get_u32(a), + *get_mask(a, u32 *)); + flow_key->phy.priority = skb->priority; + break; + + case OVS_KEY_ATTR_SKB_MARK: + OVS_SET_MASKED(skb->mark, nla_get_u32(a), *get_mask(a, u32 *)); + flow_key->phy.skb_mark = skb->mark; + break; + + case OVS_KEY_ATTR_TUNNEL_INFO: + /* Masked data not supported for tunnel. */ + err = -EINVAL; + break; + + case OVS_KEY_ATTR_ETHERNET: + err = set_eth_addr(skb, flow_key, nla_data(a), + get_mask(a, struct ovs_key_ethernet *)); + break; + + case OVS_KEY_ATTR_IPV4: + err = set_ipv4(skb, flow_key, nla_data(a), + get_mask(a, struct ovs_key_ipv4 *)); + break; + + case OVS_KEY_ATTR_IPV6: + err = set_ipv6(skb, flow_key, nla_data(a), + get_mask(a, struct ovs_key_ipv6 *)); + break; + + case OVS_KEY_ATTR_TCP: + err = set_tcp(skb, flow_key, nla_data(a), + get_mask(a, struct ovs_key_tcp *)); + break; + + case OVS_KEY_ATTR_UDP: + err = set_udp(skb, flow_key, nla_data(a), + get_mask(a, struct ovs_key_udp *)); + break; + + case OVS_KEY_ATTR_SCTP: + err = set_sctp(skb, flow_key, nla_data(a), + get_mask(a, struct ovs_key_sctp *)); + break; + + case OVS_KEY_ATTR_MPLS: + err = set_mpls(skb, flow_key, nla_data(a), get_mask(a, + __be32 *)); + break; + + case OVS_KEY_ATTR_CT_STATE: + case OVS_KEY_ATTR_CT_ZONE: + case OVS_KEY_ATTR_CT_MARK: + case OVS_KEY_ATTR_CT_LABELS: + err = -EINVAL; + break; + } + + return err; +} + +static int execute_recirc(struct datapath *dp, struct sk_buff *skb, + struct sw_flow_key *key, + const struct nlattr *a, int rem) +{ + struct deferred_action *da; + + if (!is_flow_key_valid(key)) { + int err; + + err = ovs_flow_key_update(skb, key); + if (err) + return err; + } + BUG_ON(!is_flow_key_valid(key)); + + if (!nla_is_last(a, rem)) { + /* Recirc action is the not the last action + * of the action list, need to clone the skb. + */ + skb = skb_clone(skb, GFP_ATOMIC); + + /* Skip the recirc action when out of memory, but + * continue on with the rest of the action list. + */ + if (!skb) + return 0; + } + + da = add_deferred_actions(skb, key, NULL); + if (da) { + da->pkt_key.recirc_id = nla_get_u32(a); + } else { + kfree_skb(skb); + + if (net_ratelimit()) + pr_warn("%s: deferred action limit reached, drop recirc action\n", + ovs_dp_name(dp)); + } + + return 0; +} + +/* Execute a list of actions against 'skb'. */ +static int do_execute_actions(struct datapath *dp, struct sk_buff *skb, + struct sw_flow_key *key, + const struct nlattr *attr, int len) +{ + /* Every output action needs a separate clone of 'skb', but the common + * case is just a single output action, so that doing a clone and + * then freeing the original skbuff is wasteful. So the following code + * is slightly obscure just to avoid that. + */ + int prev_port = -1; + const struct nlattr *a; + int rem; + + for (a = attr, rem = len; rem > 0; + a = nla_next(a, &rem)) { + int err = 0; + + if (unlikely(prev_port != -1)) { + struct sk_buff *out_skb = skb_clone(skb, GFP_ATOMIC); + + if (out_skb) + do_output(dp, out_skb, prev_port, key); + + prev_port = -1; + } + + switch (nla_type(a)) { + case OVS_ACTION_ATTR_OUTPUT: + prev_port = nla_get_u32(a); + break; + + case OVS_ACTION_ATTR_USERSPACE: + output_userspace(dp, skb, key, a, attr, len); + break; + + case OVS_ACTION_ATTR_HASH: + execute_hash(skb, key, a); + break; + + case OVS_ACTION_ATTR_PUSH_MPLS: + err = push_mpls(skb, key, nla_data(a)); + break; + + case OVS_ACTION_ATTR_POP_MPLS: + err = pop_mpls(skb, key, nla_get_be16(a)); + break; + + case OVS_ACTION_ATTR_PUSH_VLAN: + err = push_vlan(skb, key, nla_data(a)); + break; + + case OVS_ACTION_ATTR_POP_VLAN: + err = pop_vlan(skb, key); + break; + + case OVS_ACTION_ATTR_RECIRC: + err = execute_recirc(dp, skb, key, a, rem); + if (nla_is_last(a, rem)) { + /* If this is the last action, the skb has + * been consumed or freed. + * Return immediately. + */ + return err; + } + break; + + case OVS_ACTION_ATTR_SET: + err = execute_set_action(skb, key, nla_data(a)); + break; + + case OVS_ACTION_ATTR_SET_MASKED: + case OVS_ACTION_ATTR_SET_TO_MASKED: + err = execute_masked_set_action(skb, key, nla_data(a)); + break; + + case OVS_ACTION_ATTR_SAMPLE: + err = sample(dp, skb, key, a, attr, len); + break; + + case OVS_ACTION_ATTR_CT: + if (!is_flow_key_valid(key)) { + err = ovs_flow_key_update(skb, key); + if (err) + return err; + } + + err = ovs_ct_execute(ovs_dp_get_net(dp), skb, key, + nla_data(a)); + + /* Hide stolen IP fragments from user space. */ + if (err) + return err == -EINPROGRESS ? 0 : err; + break; + } + + if (unlikely(err)) { + kfree_skb(skb); + return err; + } + } + + if (prev_port != -1) + do_output(dp, skb, prev_port, key); + else + consume_skb(skb); + + return 0; +} + +static void process_deferred_actions(struct datapath *dp) +{ + struct action_fifo *fifo = this_cpu_ptr(action_fifos); + + /* Do not touch the FIFO in case there is no deferred actions. */ + if (action_fifo_is_empty(fifo)) + return; + + /* Finishing executing all deferred actions. */ + do { + struct deferred_action *da = action_fifo_get(fifo); + struct sk_buff *skb = da->skb; + struct sw_flow_key *key = &da->pkt_key; + const struct nlattr *actions = da->actions; + + if (actions) + do_execute_actions(dp, skb, key, actions, + nla_len(actions)); + else + ovs_dp_process_packet(skb, key); + } while (!action_fifo_is_empty(fifo)); + + /* Reset FIFO for the next packet. */ + action_fifo_init(fifo); +} + +/* Execute a list of actions against 'skb'. */ +int ovs_execute_actions(struct datapath *dp, struct sk_buff *skb, + const struct sw_flow_actions *acts, + struct sw_flow_key *key) +{ + int level = this_cpu_read(exec_actions_level); + int err; + + this_cpu_inc(exec_actions_level); + err = do_execute_actions(dp, skb, key, + acts->actions, acts->actions_len); + + if (!level) + process_deferred_actions(dp); + + this_cpu_dec(exec_actions_level); + return err; +} + +int action_fifos_init(void) +{ + action_fifos = alloc_percpu(struct action_fifo); + if (!action_fifos) + return -ENOMEM; + + return 0; +} + +void action_fifos_exit(void) +{ + free_percpu(action_fifos); +} diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c new file mode 100644 index 000000000000..1829adb23505 --- /dev/null +++ b/net/openvswitch/conntrack.c @@ -0,0 +1,822 @@ +/* + * Copyright (c) 2015 Nicira, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ + +#include <linux/module.h> +#include <linux/openvswitch.h> +#include <net/ip.h> +#include <net/netfilter/nf_conntrack_core.h> +#include <net/netfilter/nf_conntrack_helper.h> +#include <net/netfilter/nf_conntrack_labels.h> +#include <net/netfilter/nf_conntrack_zones.h> +#include <net/netfilter/ipv6/nf_defrag_ipv6.h> + +#include "datapath.h" +#include "conntrack.h" +#include "flow.h" +#include "flow_netlink.h" + +struct ovs_ct_len_tbl { + size_t maxlen; + size_t minlen; +}; + +/* Metadata mark for masked write to conntrack mark */ +struct md_mark { + u32 value; + u32 mask; +}; + +/* Metadata label for masked write to conntrack label. */ +struct md_labels { + struct ovs_key_ct_labels value; + struct ovs_key_ct_labels mask; +}; + +/* Conntrack action context for execution. */ +struct ovs_conntrack_info { + struct nf_conntrack_helper *helper; + struct nf_conntrack_zone zone; + struct nf_conn *ct; + u8 commit : 1; + u16 family; + struct md_mark mark; + struct md_labels labels; +}; + +static void __ovs_ct_free_action(struct ovs_conntrack_info *ct_info); + +static u16 key_to_nfproto(const struct sw_flow_key *key) +{ + switch (ntohs(key->eth.type)) { + case ETH_P_IP: + return NFPROTO_IPV4; + case ETH_P_IPV6: + return NFPROTO_IPV6; + default: + return NFPROTO_UNSPEC; + } +} + +/* Map SKB connection state into the values used by flow definition. */ +static u8 ovs_ct_get_state(enum ip_conntrack_info ctinfo) +{ + u8 ct_state = OVS_CS_F_TRACKED; + + switch (ctinfo) { + case IP_CT_ESTABLISHED_REPLY: + case IP_CT_RELATED_REPLY: + case IP_CT_NEW_REPLY: + ct_state |= OVS_CS_F_REPLY_DIR; + break; + default: + break; + } + + switch (ctinfo) { + case IP_CT_ESTABLISHED: + case IP_CT_ESTABLISHED_REPLY: + ct_state |= OVS_CS_F_ESTABLISHED; + break; + case IP_CT_RELATED: + case IP_CT_RELATED_REPLY: + ct_state |= OVS_CS_F_RELATED; + break; + case IP_CT_NEW: + case IP_CT_NEW_REPLY: + ct_state |= OVS_CS_F_NEW; + break; + default: + break; + } + + return ct_state; +} + +static u32 ovs_ct_get_mark(const struct nf_conn *ct) +{ +#if IS_ENABLED(CONFIG_NF_CONNTRACK_MARK) + return ct ? ct->mark : 0; +#else + return 0; +#endif +} + +static void ovs_ct_get_labels(const struct nf_conn *ct, + struct ovs_key_ct_labels *labels) +{ + struct nf_conn_labels *cl = ct ? nf_ct_labels_find(ct) : NULL; + + if (cl) { + size_t len = cl->words * sizeof(long); + + if (len > OVS_CT_LABELS_LEN) + len = OVS_CT_LABELS_LEN; + else if (len < OVS_CT_LABELS_LEN) + memset(labels, 0, OVS_CT_LABELS_LEN); + memcpy(labels, cl->bits, len); + } else { + memset(labels, 0, OVS_CT_LABELS_LEN); + } +} + +static void __ovs_ct_update_key(struct sw_flow_key *key, u8 state, + const struct nf_conntrack_zone *zone, + const struct nf_conn *ct) +{ + key->ct.state = state; + key->ct.zone = zone->id; + key->ct.mark = ovs_ct_get_mark(ct); + ovs_ct_get_labels(ct, &key->ct.labels); +} + +/* Update 'key' based on skb->nfct. If 'post_ct' is true, then OVS has + * previously sent the packet to conntrack via the ct action. + */ +static void ovs_ct_update_key(const struct sk_buff *skb, + const struct ovs_conntrack_info *info, + struct sw_flow_key *key, bool post_ct) +{ + const struct nf_conntrack_zone *zone = &nf_ct_zone_dflt; + enum ip_conntrack_info ctinfo; + struct nf_conn *ct; + u8 state = 0; + + ct = nf_ct_get(skb, &ctinfo); + if (ct) { + state = ovs_ct_get_state(ctinfo); + if (!nf_ct_is_confirmed(ct)) + state |= OVS_CS_F_NEW; + if (ct->master) + state |= OVS_CS_F_RELATED; + zone = nf_ct_zone(ct); + } else if (post_ct) { + state = OVS_CS_F_TRACKED | OVS_CS_F_INVALID; + if (info) + zone = &info->zone; + } + __ovs_ct_update_key(key, state, zone, ct); +} + +void ovs_ct_fill_key(const struct sk_buff *skb, struct sw_flow_key *key) +{ + ovs_ct_update_key(skb, NULL, key, false); +} + +int ovs_ct_put_key(const struct sw_flow_key *key, struct sk_buff *skb) +{ + if (nla_put_u32(skb, OVS_KEY_ATTR_CT_STATE, key->ct.state)) + return -EMSGSIZE; + + if (IS_ENABLED(CONFIG_NF_CONNTRACK_ZONES) && + nla_put_u16(skb, OVS_KEY_ATTR_CT_ZONE, key->ct.zone)) + return -EMSGSIZE; + + if (IS_ENABLED(CONFIG_NF_CONNTRACK_MARK) && + nla_put_u32(skb, OVS_KEY_ATTR_CT_MARK, key->ct.mark)) + return -EMSGSIZE; + + if (IS_ENABLED(CONFIG_NF_CONNTRACK_LABELS) && + nla_put(skb, OVS_KEY_ATTR_CT_LABELS, sizeof(key->ct.labels), + &key->ct.labels)) + return -EMSGSIZE; + + return 0; +} + +static int ovs_ct_set_mark(struct sk_buff *skb, struct sw_flow_key *key, + u32 ct_mark, u32 mask) +{ +#if IS_ENABLED(CONFIG_NF_CONNTRACK_MARK) + enum ip_conntrack_info ctinfo; + struct nf_conn *ct; + u32 new_mark; + + + /* The connection could be invalid, in which case set_mark is no-op. */ + ct = nf_ct_get(skb, &ctinfo); + if (!ct) + return 0; + + new_mark = ct_mark | (ct->mark & ~(mask)); + if (ct->mark != new_mark) { + ct->mark = new_mark; + nf_conntrack_event_cache(IPCT_MARK, ct); + key->ct.mark = new_mark; + } + + return 0; +#else + return -ENOTSUPP; +#endif +} + +static int ovs_ct_set_labels(struct sk_buff *skb, struct sw_flow_key *key, + const struct ovs_key_ct_labels *labels, + const struct ovs_key_ct_labels *mask) +{ + enum ip_conntrack_info ctinfo; + struct nf_conn_labels *cl; + struct nf_conn *ct; + int err; + + /* The connection could be invalid, in which case set_label is no-op.*/ + ct = nf_ct_get(skb, &ctinfo); + if (!ct) + return 0; + + cl = nf_ct_labels_find(ct); + if (!cl) { + nf_ct_labels_ext_add(ct); + cl = nf_ct_labels_find(ct); + } + if (!cl || cl->words * sizeof(long) < OVS_CT_LABELS_LEN) + return -ENOSPC; + + err = nf_connlabels_replace(ct, (u32 *)labels, (u32 *)mask, + OVS_CT_LABELS_LEN / sizeof(u32)); + if (err) + return err; + + ovs_ct_get_labels(ct, &key->ct.labels); + return 0; +} + +/* 'skb' should already be pulled to nh_ofs. */ +static int ovs_ct_helper(struct sk_buff *skb, u16 proto) +{ + const struct nf_conntrack_helper *helper; + const struct nf_conn_help *help; + enum ip_conntrack_info ctinfo; + unsigned int protoff; + struct nf_conn *ct; + + ct = nf_ct_get(skb, &ctinfo); + if (!ct || ctinfo == IP_CT_RELATED_REPLY) + return NF_ACCEPT; + + help = nfct_help(ct); + if (!help) + return NF_ACCEPT; + + helper = rcu_dereference(help->helper); + if (!helper) + return NF_ACCEPT; + + switch (proto) { + case NFPROTO_IPV4: + protoff = ip_hdrlen(skb); + break; + case NFPROTO_IPV6: { + u8 nexthdr = ipv6_hdr(skb)->nexthdr; + __be16 frag_off; + int ofs; + + ofs = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &nexthdr, + &frag_off); + if (ofs < 0 || (frag_off & htons(~0x7)) != 0) { + pr_debug("proto header not found\n"); + return NF_ACCEPT; + } + protoff = ofs; + break; + } + default: + WARN_ONCE(1, "helper invoked on non-IP family!"); + return NF_DROP; + } + + return helper->help(skb, protoff, ct, ctinfo); +} + +/* Returns 0 on success, -EINPROGRESS if 'skb' is stolen, or other nonzero + * value if 'skb' is freed. + */ +static int handle_fragments(struct net *net, struct sw_flow_key *key, + u16 zone, struct sk_buff *skb) +{ + struct ovs_skb_cb ovs_cb = *OVS_CB(skb); + + if (key->eth.type == htons(ETH_P_IP)) { + enum ip_defrag_users user = IP_DEFRAG_CONNTRACK_IN + zone; + int err; + + memset(IPCB(skb), 0, sizeof(struct inet_skb_parm)); + err = ip_defrag(net, skb, user); + if (err) + return err; + + ovs_cb.mru = IPCB(skb)->frag_max_size; +#if IS_ENABLED(CONFIG_NF_DEFRAG_IPV6) + } else if (key->eth.type == htons(ETH_P_IPV6)) { + enum ip6_defrag_users user = IP6_DEFRAG_CONNTRACK_IN + zone; + struct sk_buff *reasm; + + memset(IP6CB(skb), 0, sizeof(struct inet6_skb_parm)); + reasm = nf_ct_frag6_gather(net, skb, user); + if (!reasm) + return -EINPROGRESS; + + if (skb == reasm) { + kfree_skb(skb); + return -EINVAL; + } + + /* Don't free 'skb' even though it is one of the original + * fragments, as we're going to morph it into the head. + */ + skb_get(skb); + nf_ct_frag6_consume_orig(reasm); + + key->ip.proto = ipv6_hdr(reasm)->nexthdr; + skb_morph(skb, reasm); + skb->next = reasm->next; + consume_skb(reasm); + ovs_cb.mru = IP6CB(skb)->frag_max_size; +#endif + } else { + kfree_skb(skb); + return -EPFNOSUPPORT; + } + + key->ip.frag = OVS_FRAG_TYPE_NONE; + skb_clear_hash(skb); + skb->ignore_df = 1; + *OVS_CB(skb) = ovs_cb; + + return 0; +} + +static struct nf_conntrack_expect * +ovs_ct_expect_find(struct net *net, const struct nf_conntrack_zone *zone, + u16 proto, const struct sk_buff *skb) +{ + struct nf_conntrack_tuple tuple; + struct nf_conntrack_expect *exp; + + if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb), proto, net, &tuple)) + return NULL; + + exp = __nf_ct_expect_find(net, zone, &tuple); + if (exp) { + struct nf_conntrack_tuple_hash *h; + + /* Delete existing conntrack entry, if it clashes with the + * expectation. This can happen since conntrack ALGs do not + * check for clashes between (new) expectations and existing + * conntrack entries. nf_conntrack_in() will check the + * expectations only if a conntrack entry can not be found, + * which can lead to OVS finding the expectation (here) in the + * init direction, but which will not be removed by the + * nf_conntrack_in() call, if a matching conntrack entry is + * found instead. In this case all init direction packets + * would be reported as new related packets, while reply + * direction packets would be reported as un-related + * established packets. + */ + h = nf_conntrack_find_get(net, zone, &tuple); + if (h) { + struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); + + nf_ct_delete(ct, 0, 0); + nf_conntrack_put(&ct->ct_general); + } + } + + return exp; +} + +/* Determine whether skb->nfct is equal to the result of conntrack lookup. */ +static bool skb_nfct_cached(const struct net *net, const struct sk_buff *skb, + const struct ovs_conntrack_info *info) +{ + enum ip_conntrack_info ctinfo; + struct nf_conn *ct; + + ct = nf_ct_get(skb, &ctinfo); + if (!ct) + return false; + if (!net_eq(net, read_pnet(&ct->ct_net))) + return false; + if (!nf_ct_zone_equal_any(info->ct, nf_ct_zone(ct))) + return false; + if (info->helper) { + struct nf_conn_help *help; + + help = nf_ct_ext_find(ct, NF_CT_EXT_HELPER); + if (help && rcu_access_pointer(help->helper) != info->helper) + return false; + } + + return true; +} + +static int __ovs_ct_lookup(struct net *net, struct sw_flow_key *key, + const struct ovs_conntrack_info *info, + struct sk_buff *skb) +{ + /* If we are recirculating packets to match on conntrack fields and + * committing with a separate conntrack action, then we don't need to + * actually run the packet through conntrack twice unless it's for a + * different zone. + */ + if (!skb_nfct_cached(net, skb, info)) { + struct nf_conn *tmpl = info->ct; + + /* Associate skb with specified zone. */ + if (tmpl) { + if (skb->nfct) + nf_conntrack_put(skb->nfct); + nf_conntrack_get(&tmpl->ct_general); + skb->nfct = &tmpl->ct_general; + skb->nfctinfo = IP_CT_NEW; + } + + if (nf_conntrack_in(net, info->family, NF_INET_PRE_ROUTING, + skb) != NF_ACCEPT) + return -ENOENT; + + if (ovs_ct_helper(skb, info->family) != NF_ACCEPT) { + WARN_ONCE(1, "helper rejected packet"); + return -EINVAL; + } + } + + ovs_ct_update_key(skb, info, key, true); + + return 0; +} + +/* Lookup connection and read fields into key. */ +static int ovs_ct_lookup(struct net *net, struct sw_flow_key *key, + const struct ovs_conntrack_info *info, + struct sk_buff *skb) +{ + struct nf_conntrack_expect *exp; + + exp = ovs_ct_expect_find(net, &info->zone, info->family, skb); + if (exp) { + u8 state; + + state = OVS_CS_F_TRACKED | OVS_CS_F_NEW | OVS_CS_F_RELATED; + __ovs_ct_update_key(key, state, &info->zone, exp->master); + } else { + int err; + + err = __ovs_ct_lookup(net, key, info, skb); + if (err) + return err; + } + + return 0; +} + +/* Lookup connection and confirm if unconfirmed. */ +static int ovs_ct_commit(struct net *net, struct sw_flow_key *key, + const struct ovs_conntrack_info *info, + struct sk_buff *skb) +{ + u8 state; + int err; + + state = key->ct.state; + if (key->ct.zone == info->zone.id && + ((state & OVS_CS_F_TRACKED) && !(state & OVS_CS_F_NEW))) { + /* Previous lookup has shown that this connection is already + * tracked and committed. Skip committing. + */ + return 0; + } + + err = __ovs_ct_lookup(net, key, info, skb); + if (err) + return err; + if (nf_conntrack_confirm(skb) != NF_ACCEPT) + return -EINVAL; + + return 0; +} + +static bool labels_nonzero(const struct ovs_key_ct_labels *labels) +{ + size_t i; + + for (i = 0; i < sizeof(*labels); i++) + if (labels->ct_labels[i]) + return true; + + return false; +} + +/* Returns 0 on success, -EINPROGRESS if 'skb' is stolen, or other nonzero + * value if 'skb' is freed. + */ +int ovs_ct_execute(struct net *net, struct sk_buff *skb, + struct sw_flow_key *key, + const struct ovs_conntrack_info *info) +{ + int nh_ofs; + int err; + + /* The conntrack module expects to be working at L3. */ + nh_ofs = skb_network_offset(skb); + skb_pull_rcsum(skb, nh_ofs); + + if (key->ip.frag != OVS_FRAG_TYPE_NONE) { + err = handle_fragments(net, key, info->zone.id, skb); + if (err) + return err; + } + + if (info->commit) + err = ovs_ct_commit(net, key, info, skb); + else + err = ovs_ct_lookup(net, key, info, skb); + if (err) + goto err; + + if (info->mark.mask) { + err = ovs_ct_set_mark(skb, key, info->mark.value, + info->mark.mask); + if (err) + goto err; + } + if (labels_nonzero(&info->labels.mask)) + err = ovs_ct_set_labels(skb, key, &info->labels.value, + &info->labels.mask); +err: + skb_push(skb, nh_ofs); + skb_postpush_rcsum(skb, skb->data, nh_ofs); + if (err) + kfree_skb(skb); + return err; +} + +static int ovs_ct_add_helper(struct ovs_conntrack_info *info, const char *name, + const struct sw_flow_key *key, bool log) +{ + struct nf_conntrack_helper *helper; + struct nf_conn_help *help; + + helper = nf_conntrack_helper_try_module_get(name, info->family, + key->ip.proto); + if (!helper) { + OVS_NLERR(log, "Unknown helper \"%s\"", name); + return -EINVAL; + } + + help = nf_ct_helper_ext_add(info->ct, helper, GFP_KERNEL); + if (!help) { + module_put(helper->me); + return -ENOMEM; + } + + rcu_assign_pointer(help->helper, helper); + info->helper = helper; + return 0; +} + +static const struct ovs_ct_len_tbl ovs_ct_attr_lens[OVS_CT_ATTR_MAX + 1] = { + [OVS_CT_ATTR_COMMIT] = { .minlen = 0, .maxlen = 0 }, + [OVS_CT_ATTR_ZONE] = { .minlen = sizeof(u16), + .maxlen = sizeof(u16) }, + [OVS_CT_ATTR_MARK] = { .minlen = sizeof(struct md_mark), + .maxlen = sizeof(struct md_mark) }, + [OVS_CT_ATTR_LABELS] = { .minlen = sizeof(struct md_labels), + .maxlen = sizeof(struct md_labels) }, + [OVS_CT_ATTR_HELPER] = { .minlen = 1, + .maxlen = NF_CT_HELPER_NAME_LEN } +}; + +static int parse_ct(const struct nlattr *attr, struct ovs_conntrack_info *info, + const char **helper, bool log) +{ + struct nlattr *a; + int rem; + + nla_for_each_nested(a, attr, rem) { + int type = nla_type(a); + int maxlen; + int minlen; + + if (type > OVS_CT_ATTR_MAX) { + OVS_NLERR(log, + "Unknown conntrack attr (type=%d, max=%d)", + type, OVS_CT_ATTR_MAX); + return -EINVAL; + } + + maxlen = ovs_ct_attr_lens[type].maxlen; + minlen = ovs_ct_attr_lens[type].minlen; + if (nla_len(a) < minlen || nla_len(a) > maxlen) { + OVS_NLERR(log, + "Conntrack attr type has unexpected length (type=%d, length=%d, expected=%d)", + type, nla_len(a), maxlen); + return -EINVAL; + } + + switch (type) { + case OVS_CT_ATTR_COMMIT: + info->commit = true; + break; +#ifdef CONFIG_NF_CONNTRACK_ZONES + case OVS_CT_ATTR_ZONE: + info->zone.id = nla_get_u16(a); + break; +#endif +#ifdef CONFIG_NF_CONNTRACK_MARK + case OVS_CT_ATTR_MARK: { + struct md_mark *mark = nla_data(a); + + if (!mark->mask) { + OVS_NLERR(log, "ct_mark mask cannot be 0"); + return -EINVAL; + } + info->mark = *mark; + break; + } +#endif +#ifdef CONFIG_NF_CONNTRACK_LABELS + case OVS_CT_ATTR_LABELS: { + struct md_labels *labels = nla_data(a); + + if (!labels_nonzero(&labels->mask)) { + OVS_NLERR(log, "ct_labels mask cannot be 0"); + return -EINVAL; + } + info->labels = *labels; + break; + } +#endif + case OVS_CT_ATTR_HELPER: + *helper = nla_data(a); + if (!memchr(*helper, '\0', nla_len(a))) { + OVS_NLERR(log, "Invalid conntrack helper"); + return -EINVAL; + } + break; + default: + OVS_NLERR(log, "Unknown conntrack attr (%d)", + type); + return -EINVAL; + } + } + + if (rem > 0) { + OVS_NLERR(log, "Conntrack attr has %d unknown bytes", rem); + return -EINVAL; + } + + return 0; +} + +bool ovs_ct_verify(struct net *net, enum ovs_key_attr attr) +{ + if (attr == OVS_KEY_ATTR_CT_STATE) + return true; + if (IS_ENABLED(CONFIG_NF_CONNTRACK_ZONES) && + attr == OVS_KEY_ATTR_CT_ZONE) + return true; + if (IS_ENABLED(CONFIG_NF_CONNTRACK_MARK) && + attr == OVS_KEY_ATTR_CT_MARK) + return true; + if (IS_ENABLED(CONFIG_NF_CONNTRACK_LABELS) && + attr == OVS_KEY_ATTR_CT_LABELS) { + struct ovs_net *ovs_net = net_generic(net, ovs_net_id); + + return ovs_net->xt_label; + } + + return false; +} + +int ovs_ct_copy_action(struct net *net, const struct nlattr *attr, + const struct sw_flow_key *key, + struct sw_flow_actions **sfa, bool log) +{ + struct ovs_conntrack_info ct_info; + const char *helper = NULL; + u16 family; + int err; + + family = key_to_nfproto(key); + if (family == NFPROTO_UNSPEC) { + OVS_NLERR(log, "ct family unspecified"); + return -EINVAL; + } + + memset(&ct_info, 0, sizeof(ct_info)); + ct_info.family = family; + + nf_ct_zone_init(&ct_info.zone, NF_CT_DEFAULT_ZONE_ID, + NF_CT_DEFAULT_ZONE_DIR, 0); + + err = parse_ct(attr, &ct_info, &helper, log); + if (err) + return err; + + /* Set up template for tracking connections in specific zones. */ + ct_info.ct = nf_ct_tmpl_alloc(net, &ct_info.zone, GFP_KERNEL); + if (!ct_info.ct) { + OVS_NLERR(log, "Failed to allocate conntrack template"); + return -ENOMEM; + } + + __set_bit(IPS_CONFIRMED_BIT, &ct_info.ct->status); + nf_conntrack_get(&ct_info.ct->ct_general); + + if (helper) { + err = ovs_ct_add_helper(&ct_info, helper, key, log); + if (err) + goto err_free_ct; + } + + err = ovs_nla_add_action(sfa, OVS_ACTION_ATTR_CT, &ct_info, + sizeof(ct_info), log); + if (err) + goto err_free_ct; + + return 0; +err_free_ct: + __ovs_ct_free_action(&ct_info); + return err; +} + +int ovs_ct_action_to_attr(const struct ovs_conntrack_info *ct_info, + struct sk_buff *skb) +{ + struct nlattr *start; + + start = nla_nest_start(skb, OVS_ACTION_ATTR_CT); + if (!start) + return -EMSGSIZE; + + if (ct_info->commit && nla_put_flag(skb, OVS_CT_ATTR_COMMIT)) + return -EMSGSIZE; + if (IS_ENABLED(CONFIG_NF_CONNTRACK_ZONES) && + nla_put_u16(skb, OVS_CT_ATTR_ZONE, ct_info->zone.id)) + return -EMSGSIZE; + if (IS_ENABLED(CONFIG_NF_CONNTRACK_MARK) && ct_info->mark.mask && + nla_put(skb, OVS_CT_ATTR_MARK, sizeof(ct_info->mark), + &ct_info->mark)) + return -EMSGSIZE; + if (IS_ENABLED(CONFIG_NF_CONNTRACK_LABELS) && + labels_nonzero(&ct_info->labels.mask) && + nla_put(skb, OVS_CT_ATTR_LABELS, sizeof(ct_info->labels), + &ct_info->labels)) + return -EMSGSIZE; + if (ct_info->helper) { + if (nla_put_string(skb, OVS_CT_ATTR_HELPER, + ct_info->helper->name)) + return -EMSGSIZE; + } + + nla_nest_end(skb, start); + + return 0; +} + +void ovs_ct_free_action(const struct nlattr *a) +{ + struct ovs_conntrack_info *ct_info = nla_data(a); + + __ovs_ct_free_action(ct_info); +} + +static void __ovs_ct_free_action(struct ovs_conntrack_info *ct_info) +{ + if (ct_info->helper) + module_put(ct_info->helper->me); + if (ct_info->ct) + nf_ct_put(ct_info->ct); +} + +void ovs_ct_init(struct net *net) +{ + unsigned int n_bits = sizeof(struct ovs_key_ct_labels) * BITS_PER_BYTE; + struct ovs_net *ovs_net = net_generic(net, ovs_net_id); + + if (nf_connlabels_get(net, n_bits)) { + ovs_net->xt_label = false; + OVS_NLERR(true, "Failed to set connlabel length"); + } else { + ovs_net->xt_label = true; + } +} + +void ovs_ct_exit(struct net *net) +{ + struct ovs_net *ovs_net = net_generic(net, ovs_net_id); + + if (ovs_net->xt_label) + nf_connlabels_put(net); +} diff --git a/net/openvswitch/conntrack.h b/net/openvswitch/conntrack.h new file mode 100644 index 000000000000..a7544f405c16 --- /dev/null +++ b/net/openvswitch/conntrack.h @@ -0,0 +1,93 @@ +/* + * Copyright (c) 2015 Nicira, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ + +#ifndef OVS_CONNTRACK_H +#define OVS_CONNTRACK_H 1 + +#include "flow.h" + +struct ovs_conntrack_info; +enum ovs_key_attr; + +#if IS_ENABLED(CONFIG_NF_CONNTRACK) +void ovs_ct_init(struct net *); +void ovs_ct_exit(struct net *); +bool ovs_ct_verify(struct net *, enum ovs_key_attr attr); +int ovs_ct_copy_action(struct net *, const struct nlattr *, + const struct sw_flow_key *, struct sw_flow_actions **, + bool log); +int ovs_ct_action_to_attr(const struct ovs_conntrack_info *, struct sk_buff *); + +int ovs_ct_execute(struct net *, struct sk_buff *, struct sw_flow_key *, + const struct ovs_conntrack_info *); + +void ovs_ct_fill_key(const struct sk_buff *skb, struct sw_flow_key *key); +int ovs_ct_put_key(const struct sw_flow_key *key, struct sk_buff *skb); +void ovs_ct_free_action(const struct nlattr *a); + +#define CT_SUPPORTED_MASK (OVS_CS_F_NEW | OVS_CS_F_ESTABLISHED | \ + OVS_CS_F_RELATED | OVS_CS_F_REPLY_DIR | \ + OVS_CS_F_INVALID | OVS_CS_F_TRACKED) +#else +#include <linux/errno.h> + +static inline void ovs_ct_init(struct net *net) { } + +static inline void ovs_ct_exit(struct net *net) { } + +static inline bool ovs_ct_verify(struct net *net, int attr) +{ + return false; +} + +static inline int ovs_ct_copy_action(struct net *net, const struct nlattr *nla, + const struct sw_flow_key *key, + struct sw_flow_actions **acts, bool log) +{ + return -ENOTSUPP; +} + +static inline int ovs_ct_action_to_attr(const struct ovs_conntrack_info *info, + struct sk_buff *skb) +{ + return -ENOTSUPP; +} + +static inline int ovs_ct_execute(struct net *net, struct sk_buff *skb, + struct sw_flow_key *key, + const struct ovs_conntrack_info *info) +{ + kfree_skb(skb); + return -ENOTSUPP; +} + +static inline void ovs_ct_fill_key(const struct sk_buff *skb, + struct sw_flow_key *key) +{ + key->ct.state = 0; + key->ct.zone = 0; + key->ct.mark = 0; + memset(&key->ct.labels, 0, sizeof(key->ct.labels)); +} + +static inline int ovs_ct_put_key(const struct sw_flow_key *key, + struct sk_buff *skb) +{ + return 0; +} + +static inline void ovs_ct_free_action(const struct nlattr *a) { } + +#define CT_SUPPORTED_MASK 0 +#endif /* CONFIG_NF_CONNTRACK */ +#endif /* ovs_conntrack.h */ diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c new file mode 100644 index 000000000000..82ca7fe7a163 --- /dev/null +++ b/net/openvswitch/datapath.c @@ -0,0 +1,2388 @@ +/* + * Copyright (c) 2007-2014 Nicira, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/init.h> +#include <linux/module.h> +#include <linux/if_arp.h> +#include <linux/if_vlan.h> +#include <linux/in.h> +#include <linux/ip.h> +#include <linux/jhash.h> +#include <linux/delay.h> +#include <linux/time.h> +#include <linux/etherdevice.h> +#include <linux/genetlink.h> +#include <linux/kernel.h> +#include <linux/kthread.h> +#include <linux/mutex.h> +#include <linux/percpu.h> +#include <linux/rcupdate.h> +#include <linux/tcp.h> +#include <linux/udp.h> +#include <linux/ethtool.h> +#include <linux/wait.h> +#include <asm/div64.h> +#include <linux/highmem.h> +#include <linux/netfilter_bridge.h> +#include <linux/netfilter_ipv4.h> +#include <linux/inetdevice.h> +#include <linux/list.h> +#include <linux/openvswitch.h> +#include <linux/rculist.h> +#include <linux/dmi.h> +#include <net/genetlink.h> +#include <net/net_namespace.h> +#include <net/netns/generic.h> + +#include "datapath.h" +#include "flow.h" +#include "flow_table.h" +#include "flow_netlink.h" +#include "vport-internal_dev.h" +#include "vport-netdev.h" + +int ovs_net_id __read_mostly; +EXPORT_SYMBOL_GPL(ovs_net_id); + +static struct genl_family dp_packet_genl_family; +static struct genl_family dp_flow_genl_family; +static struct genl_family dp_datapath_genl_family; + +static const struct nla_policy flow_policy[]; + +static const struct genl_multicast_group ovs_dp_flow_multicast_group = { + .name = OVS_FLOW_MCGROUP, +}; + +static const struct genl_multicast_group ovs_dp_datapath_multicast_group = { + .name = OVS_DATAPATH_MCGROUP, +}; + +static const struct genl_multicast_group ovs_dp_vport_multicast_group = { + .name = OVS_VPORT_MCGROUP, +}; + +/* Check if need to build a reply message. + * OVS userspace sets the NLM_F_ECHO flag if it needs the reply. */ +static bool ovs_must_notify(struct genl_family *family, struct genl_info *info, + unsigned int group) +{ + return info->nlhdr->nlmsg_flags & NLM_F_ECHO || + genl_has_listeners(family, genl_info_net(info), group); +} + +static void ovs_notify(struct genl_family *family, + struct sk_buff *skb, struct genl_info *info) +{ + genl_notify(family, skb, info, 0, GFP_KERNEL); +} + +/** + * DOC: Locking: + * + * All writes e.g. Writes to device state (add/remove datapath, port, set + * operations on vports, etc.), Writes to other state (flow table + * modifications, set miscellaneous datapath parameters, etc.) are protected + * by ovs_lock. + * + * Reads are protected by RCU. + * + * There are a few special cases (mostly stats) that have their own + * synchronization but they nest under all of above and don't interact with + * each other. + * + * The RTNL lock nests inside ovs_mutex. + */ + +static DEFINE_MUTEX(ovs_mutex); + +void ovs_lock(void) +{ + mutex_lock(&ovs_mutex); +} + +void ovs_unlock(void) +{ + mutex_unlock(&ovs_mutex); +} + +#ifdef CONFIG_LOCKDEP +int lockdep_ovsl_is_held(void) +{ + if (debug_locks) + return lockdep_is_held(&ovs_mutex); + else + return 1; +} +EXPORT_SYMBOL_GPL(lockdep_ovsl_is_held); +#endif + +static struct vport *new_vport(const struct vport_parms *); +static int queue_gso_packets(struct datapath *dp, struct sk_buff *, + const struct sw_flow_key *, + const struct dp_upcall_info *); +static int queue_userspace_packet(struct datapath *dp, struct sk_buff *, + const struct sw_flow_key *, + const struct dp_upcall_info *); + +/* Must be called with rcu_read_lock. */ +static struct datapath *get_dp_rcu(struct net *net, int dp_ifindex) +{ + struct net_device *dev = dev_get_by_index_rcu(net, dp_ifindex); + + if (dev) { + struct vport *vport = ovs_internal_dev_get_vport(dev); + if (vport) + return vport->dp; + } + + return NULL; +} + +/* The caller must hold either ovs_mutex or rcu_read_lock to keep the + * returned dp pointer valid. + */ +static inline struct datapath *get_dp(struct net *net, int dp_ifindex) +{ + struct datapath *dp; + + WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_ovsl_is_held()); + rcu_read_lock(); + dp = get_dp_rcu(net, dp_ifindex); + rcu_read_unlock(); + + return dp; +} + +/* Must be called with rcu_read_lock or ovs_mutex. */ +const char *ovs_dp_name(const struct datapath *dp) +{ + struct vport *vport = ovs_vport_ovsl_rcu(dp, OVSP_LOCAL); + return ovs_vport_name(vport); +} + +static int get_dpifindex(const struct datapath *dp) +{ + struct vport *local; + int ifindex; + + rcu_read_lock(); + + local = ovs_vport_rcu(dp, OVSP_LOCAL); + if (local) + ifindex = local->dev->ifindex; + else + ifindex = 0; + + rcu_read_unlock(); + + return ifindex; +} + +static void destroy_dp_rcu(struct rcu_head *rcu) +{ + struct datapath *dp = container_of(rcu, struct datapath, rcu); + + ovs_flow_tbl_destroy(&dp->table); + free_percpu(dp->stats_percpu); + kfree(dp->ports); + kfree(dp); +} + +static struct hlist_head *vport_hash_bucket(const struct datapath *dp, + u16 port_no) +{ + return &dp->ports[port_no & (DP_VPORT_HASH_BUCKETS - 1)]; +} + +/* Called with ovs_mutex or RCU read lock. */ +struct vport *ovs_lookup_vport(const struct datapath *dp, u16 port_no) +{ + struct vport *vport; + struct hlist_head *head; + + head = vport_hash_bucket(dp, port_no); + hlist_for_each_entry_rcu(vport, head, dp_hash_node) { + if (vport->port_no == port_no) + return vport; + } + return NULL; +} + +/* Called with ovs_mutex. */ +static struct vport *new_vport(const struct vport_parms *parms) +{ + struct vport *vport; + + vport = ovs_vport_add(parms); + if (!IS_ERR(vport)) { + struct datapath *dp = parms->dp; + struct hlist_head *head = vport_hash_bucket(dp, vport->port_no); + + hlist_add_head_rcu(&vport->dp_hash_node, head); + } + return vport; +} + +void ovs_dp_detach_port(struct vport *p) +{ + ASSERT_OVSL(); + + /* First drop references to device. */ + hlist_del_rcu(&p->dp_hash_node); + + /* Then destroy it. */ + ovs_vport_del(p); +} + +/* Must be called with rcu_read_lock. */ +void ovs_dp_process_packet(struct sk_buff *skb, struct sw_flow_key *key) +{ + const struct vport *p = OVS_CB(skb)->input_vport; + struct datapath *dp = p->dp; + struct sw_flow *flow; + struct sw_flow_actions *sf_acts; + struct dp_stats_percpu *stats; + u64 *stats_counter; + u32 n_mask_hit; + + stats = this_cpu_ptr(dp->stats_percpu); + + /* Look up flow. */ + flow = ovs_flow_tbl_lookup_stats(&dp->table, key, &n_mask_hit); + if (unlikely(!flow)) { + struct dp_upcall_info upcall; + int error; + + memset(&upcall, 0, sizeof(upcall)); + upcall.cmd = OVS_PACKET_CMD_MISS; + upcall.portid = ovs_vport_find_upcall_portid(p, skb); + upcall.mru = OVS_CB(skb)->mru; + error = ovs_dp_upcall(dp, skb, key, &upcall); + if (unlikely(error)) + kfree_skb(skb); + else + consume_skb(skb); + stats_counter = &stats->n_missed; + goto out; + } + + ovs_flow_stats_update(flow, key->tp.flags, skb); + sf_acts = rcu_dereference(flow->sf_acts); + ovs_execute_actions(dp, skb, sf_acts, key); + + stats_counter = &stats->n_hit; + +out: + /* Update datapath statistics. */ + u64_stats_update_begin(&stats->syncp); + (*stats_counter)++; + stats->n_mask_hit += n_mask_hit; + u64_stats_update_end(&stats->syncp); +} + +int ovs_dp_upcall(struct datapath *dp, struct sk_buff *skb, + const struct sw_flow_key *key, + const struct dp_upcall_info *upcall_info) +{ + struct dp_stats_percpu *stats; + int err; + + if (upcall_info->portid == 0) { + err = -ENOTCONN; + goto err; + } + + if (!skb_is_gso(skb)) + err = queue_userspace_packet(dp, skb, key, upcall_info); + else + err = queue_gso_packets(dp, skb, key, upcall_info); + if (err) + goto err; + + return 0; + +err: + stats = this_cpu_ptr(dp->stats_percpu); + + u64_stats_update_begin(&stats->syncp); + stats->n_lost++; + u64_stats_update_end(&stats->syncp); + + return err; +} + +static int queue_gso_packets(struct datapath *dp, struct sk_buff *skb, + const struct sw_flow_key *key, + const struct dp_upcall_info *upcall_info) +{ + unsigned short gso_type = skb_shinfo(skb)->gso_type; + struct sw_flow_key later_key; + struct sk_buff *segs, *nskb; + int err; + + BUILD_BUG_ON(sizeof(*OVS_CB(skb)) > SKB_SGO_CB_OFFSET); + segs = __skb_gso_segment(skb, NETIF_F_SG, false); + if (IS_ERR(segs)) + return PTR_ERR(segs); + if (segs == NULL) + return -EINVAL; + + if (gso_type & SKB_GSO_UDP) { + /* The initial flow key extracted by ovs_flow_key_extract() + * in this case is for a first fragment, so we need to + * properly mark later fragments. + */ + later_key = *key; + later_key.ip.frag = OVS_FRAG_TYPE_LATER; + } + + /* Queue all of the segments. */ + skb = segs; + do { + if (gso_type & SKB_GSO_UDP && skb != segs) + key = &later_key; + + err = queue_userspace_packet(dp, skb, key, upcall_info); + if (err) + break; + + } while ((skb = skb->next)); + + /* Free all of the segments. */ + skb = segs; + do { + nskb = skb->next; + if (err) + kfree_skb(skb); + else + consume_skb(skb); + } while ((skb = nskb)); + return err; +} + +static size_t upcall_msg_size(const struct dp_upcall_info *upcall_info, + unsigned int hdrlen) +{ + size_t size = NLMSG_ALIGN(sizeof(struct ovs_header)) + + nla_total_size(hdrlen) /* OVS_PACKET_ATTR_PACKET */ + + nla_total_size(ovs_key_attr_size()); /* OVS_PACKET_ATTR_KEY */ + + /* OVS_PACKET_ATTR_USERDATA */ + if (upcall_info->userdata) + size += NLA_ALIGN(upcall_info->userdata->nla_len); + + /* OVS_PACKET_ATTR_EGRESS_TUN_KEY */ + if (upcall_info->egress_tun_info) + size += nla_total_size(ovs_tun_key_attr_size()); + + /* OVS_PACKET_ATTR_ACTIONS */ + if (upcall_info->actions_len) + size += nla_total_size(upcall_info->actions_len); + + /* OVS_PACKET_ATTR_MRU */ + if (upcall_info->mru) + size += nla_total_size(sizeof(upcall_info->mru)); + + return size; +} + +static void pad_packet(struct datapath *dp, struct sk_buff *skb) +{ + if (!(dp->user_features & OVS_DP_F_UNALIGNED)) { + size_t plen = NLA_ALIGN(skb->len) - skb->len; + + if (plen > 0) + memset(skb_put(skb, plen), 0, plen); + } +} + +static int queue_userspace_packet(struct datapath *dp, struct sk_buff *skb, + const struct sw_flow_key *key, + const struct dp_upcall_info *upcall_info) +{ + struct ovs_header *upcall; + struct sk_buff *nskb = NULL; + struct sk_buff *user_skb = NULL; /* to be queued to userspace */ + struct nlattr *nla; + struct genl_info info = { + .dst_sk = ovs_dp_get_net(dp)->genl_sock, + .snd_portid = upcall_info->portid, + }; + size_t len; + unsigned int hlen; + int err, dp_ifindex; + + dp_ifindex = get_dpifindex(dp); + if (!dp_ifindex) + return -ENODEV; + + if (skb_vlan_tag_present(skb)) { + nskb = skb_clone(skb, GFP_ATOMIC); + if (!nskb) + return -ENOMEM; + + nskb = __vlan_hwaccel_push_inside(nskb); + if (!nskb) + return -ENOMEM; + + skb = nskb; + } + + if (nla_attr_size(skb->len) > USHRT_MAX) { + err = -EFBIG; + goto out; + } + + /* Complete checksum if needed */ + if (skb->ip_summed == CHECKSUM_PARTIAL && + (err = skb_checksum_help(skb))) + goto out; + + /* Older versions of OVS user space enforce alignment of the last + * Netlink attribute to NLA_ALIGNTO which would require extensive + * padding logic. Only perform zerocopy if padding is not required. + */ + if (dp->user_features & OVS_DP_F_UNALIGNED) + hlen = skb_zerocopy_headlen(skb); + else + hlen = skb->len; + + len = upcall_msg_size(upcall_info, hlen); + user_skb = genlmsg_new_unicast(len, &info, GFP_ATOMIC); + if (!user_skb) { + err = -ENOMEM; + goto out; + } + + upcall = genlmsg_put(user_skb, 0, 0, &dp_packet_genl_family, + 0, upcall_info->cmd); + upcall->dp_ifindex = dp_ifindex; + + err = ovs_nla_put_key(key, key, OVS_PACKET_ATTR_KEY, false, user_skb); + BUG_ON(err); + + if (upcall_info->userdata) + __nla_put(user_skb, OVS_PACKET_ATTR_USERDATA, + nla_len(upcall_info->userdata), + nla_data(upcall_info->userdata)); + + if (upcall_info->egress_tun_info) { + nla = nla_nest_start(user_skb, OVS_PACKET_ATTR_EGRESS_TUN_KEY); + err = ovs_nla_put_tunnel_info(user_skb, + upcall_info->egress_tun_info); + BUG_ON(err); + nla_nest_end(user_skb, nla); + } + + if (upcall_info->actions_len) { + nla = nla_nest_start(user_skb, OVS_PACKET_ATTR_ACTIONS); + err = ovs_nla_put_actions(upcall_info->actions, + upcall_info->actions_len, + user_skb); + if (!err) + nla_nest_end(user_skb, nla); + else + nla_nest_cancel(user_skb, nla); + } + + /* Add OVS_PACKET_ATTR_MRU */ + if (upcall_info->mru) { + if (nla_put_u16(user_skb, OVS_PACKET_ATTR_MRU, + upcall_info->mru)) { + err = -ENOBUFS; + goto out; + } + pad_packet(dp, user_skb); + } + + /* Only reserve room for attribute header, packet data is added + * in skb_zerocopy() */ + if (!(nla = nla_reserve(user_skb, OVS_PACKET_ATTR_PACKET, 0))) { + err = -ENOBUFS; + goto out; + } + nla->nla_len = nla_attr_size(skb->len); + + err = skb_zerocopy(user_skb, skb, skb->len, hlen); + if (err) + goto out; + + /* Pad OVS_PACKET_ATTR_PACKET if linear copy was performed */ + pad_packet(dp, user_skb); + + ((struct nlmsghdr *) user_skb->data)->nlmsg_len = user_skb->len; + + err = genlmsg_unicast(ovs_dp_get_net(dp), user_skb, upcall_info->portid); + user_skb = NULL; +out: + if (err) + skb_tx_error(skb); + kfree_skb(user_skb); + kfree_skb(nskb); + return err; +} + +static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info) +{ + struct ovs_header *ovs_header = info->userhdr; + struct net *net = sock_net(skb->sk); + struct nlattr **a = info->attrs; + struct sw_flow_actions *acts; + struct sk_buff *packet; + struct sw_flow *flow; + struct sw_flow_actions *sf_acts; + struct datapath *dp; + struct ethhdr *eth; + struct vport *input_vport; + u16 mru = 0; + int len; + int err; + bool log = !a[OVS_PACKET_ATTR_PROBE]; + + err = -EINVAL; + if (!a[OVS_PACKET_ATTR_PACKET] || !a[OVS_PACKET_ATTR_KEY] || + !a[OVS_PACKET_ATTR_ACTIONS]) + goto err; + + len = nla_len(a[OVS_PACKET_ATTR_PACKET]); + packet = __dev_alloc_skb(NET_IP_ALIGN + len, GFP_KERNEL); + err = -ENOMEM; + if (!packet) + goto err; + skb_reserve(packet, NET_IP_ALIGN); + + nla_memcpy(__skb_put(packet, len), a[OVS_PACKET_ATTR_PACKET], len); + + skb_reset_mac_header(packet); + eth = eth_hdr(packet); + + /* Normally, setting the skb 'protocol' field would be handled by a + * call to eth_type_trans(), but it assumes there's a sending + * device, which we may not have. */ + if (eth_proto_is_802_3(eth->h_proto)) + packet->protocol = eth->h_proto; + else + packet->protocol = htons(ETH_P_802_2); + + /* Set packet's mru */ + if (a[OVS_PACKET_ATTR_MRU]) { + mru = nla_get_u16(a[OVS_PACKET_ATTR_MRU]); + packet->ignore_df = 1; + } + OVS_CB(packet)->mru = mru; + + /* Build an sw_flow for sending this packet. */ + flow = ovs_flow_alloc(); + err = PTR_ERR(flow); + if (IS_ERR(flow)) + goto err_kfree_skb; + + err = ovs_flow_key_extract_userspace(net, a[OVS_PACKET_ATTR_KEY], + packet, &flow->key, log); + if (err) + goto err_flow_free; + + err = ovs_nla_copy_actions(net, a[OVS_PACKET_ATTR_ACTIONS], + &flow->key, &acts, log); + if (err) + goto err_flow_free; + + rcu_assign_pointer(flow->sf_acts, acts); + packet->priority = flow->key.phy.priority; + packet->mark = flow->key.phy.skb_mark; + + rcu_read_lock(); + dp = get_dp_rcu(net, ovs_header->dp_ifindex); + err = -ENODEV; + if (!dp) + goto err_unlock; + + input_vport = ovs_vport_rcu(dp, flow->key.phy.in_port); + if (!input_vport) + input_vport = ovs_vport_rcu(dp, OVSP_LOCAL); + + if (!input_vport) + goto err_unlock; + + packet->dev = input_vport->dev; + OVS_CB(packet)->input_vport = input_vport; + sf_acts = rcu_dereference(flow->sf_acts); + + local_bh_disable(); + err = ovs_execute_actions(dp, packet, sf_acts, &flow->key); + local_bh_enable(); + rcu_read_unlock(); + + ovs_flow_free(flow, false); + return err; + +err_unlock: + rcu_read_unlock(); +err_flow_free: + ovs_flow_free(flow, false); +err_kfree_skb: + kfree_skb(packet); +err: + return err; +} + +static const struct nla_policy packet_policy[OVS_PACKET_ATTR_MAX + 1] = { + [OVS_PACKET_ATTR_PACKET] = { .len = ETH_HLEN }, + [OVS_PACKET_ATTR_KEY] = { .type = NLA_NESTED }, + [OVS_PACKET_ATTR_ACTIONS] = { .type = NLA_NESTED }, + [OVS_PACKET_ATTR_PROBE] = { .type = NLA_FLAG }, + [OVS_PACKET_ATTR_MRU] = { .type = NLA_U16 }, +}; + +static const struct genl_ops dp_packet_genl_ops[] = { + { .cmd = OVS_PACKET_CMD_EXECUTE, + .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ + .policy = packet_policy, + .doit = ovs_packet_cmd_execute + } +}; + +static struct genl_family dp_packet_genl_family = { + .id = GENL_ID_GENERATE, + .hdrsize = sizeof(struct ovs_header), + .name = OVS_PACKET_FAMILY, + .version = OVS_PACKET_VERSION, + .maxattr = OVS_PACKET_ATTR_MAX, + .netnsok = true, + .parallel_ops = true, + .ops = dp_packet_genl_ops, + .n_ops = ARRAY_SIZE(dp_packet_genl_ops), +}; + +static void get_dp_stats(const struct datapath *dp, struct ovs_dp_stats *stats, + struct ovs_dp_megaflow_stats *mega_stats) +{ + int i; + + memset(mega_stats, 0, sizeof(*mega_stats)); + + stats->n_flows = ovs_flow_tbl_count(&dp->table); + mega_stats->n_masks = ovs_flow_tbl_num_masks(&dp->table); + + stats->n_hit = stats->n_missed = stats->n_lost = 0; + + for_each_possible_cpu(i) { + const struct dp_stats_percpu *percpu_stats; + struct dp_stats_percpu local_stats; + unsigned int start; + + percpu_stats = per_cpu_ptr(dp->stats_percpu, i); + + do { + start = u64_stats_fetch_begin_irq(&percpu_stats->syncp); + local_stats = *percpu_stats; + } while (u64_stats_fetch_retry_irq(&percpu_stats->syncp, start)); + + stats->n_hit += local_stats.n_hit; + stats->n_missed += local_stats.n_missed; + stats->n_lost += local_stats.n_lost; + mega_stats->n_mask_hit += local_stats.n_mask_hit; + } +} + +static bool should_fill_key(const struct sw_flow_id *sfid, uint32_t ufid_flags) +{ + return ovs_identifier_is_ufid(sfid) && + !(ufid_flags & OVS_UFID_F_OMIT_KEY); +} + +static bool should_fill_mask(uint32_t ufid_flags) +{ + return !(ufid_flags & OVS_UFID_F_OMIT_MASK); +} + +static bool should_fill_actions(uint32_t ufid_flags) +{ + return !(ufid_flags & OVS_UFID_F_OMIT_ACTIONS); +} + +static size_t ovs_flow_cmd_msg_size(const struct sw_flow_actions *acts, + const struct sw_flow_id *sfid, + uint32_t ufid_flags) +{ + size_t len = NLMSG_ALIGN(sizeof(struct ovs_header)); + + /* OVS_FLOW_ATTR_UFID, or unmasked flow key as fallback + * see ovs_nla_put_identifier() + */ + if (sfid && ovs_identifier_is_ufid(sfid)) + len += nla_total_size(sfid->ufid_len); + else + len += nla_total_size(ovs_key_attr_size()); + + /* OVS_FLOW_ATTR_KEY */ + if (!sfid || should_fill_key(sfid, ufid_flags)) + len += nla_total_size(ovs_key_attr_size()); + + /* OVS_FLOW_ATTR_MASK */ + if (should_fill_mask(ufid_flags)) + len += nla_total_size(ovs_key_attr_size()); + + /* OVS_FLOW_ATTR_ACTIONS */ + if (should_fill_actions(ufid_flags)) + len += nla_total_size(acts->orig_len); + + return len + + nla_total_size(sizeof(struct ovs_flow_stats)) /* OVS_FLOW_ATTR_STATS */ + + nla_total_size(1) /* OVS_FLOW_ATTR_TCP_FLAGS */ + + nla_total_size(8); /* OVS_FLOW_ATTR_USED */ +} + +/* Called with ovs_mutex or RCU read lock. */ +static int ovs_flow_cmd_fill_stats(const struct sw_flow *flow, + struct sk_buff *skb) +{ + struct ovs_flow_stats stats; + __be16 tcp_flags; + unsigned long used; + + ovs_flow_stats_get(flow, &stats, &used, &tcp_flags); + + if (used && + nla_put_u64(skb, OVS_FLOW_ATTR_USED, ovs_flow_used_time(used))) + return -EMSGSIZE; + + if (stats.n_packets && + nla_put(skb, OVS_FLOW_ATTR_STATS, sizeof(struct ovs_flow_stats), &stats)) + return -EMSGSIZE; + + if ((u8)ntohs(tcp_flags) && + nla_put_u8(skb, OVS_FLOW_ATTR_TCP_FLAGS, (u8)ntohs(tcp_flags))) + return -EMSGSIZE; + + return 0; +} + +/* Called with ovs_mutex or RCU read lock. */ +static int ovs_flow_cmd_fill_actions(const struct sw_flow *flow, + struct sk_buff *skb, int skb_orig_len) +{ + struct nlattr *start; + int err; + + /* If OVS_FLOW_ATTR_ACTIONS doesn't fit, skip dumping the actions if + * this is the first flow to be dumped into 'skb'. This is unusual for + * Netlink but individual action lists can be longer than + * NLMSG_GOODSIZE and thus entirely undumpable if we didn't do this. + * The userspace caller can always fetch the actions separately if it + * really wants them. (Most userspace callers in fact don't care.) + * + * This can only fail for dump operations because the skb is always + * properly sized for single flows. + */ + start = nla_nest_start(skb, OVS_FLOW_ATTR_ACTIONS); + if (start) { + const struct sw_flow_actions *sf_acts; + + sf_acts = rcu_dereference_ovsl(flow->sf_acts); + err = ovs_nla_put_actions(sf_acts->actions, + sf_acts->actions_len, skb); + + if (!err) + nla_nest_end(skb, start); + else { + if (skb_orig_len) + return err; + + nla_nest_cancel(skb, start); + } + } else if (skb_orig_len) { + return -EMSGSIZE; + } + + return 0; +} + +/* Called with ovs_mutex or RCU read lock. */ +static int ovs_flow_cmd_fill_info(const struct sw_flow *flow, int dp_ifindex, + struct sk_buff *skb, u32 portid, + u32 seq, u32 flags, u8 cmd, u32 ufid_flags) +{ + const int skb_orig_len = skb->len; + struct ovs_header *ovs_header; + int err; + + ovs_header = genlmsg_put(skb, portid, seq, &dp_flow_genl_family, + flags, cmd); + if (!ovs_header) + return -EMSGSIZE; + + ovs_header->dp_ifindex = dp_ifindex; + + err = ovs_nla_put_identifier(flow, skb); + if (err) + goto error; + + if (should_fill_key(&flow->id, ufid_flags)) { + err = ovs_nla_put_masked_key(flow, skb); + if (err) + goto error; + } + + if (should_fill_mask(ufid_flags)) { + err = ovs_nla_put_mask(flow, skb); + if (err) + goto error; + } + + err = ovs_flow_cmd_fill_stats(flow, skb); + if (err) + goto error; + + if (should_fill_actions(ufid_flags)) { + err = ovs_flow_cmd_fill_actions(flow, skb, skb_orig_len); + if (err) + goto error; + } + + genlmsg_end(skb, ovs_header); + return 0; + +error: + genlmsg_cancel(skb, ovs_header); + return err; +} + +/* May not be called with RCU read lock. */ +static struct sk_buff *ovs_flow_cmd_alloc_info(const struct sw_flow_actions *acts, + const struct sw_flow_id *sfid, + struct genl_info *info, + bool always, + uint32_t ufid_flags) +{ + struct sk_buff *skb; + size_t len; + + if (!always && !ovs_must_notify(&dp_flow_genl_family, info, 0)) + return NULL; + + len = ovs_flow_cmd_msg_size(acts, sfid, ufid_flags); + skb = genlmsg_new_unicast(len, info, GFP_KERNEL); + if (!skb) + return ERR_PTR(-ENOMEM); + + return skb; +} + +/* Called with ovs_mutex. */ +static struct sk_buff *ovs_flow_cmd_build_info(const struct sw_flow *flow, + int dp_ifindex, + struct genl_info *info, u8 cmd, + bool always, u32 ufid_flags) +{ + struct sk_buff *skb; + int retval; + + skb = ovs_flow_cmd_alloc_info(ovsl_dereference(flow->sf_acts), + &flow->id, info, always, ufid_flags); + if (IS_ERR_OR_NULL(skb)) + return skb; + + retval = ovs_flow_cmd_fill_info(flow, dp_ifindex, skb, + info->snd_portid, info->snd_seq, 0, + cmd, ufid_flags); + if (WARN_ON_ONCE(retval < 0)) { + kfree_skb(skb); + skb = ERR_PTR(retval); + } + return skb; +} + +static int ovs_flow_cmd_new(struct sk_buff *skb, struct genl_info *info) +{ + struct net *net = sock_net(skb->sk); + struct nlattr **a = info->attrs; + struct ovs_header *ovs_header = info->userhdr; + struct sw_flow *flow = NULL, *new_flow; + struct sw_flow_mask mask; + struct sk_buff *reply; + struct datapath *dp; + struct sw_flow_key key; + struct sw_flow_actions *acts; + struct sw_flow_match match; + u32 ufid_flags = ovs_nla_get_ufid_flags(a[OVS_FLOW_ATTR_UFID_FLAGS]); + int error; + bool log = !a[OVS_FLOW_ATTR_PROBE]; + + /* Must have key and actions. */ + error = -EINVAL; + if (!a[OVS_FLOW_ATTR_KEY]) { + OVS_NLERR(log, "Flow key attr not present in new flow."); + goto error; + } + if (!a[OVS_FLOW_ATTR_ACTIONS]) { + OVS_NLERR(log, "Flow actions attr not present in new flow."); + goto error; + } + + /* Most of the time we need to allocate a new flow, do it before + * locking. + */ + new_flow = ovs_flow_alloc(); + if (IS_ERR(new_flow)) { + error = PTR_ERR(new_flow); + goto error; + } + + /* Extract key. */ + ovs_match_init(&match, &key, &mask); + error = ovs_nla_get_match(net, &match, a[OVS_FLOW_ATTR_KEY], + a[OVS_FLOW_ATTR_MASK], log); + if (error) + goto err_kfree_flow; + + ovs_flow_mask_key(&new_flow->key, &key, true, &mask); + + /* Extract flow identifier. */ + error = ovs_nla_get_identifier(&new_flow->id, a[OVS_FLOW_ATTR_UFID], + &key, log); + if (error) + goto err_kfree_flow; + + /* Validate actions. */ + error = ovs_nla_copy_actions(net, a[OVS_FLOW_ATTR_ACTIONS], + &new_flow->key, &acts, log); + if (error) { + OVS_NLERR(log, "Flow actions may not be safe on all matching packets."); + goto err_kfree_flow; + } + + reply = ovs_flow_cmd_alloc_info(acts, &new_flow->id, info, false, + ufid_flags); + if (IS_ERR(reply)) { + error = PTR_ERR(reply); + goto err_kfree_acts; + } + + ovs_lock(); + dp = get_dp(net, ovs_header->dp_ifindex); + if (unlikely(!dp)) { + error = -ENODEV; + goto err_unlock_ovs; + } + + /* Check if this is a duplicate flow */ + if (ovs_identifier_is_ufid(&new_flow->id)) + flow = ovs_flow_tbl_lookup_ufid(&dp->table, &new_flow->id); + if (!flow) + flow = ovs_flow_tbl_lookup(&dp->table, &key); + if (likely(!flow)) { + rcu_assign_pointer(new_flow->sf_acts, acts); + + /* Put flow in bucket. */ + error = ovs_flow_tbl_insert(&dp->table, new_flow, &mask); + if (unlikely(error)) { + acts = NULL; + goto err_unlock_ovs; + } + + if (unlikely(reply)) { + error = ovs_flow_cmd_fill_info(new_flow, + ovs_header->dp_ifindex, + reply, info->snd_portid, + info->snd_seq, 0, + OVS_FLOW_CMD_NEW, + ufid_flags); + BUG_ON(error < 0); + } + ovs_unlock(); + } else { + struct sw_flow_actions *old_acts; + + /* Bail out if we're not allowed to modify an existing flow. + * We accept NLM_F_CREATE in place of the intended NLM_F_EXCL + * because Generic Netlink treats the latter as a dump + * request. We also accept NLM_F_EXCL in case that bug ever + * gets fixed. + */ + if (unlikely(info->nlhdr->nlmsg_flags & (NLM_F_CREATE + | NLM_F_EXCL))) { + error = -EEXIST; + goto err_unlock_ovs; + } + /* The flow identifier has to be the same for flow updates. + * Look for any overlapping flow. + */ + if (unlikely(!ovs_flow_cmp(flow, &match))) { + if (ovs_identifier_is_key(&flow->id)) + flow = ovs_flow_tbl_lookup_exact(&dp->table, + &match); + else /* UFID matches but key is different */ + flow = NULL; + if (!flow) { + error = -ENOENT; + goto err_unlock_ovs; + } + } + /* Update actions. */ + old_acts = ovsl_dereference(flow->sf_acts); + rcu_assign_pointer(flow->sf_acts, acts); + + if (unlikely(reply)) { + error = ovs_flow_cmd_fill_info(flow, + ovs_header->dp_ifindex, + reply, info->snd_portid, + info->snd_seq, 0, + OVS_FLOW_CMD_NEW, + ufid_flags); + BUG_ON(error < 0); + } + ovs_unlock(); + + ovs_nla_free_flow_actions_rcu(old_acts); + ovs_flow_free(new_flow, false); + } + + if (reply) + ovs_notify(&dp_flow_genl_family, reply, info); + return 0; + +err_unlock_ovs: + ovs_unlock(); + kfree_skb(reply); +err_kfree_acts: + ovs_nla_free_flow_actions(acts); +err_kfree_flow: + ovs_flow_free(new_flow, false); +error: + return error; +} + +/* Factor out action copy to avoid "Wframe-larger-than=1024" warning. */ +static struct sw_flow_actions *get_flow_actions(struct net *net, + const struct nlattr *a, + const struct sw_flow_key *key, + const struct sw_flow_mask *mask, + bool log) +{ + struct sw_flow_actions *acts; + struct sw_flow_key masked_key; + int error; + + ovs_flow_mask_key(&masked_key, key, true, mask); + error = ovs_nla_copy_actions(net, a, &masked_key, &acts, log); + if (error) { + OVS_NLERR(log, + "Actions may not be safe on all matching packets"); + return ERR_PTR(error); + } + + return acts; +} + +static int ovs_flow_cmd_set(struct sk_buff *skb, struct genl_info *info) +{ + struct net *net = sock_net(skb->sk); + struct nlattr **a = info->attrs; + struct ovs_header *ovs_header = info->userhdr; + struct sw_flow_key key; + struct sw_flow *flow; + struct sw_flow_mask mask; + struct sk_buff *reply = NULL; + struct datapath *dp; + struct sw_flow_actions *old_acts = NULL, *acts = NULL; + struct sw_flow_match match; + struct sw_flow_id sfid; + u32 ufid_flags = ovs_nla_get_ufid_flags(a[OVS_FLOW_ATTR_UFID_FLAGS]); + int error; + bool log = !a[OVS_FLOW_ATTR_PROBE]; + bool ufid_present; + + /* Extract key. */ + error = -EINVAL; + if (!a[OVS_FLOW_ATTR_KEY]) { + OVS_NLERR(log, "Flow key attribute not present in set flow."); + goto error; + } + + ufid_present = ovs_nla_get_ufid(&sfid, a[OVS_FLOW_ATTR_UFID], log); + ovs_match_init(&match, &key, &mask); + error = ovs_nla_get_match(net, &match, a[OVS_FLOW_ATTR_KEY], + a[OVS_FLOW_ATTR_MASK], log); + if (error) + goto error; + + /* Validate actions. */ + if (a[OVS_FLOW_ATTR_ACTIONS]) { + acts = get_flow_actions(net, a[OVS_FLOW_ATTR_ACTIONS], &key, + &mask, log); + if (IS_ERR(acts)) { + error = PTR_ERR(acts); + goto error; + } + + /* Can allocate before locking if have acts. */ + reply = ovs_flow_cmd_alloc_info(acts, &sfid, info, false, + ufid_flags); + if (IS_ERR(reply)) { + error = PTR_ERR(reply); + goto err_kfree_acts; + } + } + + ovs_lock(); + dp = get_dp(net, ovs_header->dp_ifindex); + if (unlikely(!dp)) { + error = -ENODEV; + goto err_unlock_ovs; + } + /* Check that the flow exists. */ + if (ufid_present) + flow = ovs_flow_tbl_lookup_ufid(&dp->table, &sfid); + else + flow = ovs_flow_tbl_lookup_exact(&dp->table, &match); + if (unlikely(!flow)) { + error = -ENOENT; + goto err_unlock_ovs; + } + + /* Update actions, if present. */ + if (likely(acts)) { + old_acts = ovsl_dereference(flow->sf_acts); + rcu_assign_pointer(flow->sf_acts, acts); + + if (unlikely(reply)) { + error = ovs_flow_cmd_fill_info(flow, + ovs_header->dp_ifindex, + reply, info->snd_portid, + info->snd_seq, 0, + OVS_FLOW_CMD_NEW, + ufid_flags); + BUG_ON(error < 0); + } + } else { + /* Could not alloc without acts before locking. */ + reply = ovs_flow_cmd_build_info(flow, ovs_header->dp_ifindex, + info, OVS_FLOW_CMD_NEW, false, + ufid_flags); + + if (IS_ERR(reply)) { + error = PTR_ERR(reply); + goto err_unlock_ovs; + } + } + + /* Clear stats. */ + if (a[OVS_FLOW_ATTR_CLEAR]) + ovs_flow_stats_clear(flow); + ovs_unlock(); + + if (reply) + ovs_notify(&dp_flow_genl_family, reply, info); + if (old_acts) + ovs_nla_free_flow_actions_rcu(old_acts); + + return 0; + +err_unlock_ovs: + ovs_unlock(); + kfree_skb(reply); +err_kfree_acts: + ovs_nla_free_flow_actions(acts); +error: + return error; +} + +static int ovs_flow_cmd_get(struct sk_buff *skb, struct genl_info *info) +{ + struct nlattr **a = info->attrs; + struct ovs_header *ovs_header = info->userhdr; + struct net *net = sock_net(skb->sk); + struct sw_flow_key key; + struct sk_buff *reply; + struct sw_flow *flow; + struct datapath *dp; + struct sw_flow_match match; + struct sw_flow_id ufid; + u32 ufid_flags = ovs_nla_get_ufid_flags(a[OVS_FLOW_ATTR_UFID_FLAGS]); + int err = 0; + bool log = !a[OVS_FLOW_ATTR_PROBE]; + bool ufid_present; + + ufid_present = ovs_nla_get_ufid(&ufid, a[OVS_FLOW_ATTR_UFID], log); + if (a[OVS_FLOW_ATTR_KEY]) { + ovs_match_init(&match, &key, NULL); + err = ovs_nla_get_match(net, &match, a[OVS_FLOW_ATTR_KEY], NULL, + log); + } else if (!ufid_present) { + OVS_NLERR(log, + "Flow get message rejected, Key attribute missing."); + err = -EINVAL; + } + if (err) + return err; + + ovs_lock(); + dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex); + if (!dp) { + err = -ENODEV; + goto unlock; + } + + if (ufid_present) + flow = ovs_flow_tbl_lookup_ufid(&dp->table, &ufid); + else + flow = ovs_flow_tbl_lookup_exact(&dp->table, &match); + if (!flow) { + err = -ENOENT; + goto unlock; + } + + reply = ovs_flow_cmd_build_info(flow, ovs_header->dp_ifindex, info, + OVS_FLOW_CMD_NEW, true, ufid_flags); + if (IS_ERR(reply)) { + err = PTR_ERR(reply); + goto unlock; + } + + ovs_unlock(); + return genlmsg_reply(reply, info); +unlock: + ovs_unlock(); + return err; +} + +static int ovs_flow_cmd_del(struct sk_buff *skb, struct genl_info *info) +{ + struct nlattr **a = info->attrs; + struct ovs_header *ovs_header = info->userhdr; + struct net *net = sock_net(skb->sk); + struct sw_flow_key key; + struct sk_buff *reply; + struct sw_flow *flow = NULL; + struct datapath *dp; + struct sw_flow_match match; + struct sw_flow_id ufid; + u32 ufid_flags = ovs_nla_get_ufid_flags(a[OVS_FLOW_ATTR_UFID_FLAGS]); + int err; + bool log = !a[OVS_FLOW_ATTR_PROBE]; + bool ufid_present; + + ufid_present = ovs_nla_get_ufid(&ufid, a[OVS_FLOW_ATTR_UFID], log); + if (a[OVS_FLOW_ATTR_KEY]) { + ovs_match_init(&match, &key, NULL); + err = ovs_nla_get_match(net, &match, a[OVS_FLOW_ATTR_KEY], + NULL, log); + if (unlikely(err)) + return err; + } + + ovs_lock(); + dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex); + if (unlikely(!dp)) { + err = -ENODEV; + goto unlock; + } + + if (unlikely(!a[OVS_FLOW_ATTR_KEY] && !ufid_present)) { + err = ovs_flow_tbl_flush(&dp->table); + goto unlock; + } + + if (ufid_present) + flow = ovs_flow_tbl_lookup_ufid(&dp->table, &ufid); + else + flow = ovs_flow_tbl_lookup_exact(&dp->table, &match); + if (unlikely(!flow)) { + err = -ENOENT; + goto unlock; + } + + ovs_flow_tbl_remove(&dp->table, flow); + ovs_unlock(); + + reply = ovs_flow_cmd_alloc_info((const struct sw_flow_actions __force *) flow->sf_acts, + &flow->id, info, false, ufid_flags); + if (likely(reply)) { + if (likely(!IS_ERR(reply))) { + rcu_read_lock(); /*To keep RCU checker happy. */ + err = ovs_flow_cmd_fill_info(flow, ovs_header->dp_ifindex, + reply, info->snd_portid, + info->snd_seq, 0, + OVS_FLOW_CMD_DEL, + ufid_flags); + rcu_read_unlock(); + if (WARN_ON_ONCE(err < 0)) { + kfree_skb(reply); + goto out_free; + } + + ovs_notify(&dp_flow_genl_family, reply, info); + } else { + netlink_set_err(sock_net(skb->sk)->genl_sock, 0, 0, PTR_ERR(reply)); + } + } + +out_free: + ovs_flow_free(flow, true); + return 0; +unlock: + ovs_unlock(); + return err; +} + +static int ovs_flow_cmd_dump(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct nlattr *a[__OVS_FLOW_ATTR_MAX]; + struct ovs_header *ovs_header = genlmsg_data(nlmsg_data(cb->nlh)); + struct table_instance *ti; + struct datapath *dp; + u32 ufid_flags; + int err; + + err = genlmsg_parse(cb->nlh, &dp_flow_genl_family, a, + OVS_FLOW_ATTR_MAX, flow_policy); + if (err) + return err; + ufid_flags = ovs_nla_get_ufid_flags(a[OVS_FLOW_ATTR_UFID_FLAGS]); + + rcu_read_lock(); + dp = get_dp_rcu(sock_net(skb->sk), ovs_header->dp_ifindex); + if (!dp) { + rcu_read_unlock(); + return -ENODEV; + } + + ti = rcu_dereference(dp->table.ti); + for (;;) { + struct sw_flow *flow; + u32 bucket, obj; + + bucket = cb->args[0]; + obj = cb->args[1]; + flow = ovs_flow_tbl_dump_next(ti, &bucket, &obj); + if (!flow) + break; + + if (ovs_flow_cmd_fill_info(flow, ovs_header->dp_ifindex, skb, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, NLM_F_MULTI, + OVS_FLOW_CMD_NEW, ufid_flags) < 0) + break; + + cb->args[0] = bucket; + cb->args[1] = obj; + } + rcu_read_unlock(); + return skb->len; +} + +static const struct nla_policy flow_policy[OVS_FLOW_ATTR_MAX + 1] = { + [OVS_FLOW_ATTR_KEY] = { .type = NLA_NESTED }, + [OVS_FLOW_ATTR_MASK] = { .type = NLA_NESTED }, + [OVS_FLOW_ATTR_ACTIONS] = { .type = NLA_NESTED }, + [OVS_FLOW_ATTR_CLEAR] = { .type = NLA_FLAG }, + [OVS_FLOW_ATTR_PROBE] = { .type = NLA_FLAG }, + [OVS_FLOW_ATTR_UFID] = { .type = NLA_UNSPEC, .len = 1 }, + [OVS_FLOW_ATTR_UFID_FLAGS] = { .type = NLA_U32 }, +}; + +static const struct genl_ops dp_flow_genl_ops[] = { + { .cmd = OVS_FLOW_CMD_NEW, + .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ + .policy = flow_policy, + .doit = ovs_flow_cmd_new + }, + { .cmd = OVS_FLOW_CMD_DEL, + .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ + .policy = flow_policy, + .doit = ovs_flow_cmd_del + }, + { .cmd = OVS_FLOW_CMD_GET, + .flags = 0, /* OK for unprivileged users. */ + .policy = flow_policy, + .doit = ovs_flow_cmd_get, + .dumpit = ovs_flow_cmd_dump + }, + { .cmd = OVS_FLOW_CMD_SET, + .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ + .policy = flow_policy, + .doit = ovs_flow_cmd_set, + }, +}; + +static struct genl_family dp_flow_genl_family = { + .id = GENL_ID_GENERATE, + .hdrsize = sizeof(struct ovs_header), + .name = OVS_FLOW_FAMILY, + .version = OVS_FLOW_VERSION, + .maxattr = OVS_FLOW_ATTR_MAX, + .netnsok = true, + .parallel_ops = true, + .ops = dp_flow_genl_ops, + .n_ops = ARRAY_SIZE(dp_flow_genl_ops), + .mcgrps = &ovs_dp_flow_multicast_group, + .n_mcgrps = 1, +}; + +static size_t ovs_dp_cmd_msg_size(void) +{ + size_t msgsize = NLMSG_ALIGN(sizeof(struct ovs_header)); + + msgsize += nla_total_size(IFNAMSIZ); + msgsize += nla_total_size(sizeof(struct ovs_dp_stats)); + msgsize += nla_total_size(sizeof(struct ovs_dp_megaflow_stats)); + msgsize += nla_total_size(sizeof(u32)); /* OVS_DP_ATTR_USER_FEATURES */ + + return msgsize; +} + +/* Called with ovs_mutex. */ +static int ovs_dp_cmd_fill_info(struct datapath *dp, struct sk_buff *skb, + u32 portid, u32 seq, u32 flags, u8 cmd) +{ + struct ovs_header *ovs_header; + struct ovs_dp_stats dp_stats; + struct ovs_dp_megaflow_stats dp_megaflow_stats; + int err; + + ovs_header = genlmsg_put(skb, portid, seq, &dp_datapath_genl_family, + flags, cmd); + if (!ovs_header) + goto error; + + ovs_header->dp_ifindex = get_dpifindex(dp); + + err = nla_put_string(skb, OVS_DP_ATTR_NAME, ovs_dp_name(dp)); + if (err) + goto nla_put_failure; + + get_dp_stats(dp, &dp_stats, &dp_megaflow_stats); + if (nla_put(skb, OVS_DP_ATTR_STATS, sizeof(struct ovs_dp_stats), + &dp_stats)) + goto nla_put_failure; + + if (nla_put(skb, OVS_DP_ATTR_MEGAFLOW_STATS, + sizeof(struct ovs_dp_megaflow_stats), + &dp_megaflow_stats)) + goto nla_put_failure; + + if (nla_put_u32(skb, OVS_DP_ATTR_USER_FEATURES, dp->user_features)) + goto nla_put_failure; + + genlmsg_end(skb, ovs_header); + return 0; + +nla_put_failure: + genlmsg_cancel(skb, ovs_header); +error: + return -EMSGSIZE; +} + +static struct sk_buff *ovs_dp_cmd_alloc_info(struct genl_info *info) +{ + return genlmsg_new_unicast(ovs_dp_cmd_msg_size(), info, GFP_KERNEL); +} + +/* Called with rcu_read_lock or ovs_mutex. */ +static struct datapath *lookup_datapath(struct net *net, + const struct ovs_header *ovs_header, + struct nlattr *a[OVS_DP_ATTR_MAX + 1]) +{ + struct datapath *dp; + + if (!a[OVS_DP_ATTR_NAME]) + dp = get_dp(net, ovs_header->dp_ifindex); + else { + struct vport *vport; + + vport = ovs_vport_locate(net, nla_data(a[OVS_DP_ATTR_NAME])); + dp = vport && vport->port_no == OVSP_LOCAL ? vport->dp : NULL; + } + return dp ? dp : ERR_PTR(-ENODEV); +} + +static void ovs_dp_reset_user_features(struct sk_buff *skb, struct genl_info *info) +{ + struct datapath *dp; + + dp = lookup_datapath(sock_net(skb->sk), info->userhdr, info->attrs); + if (IS_ERR(dp)) + return; + + WARN(dp->user_features, "Dropping previously announced user features\n"); + dp->user_features = 0; +} + +static void ovs_dp_change(struct datapath *dp, struct nlattr *a[]) +{ + if (a[OVS_DP_ATTR_USER_FEATURES]) + dp->user_features = nla_get_u32(a[OVS_DP_ATTR_USER_FEATURES]); +} + +static int ovs_dp_cmd_new(struct sk_buff *skb, struct genl_info *info) +{ + struct nlattr **a = info->attrs; + struct vport_parms parms; + struct sk_buff *reply; + struct datapath *dp; + struct vport *vport; + struct ovs_net *ovs_net; + int err, i; + + err = -EINVAL; + if (!a[OVS_DP_ATTR_NAME] || !a[OVS_DP_ATTR_UPCALL_PID]) + goto err; + + reply = ovs_dp_cmd_alloc_info(info); + if (!reply) + return -ENOMEM; + + err = -ENOMEM; + dp = kzalloc(sizeof(*dp), GFP_KERNEL); + if (dp == NULL) + goto err_free_reply; + + ovs_dp_set_net(dp, sock_net(skb->sk)); + + /* Allocate table. */ + err = ovs_flow_tbl_init(&dp->table); + if (err) + goto err_free_dp; + + dp->stats_percpu = netdev_alloc_pcpu_stats(struct dp_stats_percpu); + if (!dp->stats_percpu) { + err = -ENOMEM; + goto err_destroy_table; + } + + dp->ports = kmalloc(DP_VPORT_HASH_BUCKETS * sizeof(struct hlist_head), + GFP_KERNEL); + if (!dp->ports) { + err = -ENOMEM; + goto err_destroy_percpu; + } + + for (i = 0; i < DP_VPORT_HASH_BUCKETS; i++) + INIT_HLIST_HEAD(&dp->ports[i]); + + /* Set up our datapath device. */ + parms.name = nla_data(a[OVS_DP_ATTR_NAME]); + parms.type = OVS_VPORT_TYPE_INTERNAL; + parms.options = NULL; + parms.dp = dp; + parms.port_no = OVSP_LOCAL; + parms.upcall_portids = a[OVS_DP_ATTR_UPCALL_PID]; + + ovs_dp_change(dp, a); + + /* So far only local changes have been made, now need the lock. */ + ovs_lock(); + + vport = new_vport(&parms); + if (IS_ERR(vport)) { + err = PTR_ERR(vport); + if (err == -EBUSY) + err = -EEXIST; + + if (err == -EEXIST) { + /* An outdated user space instance that does not understand + * the concept of user_features has attempted to create a new + * datapath and is likely to reuse it. Drop all user features. + */ + if (info->genlhdr->version < OVS_DP_VER_FEATURES) + ovs_dp_reset_user_features(skb, info); + } + + goto err_destroy_ports_array; + } + + err = ovs_dp_cmd_fill_info(dp, reply, info->snd_portid, + info->snd_seq, 0, OVS_DP_CMD_NEW); + BUG_ON(err < 0); + + ovs_net = net_generic(ovs_dp_get_net(dp), ovs_net_id); + list_add_tail_rcu(&dp->list_node, &ovs_net->dps); + + ovs_unlock(); + + ovs_notify(&dp_datapath_genl_family, reply, info); + return 0; + +err_destroy_ports_array: + ovs_unlock(); + kfree(dp->ports); +err_destroy_percpu: + free_percpu(dp->stats_percpu); +err_destroy_table: + ovs_flow_tbl_destroy(&dp->table); +err_free_dp: + kfree(dp); +err_free_reply: + kfree_skb(reply); +err: + return err; +} + +/* Called with ovs_mutex. */ +static void __dp_destroy(struct datapath *dp) +{ + int i; + + for (i = 0; i < DP_VPORT_HASH_BUCKETS; i++) { + struct vport *vport; + struct hlist_node *n; + + hlist_for_each_entry_safe(vport, n, &dp->ports[i], dp_hash_node) + if (vport->port_no != OVSP_LOCAL) + ovs_dp_detach_port(vport); + } + + list_del_rcu(&dp->list_node); + + /* OVSP_LOCAL is datapath internal port. We need to make sure that + * all ports in datapath are destroyed first before freeing datapath. + */ + ovs_dp_detach_port(ovs_vport_ovsl(dp, OVSP_LOCAL)); + + /* RCU destroy the flow table */ + call_rcu(&dp->rcu, destroy_dp_rcu); +} + +static int ovs_dp_cmd_del(struct sk_buff *skb, struct genl_info *info) +{ + struct sk_buff *reply; + struct datapath *dp; + int err; + + reply = ovs_dp_cmd_alloc_info(info); + if (!reply) + return -ENOMEM; + + ovs_lock(); + dp = lookup_datapath(sock_net(skb->sk), info->userhdr, info->attrs); + err = PTR_ERR(dp); + if (IS_ERR(dp)) + goto err_unlock_free; + + err = ovs_dp_cmd_fill_info(dp, reply, info->snd_portid, + info->snd_seq, 0, OVS_DP_CMD_DEL); + BUG_ON(err < 0); + + __dp_destroy(dp); + ovs_unlock(); + + ovs_notify(&dp_datapath_genl_family, reply, info); + + return 0; + +err_unlock_free: + ovs_unlock(); + kfree_skb(reply); + return err; +} + +static int ovs_dp_cmd_set(struct sk_buff *skb, struct genl_info *info) +{ + struct sk_buff *reply; + struct datapath *dp; + int err; + + reply = ovs_dp_cmd_alloc_info(info); + if (!reply) + return -ENOMEM; + + ovs_lock(); + dp = lookup_datapath(sock_net(skb->sk), info->userhdr, info->attrs); + err = PTR_ERR(dp); + if (IS_ERR(dp)) + goto err_unlock_free; + + ovs_dp_change(dp, info->attrs); + + err = ovs_dp_cmd_fill_info(dp, reply, info->snd_portid, + info->snd_seq, 0, OVS_DP_CMD_NEW); + BUG_ON(err < 0); + + ovs_unlock(); + ovs_notify(&dp_datapath_genl_family, reply, info); + + return 0; + +err_unlock_free: + ovs_unlock(); + kfree_skb(reply); + return err; +} + +static int ovs_dp_cmd_get(struct sk_buff *skb, struct genl_info *info) +{ + struct sk_buff *reply; + struct datapath *dp; + int err; + + reply = ovs_dp_cmd_alloc_info(info); + if (!reply) + return -ENOMEM; + + ovs_lock(); + dp = lookup_datapath(sock_net(skb->sk), info->userhdr, info->attrs); + if (IS_ERR(dp)) { + err = PTR_ERR(dp); + goto err_unlock_free; + } + err = ovs_dp_cmd_fill_info(dp, reply, info->snd_portid, + info->snd_seq, 0, OVS_DP_CMD_NEW); + BUG_ON(err < 0); + ovs_unlock(); + + return genlmsg_reply(reply, info); + +err_unlock_free: + ovs_unlock(); + kfree_skb(reply); + return err; +} + +static int ovs_dp_cmd_dump(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct ovs_net *ovs_net = net_generic(sock_net(skb->sk), ovs_net_id); + struct datapath *dp; + int skip = cb->args[0]; + int i = 0; + + ovs_lock(); + list_for_each_entry(dp, &ovs_net->dps, list_node) { + if (i >= skip && + ovs_dp_cmd_fill_info(dp, skb, NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, NLM_F_MULTI, + OVS_DP_CMD_NEW) < 0) + break; + i++; + } + ovs_unlock(); + + cb->args[0] = i; + + return skb->len; +} + +static const struct nla_policy datapath_policy[OVS_DP_ATTR_MAX + 1] = { + [OVS_DP_ATTR_NAME] = { .type = NLA_NUL_STRING, .len = IFNAMSIZ - 1 }, + [OVS_DP_ATTR_UPCALL_PID] = { .type = NLA_U32 }, + [OVS_DP_ATTR_USER_FEATURES] = { .type = NLA_U32 }, +}; + +static const struct genl_ops dp_datapath_genl_ops[] = { + { .cmd = OVS_DP_CMD_NEW, + .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ + .policy = datapath_policy, + .doit = ovs_dp_cmd_new + }, + { .cmd = OVS_DP_CMD_DEL, + .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ + .policy = datapath_policy, + .doit = ovs_dp_cmd_del + }, + { .cmd = OVS_DP_CMD_GET, + .flags = 0, /* OK for unprivileged users. */ + .policy = datapath_policy, + .doit = ovs_dp_cmd_get, + .dumpit = ovs_dp_cmd_dump + }, + { .cmd = OVS_DP_CMD_SET, + .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ + .policy = datapath_policy, + .doit = ovs_dp_cmd_set, + }, +}; + +static struct genl_family dp_datapath_genl_family = { + .id = GENL_ID_GENERATE, + .hdrsize = sizeof(struct ovs_header), + .name = OVS_DATAPATH_FAMILY, + .version = OVS_DATAPATH_VERSION, + .maxattr = OVS_DP_ATTR_MAX, + .netnsok = true, + .parallel_ops = true, + .ops = dp_datapath_genl_ops, + .n_ops = ARRAY_SIZE(dp_datapath_genl_ops), + .mcgrps = &ovs_dp_datapath_multicast_group, + .n_mcgrps = 1, +}; + +/* Called with ovs_mutex or RCU read lock. */ +static int ovs_vport_cmd_fill_info(struct vport *vport, struct sk_buff *skb, + u32 portid, u32 seq, u32 flags, u8 cmd) +{ + struct ovs_header *ovs_header; + struct ovs_vport_stats vport_stats; + int err; + + ovs_header = genlmsg_put(skb, portid, seq, &dp_vport_genl_family, + flags, cmd); + if (!ovs_header) + return -EMSGSIZE; + + ovs_header->dp_ifindex = get_dpifindex(vport->dp); + + if (nla_put_u32(skb, OVS_VPORT_ATTR_PORT_NO, vport->port_no) || + nla_put_u32(skb, OVS_VPORT_ATTR_TYPE, vport->ops->type) || + nla_put_string(skb, OVS_VPORT_ATTR_NAME, + ovs_vport_name(vport))) + goto nla_put_failure; + + ovs_vport_get_stats(vport, &vport_stats); + if (nla_put(skb, OVS_VPORT_ATTR_STATS, sizeof(struct ovs_vport_stats), + &vport_stats)) + goto nla_put_failure; + + if (ovs_vport_get_upcall_portids(vport, skb)) + goto nla_put_failure; + + err = ovs_vport_get_options(vport, skb); + if (err == -EMSGSIZE) + goto error; + + genlmsg_end(skb, ovs_header); + return 0; + +nla_put_failure: + err = -EMSGSIZE; +error: + genlmsg_cancel(skb, ovs_header); + return err; +} + +static struct sk_buff *ovs_vport_cmd_alloc_info(void) +{ + return nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); +} + +/* Called with ovs_mutex, only via ovs_dp_notify_wq(). */ +struct sk_buff *ovs_vport_cmd_build_info(struct vport *vport, u32 portid, + u32 seq, u8 cmd) +{ + struct sk_buff *skb; + int retval; + + skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC); + if (!skb) + return ERR_PTR(-ENOMEM); + + retval = ovs_vport_cmd_fill_info(vport, skb, portid, seq, 0, cmd); + BUG_ON(retval < 0); + + return skb; +} + +/* Called with ovs_mutex or RCU read lock. */ +static struct vport *lookup_vport(struct net *net, + const struct ovs_header *ovs_header, + struct nlattr *a[OVS_VPORT_ATTR_MAX + 1]) +{ + struct datapath *dp; + struct vport *vport; + + if (a[OVS_VPORT_ATTR_NAME]) { + vport = ovs_vport_locate(net, nla_data(a[OVS_VPORT_ATTR_NAME])); + if (!vport) + return ERR_PTR(-ENODEV); + if (ovs_header->dp_ifindex && + ovs_header->dp_ifindex != get_dpifindex(vport->dp)) + return ERR_PTR(-ENODEV); + return vport; + } else if (a[OVS_VPORT_ATTR_PORT_NO]) { + u32 port_no = nla_get_u32(a[OVS_VPORT_ATTR_PORT_NO]); + + if (port_no >= DP_MAX_PORTS) + return ERR_PTR(-EFBIG); + + dp = get_dp(net, ovs_header->dp_ifindex); + if (!dp) + return ERR_PTR(-ENODEV); + + vport = ovs_vport_ovsl_rcu(dp, port_no); + if (!vport) + return ERR_PTR(-ENODEV); + return vport; + } else + return ERR_PTR(-EINVAL); +} + +static int ovs_vport_cmd_new(struct sk_buff *skb, struct genl_info *info) +{ + struct nlattr **a = info->attrs; + struct ovs_header *ovs_header = info->userhdr; + struct vport_parms parms; + struct sk_buff *reply; + struct vport *vport; + struct datapath *dp; + u32 port_no; + int err; + + if (!a[OVS_VPORT_ATTR_NAME] || !a[OVS_VPORT_ATTR_TYPE] || + !a[OVS_VPORT_ATTR_UPCALL_PID]) + return -EINVAL; + + port_no = a[OVS_VPORT_ATTR_PORT_NO] + ? nla_get_u32(a[OVS_VPORT_ATTR_PORT_NO]) : 0; + if (port_no >= DP_MAX_PORTS) + return -EFBIG; + + reply = ovs_vport_cmd_alloc_info(); + if (!reply) + return -ENOMEM; + + ovs_lock(); +restart: + dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex); + err = -ENODEV; + if (!dp) + goto exit_unlock_free; + + if (port_no) { + vport = ovs_vport_ovsl(dp, port_no); + err = -EBUSY; + if (vport) + goto exit_unlock_free; + } else { + for (port_no = 1; ; port_no++) { + if (port_no >= DP_MAX_PORTS) { + err = -EFBIG; + goto exit_unlock_free; + } + vport = ovs_vport_ovsl(dp, port_no); + if (!vport) + break; + } + } + + parms.name = nla_data(a[OVS_VPORT_ATTR_NAME]); + parms.type = nla_get_u32(a[OVS_VPORT_ATTR_TYPE]); + parms.options = a[OVS_VPORT_ATTR_OPTIONS]; + parms.dp = dp; + parms.port_no = port_no; + parms.upcall_portids = a[OVS_VPORT_ATTR_UPCALL_PID]; + + vport = new_vport(&parms); + err = PTR_ERR(vport); + if (IS_ERR(vport)) { + if (err == -EAGAIN) + goto restart; + goto exit_unlock_free; + } + + err = ovs_vport_cmd_fill_info(vport, reply, info->snd_portid, + info->snd_seq, 0, OVS_VPORT_CMD_NEW); + BUG_ON(err < 0); + ovs_unlock(); + + ovs_notify(&dp_vport_genl_family, reply, info); + return 0; + +exit_unlock_free: + ovs_unlock(); + kfree_skb(reply); + return err; +} + +static int ovs_vport_cmd_set(struct sk_buff *skb, struct genl_info *info) +{ + struct nlattr **a = info->attrs; + struct sk_buff *reply; + struct vport *vport; + int err; + + reply = ovs_vport_cmd_alloc_info(); + if (!reply) + return -ENOMEM; + + ovs_lock(); + vport = lookup_vport(sock_net(skb->sk), info->userhdr, a); + err = PTR_ERR(vport); + if (IS_ERR(vport)) + goto exit_unlock_free; + + if (a[OVS_VPORT_ATTR_TYPE] && + nla_get_u32(a[OVS_VPORT_ATTR_TYPE]) != vport->ops->type) { + err = -EINVAL; + goto exit_unlock_free; + } + + if (a[OVS_VPORT_ATTR_OPTIONS]) { + err = ovs_vport_set_options(vport, a[OVS_VPORT_ATTR_OPTIONS]); + if (err) + goto exit_unlock_free; + } + + + if (a[OVS_VPORT_ATTR_UPCALL_PID]) { + struct nlattr *ids = a[OVS_VPORT_ATTR_UPCALL_PID]; + + err = ovs_vport_set_upcall_portids(vport, ids); + if (err) + goto exit_unlock_free; + } + + err = ovs_vport_cmd_fill_info(vport, reply, info->snd_portid, + info->snd_seq, 0, OVS_VPORT_CMD_NEW); + BUG_ON(err < 0); + + ovs_unlock(); + ovs_notify(&dp_vport_genl_family, reply, info); + return 0; + +exit_unlock_free: + ovs_unlock(); + kfree_skb(reply); + return err; +} + +static int ovs_vport_cmd_del(struct sk_buff *skb, struct genl_info *info) +{ + struct nlattr **a = info->attrs; + struct sk_buff *reply; + struct vport *vport; + int err; + + reply = ovs_vport_cmd_alloc_info(); + if (!reply) + return -ENOMEM; + + ovs_lock(); + vport = lookup_vport(sock_net(skb->sk), info->userhdr, a); + err = PTR_ERR(vport); + if (IS_ERR(vport)) + goto exit_unlock_free; + + if (vport->port_no == OVSP_LOCAL) { + err = -EINVAL; + goto exit_unlock_free; + } + + err = ovs_vport_cmd_fill_info(vport, reply, info->snd_portid, + info->snd_seq, 0, OVS_VPORT_CMD_DEL); + BUG_ON(err < 0); + ovs_dp_detach_port(vport); + ovs_unlock(); + + ovs_notify(&dp_vport_genl_family, reply, info); + return 0; + +exit_unlock_free: + ovs_unlock(); + kfree_skb(reply); + return err; +} + +static int ovs_vport_cmd_get(struct sk_buff *skb, struct genl_info *info) +{ + struct nlattr **a = info->attrs; + struct ovs_header *ovs_header = info->userhdr; + struct sk_buff *reply; + struct vport *vport; + int err; + + reply = ovs_vport_cmd_alloc_info(); + if (!reply) + return -ENOMEM; + + rcu_read_lock(); + vport = lookup_vport(sock_net(skb->sk), ovs_header, a); + err = PTR_ERR(vport); + if (IS_ERR(vport)) + goto exit_unlock_free; + err = ovs_vport_cmd_fill_info(vport, reply, info->snd_portid, + info->snd_seq, 0, OVS_VPORT_CMD_NEW); + BUG_ON(err < 0); + rcu_read_unlock(); + + return genlmsg_reply(reply, info); + +exit_unlock_free: + rcu_read_unlock(); + kfree_skb(reply); + return err; +} + +static int ovs_vport_cmd_dump(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct ovs_header *ovs_header = genlmsg_data(nlmsg_data(cb->nlh)); + struct datapath *dp; + int bucket = cb->args[0], skip = cb->args[1]; + int i, j = 0; + + rcu_read_lock(); + dp = get_dp_rcu(sock_net(skb->sk), ovs_header->dp_ifindex); + if (!dp) { + rcu_read_unlock(); + return -ENODEV; + } + for (i = bucket; i < DP_VPORT_HASH_BUCKETS; i++) { + struct vport *vport; + + j = 0; + hlist_for_each_entry_rcu(vport, &dp->ports[i], dp_hash_node) { + if (j >= skip && + ovs_vport_cmd_fill_info(vport, skb, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + NLM_F_MULTI, + OVS_VPORT_CMD_NEW) < 0) + goto out; + + j++; + } + skip = 0; + } +out: + rcu_read_unlock(); + + cb->args[0] = i; + cb->args[1] = j; + + return skb->len; +} + +static const struct nla_policy vport_policy[OVS_VPORT_ATTR_MAX + 1] = { + [OVS_VPORT_ATTR_NAME] = { .type = NLA_NUL_STRING, .len = IFNAMSIZ - 1 }, + [OVS_VPORT_ATTR_STATS] = { .len = sizeof(struct ovs_vport_stats) }, + [OVS_VPORT_ATTR_PORT_NO] = { .type = NLA_U32 }, + [OVS_VPORT_ATTR_TYPE] = { .type = NLA_U32 }, + [OVS_VPORT_ATTR_UPCALL_PID] = { .type = NLA_UNSPEC }, + [OVS_VPORT_ATTR_OPTIONS] = { .type = NLA_NESTED }, +}; + +static const struct genl_ops dp_vport_genl_ops[] = { + { .cmd = OVS_VPORT_CMD_NEW, + .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ + .policy = vport_policy, + .doit = ovs_vport_cmd_new + }, + { .cmd = OVS_VPORT_CMD_DEL, + .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ + .policy = vport_policy, + .doit = ovs_vport_cmd_del + }, + { .cmd = OVS_VPORT_CMD_GET, + .flags = 0, /* OK for unprivileged users. */ + .policy = vport_policy, + .doit = ovs_vport_cmd_get, + .dumpit = ovs_vport_cmd_dump + }, + { .cmd = OVS_VPORT_CMD_SET, + .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ + .policy = vport_policy, + .doit = ovs_vport_cmd_set, + }, +}; + +struct genl_family dp_vport_genl_family = { + .id = GENL_ID_GENERATE, + .hdrsize = sizeof(struct ovs_header), + .name = OVS_VPORT_FAMILY, + .version = OVS_VPORT_VERSION, + .maxattr = OVS_VPORT_ATTR_MAX, + .netnsok = true, + .parallel_ops = true, + .ops = dp_vport_genl_ops, + .n_ops = ARRAY_SIZE(dp_vport_genl_ops), + .mcgrps = &ovs_dp_vport_multicast_group, + .n_mcgrps = 1, +}; + +static struct genl_family * const dp_genl_families[] = { + &dp_datapath_genl_family, + &dp_vport_genl_family, + &dp_flow_genl_family, + &dp_packet_genl_family, +}; + +static void dp_unregister_genl(int n_families) +{ + int i; + + for (i = 0; i < n_families; i++) + genl_unregister_family(dp_genl_families[i]); +} + +static int dp_register_genl(void) +{ + int err; + int i; + + for (i = 0; i < ARRAY_SIZE(dp_genl_families); i++) { + + err = genl_register_family(dp_genl_families[i]); + if (err) + goto error; + } + + return 0; + +error: + dp_unregister_genl(i); + return err; +} + +static int __net_init ovs_init_net(struct net *net) +{ + struct ovs_net *ovs_net = net_generic(net, ovs_net_id); + + INIT_LIST_HEAD(&ovs_net->dps); + INIT_WORK(&ovs_net->dp_notify_work, ovs_dp_notify_wq); + ovs_ct_init(net); + return 0; +} + +static void __net_exit list_vports_from_net(struct net *net, struct net *dnet, + struct list_head *head) +{ + struct ovs_net *ovs_net = net_generic(net, ovs_net_id); + struct datapath *dp; + + list_for_each_entry(dp, &ovs_net->dps, list_node) { + int i; + + for (i = 0; i < DP_VPORT_HASH_BUCKETS; i++) { + struct vport *vport; + + hlist_for_each_entry(vport, &dp->ports[i], dp_hash_node) { + if (vport->ops->type != OVS_VPORT_TYPE_INTERNAL) + continue; + + if (dev_net(vport->dev) == dnet) + list_add(&vport->detach_list, head); + } + } + } +} + +static void __net_exit ovs_exit_net(struct net *dnet) +{ + struct datapath *dp, *dp_next; + struct ovs_net *ovs_net = net_generic(dnet, ovs_net_id); + struct vport *vport, *vport_next; + struct net *net; + LIST_HEAD(head); + + ovs_ct_exit(dnet); + ovs_lock(); + list_for_each_entry_safe(dp, dp_next, &ovs_net->dps, list_node) + __dp_destroy(dp); + + rtnl_lock(); + for_each_net(net) + list_vports_from_net(net, dnet, &head); + rtnl_unlock(); + + /* Detach all vports from given namespace. */ + list_for_each_entry_safe(vport, vport_next, &head, detach_list) { + list_del(&vport->detach_list); + ovs_dp_detach_port(vport); + } + + ovs_unlock(); + + cancel_work_sync(&ovs_net->dp_notify_work); +} + +static struct pernet_operations ovs_net_ops = { + .init = ovs_init_net, + .exit = ovs_exit_net, + .id = &ovs_net_id, + .size = sizeof(struct ovs_net), +}; + +static int __init dp_init(void) +{ + int err; + + BUILD_BUG_ON(sizeof(struct ovs_skb_cb) > FIELD_SIZEOF(struct sk_buff, cb)); + + pr_info("Open vSwitch switching datapath\n"); + + err = action_fifos_init(); + if (err) + goto error; + + err = ovs_internal_dev_rtnl_link_register(); + if (err) + goto error_action_fifos_exit; + + err = ovs_flow_init(); + if (err) + goto error_unreg_rtnl_link; + + err = ovs_vport_init(); + if (err) + goto error_flow_exit; + + err = register_pernet_device(&ovs_net_ops); + if (err) + goto error_vport_exit; + + err = register_netdevice_notifier(&ovs_dp_device_notifier); + if (err) + goto error_netns_exit; + + err = ovs_netdev_init(); + if (err) + goto error_unreg_notifier; + + err = dp_register_genl(); + if (err < 0) + goto error_unreg_netdev; + + return 0; + +error_unreg_netdev: + ovs_netdev_exit(); +error_unreg_notifier: + unregister_netdevice_notifier(&ovs_dp_device_notifier); +error_netns_exit: + unregister_pernet_device(&ovs_net_ops); +error_vport_exit: + ovs_vport_exit(); +error_flow_exit: + ovs_flow_exit(); +error_unreg_rtnl_link: + ovs_internal_dev_rtnl_link_unregister(); +error_action_fifos_exit: + action_fifos_exit(); +error: + return err; +} + +static void dp_cleanup(void) +{ + dp_unregister_genl(ARRAY_SIZE(dp_genl_families)); + ovs_netdev_exit(); + unregister_netdevice_notifier(&ovs_dp_device_notifier); + unregister_pernet_device(&ovs_net_ops); + rcu_barrier(); + ovs_vport_exit(); + ovs_flow_exit(); + ovs_internal_dev_rtnl_link_unregister(); + action_fifos_exit(); +} + +module_init(dp_init); +module_exit(dp_cleanup); + +MODULE_DESCRIPTION("Open vSwitch switching datapath"); +MODULE_LICENSE("GPL"); diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h new file mode 100644 index 000000000000..67bdecd9fdc1 --- /dev/null +++ b/net/openvswitch/datapath.h @@ -0,0 +1,216 @@ +/* + * Copyright (c) 2007-2014 Nicira, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA + */ + +#ifndef DATAPATH_H +#define DATAPATH_H 1 + +#include <asm/page.h> +#include <linux/kernel.h> +#include <linux/mutex.h> +#include <linux/netdevice.h> +#include <linux/skbuff.h> +#include <linux/u64_stats_sync.h> +#include <net/ip_tunnels.h> + +#include "conntrack.h" +#include "flow.h" +#include "flow_table.h" + +#define DP_MAX_PORTS USHRT_MAX +#define DP_VPORT_HASH_BUCKETS 1024 + +#define SAMPLE_ACTION_DEPTH 3 + +/** + * struct dp_stats_percpu - per-cpu packet processing statistics for a given + * datapath. + * @n_hit: Number of received packets for which a matching flow was found in + * the flow table. + * @n_miss: Number of received packets that had no matching flow in the flow + * table. The sum of @n_hit and @n_miss is the number of packets that have + * been received by the datapath. + * @n_lost: Number of received packets that had no matching flow in the flow + * table that could not be sent to userspace (normally due to an overflow in + * one of the datapath's queues). + * @n_mask_hit: Number of masks looked up for flow match. + * @n_mask_hit / (@n_hit + @n_missed) will be the average masks looked + * up per packet. + */ +struct dp_stats_percpu { + u64 n_hit; + u64 n_missed; + u64 n_lost; + u64 n_mask_hit; + struct u64_stats_sync syncp; +}; + +/** + * struct datapath - datapath for flow-based packet switching + * @rcu: RCU callback head for deferred destruction. + * @list_node: Element in global 'dps' list. + * @table: flow table. + * @ports: Hash table for ports. %OVSP_LOCAL port always exists. Protected by + * ovs_mutex and RCU. + * @stats_percpu: Per-CPU datapath statistics. + * @net: Reference to net namespace. + * + * Context: See the comment on locking at the top of datapath.c for additional + * locking information. + */ +struct datapath { + struct rcu_head rcu; + struct list_head list_node; + + /* Flow table. */ + struct flow_table table; + + /* Switch ports. */ + struct hlist_head *ports; + + /* Stats. */ + struct dp_stats_percpu __percpu *stats_percpu; + + /* Network namespace ref. */ + possible_net_t net; + + u32 user_features; +}; + +/** + * struct ovs_skb_cb - OVS data in skb CB + * @input_vport: The original vport packet came in on. This value is cached + * when a packet is received by OVS. + * @mru: The maximum received fragement size; 0 if the packet is not + * fragmented. + */ +struct ovs_skb_cb { + struct vport *input_vport; + u16 mru; +}; +#define OVS_CB(skb) ((struct ovs_skb_cb *)(skb)->cb) + +/** + * struct dp_upcall - metadata to include with a packet to send to userspace + * @cmd: One of %OVS_PACKET_CMD_*. + * @userdata: If nonnull, its variable-length value is passed to userspace as + * %OVS_PACKET_ATTR_USERDATA. + * @portid: Netlink portid to which packet should be sent. If @portid is 0 + * then no packet is sent and the packet is accounted in the datapath's @n_lost + * counter. + * @egress_tun_info: If nonnull, becomes %OVS_PACKET_ATTR_EGRESS_TUN_KEY. + * @mru: If not zero, Maximum received IP fragment size. + */ +struct dp_upcall_info { + struct ip_tunnel_info *egress_tun_info; + const struct nlattr *userdata; + const struct nlattr *actions; + int actions_len; + u32 portid; + u8 cmd; + u16 mru; +}; + +/** + * struct ovs_net - Per net-namespace data for ovs. + * @dps: List of datapaths to enable dumping them all out. + * Protected by genl_mutex. + */ +struct ovs_net { + struct list_head dps; + struct work_struct dp_notify_work; + + /* Module reference for configuring conntrack. */ + bool xt_label; +}; + +extern int ovs_net_id; +void ovs_lock(void); +void ovs_unlock(void); + +#ifdef CONFIG_LOCKDEP +int lockdep_ovsl_is_held(void); +#else +#define lockdep_ovsl_is_held() 1 +#endif + +#define ASSERT_OVSL() WARN_ON(!lockdep_ovsl_is_held()) +#define ovsl_dereference(p) \ + rcu_dereference_protected(p, lockdep_ovsl_is_held()) +#define rcu_dereference_ovsl(p) \ + rcu_dereference_check(p, lockdep_ovsl_is_held()) + +static inline struct net *ovs_dp_get_net(const struct datapath *dp) +{ + return read_pnet(&dp->net); +} + +static inline void ovs_dp_set_net(struct datapath *dp, struct net *net) +{ + write_pnet(&dp->net, net); +} + +struct vport *ovs_lookup_vport(const struct datapath *dp, u16 port_no); + +static inline struct vport *ovs_vport_rcu(const struct datapath *dp, int port_no) +{ + WARN_ON_ONCE(!rcu_read_lock_held()); + return ovs_lookup_vport(dp, port_no); +} + +static inline struct vport *ovs_vport_ovsl_rcu(const struct datapath *dp, int port_no) +{ + WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_ovsl_is_held()); + return ovs_lookup_vport(dp, port_no); +} + +static inline struct vport *ovs_vport_ovsl(const struct datapath *dp, int port_no) +{ + ASSERT_OVSL(); + return ovs_lookup_vport(dp, port_no); +} + +extern struct notifier_block ovs_dp_device_notifier; +extern struct genl_family dp_vport_genl_family; + +void ovs_dp_process_packet(struct sk_buff *skb, struct sw_flow_key *key); +void ovs_dp_detach_port(struct vport *); +int ovs_dp_upcall(struct datapath *, struct sk_buff *, + const struct sw_flow_key *, const struct dp_upcall_info *); + +const char *ovs_dp_name(const struct datapath *dp); +struct sk_buff *ovs_vport_cmd_build_info(struct vport *, u32 pid, u32 seq, + u8 cmd); + +int ovs_execute_actions(struct datapath *dp, struct sk_buff *skb, + const struct sw_flow_actions *, struct sw_flow_key *); + +void ovs_dp_notify_wq(struct work_struct *work); + +int action_fifos_init(void); +void action_fifos_exit(void); + +/* 'KEY' must not have any bits set outside of the 'MASK' */ +#define OVS_MASKED(OLD, KEY, MASK) ((KEY) | ((OLD) & ~(MASK))) +#define OVS_SET_MASKED(OLD, KEY, MASK) ((OLD) = OVS_MASKED(OLD, KEY, MASK)) + +#define OVS_NLERR(logging_allowed, fmt, ...) \ +do { \ + if (logging_allowed && net_ratelimit()) \ + pr_info("netlink: " fmt "\n", ##__VA_ARGS__); \ +} while (0) +#endif /* datapath.h */ diff --git a/net/openvswitch/dp_notify.c b/net/openvswitch/dp_notify.c new file mode 100644 index 000000000000..653d073bae45 --- /dev/null +++ b/net/openvswitch/dp_notify.c @@ -0,0 +1,99 @@ +/* + * Copyright (c) 2007-2012 Nicira, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA + */ + +#include <linux/netdevice.h> +#include <net/genetlink.h> +#include <net/netns/generic.h> + +#include "datapath.h" +#include "vport-internal_dev.h" +#include "vport-netdev.h" + +static void dp_detach_port_notify(struct vport *vport) +{ + struct sk_buff *notify; + struct datapath *dp; + + dp = vport->dp; + notify = ovs_vport_cmd_build_info(vport, 0, 0, + OVS_VPORT_CMD_DEL); + ovs_dp_detach_port(vport); + if (IS_ERR(notify)) { + genl_set_err(&dp_vport_genl_family, ovs_dp_get_net(dp), 0, + 0, PTR_ERR(notify)); + return; + } + + genlmsg_multicast_netns(&dp_vport_genl_family, + ovs_dp_get_net(dp), notify, 0, + 0, GFP_KERNEL); +} + +void ovs_dp_notify_wq(struct work_struct *work) +{ + struct ovs_net *ovs_net = container_of(work, struct ovs_net, dp_notify_work); + struct datapath *dp; + + ovs_lock(); + list_for_each_entry(dp, &ovs_net->dps, list_node) { + int i; + + for (i = 0; i < DP_VPORT_HASH_BUCKETS; i++) { + struct vport *vport; + struct hlist_node *n; + + hlist_for_each_entry_safe(vport, n, &dp->ports[i], dp_hash_node) { + if (vport->ops->type == OVS_VPORT_TYPE_INTERNAL) + continue; + + if (!(vport->dev->priv_flags & IFF_OVS_DATAPATH)) + dp_detach_port_notify(vport); + } + } + } + ovs_unlock(); +} + +static int dp_device_event(struct notifier_block *unused, unsigned long event, + void *ptr) +{ + struct ovs_net *ovs_net; + struct net_device *dev = netdev_notifier_info_to_dev(ptr); + struct vport *vport = NULL; + + if (!ovs_is_internal_dev(dev)) + vport = ovs_netdev_get_vport(dev); + + if (!vport) + return NOTIFY_DONE; + + if (event == NETDEV_UNREGISTER) { + /* upper_dev_unlink and decrement promisc immediately */ + ovs_netdev_detach_dev(vport); + + /* schedule vport destroy, dev_put and genl notification */ + ovs_net = net_generic(dev_net(dev), ovs_net_id); + queue_work(system_wq, &ovs_net->dp_notify_work); + } + + return NOTIFY_DONE; +} + +struct notifier_block ovs_dp_device_notifier = { + .notifier_call = dp_device_event +}; diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c new file mode 100644 index 000000000000..0ea128eeeab2 --- /dev/null +++ b/net/openvswitch/flow.c @@ -0,0 +1,745 @@ +/* + * Copyright (c) 2007-2014 Nicira, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA + */ + +#include <linux/uaccess.h> +#include <linux/netdevice.h> +#include <linux/etherdevice.h> +#include <linux/if_ether.h> +#include <linux/if_vlan.h> +#include <net/llc_pdu.h> +#include <linux/kernel.h> +#include <linux/jhash.h> +#include <linux/jiffies.h> +#include <linux/llc.h> +#include <linux/module.h> +#include <linux/in.h> +#include <linux/rcupdate.h> +#include <linux/if_arp.h> +#include <linux/ip.h> +#include <linux/ipv6.h> +#include <linux/mpls.h> +#include <linux/sctp.h> +#include <linux/smp.h> +#include <linux/tcp.h> +#include <linux/udp.h> +#include <linux/icmp.h> +#include <linux/icmpv6.h> +#include <linux/rculist.h> +#include <net/ip.h> +#include <net/ip_tunnels.h> +#include <net/ipv6.h> +#include <net/mpls.h> +#include <net/ndisc.h> + +#include "conntrack.h" +#include "datapath.h" +#include "flow.h" +#include "flow_netlink.h" +#include "vport.h" + +u64 ovs_flow_used_time(unsigned long flow_jiffies) +{ + struct timespec cur_ts; + u64 cur_ms, idle_ms; + + ktime_get_ts(&cur_ts); + idle_ms = jiffies_to_msecs(jiffies - flow_jiffies); + cur_ms = (u64)cur_ts.tv_sec * MSEC_PER_SEC + + cur_ts.tv_nsec / NSEC_PER_MSEC; + + return cur_ms - idle_ms; +} + +#define TCP_FLAGS_BE16(tp) (*(__be16 *)&tcp_flag_word(tp) & htons(0x0FFF)) + +void ovs_flow_stats_update(struct sw_flow *flow, __be16 tcp_flags, + const struct sk_buff *skb) +{ + struct flow_stats *stats; + int node = numa_node_id(); + int len = skb->len + (skb_vlan_tag_present(skb) ? VLAN_HLEN : 0); + + stats = rcu_dereference(flow->stats[node]); + + /* Check if already have node-specific stats. */ + if (likely(stats)) { + spin_lock(&stats->lock); + /* Mark if we write on the pre-allocated stats. */ + if (node == 0 && unlikely(flow->stats_last_writer != node)) + flow->stats_last_writer = node; + } else { + stats = rcu_dereference(flow->stats[0]); /* Pre-allocated. */ + spin_lock(&stats->lock); + + /* If the current NUMA-node is the only writer on the + * pre-allocated stats keep using them. + */ + if (unlikely(flow->stats_last_writer != node)) { + /* A previous locker may have already allocated the + * stats, so we need to check again. If node-specific + * stats were already allocated, we update the pre- + * allocated stats as we have already locked them. + */ + if (likely(flow->stats_last_writer != NUMA_NO_NODE) + && likely(!rcu_access_pointer(flow->stats[node]))) { + /* Try to allocate node-specific stats. */ + struct flow_stats *new_stats; + + new_stats = + kmem_cache_alloc_node(flow_stats_cache, + GFP_NOWAIT | + __GFP_THISNODE | + __GFP_NOWARN | + __GFP_NOMEMALLOC, + node); + if (likely(new_stats)) { + new_stats->used = jiffies; + new_stats->packet_count = 1; + new_stats->byte_count = len; + new_stats->tcp_flags = tcp_flags; + spin_lock_init(&new_stats->lock); + + rcu_assign_pointer(flow->stats[node], + new_stats); + goto unlock; + } + } + flow->stats_last_writer = node; + } + } + + stats->used = jiffies; + stats->packet_count++; + stats->byte_count += len; + stats->tcp_flags |= tcp_flags; +unlock: + spin_unlock(&stats->lock); +} + +/* Must be called with rcu_read_lock or ovs_mutex. */ +void ovs_flow_stats_get(const struct sw_flow *flow, + struct ovs_flow_stats *ovs_stats, + unsigned long *used, __be16 *tcp_flags) +{ + int node; + + *used = 0; + *tcp_flags = 0; + memset(ovs_stats, 0, sizeof(*ovs_stats)); + + for_each_node(node) { + struct flow_stats *stats = rcu_dereference_ovsl(flow->stats[node]); + + if (stats) { + /* Local CPU may write on non-local stats, so we must + * block bottom-halves here. + */ + spin_lock_bh(&stats->lock); + if (!*used || time_after(stats->used, *used)) + *used = stats->used; + *tcp_flags |= stats->tcp_flags; + ovs_stats->n_packets += stats->packet_count; + ovs_stats->n_bytes += stats->byte_count; + spin_unlock_bh(&stats->lock); + } + } +} + +/* Called with ovs_mutex. */ +void ovs_flow_stats_clear(struct sw_flow *flow) +{ + int node; + + for_each_node(node) { + struct flow_stats *stats = ovsl_dereference(flow->stats[node]); + + if (stats) { + spin_lock_bh(&stats->lock); + stats->used = 0; + stats->packet_count = 0; + stats->byte_count = 0; + stats->tcp_flags = 0; + spin_unlock_bh(&stats->lock); + } + } +} + +static int check_header(struct sk_buff *skb, int len) +{ + if (unlikely(skb->len < len)) + return -EINVAL; + if (unlikely(!pskb_may_pull(skb, len))) + return -ENOMEM; + return 0; +} + +static bool arphdr_ok(struct sk_buff *skb) +{ + return pskb_may_pull(skb, skb_network_offset(skb) + + sizeof(struct arp_eth_header)); +} + +static int check_iphdr(struct sk_buff *skb) +{ + unsigned int nh_ofs = skb_network_offset(skb); + unsigned int ip_len; + int err; + + err = check_header(skb, nh_ofs + sizeof(struct iphdr)); + if (unlikely(err)) + return err; + + ip_len = ip_hdrlen(skb); + if (unlikely(ip_len < sizeof(struct iphdr) || + skb->len < nh_ofs + ip_len)) + return -EINVAL; + + skb_set_transport_header(skb, nh_ofs + ip_len); + return 0; +} + +static bool tcphdr_ok(struct sk_buff *skb) +{ + int th_ofs = skb_transport_offset(skb); + int tcp_len; + + if (unlikely(!pskb_may_pull(skb, th_ofs + sizeof(struct tcphdr)))) + return false; + + tcp_len = tcp_hdrlen(skb); + if (unlikely(tcp_len < sizeof(struct tcphdr) || + skb->len < th_ofs + tcp_len)) + return false; + + return true; +} + +static bool udphdr_ok(struct sk_buff *skb) +{ + return pskb_may_pull(skb, skb_transport_offset(skb) + + sizeof(struct udphdr)); +} + +static bool sctphdr_ok(struct sk_buff *skb) +{ + return pskb_may_pull(skb, skb_transport_offset(skb) + + sizeof(struct sctphdr)); +} + +static bool icmphdr_ok(struct sk_buff *skb) +{ + return pskb_may_pull(skb, skb_transport_offset(skb) + + sizeof(struct icmphdr)); +} + +static int parse_ipv6hdr(struct sk_buff *skb, struct sw_flow_key *key) +{ + unsigned int nh_ofs = skb_network_offset(skb); + unsigned int nh_len; + int payload_ofs; + struct ipv6hdr *nh; + uint8_t nexthdr; + __be16 frag_off; + int err; + + err = check_header(skb, nh_ofs + sizeof(*nh)); + if (unlikely(err)) + return err; + + nh = ipv6_hdr(skb); + nexthdr = nh->nexthdr; + payload_ofs = (u8 *)(nh + 1) - skb->data; + + key->ip.proto = NEXTHDR_NONE; + key->ip.tos = ipv6_get_dsfield(nh); + key->ip.ttl = nh->hop_limit; + key->ipv6.label = *(__be32 *)nh & htonl(IPV6_FLOWINFO_FLOWLABEL); + key->ipv6.addr.src = nh->saddr; + key->ipv6.addr.dst = nh->daddr; + + payload_ofs = ipv6_skip_exthdr(skb, payload_ofs, &nexthdr, &frag_off); + + if (frag_off) { + if (frag_off & htons(~0x7)) + key->ip.frag = OVS_FRAG_TYPE_LATER; + else + key->ip.frag = OVS_FRAG_TYPE_FIRST; + } else { + key->ip.frag = OVS_FRAG_TYPE_NONE; + } + + /* Delayed handling of error in ipv6_skip_exthdr() as it + * always sets frag_off to a valid value which may be + * used to set key->ip.frag above. + */ + if (unlikely(payload_ofs < 0)) + return -EPROTO; + + nh_len = payload_ofs - nh_ofs; + skb_set_transport_header(skb, nh_ofs + nh_len); + key->ip.proto = nexthdr; + return nh_len; +} + +static bool icmp6hdr_ok(struct sk_buff *skb) +{ + return pskb_may_pull(skb, skb_transport_offset(skb) + + sizeof(struct icmp6hdr)); +} + +static int parse_vlan(struct sk_buff *skb, struct sw_flow_key *key) +{ + struct qtag_prefix { + __be16 eth_type; /* ETH_P_8021Q */ + __be16 tci; + }; + struct qtag_prefix *qp; + + if (unlikely(skb->len < sizeof(struct qtag_prefix) + sizeof(__be16))) + return 0; + + if (unlikely(!pskb_may_pull(skb, sizeof(struct qtag_prefix) + + sizeof(__be16)))) + return -ENOMEM; + + qp = (struct qtag_prefix *) skb->data; + key->eth.tci = qp->tci | htons(VLAN_TAG_PRESENT); + __skb_pull(skb, sizeof(struct qtag_prefix)); + + return 0; +} + +static __be16 parse_ethertype(struct sk_buff *skb) +{ + struct llc_snap_hdr { + u8 dsap; /* Always 0xAA */ + u8 ssap; /* Always 0xAA */ + u8 ctrl; + u8 oui[3]; + __be16 ethertype; + }; + struct llc_snap_hdr *llc; + __be16 proto; + + proto = *(__be16 *) skb->data; + __skb_pull(skb, sizeof(__be16)); + + if (eth_proto_is_802_3(proto)) + return proto; + + if (skb->len < sizeof(struct llc_snap_hdr)) + return htons(ETH_P_802_2); + + if (unlikely(!pskb_may_pull(skb, sizeof(struct llc_snap_hdr)))) + return htons(0); + + llc = (struct llc_snap_hdr *) skb->data; + if (llc->dsap != LLC_SAP_SNAP || + llc->ssap != LLC_SAP_SNAP || + (llc->oui[0] | llc->oui[1] | llc->oui[2]) != 0) + return htons(ETH_P_802_2); + + __skb_pull(skb, sizeof(struct llc_snap_hdr)); + + if (eth_proto_is_802_3(llc->ethertype)) + return llc->ethertype; + + return htons(ETH_P_802_2); +} + +static int parse_icmpv6(struct sk_buff *skb, struct sw_flow_key *key, + int nh_len) +{ + struct icmp6hdr *icmp = icmp6_hdr(skb); + + /* The ICMPv6 type and code fields use the 16-bit transport port + * fields, so we need to store them in 16-bit network byte order. + */ + key->tp.src = htons(icmp->icmp6_type); + key->tp.dst = htons(icmp->icmp6_code); + memset(&key->ipv6.nd, 0, sizeof(key->ipv6.nd)); + + if (icmp->icmp6_code == 0 && + (icmp->icmp6_type == NDISC_NEIGHBOUR_SOLICITATION || + icmp->icmp6_type == NDISC_NEIGHBOUR_ADVERTISEMENT)) { + int icmp_len = skb->len - skb_transport_offset(skb); + struct nd_msg *nd; + int offset; + + /* In order to process neighbor discovery options, we need the + * entire packet. + */ + if (unlikely(icmp_len < sizeof(*nd))) + return 0; + + if (unlikely(skb_linearize(skb))) + return -ENOMEM; + + nd = (struct nd_msg *)skb_transport_header(skb); + key->ipv6.nd.target = nd->target; + + icmp_len -= sizeof(*nd); + offset = 0; + while (icmp_len >= 8) { + struct nd_opt_hdr *nd_opt = + (struct nd_opt_hdr *)(nd->opt + offset); + int opt_len = nd_opt->nd_opt_len * 8; + + if (unlikely(!opt_len || opt_len > icmp_len)) + return 0; + + /* Store the link layer address if the appropriate + * option is provided. It is considered an error if + * the same link layer option is specified twice. + */ + if (nd_opt->nd_opt_type == ND_OPT_SOURCE_LL_ADDR + && opt_len == 8) { + if (unlikely(!is_zero_ether_addr(key->ipv6.nd.sll))) + goto invalid; + ether_addr_copy(key->ipv6.nd.sll, + &nd->opt[offset+sizeof(*nd_opt)]); + } else if (nd_opt->nd_opt_type == ND_OPT_TARGET_LL_ADDR + && opt_len == 8) { + if (unlikely(!is_zero_ether_addr(key->ipv6.nd.tll))) + goto invalid; + ether_addr_copy(key->ipv6.nd.tll, + &nd->opt[offset+sizeof(*nd_opt)]); + } + + icmp_len -= opt_len; + offset += opt_len; + } + } + + return 0; + +invalid: + memset(&key->ipv6.nd.target, 0, sizeof(key->ipv6.nd.target)); + memset(key->ipv6.nd.sll, 0, sizeof(key->ipv6.nd.sll)); + memset(key->ipv6.nd.tll, 0, sizeof(key->ipv6.nd.tll)); + + return 0; +} + +/** + * key_extract - extracts a flow key from an Ethernet frame. + * @skb: sk_buff that contains the frame, with skb->data pointing to the + * Ethernet header + * @key: output flow key + * + * The caller must ensure that skb->len >= ETH_HLEN. + * + * Returns 0 if successful, otherwise a negative errno value. + * + * Initializes @skb header pointers as follows: + * + * - skb->mac_header: the Ethernet header. + * + * - skb->network_header: just past the Ethernet header, or just past the + * VLAN header, to the first byte of the Ethernet payload. + * + * - skb->transport_header: If key->eth.type is ETH_P_IP or ETH_P_IPV6 + * on output, then just past the IP header, if one is present and + * of a correct length, otherwise the same as skb->network_header. + * For other key->eth.type values it is left untouched. + */ +static int key_extract(struct sk_buff *skb, struct sw_flow_key *key) +{ + int error; + struct ethhdr *eth; + + /* Flags are always used as part of stats */ + key->tp.flags = 0; + + skb_reset_mac_header(skb); + + /* Link layer. We are guaranteed to have at least the 14 byte Ethernet + * header in the linear data area. + */ + eth = eth_hdr(skb); + ether_addr_copy(key->eth.src, eth->h_source); + ether_addr_copy(key->eth.dst, eth->h_dest); + + __skb_pull(skb, 2 * ETH_ALEN); + /* We are going to push all headers that we pull, so no need to + * update skb->csum here. + */ + + key->eth.tci = 0; + if (skb_vlan_tag_present(skb)) + key->eth.tci = htons(skb->vlan_tci); + else if (eth->h_proto == htons(ETH_P_8021Q)) + if (unlikely(parse_vlan(skb, key))) + return -ENOMEM; + + key->eth.type = parse_ethertype(skb); + if (unlikely(key->eth.type == htons(0))) + return -ENOMEM; + + skb_reset_network_header(skb); + skb_reset_mac_len(skb); + __skb_push(skb, skb->data - skb_mac_header(skb)); + + /* Network layer. */ + if (key->eth.type == htons(ETH_P_IP)) { + struct iphdr *nh; + __be16 offset; + + error = check_iphdr(skb); + if (unlikely(error)) { + memset(&key->ip, 0, sizeof(key->ip)); + memset(&key->ipv4, 0, sizeof(key->ipv4)); + if (error == -EINVAL) { + skb->transport_header = skb->network_header; + error = 0; + } + return error; + } + + nh = ip_hdr(skb); + key->ipv4.addr.src = nh->saddr; + key->ipv4.addr.dst = nh->daddr; + + key->ip.proto = nh->protocol; + key->ip.tos = nh->tos; + key->ip.ttl = nh->ttl; + + offset = nh->frag_off & htons(IP_OFFSET); + if (offset) { + key->ip.frag = OVS_FRAG_TYPE_LATER; + return 0; + } + if (nh->frag_off & htons(IP_MF) || + skb_shinfo(skb)->gso_type & SKB_GSO_UDP) + key->ip.frag = OVS_FRAG_TYPE_FIRST; + else + key->ip.frag = OVS_FRAG_TYPE_NONE; + + /* Transport layer. */ + if (key->ip.proto == IPPROTO_TCP) { + if (tcphdr_ok(skb)) { + struct tcphdr *tcp = tcp_hdr(skb); + key->tp.src = tcp->source; + key->tp.dst = tcp->dest; + key->tp.flags = TCP_FLAGS_BE16(tcp); + } else { + memset(&key->tp, 0, sizeof(key->tp)); + } + + } else if (key->ip.proto == IPPROTO_UDP) { + if (udphdr_ok(skb)) { + struct udphdr *udp = udp_hdr(skb); + key->tp.src = udp->source; + key->tp.dst = udp->dest; + } else { + memset(&key->tp, 0, sizeof(key->tp)); + } + } else if (key->ip.proto == IPPROTO_SCTP) { + if (sctphdr_ok(skb)) { + struct sctphdr *sctp = sctp_hdr(skb); + key->tp.src = sctp->source; + key->tp.dst = sctp->dest; + } else { + memset(&key->tp, 0, sizeof(key->tp)); + } + } else if (key->ip.proto == IPPROTO_ICMP) { + if (icmphdr_ok(skb)) { + struct icmphdr *icmp = icmp_hdr(skb); + /* The ICMP type and code fields use the 16-bit + * transport port fields, so we need to store + * them in 16-bit network byte order. */ + key->tp.src = htons(icmp->type); + key->tp.dst = htons(icmp->code); + } else { + memset(&key->tp, 0, sizeof(key->tp)); + } + } + + } else if (key->eth.type == htons(ETH_P_ARP) || + key->eth.type == htons(ETH_P_RARP)) { + struct arp_eth_header *arp; + bool arp_available = arphdr_ok(skb); + + arp = (struct arp_eth_header *)skb_network_header(skb); + + if (arp_available && + arp->ar_hrd == htons(ARPHRD_ETHER) && + arp->ar_pro == htons(ETH_P_IP) && + arp->ar_hln == ETH_ALEN && + arp->ar_pln == 4) { + + /* We only match on the lower 8 bits of the opcode. */ + if (ntohs(arp->ar_op) <= 0xff) + key->ip.proto = ntohs(arp->ar_op); + else + key->ip.proto = 0; + + memcpy(&key->ipv4.addr.src, arp->ar_sip, sizeof(key->ipv4.addr.src)); + memcpy(&key->ipv4.addr.dst, arp->ar_tip, sizeof(key->ipv4.addr.dst)); + ether_addr_copy(key->ipv4.arp.sha, arp->ar_sha); + ether_addr_copy(key->ipv4.arp.tha, arp->ar_tha); + } else { + memset(&key->ip, 0, sizeof(key->ip)); + memset(&key->ipv4, 0, sizeof(key->ipv4)); + } + } else if (eth_p_mpls(key->eth.type)) { + size_t stack_len = MPLS_HLEN; + + /* In the presence of an MPLS label stack the end of the L2 + * header and the beginning of the L3 header differ. + * + * Advance network_header to the beginning of the L3 + * header. mac_len corresponds to the end of the L2 header. + */ + while (1) { + __be32 lse; + + error = check_header(skb, skb->mac_len + stack_len); + if (unlikely(error)) + return 0; + + memcpy(&lse, skb_network_header(skb), MPLS_HLEN); + + if (stack_len == MPLS_HLEN) + memcpy(&key->mpls.top_lse, &lse, MPLS_HLEN); + + skb_set_network_header(skb, skb->mac_len + stack_len); + if (lse & htonl(MPLS_LS_S_MASK)) + break; + + stack_len += MPLS_HLEN; + } + } else if (key->eth.type == htons(ETH_P_IPV6)) { + int nh_len; /* IPv6 Header + Extensions */ + + nh_len = parse_ipv6hdr(skb, key); + if (unlikely(nh_len < 0)) { + switch (nh_len) { + case -EINVAL: + memset(&key->ip, 0, sizeof(key->ip)); + memset(&key->ipv6.addr, 0, sizeof(key->ipv6.addr)); + /* fall-through */ + case -EPROTO: + skb->transport_header = skb->network_header; + error = 0; + break; + default: + error = nh_len; + } + return error; + } + + if (key->ip.frag == OVS_FRAG_TYPE_LATER) + return 0; + if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP) + key->ip.frag = OVS_FRAG_TYPE_FIRST; + + /* Transport layer. */ + if (key->ip.proto == NEXTHDR_TCP) { + if (tcphdr_ok(skb)) { + struct tcphdr *tcp = tcp_hdr(skb); + key->tp.src = tcp->source; + key->tp.dst = tcp->dest; + key->tp.flags = TCP_FLAGS_BE16(tcp); + } else { + memset(&key->tp, 0, sizeof(key->tp)); + } + } else if (key->ip.proto == NEXTHDR_UDP) { + if (udphdr_ok(skb)) { + struct udphdr *udp = udp_hdr(skb); + key->tp.src = udp->source; + key->tp.dst = udp->dest; + } else { + memset(&key->tp, 0, sizeof(key->tp)); + } + } else if (key->ip.proto == NEXTHDR_SCTP) { + if (sctphdr_ok(skb)) { + struct sctphdr *sctp = sctp_hdr(skb); + key->tp.src = sctp->source; + key->tp.dst = sctp->dest; + } else { + memset(&key->tp, 0, sizeof(key->tp)); + } + } else if (key->ip.proto == NEXTHDR_ICMP) { + if (icmp6hdr_ok(skb)) { + error = parse_icmpv6(skb, key, nh_len); + if (error) + return error; + } else { + memset(&key->tp, 0, sizeof(key->tp)); + } + } + } + return 0; +} + +int ovs_flow_key_update(struct sk_buff *skb, struct sw_flow_key *key) +{ + return key_extract(skb, key); +} + +int ovs_flow_key_extract(const struct ip_tunnel_info *tun_info, + struct sk_buff *skb, struct sw_flow_key *key) +{ + /* Extract metadata from packet. */ + if (tun_info) { + key->tun_proto = ip_tunnel_info_af(tun_info); + memcpy(&key->tun_key, &tun_info->key, sizeof(key->tun_key)); + + if (tun_info->options_len) { + BUILD_BUG_ON((1 << (sizeof(tun_info->options_len) * + 8)) - 1 + > sizeof(key->tun_opts)); + + ip_tunnel_info_opts_get(TUN_METADATA_OPTS(key, tun_info->options_len), + tun_info); + key->tun_opts_len = tun_info->options_len; + } else { + key->tun_opts_len = 0; + } + } else { + key->tun_proto = 0; + key->tun_opts_len = 0; + memset(&key->tun_key, 0, sizeof(key->tun_key)); + } + + key->phy.priority = skb->priority; + key->phy.in_port = OVS_CB(skb)->input_vport->port_no; + key->phy.skb_mark = skb->mark; + ovs_ct_fill_key(skb, key); + key->ovs_flow_hash = 0; + key->recirc_id = 0; + + return key_extract(skb, key); +} + +int ovs_flow_key_extract_userspace(struct net *net, const struct nlattr *attr, + struct sk_buff *skb, + struct sw_flow_key *key, bool log) +{ + int err; + + memset(key, 0, OVS_SW_FLOW_KEY_METADATA_SIZE); + + /* Extract metadata from netlink attributes. */ + err = ovs_nla_get_flow_metadata(net, attr, key, log); + if (err) + return err; + + return key_extract(skb, key); +} diff --git a/net/openvswitch/flow.h b/net/openvswitch/flow.h new file mode 100644 index 000000000000..1d055c559eaf --- /dev/null +++ b/net/openvswitch/flow.h @@ -0,0 +1,229 @@ +/* + * Copyright (c) 2007-2014 Nicira, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA + */ + +#ifndef FLOW_H +#define FLOW_H 1 + +#include <linux/cache.h> +#include <linux/kernel.h> +#include <linux/netlink.h> +#include <linux/openvswitch.h> +#include <linux/spinlock.h> +#include <linux/types.h> +#include <linux/rcupdate.h> +#include <linux/if_ether.h> +#include <linux/in6.h> +#include <linux/jiffies.h> +#include <linux/time.h> +#include <linux/flex_array.h> +#include <net/inet_ecn.h> +#include <net/ip_tunnels.h> +#include <net/dst_metadata.h> + +struct sk_buff; + +/* Store options at the end of the array if they are less than the + * maximum size. This allows us to get the benefits of variable length + * matching for small options. + */ +#define TUN_METADATA_OFFSET(opt_len) \ + (FIELD_SIZEOF(struct sw_flow_key, tun_opts) - opt_len) +#define TUN_METADATA_OPTS(flow_key, opt_len) \ + ((void *)((flow_key)->tun_opts + TUN_METADATA_OFFSET(opt_len))) + +struct ovs_tunnel_info { + struct metadata_dst *tun_dst; +}; + +#define OVS_SW_FLOW_KEY_METADATA_SIZE \ + (offsetof(struct sw_flow_key, recirc_id) + \ + FIELD_SIZEOF(struct sw_flow_key, recirc_id)) + +struct sw_flow_key { + u8 tun_opts[255]; + u8 tun_opts_len; + struct ip_tunnel_key tun_key; /* Encapsulating tunnel key. */ + struct { + u32 priority; /* Packet QoS priority. */ + u32 skb_mark; /* SKB mark. */ + u16 in_port; /* Input switch port (or DP_MAX_PORTS). */ + } __packed phy; /* Safe when right after 'tun_key'. */ + u8 tun_proto; /* Protocol of encapsulating tunnel. */ + u32 ovs_flow_hash; /* Datapath computed hash value. */ + u32 recirc_id; /* Recirculation ID. */ + struct { + u8 src[ETH_ALEN]; /* Ethernet source address. */ + u8 dst[ETH_ALEN]; /* Ethernet destination address. */ + __be16 tci; /* 0 if no VLAN, VLAN_TAG_PRESENT set otherwise. */ + __be16 type; /* Ethernet frame type. */ + } eth; + union { + struct { + __be32 top_lse; /* top label stack entry */ + } mpls; + struct { + u8 proto; /* IP protocol or lower 8 bits of ARP opcode. */ + u8 tos; /* IP ToS. */ + u8 ttl; /* IP TTL/hop limit. */ + u8 frag; /* One of OVS_FRAG_TYPE_*. */ + } ip; + }; + struct { + __be16 src; /* TCP/UDP/SCTP source port. */ + __be16 dst; /* TCP/UDP/SCTP destination port. */ + __be16 flags; /* TCP flags. */ + } tp; + union { + struct { + struct { + __be32 src; /* IP source address. */ + __be32 dst; /* IP destination address. */ + } addr; + struct { + u8 sha[ETH_ALEN]; /* ARP source hardware address. */ + u8 tha[ETH_ALEN]; /* ARP target hardware address. */ + } arp; + } ipv4; + struct { + struct { + struct in6_addr src; /* IPv6 source address. */ + struct in6_addr dst; /* IPv6 destination address. */ + } addr; + __be32 label; /* IPv6 flow label. */ + struct { + struct in6_addr target; /* ND target address. */ + u8 sll[ETH_ALEN]; /* ND source link layer address. */ + u8 tll[ETH_ALEN]; /* ND target link layer address. */ + } nd; + } ipv6; + }; + struct { + /* Connection tracking fields. */ + u16 zone; + u32 mark; + u8 state; + struct ovs_key_ct_labels labels; + } ct; + +} __aligned(BITS_PER_LONG/8); /* Ensure that we can do comparisons as longs. */ + +struct sw_flow_key_range { + unsigned short int start; + unsigned short int end; +}; + +struct sw_flow_mask { + int ref_count; + struct rcu_head rcu; + struct list_head list; + struct sw_flow_key_range range; + struct sw_flow_key key; +}; + +struct sw_flow_match { + struct sw_flow_key *key; + struct sw_flow_key_range range; + struct sw_flow_mask *mask; +}; + +#define MAX_UFID_LENGTH 16 /* 128 bits */ + +struct sw_flow_id { + u32 ufid_len; + union { + u32 ufid[MAX_UFID_LENGTH / 4]; + struct sw_flow_key *unmasked_key; + }; +}; + +struct sw_flow_actions { + struct rcu_head rcu; + size_t orig_len; /* From flow_cmd_new netlink actions size */ + u32 actions_len; + struct nlattr actions[]; +}; + +struct flow_stats { + u64 packet_count; /* Number of packets matched. */ + u64 byte_count; /* Number of bytes matched. */ + unsigned long used; /* Last used time (in jiffies). */ + spinlock_t lock; /* Lock for atomic stats update. */ + __be16 tcp_flags; /* Union of seen TCP flags. */ +}; + +struct sw_flow { + struct rcu_head rcu; + struct { + struct hlist_node node[2]; + u32 hash; + } flow_table, ufid_table; + int stats_last_writer; /* NUMA-node id of the last writer on + * 'stats[0]'. + */ + struct sw_flow_key key; + struct sw_flow_id id; + struct sw_flow_mask *mask; + struct sw_flow_actions __rcu *sf_acts; + struct flow_stats __rcu *stats[]; /* One for each NUMA node. First one + * is allocated at flow creation time, + * the rest are allocated on demand + * while holding the 'stats[0].lock'. + */ +}; + +struct arp_eth_header { + __be16 ar_hrd; /* format of hardware address */ + __be16 ar_pro; /* format of protocol address */ + unsigned char ar_hln; /* length of hardware address */ + unsigned char ar_pln; /* length of protocol address */ + __be16 ar_op; /* ARP opcode (command) */ + + /* Ethernet+IPv4 specific members. */ + unsigned char ar_sha[ETH_ALEN]; /* sender hardware address */ + unsigned char ar_sip[4]; /* sender IP address */ + unsigned char ar_tha[ETH_ALEN]; /* target hardware address */ + unsigned char ar_tip[4]; /* target IP address */ +} __packed; + +static inline bool ovs_identifier_is_ufid(const struct sw_flow_id *sfid) +{ + return sfid->ufid_len; +} + +static inline bool ovs_identifier_is_key(const struct sw_flow_id *sfid) +{ + return !ovs_identifier_is_ufid(sfid); +} + +void ovs_flow_stats_update(struct sw_flow *, __be16 tcp_flags, + const struct sk_buff *); +void ovs_flow_stats_get(const struct sw_flow *, struct ovs_flow_stats *, + unsigned long *used, __be16 *tcp_flags); +void ovs_flow_stats_clear(struct sw_flow *); +u64 ovs_flow_used_time(unsigned long flow_jiffies); + +int ovs_flow_key_update(struct sk_buff *skb, struct sw_flow_key *key); +int ovs_flow_key_extract(const struct ip_tunnel_info *tun_info, + struct sk_buff *skb, + struct sw_flow_key *key); +/* Extract key from packet coming from userspace. */ +int ovs_flow_key_extract_userspace(struct net *net, const struct nlattr *attr, + struct sk_buff *skb, + struct sw_flow_key *key, bool log); + +#endif /* flow.h */ diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c new file mode 100644 index 000000000000..9b676f8fc16f --- /dev/null +++ b/net/openvswitch/flow_netlink.c @@ -0,0 +1,2514 @@ +/* + * Copyright (c) 2007-2014 Nicira, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include "flow.h" +#include "datapath.h" +#include <linux/uaccess.h> +#include <linux/netdevice.h> +#include <linux/etherdevice.h> +#include <linux/if_ether.h> +#include <linux/if_vlan.h> +#include <net/llc_pdu.h> +#include <linux/kernel.h> +#include <linux/jhash.h> +#include <linux/jiffies.h> +#include <linux/llc.h> +#include <linux/module.h> +#include <linux/in.h> +#include <linux/rcupdate.h> +#include <linux/if_arp.h> +#include <linux/ip.h> +#include <linux/ipv6.h> +#include <linux/sctp.h> +#include <linux/tcp.h> +#include <linux/udp.h> +#include <linux/icmp.h> +#include <linux/icmpv6.h> +#include <linux/rculist.h> +#include <net/geneve.h> +#include <net/ip.h> +#include <net/ipv6.h> +#include <net/ndisc.h> +#include <net/mpls.h> +#include <net/vxlan.h> + +#include "flow_netlink.h" + +struct ovs_len_tbl { + int len; + const struct ovs_len_tbl *next; +}; + +#define OVS_ATTR_NESTED -1 +#define OVS_ATTR_VARIABLE -2 + +static void update_range(struct sw_flow_match *match, + size_t offset, size_t size, bool is_mask) +{ + struct sw_flow_key_range *range; + size_t start = rounddown(offset, sizeof(long)); + size_t end = roundup(offset + size, sizeof(long)); + + if (!is_mask) + range = &match->range; + else + range = &match->mask->range; + + if (range->start == range->end) { + range->start = start; + range->end = end; + return; + } + + if (range->start > start) + range->start = start; + + if (range->end < end) + range->end = end; +} + +#define SW_FLOW_KEY_PUT(match, field, value, is_mask) \ + do { \ + update_range(match, offsetof(struct sw_flow_key, field), \ + sizeof((match)->key->field), is_mask); \ + if (is_mask) \ + (match)->mask->key.field = value; \ + else \ + (match)->key->field = value; \ + } while (0) + +#define SW_FLOW_KEY_MEMCPY_OFFSET(match, offset, value_p, len, is_mask) \ + do { \ + update_range(match, offset, len, is_mask); \ + if (is_mask) \ + memcpy((u8 *)&(match)->mask->key + offset, value_p, \ + len); \ + else \ + memcpy((u8 *)(match)->key + offset, value_p, len); \ + } while (0) + +#define SW_FLOW_KEY_MEMCPY(match, field, value_p, len, is_mask) \ + SW_FLOW_KEY_MEMCPY_OFFSET(match, offsetof(struct sw_flow_key, field), \ + value_p, len, is_mask) + +#define SW_FLOW_KEY_MEMSET_FIELD(match, field, value, is_mask) \ + do { \ + update_range(match, offsetof(struct sw_flow_key, field), \ + sizeof((match)->key->field), is_mask); \ + if (is_mask) \ + memset((u8 *)&(match)->mask->key.field, value, \ + sizeof((match)->mask->key.field)); \ + else \ + memset((u8 *)&(match)->key->field, value, \ + sizeof((match)->key->field)); \ + } while (0) + +static bool match_validate(const struct sw_flow_match *match, + u64 key_attrs, u64 mask_attrs, bool log) +{ + u64 key_expected = 1 << OVS_KEY_ATTR_ETHERNET; + u64 mask_allowed = key_attrs; /* At most allow all key attributes */ + + /* The following mask attributes allowed only if they + * pass the validation tests. */ + mask_allowed &= ~((1 << OVS_KEY_ATTR_IPV4) + | (1 << OVS_KEY_ATTR_IPV6) + | (1 << OVS_KEY_ATTR_TCP) + | (1 << OVS_KEY_ATTR_TCP_FLAGS) + | (1 << OVS_KEY_ATTR_UDP) + | (1 << OVS_KEY_ATTR_SCTP) + | (1 << OVS_KEY_ATTR_ICMP) + | (1 << OVS_KEY_ATTR_ICMPV6) + | (1 << OVS_KEY_ATTR_ARP) + | (1 << OVS_KEY_ATTR_ND) + | (1 << OVS_KEY_ATTR_MPLS)); + + /* Always allowed mask fields. */ + mask_allowed |= ((1 << OVS_KEY_ATTR_TUNNEL) + | (1 << OVS_KEY_ATTR_IN_PORT) + | (1 << OVS_KEY_ATTR_ETHERTYPE)); + + /* Check key attributes. */ + if (match->key->eth.type == htons(ETH_P_ARP) + || match->key->eth.type == htons(ETH_P_RARP)) { + key_expected |= 1 << OVS_KEY_ATTR_ARP; + if (match->mask && (match->mask->key.eth.type == htons(0xffff))) + mask_allowed |= 1 << OVS_KEY_ATTR_ARP; + } + + if (eth_p_mpls(match->key->eth.type)) { + key_expected |= 1 << OVS_KEY_ATTR_MPLS; + if (match->mask && (match->mask->key.eth.type == htons(0xffff))) + mask_allowed |= 1 << OVS_KEY_ATTR_MPLS; + } + + if (match->key->eth.type == htons(ETH_P_IP)) { + key_expected |= 1 << OVS_KEY_ATTR_IPV4; + if (match->mask && (match->mask->key.eth.type == htons(0xffff))) + mask_allowed |= 1 << OVS_KEY_ATTR_IPV4; + + if (match->key->ip.frag != OVS_FRAG_TYPE_LATER) { + if (match->key->ip.proto == IPPROTO_UDP) { + key_expected |= 1 << OVS_KEY_ATTR_UDP; + if (match->mask && (match->mask->key.ip.proto == 0xff)) + mask_allowed |= 1 << OVS_KEY_ATTR_UDP; + } + + if (match->key->ip.proto == IPPROTO_SCTP) { + key_expected |= 1 << OVS_KEY_ATTR_SCTP; + if (match->mask && (match->mask->key.ip.proto == 0xff)) + mask_allowed |= 1 << OVS_KEY_ATTR_SCTP; + } + + if (match->key->ip.proto == IPPROTO_TCP) { + key_expected |= 1 << OVS_KEY_ATTR_TCP; + key_expected |= 1 << OVS_KEY_ATTR_TCP_FLAGS; + if (match->mask && (match->mask->key.ip.proto == 0xff)) { + mask_allowed |= 1 << OVS_KEY_ATTR_TCP; + mask_allowed |= 1 << OVS_KEY_ATTR_TCP_FLAGS; + } + } + + if (match->key->ip.proto == IPPROTO_ICMP) { + key_expected |= 1 << OVS_KEY_ATTR_ICMP; + if (match->mask && (match->mask->key.ip.proto == 0xff)) + mask_allowed |= 1 << OVS_KEY_ATTR_ICMP; + } + } + } + + if (match->key->eth.type == htons(ETH_P_IPV6)) { + key_expected |= 1 << OVS_KEY_ATTR_IPV6; + if (match->mask && (match->mask->key.eth.type == htons(0xffff))) + mask_allowed |= 1 << OVS_KEY_ATTR_IPV6; + + if (match->key->ip.frag != OVS_FRAG_TYPE_LATER) { + if (match->key->ip.proto == IPPROTO_UDP) { + key_expected |= 1 << OVS_KEY_ATTR_UDP; + if (match->mask && (match->mask->key.ip.proto == 0xff)) + mask_allowed |= 1 << OVS_KEY_ATTR_UDP; + } + + if (match->key->ip.proto == IPPROTO_SCTP) { + key_expected |= 1 << OVS_KEY_ATTR_SCTP; + if (match->mask && (match->mask->key.ip.proto == 0xff)) + mask_allowed |= 1 << OVS_KEY_ATTR_SCTP; + } + + if (match->key->ip.proto == IPPROTO_TCP) { + key_expected |= 1 << OVS_KEY_ATTR_TCP; + key_expected |= 1 << OVS_KEY_ATTR_TCP_FLAGS; + if (match->mask && (match->mask->key.ip.proto == 0xff)) { + mask_allowed |= 1 << OVS_KEY_ATTR_TCP; + mask_allowed |= 1 << OVS_KEY_ATTR_TCP_FLAGS; + } + } + + if (match->key->ip.proto == IPPROTO_ICMPV6) { + key_expected |= 1 << OVS_KEY_ATTR_ICMPV6; + if (match->mask && (match->mask->key.ip.proto == 0xff)) + mask_allowed |= 1 << OVS_KEY_ATTR_ICMPV6; + + if (match->key->tp.src == + htons(NDISC_NEIGHBOUR_SOLICITATION) || + match->key->tp.src == htons(NDISC_NEIGHBOUR_ADVERTISEMENT)) { + key_expected |= 1 << OVS_KEY_ATTR_ND; + if (match->mask && (match->mask->key.tp.src == htons(0xff))) + mask_allowed |= 1 << OVS_KEY_ATTR_ND; + } + } + } + } + + if ((key_attrs & key_expected) != key_expected) { + /* Key attributes check failed. */ + OVS_NLERR(log, "Missing key (keys=%llx, expected=%llx)", + (unsigned long long)key_attrs, + (unsigned long long)key_expected); + return false; + } + + if ((mask_attrs & mask_allowed) != mask_attrs) { + /* Mask attributes check failed. */ + OVS_NLERR(log, "Unexpected mask (mask=%llx, allowed=%llx)", + (unsigned long long)mask_attrs, + (unsigned long long)mask_allowed); + return false; + } + + return true; +} + +size_t ovs_tun_key_attr_size(void) +{ + /* Whenever adding new OVS_TUNNEL_KEY_ FIELDS, we should consider + * updating this function. + */ + return nla_total_size(8) /* OVS_TUNNEL_KEY_ATTR_ID */ + + nla_total_size(16) /* OVS_TUNNEL_KEY_ATTR_IPV[46]_SRC */ + + nla_total_size(16) /* OVS_TUNNEL_KEY_ATTR_IPV[46]_DST */ + + nla_total_size(1) /* OVS_TUNNEL_KEY_ATTR_TOS */ + + nla_total_size(1) /* OVS_TUNNEL_KEY_ATTR_TTL */ + + nla_total_size(0) /* OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT */ + + nla_total_size(0) /* OVS_TUNNEL_KEY_ATTR_CSUM */ + + nla_total_size(0) /* OVS_TUNNEL_KEY_ATTR_OAM */ + + nla_total_size(256) /* OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS */ + /* OVS_TUNNEL_KEY_ATTR_VXLAN_OPTS is mutually exclusive with + * OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS and covered by it. + */ + + nla_total_size(2) /* OVS_TUNNEL_KEY_ATTR_TP_SRC */ + + nla_total_size(2); /* OVS_TUNNEL_KEY_ATTR_TP_DST */ +} + +size_t ovs_key_attr_size(void) +{ + /* Whenever adding new OVS_KEY_ FIELDS, we should consider + * updating this function. + */ + BUILD_BUG_ON(OVS_KEY_ATTR_TUNNEL_INFO != 26); + + return nla_total_size(4) /* OVS_KEY_ATTR_PRIORITY */ + + nla_total_size(0) /* OVS_KEY_ATTR_TUNNEL */ + + ovs_tun_key_attr_size() + + nla_total_size(4) /* OVS_KEY_ATTR_IN_PORT */ + + nla_total_size(4) /* OVS_KEY_ATTR_SKB_MARK */ + + nla_total_size(4) /* OVS_KEY_ATTR_DP_HASH */ + + nla_total_size(4) /* OVS_KEY_ATTR_RECIRC_ID */ + + nla_total_size(4) /* OVS_KEY_ATTR_CT_STATE */ + + nla_total_size(2) /* OVS_KEY_ATTR_CT_ZONE */ + + nla_total_size(4) /* OVS_KEY_ATTR_CT_MARK */ + + nla_total_size(16) /* OVS_KEY_ATTR_CT_LABELS */ + + nla_total_size(12) /* OVS_KEY_ATTR_ETHERNET */ + + nla_total_size(2) /* OVS_KEY_ATTR_ETHERTYPE */ + + nla_total_size(4) /* OVS_KEY_ATTR_VLAN */ + + nla_total_size(0) /* OVS_KEY_ATTR_ENCAP */ + + nla_total_size(2) /* OVS_KEY_ATTR_ETHERTYPE */ + + nla_total_size(40) /* OVS_KEY_ATTR_IPV6 */ + + nla_total_size(2) /* OVS_KEY_ATTR_ICMPV6 */ + + nla_total_size(28); /* OVS_KEY_ATTR_ND */ +} + +static const struct ovs_len_tbl ovs_vxlan_ext_key_lens[OVS_VXLAN_EXT_MAX + 1] = { + [OVS_VXLAN_EXT_GBP] = { .len = sizeof(u32) }, +}; + +static const struct ovs_len_tbl ovs_tunnel_key_lens[OVS_TUNNEL_KEY_ATTR_MAX + 1] = { + [OVS_TUNNEL_KEY_ATTR_ID] = { .len = sizeof(u64) }, + [OVS_TUNNEL_KEY_ATTR_IPV4_SRC] = { .len = sizeof(u32) }, + [OVS_TUNNEL_KEY_ATTR_IPV4_DST] = { .len = sizeof(u32) }, + [OVS_TUNNEL_KEY_ATTR_TOS] = { .len = 1 }, + [OVS_TUNNEL_KEY_ATTR_TTL] = { .len = 1 }, + [OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT] = { .len = 0 }, + [OVS_TUNNEL_KEY_ATTR_CSUM] = { .len = 0 }, + [OVS_TUNNEL_KEY_ATTR_TP_SRC] = { .len = sizeof(u16) }, + [OVS_TUNNEL_KEY_ATTR_TP_DST] = { .len = sizeof(u16) }, + [OVS_TUNNEL_KEY_ATTR_OAM] = { .len = 0 }, + [OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS] = { .len = OVS_ATTR_VARIABLE }, + [OVS_TUNNEL_KEY_ATTR_VXLAN_OPTS] = { .len = OVS_ATTR_NESTED, + .next = ovs_vxlan_ext_key_lens }, + [OVS_TUNNEL_KEY_ATTR_IPV6_SRC] = { .len = sizeof(struct in6_addr) }, + [OVS_TUNNEL_KEY_ATTR_IPV6_DST] = { .len = sizeof(struct in6_addr) }, +}; + +/* The size of the argument for each %OVS_KEY_ATTR_* Netlink attribute. */ +static const struct ovs_len_tbl ovs_key_lens[OVS_KEY_ATTR_MAX + 1] = { + [OVS_KEY_ATTR_ENCAP] = { .len = OVS_ATTR_NESTED }, + [OVS_KEY_ATTR_PRIORITY] = { .len = sizeof(u32) }, + [OVS_KEY_ATTR_IN_PORT] = { .len = sizeof(u32) }, + [OVS_KEY_ATTR_SKB_MARK] = { .len = sizeof(u32) }, + [OVS_KEY_ATTR_ETHERNET] = { .len = sizeof(struct ovs_key_ethernet) }, + [OVS_KEY_ATTR_VLAN] = { .len = sizeof(__be16) }, + [OVS_KEY_ATTR_ETHERTYPE] = { .len = sizeof(__be16) }, + [OVS_KEY_ATTR_IPV4] = { .len = sizeof(struct ovs_key_ipv4) }, + [OVS_KEY_ATTR_IPV6] = { .len = sizeof(struct ovs_key_ipv6) }, + [OVS_KEY_ATTR_TCP] = { .len = sizeof(struct ovs_key_tcp) }, + [OVS_KEY_ATTR_TCP_FLAGS] = { .len = sizeof(__be16) }, + [OVS_KEY_ATTR_UDP] = { .len = sizeof(struct ovs_key_udp) }, + [OVS_KEY_ATTR_SCTP] = { .len = sizeof(struct ovs_key_sctp) }, + [OVS_KEY_ATTR_ICMP] = { .len = sizeof(struct ovs_key_icmp) }, + [OVS_KEY_ATTR_ICMPV6] = { .len = sizeof(struct ovs_key_icmpv6) }, + [OVS_KEY_ATTR_ARP] = { .len = sizeof(struct ovs_key_arp) }, + [OVS_KEY_ATTR_ND] = { .len = sizeof(struct ovs_key_nd) }, + [OVS_KEY_ATTR_RECIRC_ID] = { .len = sizeof(u32) }, + [OVS_KEY_ATTR_DP_HASH] = { .len = sizeof(u32) }, + [OVS_KEY_ATTR_TUNNEL] = { .len = OVS_ATTR_NESTED, + .next = ovs_tunnel_key_lens, }, + [OVS_KEY_ATTR_MPLS] = { .len = sizeof(struct ovs_key_mpls) }, + [OVS_KEY_ATTR_CT_STATE] = { .len = sizeof(u32) }, + [OVS_KEY_ATTR_CT_ZONE] = { .len = sizeof(u16) }, + [OVS_KEY_ATTR_CT_MARK] = { .len = sizeof(u32) }, + [OVS_KEY_ATTR_CT_LABELS] = { .len = sizeof(struct ovs_key_ct_labels) }, +}; + +static bool check_attr_len(unsigned int attr_len, unsigned int expected_len) +{ + return expected_len == attr_len || + expected_len == OVS_ATTR_NESTED || + expected_len == OVS_ATTR_VARIABLE; +} + +static bool is_all_zero(const u8 *fp, size_t size) +{ + int i; + + if (!fp) + return false; + + for (i = 0; i < size; i++) + if (fp[i]) + return false; + + return true; +} + +static int __parse_flow_nlattrs(const struct nlattr *attr, + const struct nlattr *a[], + u64 *attrsp, bool log, bool nz) +{ + const struct nlattr *nla; + u64 attrs; + int rem; + + attrs = *attrsp; + nla_for_each_nested(nla, attr, rem) { + u16 type = nla_type(nla); + int expected_len; + + if (type > OVS_KEY_ATTR_MAX) { + OVS_NLERR(log, "Key type %d is out of range max %d", + type, OVS_KEY_ATTR_MAX); + return -EINVAL; + } + + if (attrs & (1 << type)) { + OVS_NLERR(log, "Duplicate key (type %d).", type); + return -EINVAL; + } + + expected_len = ovs_key_lens[type].len; + if (!check_attr_len(nla_len(nla), expected_len)) { + OVS_NLERR(log, "Key %d has unexpected len %d expected %d", + type, nla_len(nla), expected_len); + return -EINVAL; + } + + if (!nz || !is_all_zero(nla_data(nla), nla_len(nla))) { + attrs |= 1 << type; + a[type] = nla; + } + } + if (rem) { + OVS_NLERR(log, "Message has %d unknown bytes.", rem); + return -EINVAL; + } + + *attrsp = attrs; + return 0; +} + +static int parse_flow_mask_nlattrs(const struct nlattr *attr, + const struct nlattr *a[], u64 *attrsp, + bool log) +{ + return __parse_flow_nlattrs(attr, a, attrsp, log, true); +} + +static int parse_flow_nlattrs(const struct nlattr *attr, + const struct nlattr *a[], u64 *attrsp, + bool log) +{ + return __parse_flow_nlattrs(attr, a, attrsp, log, false); +} + +static int genev_tun_opt_from_nlattr(const struct nlattr *a, + struct sw_flow_match *match, bool is_mask, + bool log) +{ + unsigned long opt_key_offset; + + if (nla_len(a) > sizeof(match->key->tun_opts)) { + OVS_NLERR(log, "Geneve option length err (len %d, max %zu).", + nla_len(a), sizeof(match->key->tun_opts)); + return -EINVAL; + } + + if (nla_len(a) % 4 != 0) { + OVS_NLERR(log, "Geneve opt len %d is not a multiple of 4.", + nla_len(a)); + return -EINVAL; + } + + /* We need to record the length of the options passed + * down, otherwise packets with the same format but + * additional options will be silently matched. + */ + if (!is_mask) { + SW_FLOW_KEY_PUT(match, tun_opts_len, nla_len(a), + false); + } else { + /* This is somewhat unusual because it looks at + * both the key and mask while parsing the + * attributes (and by extension assumes the key + * is parsed first). Normally, we would verify + * that each is the correct length and that the + * attributes line up in the validate function. + * However, that is difficult because this is + * variable length and we won't have the + * information later. + */ + if (match->key->tun_opts_len != nla_len(a)) { + OVS_NLERR(log, "Geneve option len %d != mask len %d", + match->key->tun_opts_len, nla_len(a)); + return -EINVAL; + } + + SW_FLOW_KEY_PUT(match, tun_opts_len, 0xff, true); + } + + opt_key_offset = TUN_METADATA_OFFSET(nla_len(a)); + SW_FLOW_KEY_MEMCPY_OFFSET(match, opt_key_offset, nla_data(a), + nla_len(a), is_mask); + return 0; +} + +static int vxlan_tun_opt_from_nlattr(const struct nlattr *attr, + struct sw_flow_match *match, bool is_mask, + bool log) +{ + struct nlattr *a; + int rem; + unsigned long opt_key_offset; + struct vxlan_metadata opts; + + BUILD_BUG_ON(sizeof(opts) > sizeof(match->key->tun_opts)); + + memset(&opts, 0, sizeof(opts)); + nla_for_each_nested(a, attr, rem) { + int type = nla_type(a); + + if (type > OVS_VXLAN_EXT_MAX) { + OVS_NLERR(log, "VXLAN extension %d out of range max %d", + type, OVS_VXLAN_EXT_MAX); + return -EINVAL; + } + + if (!check_attr_len(nla_len(a), + ovs_vxlan_ext_key_lens[type].len)) { + OVS_NLERR(log, "VXLAN extension %d has unexpected len %d expected %d", + type, nla_len(a), + ovs_vxlan_ext_key_lens[type].len); + return -EINVAL; + } + + switch (type) { + case OVS_VXLAN_EXT_GBP: + opts.gbp = nla_get_u32(a); + break; + default: + OVS_NLERR(log, "Unknown VXLAN extension attribute %d", + type); + return -EINVAL; + } + } + if (rem) { + OVS_NLERR(log, "VXLAN extension message has %d unknown bytes.", + rem); + return -EINVAL; + } + + if (!is_mask) + SW_FLOW_KEY_PUT(match, tun_opts_len, sizeof(opts), false); + else + SW_FLOW_KEY_PUT(match, tun_opts_len, 0xff, true); + + opt_key_offset = TUN_METADATA_OFFSET(sizeof(opts)); + SW_FLOW_KEY_MEMCPY_OFFSET(match, opt_key_offset, &opts, sizeof(opts), + is_mask); + return 0; +} + +static int ip_tun_from_nlattr(const struct nlattr *attr, + struct sw_flow_match *match, bool is_mask, + bool log) +{ + bool ttl = false, ipv4 = false, ipv6 = false; + __be16 tun_flags = 0; + int opts_type = 0; + struct nlattr *a; + int rem; + + nla_for_each_nested(a, attr, rem) { + int type = nla_type(a); + int err; + + if (type > OVS_TUNNEL_KEY_ATTR_MAX) { + OVS_NLERR(log, "Tunnel attr %d out of range max %d", + type, OVS_TUNNEL_KEY_ATTR_MAX); + return -EINVAL; + } + + if (!check_attr_len(nla_len(a), + ovs_tunnel_key_lens[type].len)) { + OVS_NLERR(log, "Tunnel attr %d has unexpected len %d expected %d", + type, nla_len(a), ovs_tunnel_key_lens[type].len); + return -EINVAL; + } + + switch (type) { + case OVS_TUNNEL_KEY_ATTR_ID: + SW_FLOW_KEY_PUT(match, tun_key.tun_id, + nla_get_be64(a), is_mask); + tun_flags |= TUNNEL_KEY; + break; + case OVS_TUNNEL_KEY_ATTR_IPV4_SRC: + SW_FLOW_KEY_PUT(match, tun_key.u.ipv4.src, + nla_get_in_addr(a), is_mask); + ipv4 = true; + break; + case OVS_TUNNEL_KEY_ATTR_IPV4_DST: + SW_FLOW_KEY_PUT(match, tun_key.u.ipv4.dst, + nla_get_in_addr(a), is_mask); + ipv4 = true; + break; + case OVS_TUNNEL_KEY_ATTR_IPV6_SRC: + SW_FLOW_KEY_PUT(match, tun_key.u.ipv6.src, + nla_get_in6_addr(a), is_mask); + ipv6 = true; + break; + case OVS_TUNNEL_KEY_ATTR_IPV6_DST: + SW_FLOW_KEY_PUT(match, tun_key.u.ipv6.dst, + nla_get_in6_addr(a), is_mask); + ipv6 = true; + break; + case OVS_TUNNEL_KEY_ATTR_TOS: + SW_FLOW_KEY_PUT(match, tun_key.tos, + nla_get_u8(a), is_mask); + break; + case OVS_TUNNEL_KEY_ATTR_TTL: + SW_FLOW_KEY_PUT(match, tun_key.ttl, + nla_get_u8(a), is_mask); + ttl = true; + break; + case OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT: + tun_flags |= TUNNEL_DONT_FRAGMENT; + break; + case OVS_TUNNEL_KEY_ATTR_CSUM: + tun_flags |= TUNNEL_CSUM; + break; + case OVS_TUNNEL_KEY_ATTR_TP_SRC: + SW_FLOW_KEY_PUT(match, tun_key.tp_src, + nla_get_be16(a), is_mask); + break; + case OVS_TUNNEL_KEY_ATTR_TP_DST: + SW_FLOW_KEY_PUT(match, tun_key.tp_dst, + nla_get_be16(a), is_mask); + break; + case OVS_TUNNEL_KEY_ATTR_OAM: + tun_flags |= TUNNEL_OAM; + break; + case OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS: + if (opts_type) { + OVS_NLERR(log, "Multiple metadata blocks provided"); + return -EINVAL; + } + + err = genev_tun_opt_from_nlattr(a, match, is_mask, log); + if (err) + return err; + + tun_flags |= TUNNEL_GENEVE_OPT; + opts_type = type; + break; + case OVS_TUNNEL_KEY_ATTR_VXLAN_OPTS: + if (opts_type) { + OVS_NLERR(log, "Multiple metadata blocks provided"); + return -EINVAL; + } + + err = vxlan_tun_opt_from_nlattr(a, match, is_mask, log); + if (err) + return err; + + tun_flags |= TUNNEL_VXLAN_OPT; + opts_type = type; + break; + default: + OVS_NLERR(log, "Unknown IP tunnel attribute %d", + type); + return -EINVAL; + } + } + + SW_FLOW_KEY_PUT(match, tun_key.tun_flags, tun_flags, is_mask); + if (is_mask) + SW_FLOW_KEY_MEMSET_FIELD(match, tun_proto, 0xff, true); + else + SW_FLOW_KEY_PUT(match, tun_proto, ipv6 ? AF_INET6 : AF_INET, + false); + + if (rem > 0) { + OVS_NLERR(log, "IP tunnel attribute has %d unknown bytes.", + rem); + return -EINVAL; + } + + if (ipv4 && ipv6) { + OVS_NLERR(log, "Mixed IPv4 and IPv6 tunnel attributes"); + return -EINVAL; + } + + if (!is_mask) { + if (!ipv4 && !ipv6) { + OVS_NLERR(log, "IP tunnel dst address not specified"); + return -EINVAL; + } + if (ipv4 && !match->key->tun_key.u.ipv4.dst) { + OVS_NLERR(log, "IPv4 tunnel dst address is zero"); + return -EINVAL; + } + if (ipv6 && ipv6_addr_any(&match->key->tun_key.u.ipv6.dst)) { + OVS_NLERR(log, "IPv6 tunnel dst address is zero"); + return -EINVAL; + } + + if (!ttl) { + OVS_NLERR(log, "IP tunnel TTL not specified."); + return -EINVAL; + } + } + + return opts_type; +} + +static int vxlan_opt_to_nlattr(struct sk_buff *skb, + const void *tun_opts, int swkey_tun_opts_len) +{ + const struct vxlan_metadata *opts = tun_opts; + struct nlattr *nla; + + nla = nla_nest_start(skb, OVS_TUNNEL_KEY_ATTR_VXLAN_OPTS); + if (!nla) + return -EMSGSIZE; + + if (nla_put_u32(skb, OVS_VXLAN_EXT_GBP, opts->gbp) < 0) + return -EMSGSIZE; + + nla_nest_end(skb, nla); + return 0; +} + +static int __ip_tun_to_nlattr(struct sk_buff *skb, + const struct ip_tunnel_key *output, + const void *tun_opts, int swkey_tun_opts_len, + unsigned short tun_proto) +{ + if (output->tun_flags & TUNNEL_KEY && + nla_put_be64(skb, OVS_TUNNEL_KEY_ATTR_ID, output->tun_id)) + return -EMSGSIZE; + switch (tun_proto) { + case AF_INET: + if (output->u.ipv4.src && + nla_put_in_addr(skb, OVS_TUNNEL_KEY_ATTR_IPV4_SRC, + output->u.ipv4.src)) + return -EMSGSIZE; + if (output->u.ipv4.dst && + nla_put_in_addr(skb, OVS_TUNNEL_KEY_ATTR_IPV4_DST, + output->u.ipv4.dst)) + return -EMSGSIZE; + break; + case AF_INET6: + if (!ipv6_addr_any(&output->u.ipv6.src) && + nla_put_in6_addr(skb, OVS_TUNNEL_KEY_ATTR_IPV6_SRC, + &output->u.ipv6.src)) + return -EMSGSIZE; + if (!ipv6_addr_any(&output->u.ipv6.dst) && + nla_put_in6_addr(skb, OVS_TUNNEL_KEY_ATTR_IPV6_DST, + &output->u.ipv6.dst)) + return -EMSGSIZE; + break; + } + if (output->tos && + nla_put_u8(skb, OVS_TUNNEL_KEY_ATTR_TOS, output->tos)) + return -EMSGSIZE; + if (nla_put_u8(skb, OVS_TUNNEL_KEY_ATTR_TTL, output->ttl)) + return -EMSGSIZE; + if ((output->tun_flags & TUNNEL_DONT_FRAGMENT) && + nla_put_flag(skb, OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT)) + return -EMSGSIZE; + if ((output->tun_flags & TUNNEL_CSUM) && + nla_put_flag(skb, OVS_TUNNEL_KEY_ATTR_CSUM)) + return -EMSGSIZE; + if (output->tp_src && + nla_put_be16(skb, OVS_TUNNEL_KEY_ATTR_TP_SRC, output->tp_src)) + return -EMSGSIZE; + if (output->tp_dst && + nla_put_be16(skb, OVS_TUNNEL_KEY_ATTR_TP_DST, output->tp_dst)) + return -EMSGSIZE; + if ((output->tun_flags & TUNNEL_OAM) && + nla_put_flag(skb, OVS_TUNNEL_KEY_ATTR_OAM)) + return -EMSGSIZE; + if (swkey_tun_opts_len) { + if (output->tun_flags & TUNNEL_GENEVE_OPT && + nla_put(skb, OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS, + swkey_tun_opts_len, tun_opts)) + return -EMSGSIZE; + else if (output->tun_flags & TUNNEL_VXLAN_OPT && + vxlan_opt_to_nlattr(skb, tun_opts, swkey_tun_opts_len)) + return -EMSGSIZE; + } + + return 0; +} + +static int ip_tun_to_nlattr(struct sk_buff *skb, + const struct ip_tunnel_key *output, + const void *tun_opts, int swkey_tun_opts_len, + unsigned short tun_proto) +{ + struct nlattr *nla; + int err; + + nla = nla_nest_start(skb, OVS_KEY_ATTR_TUNNEL); + if (!nla) + return -EMSGSIZE; + + err = __ip_tun_to_nlattr(skb, output, tun_opts, swkey_tun_opts_len, + tun_proto); + if (err) + return err; + + nla_nest_end(skb, nla); + return 0; +} + +int ovs_nla_put_tunnel_info(struct sk_buff *skb, + struct ip_tunnel_info *tun_info) +{ + return __ip_tun_to_nlattr(skb, &tun_info->key, + ip_tunnel_info_opts(tun_info), + tun_info->options_len, + ip_tunnel_info_af(tun_info)); +} + +static int metadata_from_nlattrs(struct net *net, struct sw_flow_match *match, + u64 *attrs, const struct nlattr **a, + bool is_mask, bool log) +{ + if (*attrs & (1 << OVS_KEY_ATTR_DP_HASH)) { + u32 hash_val = nla_get_u32(a[OVS_KEY_ATTR_DP_HASH]); + + SW_FLOW_KEY_PUT(match, ovs_flow_hash, hash_val, is_mask); + *attrs &= ~(1 << OVS_KEY_ATTR_DP_HASH); + } + + if (*attrs & (1 << OVS_KEY_ATTR_RECIRC_ID)) { + u32 recirc_id = nla_get_u32(a[OVS_KEY_ATTR_RECIRC_ID]); + + SW_FLOW_KEY_PUT(match, recirc_id, recirc_id, is_mask); + *attrs &= ~(1 << OVS_KEY_ATTR_RECIRC_ID); + } + + if (*attrs & (1 << OVS_KEY_ATTR_PRIORITY)) { + SW_FLOW_KEY_PUT(match, phy.priority, + nla_get_u32(a[OVS_KEY_ATTR_PRIORITY]), is_mask); + *attrs &= ~(1 << OVS_KEY_ATTR_PRIORITY); + } + + if (*attrs & (1 << OVS_KEY_ATTR_IN_PORT)) { + u32 in_port = nla_get_u32(a[OVS_KEY_ATTR_IN_PORT]); + + if (is_mask) { + in_port = 0xffffffff; /* Always exact match in_port. */ + } else if (in_port >= DP_MAX_PORTS) { + OVS_NLERR(log, "Port %d exceeds max allowable %d", + in_port, DP_MAX_PORTS); + return -EINVAL; + } + + SW_FLOW_KEY_PUT(match, phy.in_port, in_port, is_mask); + *attrs &= ~(1 << OVS_KEY_ATTR_IN_PORT); + } else if (!is_mask) { + SW_FLOW_KEY_PUT(match, phy.in_port, DP_MAX_PORTS, is_mask); + } + + if (*attrs & (1 << OVS_KEY_ATTR_SKB_MARK)) { + uint32_t mark = nla_get_u32(a[OVS_KEY_ATTR_SKB_MARK]); + + SW_FLOW_KEY_PUT(match, phy.skb_mark, mark, is_mask); + *attrs &= ~(1 << OVS_KEY_ATTR_SKB_MARK); + } + if (*attrs & (1 << OVS_KEY_ATTR_TUNNEL)) { + if (ip_tun_from_nlattr(a[OVS_KEY_ATTR_TUNNEL], match, + is_mask, log) < 0) + return -EINVAL; + *attrs &= ~(1 << OVS_KEY_ATTR_TUNNEL); + } + + if (*attrs & (1 << OVS_KEY_ATTR_CT_STATE) && + ovs_ct_verify(net, OVS_KEY_ATTR_CT_STATE)) { + u32 ct_state = nla_get_u32(a[OVS_KEY_ATTR_CT_STATE]); + + if (ct_state & ~CT_SUPPORTED_MASK) { + OVS_NLERR(log, "ct_state flags %08x unsupported", + ct_state); + return -EINVAL; + } + + SW_FLOW_KEY_PUT(match, ct.state, ct_state, is_mask); + *attrs &= ~(1ULL << OVS_KEY_ATTR_CT_STATE); + } + if (*attrs & (1 << OVS_KEY_ATTR_CT_ZONE) && + ovs_ct_verify(net, OVS_KEY_ATTR_CT_ZONE)) { + u16 ct_zone = nla_get_u16(a[OVS_KEY_ATTR_CT_ZONE]); + + SW_FLOW_KEY_PUT(match, ct.zone, ct_zone, is_mask); + *attrs &= ~(1ULL << OVS_KEY_ATTR_CT_ZONE); + } + if (*attrs & (1 << OVS_KEY_ATTR_CT_MARK) && + ovs_ct_verify(net, OVS_KEY_ATTR_CT_MARK)) { + u32 mark = nla_get_u32(a[OVS_KEY_ATTR_CT_MARK]); + + SW_FLOW_KEY_PUT(match, ct.mark, mark, is_mask); + *attrs &= ~(1ULL << OVS_KEY_ATTR_CT_MARK); + } + if (*attrs & (1 << OVS_KEY_ATTR_CT_LABELS) && + ovs_ct_verify(net, OVS_KEY_ATTR_CT_LABELS)) { + const struct ovs_key_ct_labels *cl; + + cl = nla_data(a[OVS_KEY_ATTR_CT_LABELS]); + SW_FLOW_KEY_MEMCPY(match, ct.labels, cl->ct_labels, + sizeof(*cl), is_mask); + *attrs &= ~(1ULL << OVS_KEY_ATTR_CT_LABELS); + } + return 0; +} + +static int ovs_key_from_nlattrs(struct net *net, struct sw_flow_match *match, + u64 attrs, const struct nlattr **a, + bool is_mask, bool log) +{ + int err; + + err = metadata_from_nlattrs(net, match, &attrs, a, is_mask, log); + if (err) + return err; + + if (attrs & (1 << OVS_KEY_ATTR_ETHERNET)) { + const struct ovs_key_ethernet *eth_key; + + eth_key = nla_data(a[OVS_KEY_ATTR_ETHERNET]); + SW_FLOW_KEY_MEMCPY(match, eth.src, + eth_key->eth_src, ETH_ALEN, is_mask); + SW_FLOW_KEY_MEMCPY(match, eth.dst, + eth_key->eth_dst, ETH_ALEN, is_mask); + attrs &= ~(1 << OVS_KEY_ATTR_ETHERNET); + } + + if (attrs & (1 << OVS_KEY_ATTR_VLAN)) { + __be16 tci; + + tci = nla_get_be16(a[OVS_KEY_ATTR_VLAN]); + if (!(tci & htons(VLAN_TAG_PRESENT))) { + if (is_mask) + OVS_NLERR(log, "VLAN TCI mask does not have exact match for VLAN_TAG_PRESENT bit."); + else + OVS_NLERR(log, "VLAN TCI does not have VLAN_TAG_PRESENT bit set."); + + return -EINVAL; + } + + SW_FLOW_KEY_PUT(match, eth.tci, tci, is_mask); + attrs &= ~(1 << OVS_KEY_ATTR_VLAN); + } + + if (attrs & (1 << OVS_KEY_ATTR_ETHERTYPE)) { + __be16 eth_type; + + eth_type = nla_get_be16(a[OVS_KEY_ATTR_ETHERTYPE]); + if (is_mask) { + /* Always exact match EtherType. */ + eth_type = htons(0xffff); + } else if (!eth_proto_is_802_3(eth_type)) { + OVS_NLERR(log, "EtherType %x is less than min %x", + ntohs(eth_type), ETH_P_802_3_MIN); + return -EINVAL; + } + + SW_FLOW_KEY_PUT(match, eth.type, eth_type, is_mask); + attrs &= ~(1 << OVS_KEY_ATTR_ETHERTYPE); + } else if (!is_mask) { + SW_FLOW_KEY_PUT(match, eth.type, htons(ETH_P_802_2), is_mask); + } + + if (attrs & (1 << OVS_KEY_ATTR_IPV4)) { + const struct ovs_key_ipv4 *ipv4_key; + + ipv4_key = nla_data(a[OVS_KEY_ATTR_IPV4]); + if (!is_mask && ipv4_key->ipv4_frag > OVS_FRAG_TYPE_MAX) { + OVS_NLERR(log, "IPv4 frag type %d is out of range max %d", + ipv4_key->ipv4_frag, OVS_FRAG_TYPE_MAX); + return -EINVAL; + } + SW_FLOW_KEY_PUT(match, ip.proto, + ipv4_key->ipv4_proto, is_mask); + SW_FLOW_KEY_PUT(match, ip.tos, + ipv4_key->ipv4_tos, is_mask); + SW_FLOW_KEY_PUT(match, ip.ttl, + ipv4_key->ipv4_ttl, is_mask); + SW_FLOW_KEY_PUT(match, ip.frag, + ipv4_key->ipv4_frag, is_mask); + SW_FLOW_KEY_PUT(match, ipv4.addr.src, + ipv4_key->ipv4_src, is_mask); + SW_FLOW_KEY_PUT(match, ipv4.addr.dst, + ipv4_key->ipv4_dst, is_mask); + attrs &= ~(1 << OVS_KEY_ATTR_IPV4); + } + + if (attrs & (1 << OVS_KEY_ATTR_IPV6)) { + const struct ovs_key_ipv6 *ipv6_key; + + ipv6_key = nla_data(a[OVS_KEY_ATTR_IPV6]); + if (!is_mask && ipv6_key->ipv6_frag > OVS_FRAG_TYPE_MAX) { + OVS_NLERR(log, "IPv6 frag type %d is out of range max %d", + ipv6_key->ipv6_frag, OVS_FRAG_TYPE_MAX); + return -EINVAL; + } + + if (!is_mask && ipv6_key->ipv6_label & htonl(0xFFF00000)) { + OVS_NLERR(log, "IPv6 flow label %x is out of range (max=%x).\n", + ntohl(ipv6_key->ipv6_label), (1 << 20) - 1); + return -EINVAL; + } + + SW_FLOW_KEY_PUT(match, ipv6.label, + ipv6_key->ipv6_label, is_mask); + SW_FLOW_KEY_PUT(match, ip.proto, + ipv6_key->ipv6_proto, is_mask); + SW_FLOW_KEY_PUT(match, ip.tos, + ipv6_key->ipv6_tclass, is_mask); + SW_FLOW_KEY_PUT(match, ip.ttl, + ipv6_key->ipv6_hlimit, is_mask); + SW_FLOW_KEY_PUT(match, ip.frag, + ipv6_key->ipv6_frag, is_mask); + SW_FLOW_KEY_MEMCPY(match, ipv6.addr.src, + ipv6_key->ipv6_src, + sizeof(match->key->ipv6.addr.src), + is_mask); + SW_FLOW_KEY_MEMCPY(match, ipv6.addr.dst, + ipv6_key->ipv6_dst, + sizeof(match->key->ipv6.addr.dst), + is_mask); + + attrs &= ~(1 << OVS_KEY_ATTR_IPV6); + } + + if (attrs & (1 << OVS_KEY_ATTR_ARP)) { + const struct ovs_key_arp *arp_key; + + arp_key = nla_data(a[OVS_KEY_ATTR_ARP]); + if (!is_mask && (arp_key->arp_op & htons(0xff00))) { + OVS_NLERR(log, "Unknown ARP opcode (opcode=%d).", + arp_key->arp_op); + return -EINVAL; + } + + SW_FLOW_KEY_PUT(match, ipv4.addr.src, + arp_key->arp_sip, is_mask); + SW_FLOW_KEY_PUT(match, ipv4.addr.dst, + arp_key->arp_tip, is_mask); + SW_FLOW_KEY_PUT(match, ip.proto, + ntohs(arp_key->arp_op), is_mask); + SW_FLOW_KEY_MEMCPY(match, ipv4.arp.sha, + arp_key->arp_sha, ETH_ALEN, is_mask); + SW_FLOW_KEY_MEMCPY(match, ipv4.arp.tha, + arp_key->arp_tha, ETH_ALEN, is_mask); + + attrs &= ~(1 << OVS_KEY_ATTR_ARP); + } + + if (attrs & (1 << OVS_KEY_ATTR_MPLS)) { + const struct ovs_key_mpls *mpls_key; + + mpls_key = nla_data(a[OVS_KEY_ATTR_MPLS]); + SW_FLOW_KEY_PUT(match, mpls.top_lse, + mpls_key->mpls_lse, is_mask); + + attrs &= ~(1 << OVS_KEY_ATTR_MPLS); + } + + if (attrs & (1 << OVS_KEY_ATTR_TCP)) { + const struct ovs_key_tcp *tcp_key; + + tcp_key = nla_data(a[OVS_KEY_ATTR_TCP]); + SW_FLOW_KEY_PUT(match, tp.src, tcp_key->tcp_src, is_mask); + SW_FLOW_KEY_PUT(match, tp.dst, tcp_key->tcp_dst, is_mask); + attrs &= ~(1 << OVS_KEY_ATTR_TCP); + } + + if (attrs & (1 << OVS_KEY_ATTR_TCP_FLAGS)) { + SW_FLOW_KEY_PUT(match, tp.flags, + nla_get_be16(a[OVS_KEY_ATTR_TCP_FLAGS]), + is_mask); + attrs &= ~(1 << OVS_KEY_ATTR_TCP_FLAGS); + } + + if (attrs & (1 << OVS_KEY_ATTR_UDP)) { + const struct ovs_key_udp *udp_key; + + udp_key = nla_data(a[OVS_KEY_ATTR_UDP]); + SW_FLOW_KEY_PUT(match, tp.src, udp_key->udp_src, is_mask); + SW_FLOW_KEY_PUT(match, tp.dst, udp_key->udp_dst, is_mask); + attrs &= ~(1 << OVS_KEY_ATTR_UDP); + } + + if (attrs & (1 << OVS_KEY_ATTR_SCTP)) { + const struct ovs_key_sctp *sctp_key; + + sctp_key = nla_data(a[OVS_KEY_ATTR_SCTP]); + SW_FLOW_KEY_PUT(match, tp.src, sctp_key->sctp_src, is_mask); + SW_FLOW_KEY_PUT(match, tp.dst, sctp_key->sctp_dst, is_mask); + attrs &= ~(1 << OVS_KEY_ATTR_SCTP); + } + + if (attrs & (1 << OVS_KEY_ATTR_ICMP)) { + const struct ovs_key_icmp *icmp_key; + + icmp_key = nla_data(a[OVS_KEY_ATTR_ICMP]); + SW_FLOW_KEY_PUT(match, tp.src, + htons(icmp_key->icmp_type), is_mask); + SW_FLOW_KEY_PUT(match, tp.dst, + htons(icmp_key->icmp_code), is_mask); + attrs &= ~(1 << OVS_KEY_ATTR_ICMP); + } + + if (attrs & (1 << OVS_KEY_ATTR_ICMPV6)) { + const struct ovs_key_icmpv6 *icmpv6_key; + + icmpv6_key = nla_data(a[OVS_KEY_ATTR_ICMPV6]); + SW_FLOW_KEY_PUT(match, tp.src, + htons(icmpv6_key->icmpv6_type), is_mask); + SW_FLOW_KEY_PUT(match, tp.dst, + htons(icmpv6_key->icmpv6_code), is_mask); + attrs &= ~(1 << OVS_KEY_ATTR_ICMPV6); + } + + if (attrs & (1 << OVS_KEY_ATTR_ND)) { + const struct ovs_key_nd *nd_key; + + nd_key = nla_data(a[OVS_KEY_ATTR_ND]); + SW_FLOW_KEY_MEMCPY(match, ipv6.nd.target, + nd_key->nd_target, + sizeof(match->key->ipv6.nd.target), + is_mask); + SW_FLOW_KEY_MEMCPY(match, ipv6.nd.sll, + nd_key->nd_sll, ETH_ALEN, is_mask); + SW_FLOW_KEY_MEMCPY(match, ipv6.nd.tll, + nd_key->nd_tll, ETH_ALEN, is_mask); + attrs &= ~(1 << OVS_KEY_ATTR_ND); + } + + if (attrs != 0) { + OVS_NLERR(log, "Unknown key attributes %llx", + (unsigned long long)attrs); + return -EINVAL; + } + + return 0; +} + +static void nlattr_set(struct nlattr *attr, u8 val, + const struct ovs_len_tbl *tbl) +{ + struct nlattr *nla; + int rem; + + /* The nlattr stream should already have been validated */ + nla_for_each_nested(nla, attr, rem) { + if (tbl[nla_type(nla)].len == OVS_ATTR_NESTED) + nlattr_set(nla, val, tbl[nla_type(nla)].next ? : tbl); + else + memset(nla_data(nla), val, nla_len(nla)); + + if (nla_type(nla) == OVS_KEY_ATTR_CT_STATE) + *(u32 *)nla_data(nla) &= CT_SUPPORTED_MASK; + } +} + +static void mask_set_nlattr(struct nlattr *attr, u8 val) +{ + nlattr_set(attr, val, ovs_key_lens); +} + +/** + * ovs_nla_get_match - parses Netlink attributes into a flow key and + * mask. In case the 'mask' is NULL, the flow is treated as exact match + * flow. Otherwise, it is treated as a wildcarded flow, except the mask + * does not include any don't care bit. + * @net: Used to determine per-namespace field support. + * @match: receives the extracted flow match information. + * @key: Netlink attribute holding nested %OVS_KEY_ATTR_* Netlink attribute + * sequence. The fields should of the packet that triggered the creation + * of this flow. + * @mask: Optional. Netlink attribute holding nested %OVS_KEY_ATTR_* Netlink + * attribute specifies the mask field of the wildcarded flow. + * @log: Boolean to allow kernel error logging. Normally true, but when + * probing for feature compatibility this should be passed in as false to + * suppress unnecessary error logging. + */ +int ovs_nla_get_match(struct net *net, struct sw_flow_match *match, + const struct nlattr *nla_key, + const struct nlattr *nla_mask, + bool log) +{ + const struct nlattr *a[OVS_KEY_ATTR_MAX + 1]; + const struct nlattr *encap; + struct nlattr *newmask = NULL; + u64 key_attrs = 0; + u64 mask_attrs = 0; + bool encap_valid = false; + int err; + + err = parse_flow_nlattrs(nla_key, a, &key_attrs, log); + if (err) + return err; + + if ((key_attrs & (1 << OVS_KEY_ATTR_ETHERNET)) && + (key_attrs & (1 << OVS_KEY_ATTR_ETHERTYPE)) && + (nla_get_be16(a[OVS_KEY_ATTR_ETHERTYPE]) == htons(ETH_P_8021Q))) { + __be16 tci; + + if (!((key_attrs & (1 << OVS_KEY_ATTR_VLAN)) && + (key_attrs & (1 << OVS_KEY_ATTR_ENCAP)))) { + OVS_NLERR(log, "Invalid Vlan frame."); + return -EINVAL; + } + + key_attrs &= ~(1 << OVS_KEY_ATTR_ETHERTYPE); + tci = nla_get_be16(a[OVS_KEY_ATTR_VLAN]); + encap = a[OVS_KEY_ATTR_ENCAP]; + key_attrs &= ~(1 << OVS_KEY_ATTR_ENCAP); + encap_valid = true; + + if (tci & htons(VLAN_TAG_PRESENT)) { + err = parse_flow_nlattrs(encap, a, &key_attrs, log); + if (err) + return err; + } else if (!tci) { + /* Corner case for truncated 802.1Q header. */ + if (nla_len(encap)) { + OVS_NLERR(log, "Truncated 802.1Q header has non-zero encap attribute."); + return -EINVAL; + } + } else { + OVS_NLERR(log, "Encap attr is set for non-VLAN frame"); + return -EINVAL; + } + } + + err = ovs_key_from_nlattrs(net, match, key_attrs, a, false, log); + if (err) + return err; + + if (match->mask) { + if (!nla_mask) { + /* Create an exact match mask. We need to set to 0xff + * all the 'match->mask' fields that have been touched + * in 'match->key'. We cannot simply memset + * 'match->mask', because padding bytes and fields not + * specified in 'match->key' should be left to 0. + * Instead, we use a stream of netlink attributes, + * copied from 'key' and set to 0xff. + * ovs_key_from_nlattrs() will take care of filling + * 'match->mask' appropriately. + */ + newmask = kmemdup(nla_key, + nla_total_size(nla_len(nla_key)), + GFP_KERNEL); + if (!newmask) + return -ENOMEM; + + mask_set_nlattr(newmask, 0xff); + + /* The userspace does not send tunnel attributes that + * are 0, but we should not wildcard them nonetheless. + */ + if (match->key->tun_proto) + SW_FLOW_KEY_MEMSET_FIELD(match, tun_key, + 0xff, true); + + nla_mask = newmask; + } + + err = parse_flow_mask_nlattrs(nla_mask, a, &mask_attrs, log); + if (err) + goto free_newmask; + + /* Always match on tci. */ + SW_FLOW_KEY_PUT(match, eth.tci, htons(0xffff), true); + + if (mask_attrs & 1 << OVS_KEY_ATTR_ENCAP) { + __be16 eth_type = 0; + __be16 tci = 0; + + if (!encap_valid) { + OVS_NLERR(log, "Encap mask attribute is set for non-VLAN frame."); + err = -EINVAL; + goto free_newmask; + } + + mask_attrs &= ~(1 << OVS_KEY_ATTR_ENCAP); + if (a[OVS_KEY_ATTR_ETHERTYPE]) + eth_type = nla_get_be16(a[OVS_KEY_ATTR_ETHERTYPE]); + + if (eth_type == htons(0xffff)) { + mask_attrs &= ~(1 << OVS_KEY_ATTR_ETHERTYPE); + encap = a[OVS_KEY_ATTR_ENCAP]; + err = parse_flow_mask_nlattrs(encap, a, + &mask_attrs, log); + if (err) + goto free_newmask; + } else { + OVS_NLERR(log, "VLAN frames must have an exact match on the TPID (mask=%x).", + ntohs(eth_type)); + err = -EINVAL; + goto free_newmask; + } + + if (a[OVS_KEY_ATTR_VLAN]) + tci = nla_get_be16(a[OVS_KEY_ATTR_VLAN]); + + if (!(tci & htons(VLAN_TAG_PRESENT))) { + OVS_NLERR(log, "VLAN tag present bit must have an exact match (tci_mask=%x).", + ntohs(tci)); + err = -EINVAL; + goto free_newmask; + } + } + + err = ovs_key_from_nlattrs(net, match, mask_attrs, a, true, + log); + if (err) + goto free_newmask; + } + + if (!match_validate(match, key_attrs, mask_attrs, log)) + err = -EINVAL; + +free_newmask: + kfree(newmask); + return err; +} + +static size_t get_ufid_len(const struct nlattr *attr, bool log) +{ + size_t len; + + if (!attr) + return 0; + + len = nla_len(attr); + if (len < 1 || len > MAX_UFID_LENGTH) { + OVS_NLERR(log, "ufid size %u bytes exceeds the range (1, %d)", + nla_len(attr), MAX_UFID_LENGTH); + return 0; + } + + return len; +} + +/* Initializes 'flow->ufid', returning true if 'attr' contains a valid UFID, + * or false otherwise. + */ +bool ovs_nla_get_ufid(struct sw_flow_id *sfid, const struct nlattr *attr, + bool log) +{ + sfid->ufid_len = get_ufid_len(attr, log); + if (sfid->ufid_len) + memcpy(sfid->ufid, nla_data(attr), sfid->ufid_len); + + return sfid->ufid_len; +} + +int ovs_nla_get_identifier(struct sw_flow_id *sfid, const struct nlattr *ufid, + const struct sw_flow_key *key, bool log) +{ + struct sw_flow_key *new_key; + + if (ovs_nla_get_ufid(sfid, ufid, log)) + return 0; + + /* If UFID was not provided, use unmasked key. */ + new_key = kmalloc(sizeof(*new_key), GFP_KERNEL); + if (!new_key) + return -ENOMEM; + memcpy(new_key, key, sizeof(*key)); + sfid->unmasked_key = new_key; + + return 0; +} + +u32 ovs_nla_get_ufid_flags(const struct nlattr *attr) +{ + return attr ? nla_get_u32(attr) : 0; +} + +/** + * ovs_nla_get_flow_metadata - parses Netlink attributes into a flow key. + * @key: Receives extracted in_port, priority, tun_key and skb_mark. + * @attr: Netlink attribute holding nested %OVS_KEY_ATTR_* Netlink attribute + * sequence. + * @log: Boolean to allow kernel error logging. Normally true, but when + * probing for feature compatibility this should be passed in as false to + * suppress unnecessary error logging. + * + * This parses a series of Netlink attributes that form a flow key, which must + * take the same form accepted by flow_from_nlattrs(), but only enough of it to + * get the metadata, that is, the parts of the flow key that cannot be + * extracted from the packet itself. + */ + +int ovs_nla_get_flow_metadata(struct net *net, const struct nlattr *attr, + struct sw_flow_key *key, + bool log) +{ + const struct nlattr *a[OVS_KEY_ATTR_MAX + 1]; + struct sw_flow_match match; + u64 attrs = 0; + int err; + + err = parse_flow_nlattrs(attr, a, &attrs, log); + if (err) + return -EINVAL; + + memset(&match, 0, sizeof(match)); + match.key = key; + + memset(&key->ct, 0, sizeof(key->ct)); + key->phy.in_port = DP_MAX_PORTS; + + return metadata_from_nlattrs(net, &match, &attrs, a, false, log); +} + +static int __ovs_nla_put_key(const struct sw_flow_key *swkey, + const struct sw_flow_key *output, bool is_mask, + struct sk_buff *skb) +{ + struct ovs_key_ethernet *eth_key; + struct nlattr *nla, *encap; + + if (nla_put_u32(skb, OVS_KEY_ATTR_RECIRC_ID, output->recirc_id)) + goto nla_put_failure; + + if (nla_put_u32(skb, OVS_KEY_ATTR_DP_HASH, output->ovs_flow_hash)) + goto nla_put_failure; + + if (nla_put_u32(skb, OVS_KEY_ATTR_PRIORITY, output->phy.priority)) + goto nla_put_failure; + + if ((swkey->tun_proto || is_mask)) { + const void *opts = NULL; + + if (output->tun_key.tun_flags & TUNNEL_OPTIONS_PRESENT) + opts = TUN_METADATA_OPTS(output, swkey->tun_opts_len); + + if (ip_tun_to_nlattr(skb, &output->tun_key, opts, + swkey->tun_opts_len, swkey->tun_proto)) + goto nla_put_failure; + } + + if (swkey->phy.in_port == DP_MAX_PORTS) { + if (is_mask && (output->phy.in_port == 0xffff)) + if (nla_put_u32(skb, OVS_KEY_ATTR_IN_PORT, 0xffffffff)) + goto nla_put_failure; + } else { + u16 upper_u16; + upper_u16 = !is_mask ? 0 : 0xffff; + + if (nla_put_u32(skb, OVS_KEY_ATTR_IN_PORT, + (upper_u16 << 16) | output->phy.in_port)) + goto nla_put_failure; + } + + if (nla_put_u32(skb, OVS_KEY_ATTR_SKB_MARK, output->phy.skb_mark)) + goto nla_put_failure; + + if (ovs_ct_put_key(output, skb)) + goto nla_put_failure; + + nla = nla_reserve(skb, OVS_KEY_ATTR_ETHERNET, sizeof(*eth_key)); + if (!nla) + goto nla_put_failure; + + eth_key = nla_data(nla); + ether_addr_copy(eth_key->eth_src, output->eth.src); + ether_addr_copy(eth_key->eth_dst, output->eth.dst); + + if (swkey->eth.tci || swkey->eth.type == htons(ETH_P_8021Q)) { + __be16 eth_type; + eth_type = !is_mask ? htons(ETH_P_8021Q) : htons(0xffff); + if (nla_put_be16(skb, OVS_KEY_ATTR_ETHERTYPE, eth_type) || + nla_put_be16(skb, OVS_KEY_ATTR_VLAN, output->eth.tci)) + goto nla_put_failure; + encap = nla_nest_start(skb, OVS_KEY_ATTR_ENCAP); + if (!swkey->eth.tci) + goto unencap; + } else + encap = NULL; + + if (swkey->eth.type == htons(ETH_P_802_2)) { + /* + * Ethertype 802.2 is represented in the netlink with omitted + * OVS_KEY_ATTR_ETHERTYPE in the flow key attribute, and + * 0xffff in the mask attribute. Ethertype can also + * be wildcarded. + */ + if (is_mask && output->eth.type) + if (nla_put_be16(skb, OVS_KEY_ATTR_ETHERTYPE, + output->eth.type)) + goto nla_put_failure; + goto unencap; + } + + if (nla_put_be16(skb, OVS_KEY_ATTR_ETHERTYPE, output->eth.type)) + goto nla_put_failure; + + if (swkey->eth.type == htons(ETH_P_IP)) { + struct ovs_key_ipv4 *ipv4_key; + + nla = nla_reserve(skb, OVS_KEY_ATTR_IPV4, sizeof(*ipv4_key)); + if (!nla) + goto nla_put_failure; + ipv4_key = nla_data(nla); + ipv4_key->ipv4_src = output->ipv4.addr.src; + ipv4_key->ipv4_dst = output->ipv4.addr.dst; + ipv4_key->ipv4_proto = output->ip.proto; + ipv4_key->ipv4_tos = output->ip.tos; + ipv4_key->ipv4_ttl = output->ip.ttl; + ipv4_key->ipv4_frag = output->ip.frag; + } else if (swkey->eth.type == htons(ETH_P_IPV6)) { + struct ovs_key_ipv6 *ipv6_key; + + nla = nla_reserve(skb, OVS_KEY_ATTR_IPV6, sizeof(*ipv6_key)); + if (!nla) + goto nla_put_failure; + ipv6_key = nla_data(nla); + memcpy(ipv6_key->ipv6_src, &output->ipv6.addr.src, + sizeof(ipv6_key->ipv6_src)); + memcpy(ipv6_key->ipv6_dst, &output->ipv6.addr.dst, + sizeof(ipv6_key->ipv6_dst)); + ipv6_key->ipv6_label = output->ipv6.label; + ipv6_key->ipv6_proto = output->ip.proto; + ipv6_key->ipv6_tclass = output->ip.tos; + ipv6_key->ipv6_hlimit = output->ip.ttl; + ipv6_key->ipv6_frag = output->ip.frag; + } else if (swkey->eth.type == htons(ETH_P_ARP) || + swkey->eth.type == htons(ETH_P_RARP)) { + struct ovs_key_arp *arp_key; + + nla = nla_reserve(skb, OVS_KEY_ATTR_ARP, sizeof(*arp_key)); + if (!nla) + goto nla_put_failure; + arp_key = nla_data(nla); + memset(arp_key, 0, sizeof(struct ovs_key_arp)); + arp_key->arp_sip = output->ipv4.addr.src; + arp_key->arp_tip = output->ipv4.addr.dst; + arp_key->arp_op = htons(output->ip.proto); + ether_addr_copy(arp_key->arp_sha, output->ipv4.arp.sha); + ether_addr_copy(arp_key->arp_tha, output->ipv4.arp.tha); + } else if (eth_p_mpls(swkey->eth.type)) { + struct ovs_key_mpls *mpls_key; + + nla = nla_reserve(skb, OVS_KEY_ATTR_MPLS, sizeof(*mpls_key)); + if (!nla) + goto nla_put_failure; + mpls_key = nla_data(nla); + mpls_key->mpls_lse = output->mpls.top_lse; + } + + if ((swkey->eth.type == htons(ETH_P_IP) || + swkey->eth.type == htons(ETH_P_IPV6)) && + swkey->ip.frag != OVS_FRAG_TYPE_LATER) { + + if (swkey->ip.proto == IPPROTO_TCP) { + struct ovs_key_tcp *tcp_key; + + nla = nla_reserve(skb, OVS_KEY_ATTR_TCP, sizeof(*tcp_key)); + if (!nla) + goto nla_put_failure; + tcp_key = nla_data(nla); + tcp_key->tcp_src = output->tp.src; + tcp_key->tcp_dst = output->tp.dst; + if (nla_put_be16(skb, OVS_KEY_ATTR_TCP_FLAGS, + output->tp.flags)) + goto nla_put_failure; + } else if (swkey->ip.proto == IPPROTO_UDP) { + struct ovs_key_udp *udp_key; + + nla = nla_reserve(skb, OVS_KEY_ATTR_UDP, sizeof(*udp_key)); + if (!nla) + goto nla_put_failure; + udp_key = nla_data(nla); + udp_key->udp_src = output->tp.src; + udp_key->udp_dst = output->tp.dst; + } else if (swkey->ip.proto == IPPROTO_SCTP) { + struct ovs_key_sctp *sctp_key; + + nla = nla_reserve(skb, OVS_KEY_ATTR_SCTP, sizeof(*sctp_key)); + if (!nla) + goto nla_put_failure; + sctp_key = nla_data(nla); + sctp_key->sctp_src = output->tp.src; + sctp_key->sctp_dst = output->tp.dst; + } else if (swkey->eth.type == htons(ETH_P_IP) && + swkey->ip.proto == IPPROTO_ICMP) { + struct ovs_key_icmp *icmp_key; + + nla = nla_reserve(skb, OVS_KEY_ATTR_ICMP, sizeof(*icmp_key)); + if (!nla) + goto nla_put_failure; + icmp_key = nla_data(nla); + icmp_key->icmp_type = ntohs(output->tp.src); + icmp_key->icmp_code = ntohs(output->tp.dst); + } else if (swkey->eth.type == htons(ETH_P_IPV6) && + swkey->ip.proto == IPPROTO_ICMPV6) { + struct ovs_key_icmpv6 *icmpv6_key; + + nla = nla_reserve(skb, OVS_KEY_ATTR_ICMPV6, + sizeof(*icmpv6_key)); + if (!nla) + goto nla_put_failure; + icmpv6_key = nla_data(nla); + icmpv6_key->icmpv6_type = ntohs(output->tp.src); + icmpv6_key->icmpv6_code = ntohs(output->tp.dst); + + if (icmpv6_key->icmpv6_type == NDISC_NEIGHBOUR_SOLICITATION || + icmpv6_key->icmpv6_type == NDISC_NEIGHBOUR_ADVERTISEMENT) { + struct ovs_key_nd *nd_key; + + nla = nla_reserve(skb, OVS_KEY_ATTR_ND, sizeof(*nd_key)); + if (!nla) + goto nla_put_failure; + nd_key = nla_data(nla); + memcpy(nd_key->nd_target, &output->ipv6.nd.target, + sizeof(nd_key->nd_target)); + ether_addr_copy(nd_key->nd_sll, output->ipv6.nd.sll); + ether_addr_copy(nd_key->nd_tll, output->ipv6.nd.tll); + } + } + } + +unencap: + if (encap) + nla_nest_end(skb, encap); + + return 0; + +nla_put_failure: + return -EMSGSIZE; +} + +int ovs_nla_put_key(const struct sw_flow_key *swkey, + const struct sw_flow_key *output, int attr, bool is_mask, + struct sk_buff *skb) +{ + int err; + struct nlattr *nla; + + nla = nla_nest_start(skb, attr); + if (!nla) + return -EMSGSIZE; + err = __ovs_nla_put_key(swkey, output, is_mask, skb); + if (err) + return err; + nla_nest_end(skb, nla); + + return 0; +} + +/* Called with ovs_mutex or RCU read lock. */ +int ovs_nla_put_identifier(const struct sw_flow *flow, struct sk_buff *skb) +{ + if (ovs_identifier_is_ufid(&flow->id)) + return nla_put(skb, OVS_FLOW_ATTR_UFID, flow->id.ufid_len, + flow->id.ufid); + + return ovs_nla_put_key(flow->id.unmasked_key, flow->id.unmasked_key, + OVS_FLOW_ATTR_KEY, false, skb); +} + +/* Called with ovs_mutex or RCU read lock. */ +int ovs_nla_put_masked_key(const struct sw_flow *flow, struct sk_buff *skb) +{ + return ovs_nla_put_key(&flow->key, &flow->key, + OVS_FLOW_ATTR_KEY, false, skb); +} + +/* Called with ovs_mutex or RCU read lock. */ +int ovs_nla_put_mask(const struct sw_flow *flow, struct sk_buff *skb) +{ + return ovs_nla_put_key(&flow->key, &flow->mask->key, + OVS_FLOW_ATTR_MASK, true, skb); +} + +#define MAX_ACTIONS_BUFSIZE (32 * 1024) + +static struct sw_flow_actions *nla_alloc_flow_actions(int size) +{ + struct sw_flow_actions *sfa; + + WARN_ON_ONCE(size > MAX_ACTIONS_BUFSIZE); + + sfa = kmalloc(sizeof(*sfa) + size, GFP_KERNEL); + if (!sfa) + return ERR_PTR(-ENOMEM); + + sfa->actions_len = 0; + return sfa; +} + +static void ovs_nla_free_set_action(const struct nlattr *a) +{ + const struct nlattr *ovs_key = nla_data(a); + struct ovs_tunnel_info *ovs_tun; + + switch (nla_type(ovs_key)) { + case OVS_KEY_ATTR_TUNNEL_INFO: + ovs_tun = nla_data(ovs_key); + dst_release((struct dst_entry *)ovs_tun->tun_dst); + break; + } +} + +void ovs_nla_free_flow_actions(struct sw_flow_actions *sf_acts) +{ + const struct nlattr *a; + int rem; + + if (!sf_acts) + return; + + nla_for_each_attr(a, sf_acts->actions, sf_acts->actions_len, rem) { + switch (nla_type(a)) { + case OVS_ACTION_ATTR_SET: + ovs_nla_free_set_action(a); + break; + case OVS_ACTION_ATTR_CT: + ovs_ct_free_action(a); + break; + } + } + + kfree(sf_acts); +} + +static void __ovs_nla_free_flow_actions(struct rcu_head *head) +{ + ovs_nla_free_flow_actions(container_of(head, struct sw_flow_actions, rcu)); +} + +/* Schedules 'sf_acts' to be freed after the next RCU grace period. + * The caller must hold rcu_read_lock for this to be sensible. */ +void ovs_nla_free_flow_actions_rcu(struct sw_flow_actions *sf_acts) +{ + call_rcu(&sf_acts->rcu, __ovs_nla_free_flow_actions); +} + +static struct nlattr *reserve_sfa_size(struct sw_flow_actions **sfa, + int attr_len, bool log) +{ + + struct sw_flow_actions *acts; + int new_acts_size; + size_t req_size = NLA_ALIGN(attr_len); + int next_offset = offsetof(struct sw_flow_actions, actions) + + (*sfa)->actions_len; + + if (req_size <= (ksize(*sfa) - next_offset)) + goto out; + + new_acts_size = max(next_offset + req_size, ksize(*sfa) * 2); + + if (new_acts_size > MAX_ACTIONS_BUFSIZE) { + if ((MAX_ACTIONS_BUFSIZE - next_offset) < req_size) { + OVS_NLERR(log, "Flow action size exceeds max %u", + MAX_ACTIONS_BUFSIZE); + return ERR_PTR(-EMSGSIZE); + } + new_acts_size = MAX_ACTIONS_BUFSIZE; + } + + acts = nla_alloc_flow_actions(new_acts_size); + if (IS_ERR(acts)) + return (void *)acts; + + memcpy(acts->actions, (*sfa)->actions, (*sfa)->actions_len); + acts->actions_len = (*sfa)->actions_len; + acts->orig_len = (*sfa)->orig_len; + kfree(*sfa); + *sfa = acts; + +out: + (*sfa)->actions_len += req_size; + return (struct nlattr *) ((unsigned char *)(*sfa) + next_offset); +} + +static struct nlattr *__add_action(struct sw_flow_actions **sfa, + int attrtype, void *data, int len, bool log) +{ + struct nlattr *a; + + a = reserve_sfa_size(sfa, nla_attr_size(len), log); + if (IS_ERR(a)) + return a; + + a->nla_type = attrtype; + a->nla_len = nla_attr_size(len); + + if (data) + memcpy(nla_data(a), data, len); + memset((unsigned char *) a + a->nla_len, 0, nla_padlen(len)); + + return a; +} + +int ovs_nla_add_action(struct sw_flow_actions **sfa, int attrtype, void *data, + int len, bool log) +{ + struct nlattr *a; + + a = __add_action(sfa, attrtype, data, len, log); + + return PTR_ERR_OR_ZERO(a); +} + +static inline int add_nested_action_start(struct sw_flow_actions **sfa, + int attrtype, bool log) +{ + int used = (*sfa)->actions_len; + int err; + + err = ovs_nla_add_action(sfa, attrtype, NULL, 0, log); + if (err) + return err; + + return used; +} + +static inline void add_nested_action_end(struct sw_flow_actions *sfa, + int st_offset) +{ + struct nlattr *a = (struct nlattr *) ((unsigned char *)sfa->actions + + st_offset); + + a->nla_len = sfa->actions_len - st_offset; +} + +static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr, + const struct sw_flow_key *key, + int depth, struct sw_flow_actions **sfa, + __be16 eth_type, __be16 vlan_tci, bool log); + +static int validate_and_copy_sample(struct net *net, const struct nlattr *attr, + const struct sw_flow_key *key, int depth, + struct sw_flow_actions **sfa, + __be16 eth_type, __be16 vlan_tci, bool log) +{ + const struct nlattr *attrs[OVS_SAMPLE_ATTR_MAX + 1]; + const struct nlattr *probability, *actions; + const struct nlattr *a; + int rem, start, err, st_acts; + + memset(attrs, 0, sizeof(attrs)); + nla_for_each_nested(a, attr, rem) { + int type = nla_type(a); + if (!type || type > OVS_SAMPLE_ATTR_MAX || attrs[type]) + return -EINVAL; + attrs[type] = a; + } + if (rem) + return -EINVAL; + + probability = attrs[OVS_SAMPLE_ATTR_PROBABILITY]; + if (!probability || nla_len(probability) != sizeof(u32)) + return -EINVAL; + + actions = attrs[OVS_SAMPLE_ATTR_ACTIONS]; + if (!actions || (nla_len(actions) && nla_len(actions) < NLA_HDRLEN)) + return -EINVAL; + + /* validation done, copy sample action. */ + start = add_nested_action_start(sfa, OVS_ACTION_ATTR_SAMPLE, log); + if (start < 0) + return start; + err = ovs_nla_add_action(sfa, OVS_SAMPLE_ATTR_PROBABILITY, + nla_data(probability), sizeof(u32), log); + if (err) + return err; + st_acts = add_nested_action_start(sfa, OVS_SAMPLE_ATTR_ACTIONS, log); + if (st_acts < 0) + return st_acts; + + err = __ovs_nla_copy_actions(net, actions, key, depth + 1, sfa, + eth_type, vlan_tci, log); + if (err) + return err; + + add_nested_action_end(*sfa, st_acts); + add_nested_action_end(*sfa, start); + + return 0; +} + +void ovs_match_init(struct sw_flow_match *match, + struct sw_flow_key *key, + struct sw_flow_mask *mask) +{ + memset(match, 0, sizeof(*match)); + match->key = key; + match->mask = mask; + + memset(key, 0, sizeof(*key)); + + if (mask) { + memset(&mask->key, 0, sizeof(mask->key)); + mask->range.start = mask->range.end = 0; + } +} + +static int validate_geneve_opts(struct sw_flow_key *key) +{ + struct geneve_opt *option; + int opts_len = key->tun_opts_len; + bool crit_opt = false; + + option = (struct geneve_opt *)TUN_METADATA_OPTS(key, key->tun_opts_len); + while (opts_len > 0) { + int len; + + if (opts_len < sizeof(*option)) + return -EINVAL; + + len = sizeof(*option) + option->length * 4; + if (len > opts_len) + return -EINVAL; + + crit_opt |= !!(option->type & GENEVE_CRIT_OPT_TYPE); + + option = (struct geneve_opt *)((u8 *)option + len); + opts_len -= len; + }; + + key->tun_key.tun_flags |= crit_opt ? TUNNEL_CRIT_OPT : 0; + + return 0; +} + +static int validate_and_copy_set_tun(const struct nlattr *attr, + struct sw_flow_actions **sfa, bool log) +{ + struct sw_flow_match match; + struct sw_flow_key key; + struct metadata_dst *tun_dst; + struct ip_tunnel_info *tun_info; + struct ovs_tunnel_info *ovs_tun; + struct nlattr *a; + int err = 0, start, opts_type; + + ovs_match_init(&match, &key, NULL); + opts_type = ip_tun_from_nlattr(nla_data(attr), &match, false, log); + if (opts_type < 0) + return opts_type; + + if (key.tun_opts_len) { + switch (opts_type) { + case OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS: + err = validate_geneve_opts(&key); + if (err < 0) + return err; + break; + case OVS_TUNNEL_KEY_ATTR_VXLAN_OPTS: + break; + } + }; + + start = add_nested_action_start(sfa, OVS_ACTION_ATTR_SET, log); + if (start < 0) + return start; + + tun_dst = metadata_dst_alloc(key.tun_opts_len, GFP_KERNEL); + if (!tun_dst) + return -ENOMEM; + + a = __add_action(sfa, OVS_KEY_ATTR_TUNNEL_INFO, NULL, + sizeof(*ovs_tun), log); + if (IS_ERR(a)) { + dst_release((struct dst_entry *)tun_dst); + return PTR_ERR(a); + } + + ovs_tun = nla_data(a); + ovs_tun->tun_dst = tun_dst; + + tun_info = &tun_dst->u.tun_info; + tun_info->mode = IP_TUNNEL_INFO_TX; + if (key.tun_proto == AF_INET6) + tun_info->mode |= IP_TUNNEL_INFO_IPV6; + tun_info->key = key.tun_key; + + /* We need to store the options in the action itself since + * everything else will go away after flow setup. We can append + * it to tun_info and then point there. + */ + ip_tunnel_info_opts_set(tun_info, + TUN_METADATA_OPTS(&key, key.tun_opts_len), + key.tun_opts_len); + add_nested_action_end(*sfa, start); + + return err; +} + +/* Return false if there are any non-masked bits set. + * Mask follows data immediately, before any netlink padding. + */ +static bool validate_masked(u8 *data, int len) +{ + u8 *mask = data + len; + + while (len--) + if (*data++ & ~*mask++) + return false; + + return true; +} + +static int validate_set(const struct nlattr *a, + const struct sw_flow_key *flow_key, + struct sw_flow_actions **sfa, + bool *skip_copy, __be16 eth_type, bool masked, bool log) +{ + const struct nlattr *ovs_key = nla_data(a); + int key_type = nla_type(ovs_key); + size_t key_len; + + /* There can be only one key in a action */ + if (nla_total_size(nla_len(ovs_key)) != nla_len(a)) + return -EINVAL; + + key_len = nla_len(ovs_key); + if (masked) + key_len /= 2; + + if (key_type > OVS_KEY_ATTR_MAX || + !check_attr_len(key_len, ovs_key_lens[key_type].len)) + return -EINVAL; + + if (masked && !validate_masked(nla_data(ovs_key), key_len)) + return -EINVAL; + + switch (key_type) { + const struct ovs_key_ipv4 *ipv4_key; + const struct ovs_key_ipv6 *ipv6_key; + int err; + + case OVS_KEY_ATTR_PRIORITY: + case OVS_KEY_ATTR_SKB_MARK: + case OVS_KEY_ATTR_CT_MARK: + case OVS_KEY_ATTR_CT_LABELS: + case OVS_KEY_ATTR_ETHERNET: + break; + + case OVS_KEY_ATTR_TUNNEL: + if (eth_p_mpls(eth_type)) + return -EINVAL; + + if (masked) + return -EINVAL; /* Masked tunnel set not supported. */ + + *skip_copy = true; + err = validate_and_copy_set_tun(a, sfa, log); + if (err) + return err; + break; + + case OVS_KEY_ATTR_IPV4: + if (eth_type != htons(ETH_P_IP)) + return -EINVAL; + + ipv4_key = nla_data(ovs_key); + + if (masked) { + const struct ovs_key_ipv4 *mask = ipv4_key + 1; + + /* Non-writeable fields. */ + if (mask->ipv4_proto || mask->ipv4_frag) + return -EINVAL; + } else { + if (ipv4_key->ipv4_proto != flow_key->ip.proto) + return -EINVAL; + + if (ipv4_key->ipv4_frag != flow_key->ip.frag) + return -EINVAL; + } + break; + + case OVS_KEY_ATTR_IPV6: + if (eth_type != htons(ETH_P_IPV6)) + return -EINVAL; + + ipv6_key = nla_data(ovs_key); + + if (masked) { + const struct ovs_key_ipv6 *mask = ipv6_key + 1; + + /* Non-writeable fields. */ + if (mask->ipv6_proto || mask->ipv6_frag) + return -EINVAL; + + /* Invalid bits in the flow label mask? */ + if (ntohl(mask->ipv6_label) & 0xFFF00000) + return -EINVAL; + } else { + if (ipv6_key->ipv6_proto != flow_key->ip.proto) + return -EINVAL; + + if (ipv6_key->ipv6_frag != flow_key->ip.frag) + return -EINVAL; + } + if (ntohl(ipv6_key->ipv6_label) & 0xFFF00000) + return -EINVAL; + + break; + + case OVS_KEY_ATTR_TCP: + if ((eth_type != htons(ETH_P_IP) && + eth_type != htons(ETH_P_IPV6)) || + flow_key->ip.proto != IPPROTO_TCP) + return -EINVAL; + + break; + + case OVS_KEY_ATTR_UDP: + if ((eth_type != htons(ETH_P_IP) && + eth_type != htons(ETH_P_IPV6)) || + flow_key->ip.proto != IPPROTO_UDP) + return -EINVAL; + + break; + + case OVS_KEY_ATTR_MPLS: + if (!eth_p_mpls(eth_type)) + return -EINVAL; + break; + + case OVS_KEY_ATTR_SCTP: + if ((eth_type != htons(ETH_P_IP) && + eth_type != htons(ETH_P_IPV6)) || + flow_key->ip.proto != IPPROTO_SCTP) + return -EINVAL; + + break; + + default: + return -EINVAL; + } + + /* Convert non-masked non-tunnel set actions to masked set actions. */ + if (!masked && key_type != OVS_KEY_ATTR_TUNNEL) { + int start, len = key_len * 2; + struct nlattr *at; + + *skip_copy = true; + + start = add_nested_action_start(sfa, + OVS_ACTION_ATTR_SET_TO_MASKED, + log); + if (start < 0) + return start; + + at = __add_action(sfa, key_type, NULL, len, log); + if (IS_ERR(at)) + return PTR_ERR(at); + + memcpy(nla_data(at), nla_data(ovs_key), key_len); /* Key. */ + memset(nla_data(at) + key_len, 0xff, key_len); /* Mask. */ + /* Clear non-writeable bits from otherwise writeable fields. */ + if (key_type == OVS_KEY_ATTR_IPV6) { + struct ovs_key_ipv6 *mask = nla_data(at) + key_len; + + mask->ipv6_label &= htonl(0x000FFFFF); + } + add_nested_action_end(*sfa, start); + } + + return 0; +} + +static int validate_userspace(const struct nlattr *attr) +{ + static const struct nla_policy userspace_policy[OVS_USERSPACE_ATTR_MAX + 1] = { + [OVS_USERSPACE_ATTR_PID] = {.type = NLA_U32 }, + [OVS_USERSPACE_ATTR_USERDATA] = {.type = NLA_UNSPEC }, + [OVS_USERSPACE_ATTR_EGRESS_TUN_PORT] = {.type = NLA_U32 }, + }; + struct nlattr *a[OVS_USERSPACE_ATTR_MAX + 1]; + int error; + + error = nla_parse_nested(a, OVS_USERSPACE_ATTR_MAX, + attr, userspace_policy); + if (error) + return error; + + if (!a[OVS_USERSPACE_ATTR_PID] || + !nla_get_u32(a[OVS_USERSPACE_ATTR_PID])) + return -EINVAL; + + return 0; +} + +static int copy_action(const struct nlattr *from, + struct sw_flow_actions **sfa, bool log) +{ + int totlen = NLA_ALIGN(from->nla_len); + struct nlattr *to; + + to = reserve_sfa_size(sfa, from->nla_len, log); + if (IS_ERR(to)) + return PTR_ERR(to); + + memcpy(to, from, totlen); + return 0; +} + +static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr, + const struct sw_flow_key *key, + int depth, struct sw_flow_actions **sfa, + __be16 eth_type, __be16 vlan_tci, bool log) +{ + const struct nlattr *a; + int rem, err; + + if (depth >= SAMPLE_ACTION_DEPTH) + return -EOVERFLOW; + + nla_for_each_nested(a, attr, rem) { + /* Expected argument lengths, (u32)-1 for variable length. */ + static const u32 action_lens[OVS_ACTION_ATTR_MAX + 1] = { + [OVS_ACTION_ATTR_OUTPUT] = sizeof(u32), + [OVS_ACTION_ATTR_RECIRC] = sizeof(u32), + [OVS_ACTION_ATTR_USERSPACE] = (u32)-1, + [OVS_ACTION_ATTR_PUSH_MPLS] = sizeof(struct ovs_action_push_mpls), + [OVS_ACTION_ATTR_POP_MPLS] = sizeof(__be16), + [OVS_ACTION_ATTR_PUSH_VLAN] = sizeof(struct ovs_action_push_vlan), + [OVS_ACTION_ATTR_POP_VLAN] = 0, + [OVS_ACTION_ATTR_SET] = (u32)-1, + [OVS_ACTION_ATTR_SET_MASKED] = (u32)-1, + [OVS_ACTION_ATTR_SAMPLE] = (u32)-1, + [OVS_ACTION_ATTR_HASH] = sizeof(struct ovs_action_hash), + [OVS_ACTION_ATTR_CT] = (u32)-1, + }; + const struct ovs_action_push_vlan *vlan; + int type = nla_type(a); + bool skip_copy; + + if (type > OVS_ACTION_ATTR_MAX || + (action_lens[type] != nla_len(a) && + action_lens[type] != (u32)-1)) + return -EINVAL; + + skip_copy = false; + switch (type) { + case OVS_ACTION_ATTR_UNSPEC: + return -EINVAL; + + case OVS_ACTION_ATTR_USERSPACE: + err = validate_userspace(a); + if (err) + return err; + break; + + case OVS_ACTION_ATTR_OUTPUT: + if (nla_get_u32(a) >= DP_MAX_PORTS) + return -EINVAL; + break; + + case OVS_ACTION_ATTR_HASH: { + const struct ovs_action_hash *act_hash = nla_data(a); + + switch (act_hash->hash_alg) { + case OVS_HASH_ALG_L4: + break; + default: + return -EINVAL; + } + + break; + } + + case OVS_ACTION_ATTR_POP_VLAN: + vlan_tci = htons(0); + break; + + case OVS_ACTION_ATTR_PUSH_VLAN: + vlan = nla_data(a); + if (vlan->vlan_tpid != htons(ETH_P_8021Q)) + return -EINVAL; + if (!(vlan->vlan_tci & htons(VLAN_TAG_PRESENT))) + return -EINVAL; + vlan_tci = vlan->vlan_tci; + break; + + case OVS_ACTION_ATTR_RECIRC: + break; + + case OVS_ACTION_ATTR_PUSH_MPLS: { + const struct ovs_action_push_mpls *mpls = nla_data(a); + + if (!eth_p_mpls(mpls->mpls_ethertype)) + return -EINVAL; + /* Prohibit push MPLS other than to a white list + * for packets that have a known tag order. + */ + if (vlan_tci & htons(VLAN_TAG_PRESENT) || + (eth_type != htons(ETH_P_IP) && + eth_type != htons(ETH_P_IPV6) && + eth_type != htons(ETH_P_ARP) && + eth_type != htons(ETH_P_RARP) && + !eth_p_mpls(eth_type))) + return -EINVAL; + eth_type = mpls->mpls_ethertype; + break; + } + + case OVS_ACTION_ATTR_POP_MPLS: + if (vlan_tci & htons(VLAN_TAG_PRESENT) || + !eth_p_mpls(eth_type)) + return -EINVAL; + + /* Disallow subsequent L2.5+ set and mpls_pop actions + * as there is no check here to ensure that the new + * eth_type is valid and thus set actions could + * write off the end of the packet or otherwise + * corrupt it. + * + * Support for these actions is planned using packet + * recirculation. + */ + eth_type = htons(0); + break; + + case OVS_ACTION_ATTR_SET: + err = validate_set(a, key, sfa, + &skip_copy, eth_type, false, log); + if (err) + return err; + break; + + case OVS_ACTION_ATTR_SET_MASKED: + err = validate_set(a, key, sfa, + &skip_copy, eth_type, true, log); + if (err) + return err; + break; + + case OVS_ACTION_ATTR_SAMPLE: + err = validate_and_copy_sample(net, a, key, depth, sfa, + eth_type, vlan_tci, log); + if (err) + return err; + skip_copy = true; + break; + + case OVS_ACTION_ATTR_CT: + err = ovs_ct_copy_action(net, a, key, sfa, log); + if (err) + return err; + skip_copy = true; + break; + + default: + OVS_NLERR(log, "Unknown Action type %d", type); + return -EINVAL; + } + if (!skip_copy) { + err = copy_action(a, sfa, log); + if (err) + return err; + } + } + + if (rem > 0) + return -EINVAL; + + return 0; +} + +/* 'key' must be the masked key. */ +int ovs_nla_copy_actions(struct net *net, const struct nlattr *attr, + const struct sw_flow_key *key, + struct sw_flow_actions **sfa, bool log) +{ + int err; + + *sfa = nla_alloc_flow_actions(min(nla_len(attr), MAX_ACTIONS_BUFSIZE)); + if (IS_ERR(*sfa)) + return PTR_ERR(*sfa); + + (*sfa)->orig_len = nla_len(attr); + err = __ovs_nla_copy_actions(net, attr, key, 0, sfa, key->eth.type, + key->eth.tci, log); + if (err) + ovs_nla_free_flow_actions(*sfa); + + return err; +} + +static int sample_action_to_attr(const struct nlattr *attr, struct sk_buff *skb) +{ + const struct nlattr *a; + struct nlattr *start; + int err = 0, rem; + + start = nla_nest_start(skb, OVS_ACTION_ATTR_SAMPLE); + if (!start) + return -EMSGSIZE; + + nla_for_each_nested(a, attr, rem) { + int type = nla_type(a); + struct nlattr *st_sample; + + switch (type) { + case OVS_SAMPLE_ATTR_PROBABILITY: + if (nla_put(skb, OVS_SAMPLE_ATTR_PROBABILITY, + sizeof(u32), nla_data(a))) + return -EMSGSIZE; + break; + case OVS_SAMPLE_ATTR_ACTIONS: + st_sample = nla_nest_start(skb, OVS_SAMPLE_ATTR_ACTIONS); + if (!st_sample) + return -EMSGSIZE; + err = ovs_nla_put_actions(nla_data(a), nla_len(a), skb); + if (err) + return err; + nla_nest_end(skb, st_sample); + break; + } + } + + nla_nest_end(skb, start); + return err; +} + +static int set_action_to_attr(const struct nlattr *a, struct sk_buff *skb) +{ + const struct nlattr *ovs_key = nla_data(a); + int key_type = nla_type(ovs_key); + struct nlattr *start; + int err; + + switch (key_type) { + case OVS_KEY_ATTR_TUNNEL_INFO: { + struct ovs_tunnel_info *ovs_tun = nla_data(ovs_key); + struct ip_tunnel_info *tun_info = &ovs_tun->tun_dst->u.tun_info; + + start = nla_nest_start(skb, OVS_ACTION_ATTR_SET); + if (!start) + return -EMSGSIZE; + + err = ip_tun_to_nlattr(skb, &tun_info->key, + ip_tunnel_info_opts(tun_info), + tun_info->options_len, + ip_tunnel_info_af(tun_info)); + if (err) + return err; + nla_nest_end(skb, start); + break; + } + default: + if (nla_put(skb, OVS_ACTION_ATTR_SET, nla_len(a), ovs_key)) + return -EMSGSIZE; + break; + } + + return 0; +} + +static int masked_set_action_to_set_action_attr(const struct nlattr *a, + struct sk_buff *skb) +{ + const struct nlattr *ovs_key = nla_data(a); + struct nlattr *nla; + size_t key_len = nla_len(ovs_key) / 2; + + /* Revert the conversion we did from a non-masked set action to + * masked set action. + */ + nla = nla_nest_start(skb, OVS_ACTION_ATTR_SET); + if (!nla) + return -EMSGSIZE; + + if (nla_put(skb, nla_type(ovs_key), key_len, nla_data(ovs_key))) + return -EMSGSIZE; + + nla_nest_end(skb, nla); + return 0; +} + +int ovs_nla_put_actions(const struct nlattr *attr, int len, struct sk_buff *skb) +{ + const struct nlattr *a; + int rem, err; + + nla_for_each_attr(a, attr, len, rem) { + int type = nla_type(a); + + switch (type) { + case OVS_ACTION_ATTR_SET: + err = set_action_to_attr(a, skb); + if (err) + return err; + break; + + case OVS_ACTION_ATTR_SET_TO_MASKED: + err = masked_set_action_to_set_action_attr(a, skb); + if (err) + return err; + break; + + case OVS_ACTION_ATTR_SAMPLE: + err = sample_action_to_attr(a, skb); + if (err) + return err; + break; + + case OVS_ACTION_ATTR_CT: + err = ovs_ct_action_to_attr(nla_data(a), skb); + if (err) + return err; + break; + + default: + if (nla_put(skb, type, nla_len(a), nla_data(a))) + return -EMSGSIZE; + break; + } + } + + return 0; +} diff --git a/net/openvswitch/flow_netlink.h b/net/openvswitch/flow_netlink.h new file mode 100644 index 000000000000..47dd142eca1c --- /dev/null +++ b/net/openvswitch/flow_netlink.h @@ -0,0 +1,78 @@ +/* + * Copyright (c) 2007-2013 Nicira, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA + */ + + +#ifndef FLOW_NETLINK_H +#define FLOW_NETLINK_H 1 + +#include <linux/kernel.h> +#include <linux/netlink.h> +#include <linux/openvswitch.h> +#include <linux/spinlock.h> +#include <linux/types.h> +#include <linux/rcupdate.h> +#include <linux/if_ether.h> +#include <linux/in6.h> +#include <linux/jiffies.h> +#include <linux/time.h> +#include <linux/flex_array.h> + +#include <net/inet_ecn.h> +#include <net/ip_tunnels.h> + +#include "flow.h" + +size_t ovs_tun_key_attr_size(void); +size_t ovs_key_attr_size(void); + +void ovs_match_init(struct sw_flow_match *match, + struct sw_flow_key *key, struct sw_flow_mask *mask); + +int ovs_nla_put_key(const struct sw_flow_key *, const struct sw_flow_key *, + int attr, bool is_mask, struct sk_buff *); +int ovs_nla_get_flow_metadata(struct net *, const struct nlattr *, + struct sw_flow_key *, bool log); + +int ovs_nla_put_identifier(const struct sw_flow *flow, struct sk_buff *skb); +int ovs_nla_put_masked_key(const struct sw_flow *flow, struct sk_buff *skb); +int ovs_nla_put_mask(const struct sw_flow *flow, struct sk_buff *skb); + +int ovs_nla_get_match(struct net *, struct sw_flow_match *, + const struct nlattr *key, const struct nlattr *mask, + bool log); + +int ovs_nla_put_tunnel_info(struct sk_buff *skb, + struct ip_tunnel_info *tun_info); + +bool ovs_nla_get_ufid(struct sw_flow_id *, const struct nlattr *, bool log); +int ovs_nla_get_identifier(struct sw_flow_id *sfid, const struct nlattr *ufid, + const struct sw_flow_key *key, bool log); +u32 ovs_nla_get_ufid_flags(const struct nlattr *attr); + +int ovs_nla_copy_actions(struct net *net, const struct nlattr *attr, + const struct sw_flow_key *key, + struct sw_flow_actions **sfa, bool log); +int ovs_nla_add_action(struct sw_flow_actions **sfa, int attrtype, + void *data, int len, bool log); +int ovs_nla_put_actions(const struct nlattr *attr, + int len, struct sk_buff *skb); + +void ovs_nla_free_flow_actions(struct sw_flow_actions *); +void ovs_nla_free_flow_actions_rcu(struct sw_flow_actions *); + +#endif /* flow_netlink.h */ diff --git a/net/openvswitch/flow_table.c b/net/openvswitch/flow_table.c new file mode 100644 index 000000000000..d073fff82fdb --- /dev/null +++ b/net/openvswitch/flow_table.c @@ -0,0 +1,782 @@ +/* + * Copyright (c) 2007-2014 Nicira, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA + */ + +#include "flow.h" +#include "datapath.h" +#include "flow_netlink.h" +#include <linux/uaccess.h> +#include <linux/netdevice.h> +#include <linux/etherdevice.h> +#include <linux/if_ether.h> +#include <linux/if_vlan.h> +#include <net/llc_pdu.h> +#include <linux/kernel.h> +#include <linux/jhash.h> +#include <linux/jiffies.h> +#include <linux/llc.h> +#include <linux/module.h> +#include <linux/in.h> +#include <linux/rcupdate.h> +#include <linux/if_arp.h> +#include <linux/ip.h> +#include <linux/ipv6.h> +#include <linux/sctp.h> +#include <linux/tcp.h> +#include <linux/udp.h> +#include <linux/icmp.h> +#include <linux/icmpv6.h> +#include <linux/rculist.h> +#include <net/ip.h> +#include <net/ipv6.h> +#include <net/ndisc.h> + +#define TBL_MIN_BUCKETS 1024 +#define REHASH_INTERVAL (10 * 60 * HZ) + +static struct kmem_cache *flow_cache; +struct kmem_cache *flow_stats_cache __read_mostly; + +static u16 range_n_bytes(const struct sw_flow_key_range *range) +{ + return range->end - range->start; +} + +void ovs_flow_mask_key(struct sw_flow_key *dst, const struct sw_flow_key *src, + bool full, const struct sw_flow_mask *mask) +{ + int start = full ? 0 : mask->range.start; + int len = full ? sizeof *dst : range_n_bytes(&mask->range); + const long *m = (const long *)((const u8 *)&mask->key + start); + const long *s = (const long *)((const u8 *)src + start); + long *d = (long *)((u8 *)dst + start); + int i; + + /* If 'full' is true then all of 'dst' is fully initialized. Otherwise, + * if 'full' is false the memory outside of the 'mask->range' is left + * uninitialized. This can be used as an optimization when further + * operations on 'dst' only use contents within 'mask->range'. + */ + for (i = 0; i < len; i += sizeof(long)) + *d++ = *s++ & *m++; +} + +struct sw_flow *ovs_flow_alloc(void) +{ + struct sw_flow *flow; + struct flow_stats *stats; + int node; + + flow = kmem_cache_alloc(flow_cache, GFP_KERNEL); + if (!flow) + return ERR_PTR(-ENOMEM); + + flow->sf_acts = NULL; + flow->mask = NULL; + flow->id.unmasked_key = NULL; + flow->id.ufid_len = 0; + flow->stats_last_writer = NUMA_NO_NODE; + + /* Initialize the default stat node. */ + stats = kmem_cache_alloc_node(flow_stats_cache, + GFP_KERNEL | __GFP_ZERO, + node_online(0) ? 0 : NUMA_NO_NODE); + if (!stats) + goto err; + + spin_lock_init(&stats->lock); + + RCU_INIT_POINTER(flow->stats[0], stats); + + for_each_node(node) + if (node != 0) + RCU_INIT_POINTER(flow->stats[node], NULL); + + return flow; +err: + kmem_cache_free(flow_cache, flow); + return ERR_PTR(-ENOMEM); +} + +int ovs_flow_tbl_count(const struct flow_table *table) +{ + return table->count; +} + +static struct flex_array *alloc_buckets(unsigned int n_buckets) +{ + struct flex_array *buckets; + int i, err; + + buckets = flex_array_alloc(sizeof(struct hlist_head), + n_buckets, GFP_KERNEL); + if (!buckets) + return NULL; + + err = flex_array_prealloc(buckets, 0, n_buckets, GFP_KERNEL); + if (err) { + flex_array_free(buckets); + return NULL; + } + + for (i = 0; i < n_buckets; i++) + INIT_HLIST_HEAD((struct hlist_head *) + flex_array_get(buckets, i)); + + return buckets; +} + +static void flow_free(struct sw_flow *flow) +{ + int node; + + if (ovs_identifier_is_key(&flow->id)) + kfree(flow->id.unmasked_key); + if (flow->sf_acts) + ovs_nla_free_flow_actions((struct sw_flow_actions __force *)flow->sf_acts); + for_each_node(node) + if (flow->stats[node]) + kmem_cache_free(flow_stats_cache, + (struct flow_stats __force *)flow->stats[node]); + kmem_cache_free(flow_cache, flow); +} + +static void rcu_free_flow_callback(struct rcu_head *rcu) +{ + struct sw_flow *flow = container_of(rcu, struct sw_flow, rcu); + + flow_free(flow); +} + +void ovs_flow_free(struct sw_flow *flow, bool deferred) +{ + if (!flow) + return; + + if (deferred) + call_rcu(&flow->rcu, rcu_free_flow_callback); + else + flow_free(flow); +} + +static void free_buckets(struct flex_array *buckets) +{ + flex_array_free(buckets); +} + + +static void __table_instance_destroy(struct table_instance *ti) +{ + free_buckets(ti->buckets); + kfree(ti); +} + +static struct table_instance *table_instance_alloc(int new_size) +{ + struct table_instance *ti = kmalloc(sizeof(*ti), GFP_KERNEL); + + if (!ti) + return NULL; + + ti->buckets = alloc_buckets(new_size); + + if (!ti->buckets) { + kfree(ti); + return NULL; + } + ti->n_buckets = new_size; + ti->node_ver = 0; + ti->keep_flows = false; + get_random_bytes(&ti->hash_seed, sizeof(u32)); + + return ti; +} + +int ovs_flow_tbl_init(struct flow_table *table) +{ + struct table_instance *ti, *ufid_ti; + + ti = table_instance_alloc(TBL_MIN_BUCKETS); + + if (!ti) + return -ENOMEM; + + ufid_ti = table_instance_alloc(TBL_MIN_BUCKETS); + if (!ufid_ti) + goto free_ti; + + rcu_assign_pointer(table->ti, ti); + rcu_assign_pointer(table->ufid_ti, ufid_ti); + INIT_LIST_HEAD(&table->mask_list); + table->last_rehash = jiffies; + table->count = 0; + table->ufid_count = 0; + return 0; + +free_ti: + __table_instance_destroy(ti); + return -ENOMEM; +} + +static void flow_tbl_destroy_rcu_cb(struct rcu_head *rcu) +{ + struct table_instance *ti = container_of(rcu, struct table_instance, rcu); + + __table_instance_destroy(ti); +} + +static void table_instance_destroy(struct table_instance *ti, + struct table_instance *ufid_ti, + bool deferred) +{ + int i; + + if (!ti) + return; + + BUG_ON(!ufid_ti); + if (ti->keep_flows) + goto skip_flows; + + for (i = 0; i < ti->n_buckets; i++) { + struct sw_flow *flow; + struct hlist_head *head = flex_array_get(ti->buckets, i); + struct hlist_node *n; + int ver = ti->node_ver; + int ufid_ver = ufid_ti->node_ver; + + hlist_for_each_entry_safe(flow, n, head, flow_table.node[ver]) { + hlist_del_rcu(&flow->flow_table.node[ver]); + if (ovs_identifier_is_ufid(&flow->id)) + hlist_del_rcu(&flow->ufid_table.node[ufid_ver]); + ovs_flow_free(flow, deferred); + } + } + +skip_flows: + if (deferred) { + call_rcu(&ti->rcu, flow_tbl_destroy_rcu_cb); + call_rcu(&ufid_ti->rcu, flow_tbl_destroy_rcu_cb); + } else { + __table_instance_destroy(ti); + __table_instance_destroy(ufid_ti); + } +} + +/* No need for locking this function is called from RCU callback or + * error path. + */ +void ovs_flow_tbl_destroy(struct flow_table *table) +{ + struct table_instance *ti = rcu_dereference_raw(table->ti); + struct table_instance *ufid_ti = rcu_dereference_raw(table->ufid_ti); + + table_instance_destroy(ti, ufid_ti, false); +} + +struct sw_flow *ovs_flow_tbl_dump_next(struct table_instance *ti, + u32 *bucket, u32 *last) +{ + struct sw_flow *flow; + struct hlist_head *head; + int ver; + int i; + + ver = ti->node_ver; + while (*bucket < ti->n_buckets) { + i = 0; + head = flex_array_get(ti->buckets, *bucket); + hlist_for_each_entry_rcu(flow, head, flow_table.node[ver]) { + if (i < *last) { + i++; + continue; + } + *last = i + 1; + return flow; + } + (*bucket)++; + *last = 0; + } + + return NULL; +} + +static struct hlist_head *find_bucket(struct table_instance *ti, u32 hash) +{ + hash = jhash_1word(hash, ti->hash_seed); + return flex_array_get(ti->buckets, + (hash & (ti->n_buckets - 1))); +} + +static void table_instance_insert(struct table_instance *ti, + struct sw_flow *flow) +{ + struct hlist_head *head; + + head = find_bucket(ti, flow->flow_table.hash); + hlist_add_head_rcu(&flow->flow_table.node[ti->node_ver], head); +} + +static void ufid_table_instance_insert(struct table_instance *ti, + struct sw_flow *flow) +{ + struct hlist_head *head; + + head = find_bucket(ti, flow->ufid_table.hash); + hlist_add_head_rcu(&flow->ufid_table.node[ti->node_ver], head); +} + +static void flow_table_copy_flows(struct table_instance *old, + struct table_instance *new, bool ufid) +{ + int old_ver; + int i; + + old_ver = old->node_ver; + new->node_ver = !old_ver; + + /* Insert in new table. */ + for (i = 0; i < old->n_buckets; i++) { + struct sw_flow *flow; + struct hlist_head *head; + + head = flex_array_get(old->buckets, i); + + if (ufid) + hlist_for_each_entry(flow, head, + ufid_table.node[old_ver]) + ufid_table_instance_insert(new, flow); + else + hlist_for_each_entry(flow, head, + flow_table.node[old_ver]) + table_instance_insert(new, flow); + } + + old->keep_flows = true; +} + +static struct table_instance *table_instance_rehash(struct table_instance *ti, + int n_buckets, bool ufid) +{ + struct table_instance *new_ti; + + new_ti = table_instance_alloc(n_buckets); + if (!new_ti) + return NULL; + + flow_table_copy_flows(ti, new_ti, ufid); + + return new_ti; +} + +int ovs_flow_tbl_flush(struct flow_table *flow_table) +{ + struct table_instance *old_ti, *new_ti; + struct table_instance *old_ufid_ti, *new_ufid_ti; + + new_ti = table_instance_alloc(TBL_MIN_BUCKETS); + if (!new_ti) + return -ENOMEM; + new_ufid_ti = table_instance_alloc(TBL_MIN_BUCKETS); + if (!new_ufid_ti) + goto err_free_ti; + + old_ti = ovsl_dereference(flow_table->ti); + old_ufid_ti = ovsl_dereference(flow_table->ufid_ti); + + rcu_assign_pointer(flow_table->ti, new_ti); + rcu_assign_pointer(flow_table->ufid_ti, new_ufid_ti); + flow_table->last_rehash = jiffies; + flow_table->count = 0; + flow_table->ufid_count = 0; + + table_instance_destroy(old_ti, old_ufid_ti, true); + return 0; + +err_free_ti: + __table_instance_destroy(new_ti); + return -ENOMEM; +} + +static u32 flow_hash(const struct sw_flow_key *key, + const struct sw_flow_key_range *range) +{ + int key_start = range->start; + int key_end = range->end; + const u32 *hash_key = (const u32 *)((const u8 *)key + key_start); + int hash_u32s = (key_end - key_start) >> 2; + + /* Make sure number of hash bytes are multiple of u32. */ + BUILD_BUG_ON(sizeof(long) % sizeof(u32)); + + return jhash2(hash_key, hash_u32s, 0); +} + +static int flow_key_start(const struct sw_flow_key *key) +{ + if (key->tun_proto) + return 0; + else + return rounddown(offsetof(struct sw_flow_key, phy), + sizeof(long)); +} + +static bool cmp_key(const struct sw_flow_key *key1, + const struct sw_flow_key *key2, + int key_start, int key_end) +{ + const long *cp1 = (const long *)((const u8 *)key1 + key_start); + const long *cp2 = (const long *)((const u8 *)key2 + key_start); + long diffs = 0; + int i; + + for (i = key_start; i < key_end; i += sizeof(long)) + diffs |= *cp1++ ^ *cp2++; + + return diffs == 0; +} + +static bool flow_cmp_masked_key(const struct sw_flow *flow, + const struct sw_flow_key *key, + const struct sw_flow_key_range *range) +{ + return cmp_key(&flow->key, key, range->start, range->end); +} + +static bool ovs_flow_cmp_unmasked_key(const struct sw_flow *flow, + const struct sw_flow_match *match) +{ + struct sw_flow_key *key = match->key; + int key_start = flow_key_start(key); + int key_end = match->range.end; + + BUG_ON(ovs_identifier_is_ufid(&flow->id)); + return cmp_key(flow->id.unmasked_key, key, key_start, key_end); +} + +static struct sw_flow *masked_flow_lookup(struct table_instance *ti, + const struct sw_flow_key *unmasked, + const struct sw_flow_mask *mask) +{ + struct sw_flow *flow; + struct hlist_head *head; + u32 hash; + struct sw_flow_key masked_key; + + ovs_flow_mask_key(&masked_key, unmasked, false, mask); + hash = flow_hash(&masked_key, &mask->range); + head = find_bucket(ti, hash); + hlist_for_each_entry_rcu(flow, head, flow_table.node[ti->node_ver]) { + if (flow->mask == mask && flow->flow_table.hash == hash && + flow_cmp_masked_key(flow, &masked_key, &mask->range)) + return flow; + } + return NULL; +} + +struct sw_flow *ovs_flow_tbl_lookup_stats(struct flow_table *tbl, + const struct sw_flow_key *key, + u32 *n_mask_hit) +{ + struct table_instance *ti = rcu_dereference_ovsl(tbl->ti); + struct sw_flow_mask *mask; + struct sw_flow *flow; + + *n_mask_hit = 0; + list_for_each_entry_rcu(mask, &tbl->mask_list, list) { + (*n_mask_hit)++; + flow = masked_flow_lookup(ti, key, mask); + if (flow) /* Found */ + return flow; + } + return NULL; +} + +struct sw_flow *ovs_flow_tbl_lookup(struct flow_table *tbl, + const struct sw_flow_key *key) +{ + u32 __always_unused n_mask_hit; + + return ovs_flow_tbl_lookup_stats(tbl, key, &n_mask_hit); +} + +struct sw_flow *ovs_flow_tbl_lookup_exact(struct flow_table *tbl, + const struct sw_flow_match *match) +{ + struct table_instance *ti = rcu_dereference_ovsl(tbl->ti); + struct sw_flow_mask *mask; + struct sw_flow *flow; + + /* Always called under ovs-mutex. */ + list_for_each_entry(mask, &tbl->mask_list, list) { + flow = masked_flow_lookup(ti, match->key, mask); + if (flow && ovs_identifier_is_key(&flow->id) && + ovs_flow_cmp_unmasked_key(flow, match)) + return flow; + } + return NULL; +} + +static u32 ufid_hash(const struct sw_flow_id *sfid) +{ + return jhash(sfid->ufid, sfid->ufid_len, 0); +} + +static bool ovs_flow_cmp_ufid(const struct sw_flow *flow, + const struct sw_flow_id *sfid) +{ + if (flow->id.ufid_len != sfid->ufid_len) + return false; + + return !memcmp(flow->id.ufid, sfid->ufid, sfid->ufid_len); +} + +bool ovs_flow_cmp(const struct sw_flow *flow, const struct sw_flow_match *match) +{ + if (ovs_identifier_is_ufid(&flow->id)) + return flow_cmp_masked_key(flow, match->key, &match->range); + + return ovs_flow_cmp_unmasked_key(flow, match); +} + +struct sw_flow *ovs_flow_tbl_lookup_ufid(struct flow_table *tbl, + const struct sw_flow_id *ufid) +{ + struct table_instance *ti = rcu_dereference_ovsl(tbl->ufid_ti); + struct sw_flow *flow; + struct hlist_head *head; + u32 hash; + + hash = ufid_hash(ufid); + head = find_bucket(ti, hash); + hlist_for_each_entry_rcu(flow, head, ufid_table.node[ti->node_ver]) { + if (flow->ufid_table.hash == hash && + ovs_flow_cmp_ufid(flow, ufid)) + return flow; + } + return NULL; +} + +int ovs_flow_tbl_num_masks(const struct flow_table *table) +{ + struct sw_flow_mask *mask; + int num = 0; + + list_for_each_entry(mask, &table->mask_list, list) + num++; + + return num; +} + +static struct table_instance *table_instance_expand(struct table_instance *ti, + bool ufid) +{ + return table_instance_rehash(ti, ti->n_buckets * 2, ufid); +} + +/* Remove 'mask' from the mask list, if it is not needed any more. */ +static void flow_mask_remove(struct flow_table *tbl, struct sw_flow_mask *mask) +{ + if (mask) { + /* ovs-lock is required to protect mask-refcount and + * mask list. + */ + ASSERT_OVSL(); + BUG_ON(!mask->ref_count); + mask->ref_count--; + + if (!mask->ref_count) { + list_del_rcu(&mask->list); + kfree_rcu(mask, rcu); + } + } +} + +/* Must be called with OVS mutex held. */ +void ovs_flow_tbl_remove(struct flow_table *table, struct sw_flow *flow) +{ + struct table_instance *ti = ovsl_dereference(table->ti); + struct table_instance *ufid_ti = ovsl_dereference(table->ufid_ti); + + BUG_ON(table->count == 0); + hlist_del_rcu(&flow->flow_table.node[ti->node_ver]); + table->count--; + if (ovs_identifier_is_ufid(&flow->id)) { + hlist_del_rcu(&flow->ufid_table.node[ufid_ti->node_ver]); + table->ufid_count--; + } + + /* RCU delete the mask. 'flow->mask' is not NULLed, as it should be + * accessible as long as the RCU read lock is held. + */ + flow_mask_remove(table, flow->mask); +} + +static struct sw_flow_mask *mask_alloc(void) +{ + struct sw_flow_mask *mask; + + mask = kmalloc(sizeof(*mask), GFP_KERNEL); + if (mask) + mask->ref_count = 1; + + return mask; +} + +static bool mask_equal(const struct sw_flow_mask *a, + const struct sw_flow_mask *b) +{ + const u8 *a_ = (const u8 *)&a->key + a->range.start; + const u8 *b_ = (const u8 *)&b->key + b->range.start; + + return (a->range.end == b->range.end) + && (a->range.start == b->range.start) + && (memcmp(a_, b_, range_n_bytes(&a->range)) == 0); +} + +static struct sw_flow_mask *flow_mask_find(const struct flow_table *tbl, + const struct sw_flow_mask *mask) +{ + struct list_head *ml; + + list_for_each(ml, &tbl->mask_list) { + struct sw_flow_mask *m; + m = container_of(ml, struct sw_flow_mask, list); + if (mask_equal(mask, m)) + return m; + } + + return NULL; +} + +/* Add 'mask' into the mask list, if it is not already there. */ +static int flow_mask_insert(struct flow_table *tbl, struct sw_flow *flow, + const struct sw_flow_mask *new) +{ + struct sw_flow_mask *mask; + mask = flow_mask_find(tbl, new); + if (!mask) { + /* Allocate a new mask if none exsits. */ + mask = mask_alloc(); + if (!mask) + return -ENOMEM; + mask->key = new->key; + mask->range = new->range; + list_add_rcu(&mask->list, &tbl->mask_list); + } else { + BUG_ON(!mask->ref_count); + mask->ref_count++; + } + + flow->mask = mask; + return 0; +} + +/* Must be called with OVS mutex held. */ +static void flow_key_insert(struct flow_table *table, struct sw_flow *flow) +{ + struct table_instance *new_ti = NULL; + struct table_instance *ti; + + flow->flow_table.hash = flow_hash(&flow->key, &flow->mask->range); + ti = ovsl_dereference(table->ti); + table_instance_insert(ti, flow); + table->count++; + + /* Expand table, if necessary, to make room. */ + if (table->count > ti->n_buckets) + new_ti = table_instance_expand(ti, false); + else if (time_after(jiffies, table->last_rehash + REHASH_INTERVAL)) + new_ti = table_instance_rehash(ti, ti->n_buckets, false); + + if (new_ti) { + rcu_assign_pointer(table->ti, new_ti); + call_rcu(&ti->rcu, flow_tbl_destroy_rcu_cb); + table->last_rehash = jiffies; + } +} + +/* Must be called with OVS mutex held. */ +static void flow_ufid_insert(struct flow_table *table, struct sw_flow *flow) +{ + struct table_instance *ti; + + flow->ufid_table.hash = ufid_hash(&flow->id); + ti = ovsl_dereference(table->ufid_ti); + ufid_table_instance_insert(ti, flow); + table->ufid_count++; + + /* Expand table, if necessary, to make room. */ + if (table->ufid_count > ti->n_buckets) { + struct table_instance *new_ti; + + new_ti = table_instance_expand(ti, true); + if (new_ti) { + rcu_assign_pointer(table->ufid_ti, new_ti); + call_rcu(&ti->rcu, flow_tbl_destroy_rcu_cb); + } + } +} + +/* Must be called with OVS mutex held. */ +int ovs_flow_tbl_insert(struct flow_table *table, struct sw_flow *flow, + const struct sw_flow_mask *mask) +{ + int err; + + err = flow_mask_insert(table, flow, mask); + if (err) + return err; + flow_key_insert(table, flow); + if (ovs_identifier_is_ufid(&flow->id)) + flow_ufid_insert(table, flow); + + return 0; +} + +/* Initializes the flow module. + * Returns zero if successful or a negative error code. */ +int ovs_flow_init(void) +{ + BUILD_BUG_ON(__alignof__(struct sw_flow_key) % __alignof__(long)); + BUILD_BUG_ON(sizeof(struct sw_flow_key) % sizeof(long)); + + flow_cache = kmem_cache_create("sw_flow", sizeof(struct sw_flow) + + (nr_node_ids + * sizeof(struct flow_stats *)), + 0, 0, NULL); + if (flow_cache == NULL) + return -ENOMEM; + + flow_stats_cache + = kmem_cache_create("sw_flow_stats", sizeof(struct flow_stats), + 0, SLAB_HWCACHE_ALIGN, NULL); + if (flow_stats_cache == NULL) { + kmem_cache_destroy(flow_cache); + flow_cache = NULL; + return -ENOMEM; + } + + return 0; +} + +/* Uninitializes the flow module. */ +void ovs_flow_exit(void) +{ + kmem_cache_destroy(flow_stats_cache); + kmem_cache_destroy(flow_cache); +} diff --git a/net/openvswitch/flow_table.h b/net/openvswitch/flow_table.h new file mode 100644 index 000000000000..2dd9900f533d --- /dev/null +++ b/net/openvswitch/flow_table.h @@ -0,0 +1,90 @@ +/* + * Copyright (c) 2007-2013 Nicira, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA + */ + +#ifndef FLOW_TABLE_H +#define FLOW_TABLE_H 1 + +#include <linux/kernel.h> +#include <linux/netlink.h> +#include <linux/openvswitch.h> +#include <linux/spinlock.h> +#include <linux/types.h> +#include <linux/rcupdate.h> +#include <linux/if_ether.h> +#include <linux/in6.h> +#include <linux/jiffies.h> +#include <linux/time.h> +#include <linux/flex_array.h> + +#include <net/inet_ecn.h> +#include <net/ip_tunnels.h> + +#include "flow.h" + +struct table_instance { + struct flex_array *buckets; + unsigned int n_buckets; + struct rcu_head rcu; + int node_ver; + u32 hash_seed; + bool keep_flows; +}; + +struct flow_table { + struct table_instance __rcu *ti; + struct table_instance __rcu *ufid_ti; + struct list_head mask_list; + unsigned long last_rehash; + unsigned int count; + unsigned int ufid_count; +}; + +extern struct kmem_cache *flow_stats_cache; + +int ovs_flow_init(void); +void ovs_flow_exit(void); + +struct sw_flow *ovs_flow_alloc(void); +void ovs_flow_free(struct sw_flow *, bool deferred); + +int ovs_flow_tbl_init(struct flow_table *); +int ovs_flow_tbl_count(const struct flow_table *table); +void ovs_flow_tbl_destroy(struct flow_table *table); +int ovs_flow_tbl_flush(struct flow_table *flow_table); + +int ovs_flow_tbl_insert(struct flow_table *table, struct sw_flow *flow, + const struct sw_flow_mask *mask); +void ovs_flow_tbl_remove(struct flow_table *table, struct sw_flow *flow); +int ovs_flow_tbl_num_masks(const struct flow_table *table); +struct sw_flow *ovs_flow_tbl_dump_next(struct table_instance *table, + u32 *bucket, u32 *idx); +struct sw_flow *ovs_flow_tbl_lookup_stats(struct flow_table *, + const struct sw_flow_key *, + u32 *n_mask_hit); +struct sw_flow *ovs_flow_tbl_lookup(struct flow_table *, + const struct sw_flow_key *); +struct sw_flow *ovs_flow_tbl_lookup_exact(struct flow_table *tbl, + const struct sw_flow_match *match); +struct sw_flow *ovs_flow_tbl_lookup_ufid(struct flow_table *, + const struct sw_flow_id *); + +bool ovs_flow_cmp(const struct sw_flow *, const struct sw_flow_match *); + +void ovs_flow_mask_key(struct sw_flow_key *dst, const struct sw_flow_key *src, + bool full, const struct sw_flow_mask *mask); +#endif /* flow_table.h */ diff --git a/net/openvswitch/vport-geneve.c b/net/openvswitch/vport-geneve.c new file mode 100644 index 000000000000..e41cd12d9b2d --- /dev/null +++ b/net/openvswitch/vport-geneve.c @@ -0,0 +1,137 @@ +/* + * Copyright (c) 2014 Nicira, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/in.h> +#include <linux/ip.h> +#include <linux/net.h> +#include <linux/rculist.h> +#include <linux/udp.h> +#include <linux/if_vlan.h> +#include <linux/module.h> + +#include <net/geneve.h> +#include <net/icmp.h> +#include <net/ip.h> +#include <net/route.h> +#include <net/udp.h> +#include <net/xfrm.h> + +#include "datapath.h" +#include "vport.h" +#include "vport-netdev.h" + +static struct vport_ops ovs_geneve_vport_ops; +/** + * struct geneve_port - Keeps track of open UDP ports + * @dst_port: destination port. + */ +struct geneve_port { + u16 port_no; +}; + +static inline struct geneve_port *geneve_vport(const struct vport *vport) +{ + return vport_priv(vport); +} + +static int geneve_get_options(const struct vport *vport, + struct sk_buff *skb) +{ + struct geneve_port *geneve_port = geneve_vport(vport); + + if (nla_put_u16(skb, OVS_TUNNEL_ATTR_DST_PORT, geneve_port->port_no)) + return -EMSGSIZE; + return 0; +} + +static struct vport *geneve_tnl_create(const struct vport_parms *parms) +{ + struct net *net = ovs_dp_get_net(parms->dp); + struct nlattr *options = parms->options; + struct geneve_port *geneve_port; + struct net_device *dev; + struct vport *vport; + struct nlattr *a; + u16 dst_port; + int err; + + if (!options) { + err = -EINVAL; + goto error; + } + + a = nla_find_nested(options, OVS_TUNNEL_ATTR_DST_PORT); + if (a && nla_len(a) == sizeof(u16)) { + dst_port = nla_get_u16(a); + } else { + /* Require destination port from userspace. */ + err = -EINVAL; + goto error; + } + + vport = ovs_vport_alloc(sizeof(struct geneve_port), + &ovs_geneve_vport_ops, parms); + if (IS_ERR(vport)) + return vport; + + geneve_port = geneve_vport(vport); + geneve_port->port_no = dst_port; + + rtnl_lock(); + dev = geneve_dev_create_fb(net, parms->name, NET_NAME_USER, dst_port); + if (IS_ERR(dev)) { + rtnl_unlock(); + ovs_vport_free(vport); + return ERR_CAST(dev); + } + + dev_change_flags(dev, dev->flags | IFF_UP); + rtnl_unlock(); + return vport; +error: + return ERR_PTR(err); +} + +static struct vport *geneve_create(const struct vport_parms *parms) +{ + struct vport *vport; + + vport = geneve_tnl_create(parms); + if (IS_ERR(vport)) + return vport; + + return ovs_netdev_link(vport, parms->name); +} + +static struct vport_ops ovs_geneve_vport_ops = { + .type = OVS_VPORT_TYPE_GENEVE, + .create = geneve_create, + .destroy = ovs_netdev_tunnel_destroy, + .get_options = geneve_get_options, + .send = dev_queue_xmit, +}; + +static int __init ovs_geneve_tnl_init(void) +{ + return ovs_vport_ops_register(&ovs_geneve_vport_ops); +} + +static void __exit ovs_geneve_tnl_exit(void) +{ + ovs_vport_ops_unregister(&ovs_geneve_vport_ops); +} + +module_init(ovs_geneve_tnl_init); +module_exit(ovs_geneve_tnl_exit); + +MODULE_DESCRIPTION("OVS: Geneve swiching port"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("vport-type-5"); diff --git a/net/openvswitch/vport-gre.c b/net/openvswitch/vport-gre.c new file mode 100644 index 000000000000..7f8897f33a67 --- /dev/null +++ b/net/openvswitch/vport-gre.c @@ -0,0 +1,109 @@ +/* + * Copyright (c) 2007-2014 Nicira, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/if.h> +#include <linux/skbuff.h> +#include <linux/ip.h> +#include <linux/if_tunnel.h> +#include <linux/if_vlan.h> +#include <linux/in.h> +#include <linux/in_route.h> +#include <linux/inetdevice.h> +#include <linux/jhash.h> +#include <linux/list.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/workqueue.h> +#include <linux/rculist.h> +#include <net/route.h> +#include <net/xfrm.h> + +#include <net/icmp.h> +#include <net/ip.h> +#include <net/ip_tunnels.h> +#include <net/gre.h> +#include <net/net_namespace.h> +#include <net/netns/generic.h> +#include <net/protocol.h> + +#include "datapath.h" +#include "vport.h" +#include "vport-netdev.h" + +static struct vport_ops ovs_gre_vport_ops; + +static struct vport *gre_tnl_create(const struct vport_parms *parms) +{ + struct net *net = ovs_dp_get_net(parms->dp); + struct net_device *dev; + struct vport *vport; + + vport = ovs_vport_alloc(0, &ovs_gre_vport_ops, parms); + if (IS_ERR(vport)) + return vport; + + rtnl_lock(); + dev = gretap_fb_dev_create(net, parms->name, NET_NAME_USER); + if (IS_ERR(dev)) { + rtnl_unlock(); + ovs_vport_free(vport); + return ERR_CAST(dev); + } + + dev_change_flags(dev, dev->flags | IFF_UP); + rtnl_unlock(); + + return vport; +} + +static struct vport *gre_create(const struct vport_parms *parms) +{ + struct vport *vport; + + vport = gre_tnl_create(parms); + if (IS_ERR(vport)) + return vport; + + return ovs_netdev_link(vport, parms->name); +} + +static struct vport_ops ovs_gre_vport_ops = { + .type = OVS_VPORT_TYPE_GRE, + .create = gre_create, + .send = dev_queue_xmit, + .destroy = ovs_netdev_tunnel_destroy, +}; + +static int __init ovs_gre_tnl_init(void) +{ + return ovs_vport_ops_register(&ovs_gre_vport_ops); +} + +static void __exit ovs_gre_tnl_exit(void) +{ + ovs_vport_ops_unregister(&ovs_gre_vport_ops); +} + +module_init(ovs_gre_tnl_init); +module_exit(ovs_gre_tnl_exit); + +MODULE_DESCRIPTION("OVS: GRE switching port"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("vport-type-3"); diff --git a/net/openvswitch/vport-internal_dev.c b/net/openvswitch/vport-internal_dev.c new file mode 100644 index 000000000000..12ec61b259b9 --- /dev/null +++ b/net/openvswitch/vport-internal_dev.c @@ -0,0 +1,314 @@ +/* + * Copyright (c) 2007-2012 Nicira, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA + */ + +#include <linux/hardirq.h> +#include <linux/if_vlan.h> +#include <linux/kernel.h> +#include <linux/netdevice.h> +#include <linux/etherdevice.h> +#include <linux/ethtool.h> +#include <linux/skbuff.h> + +#include <net/dst.h> +#include <net/xfrm.h> +#include <net/rtnetlink.h> + +#include "datapath.h" +#include "vport-internal_dev.h" +#include "vport-netdev.h" + +struct internal_dev { + struct vport *vport; +}; + +static struct vport_ops ovs_internal_vport_ops; + +static struct internal_dev *internal_dev_priv(struct net_device *netdev) +{ + return netdev_priv(netdev); +} + +/* Called with rcu_read_lock_bh. */ +static netdev_tx_t +internal_dev_xmit(struct sk_buff *skb, struct net_device *netdev) +{ + int len, err; + + len = skb->len; + rcu_read_lock(); + err = ovs_vport_receive(internal_dev_priv(netdev)->vport, skb, NULL); + rcu_read_unlock(); + + if (likely(!err)) { + struct pcpu_sw_netstats *tstats = this_cpu_ptr(netdev->tstats); + + u64_stats_update_begin(&tstats->syncp); + tstats->tx_bytes += len; + tstats->tx_packets++; + u64_stats_update_end(&tstats->syncp); + } else { + netdev->stats.tx_errors++; + } + return NETDEV_TX_OK; +} + +static int internal_dev_open(struct net_device *netdev) +{ + netif_start_queue(netdev); + return 0; +} + +static int internal_dev_stop(struct net_device *netdev) +{ + netif_stop_queue(netdev); + return 0; +} + +static void internal_dev_getinfo(struct net_device *netdev, + struct ethtool_drvinfo *info) +{ + strlcpy(info->driver, "openvswitch", sizeof(info->driver)); +} + +static const struct ethtool_ops internal_dev_ethtool_ops = { + .get_drvinfo = internal_dev_getinfo, + .get_link = ethtool_op_get_link, +}; + +static int internal_dev_change_mtu(struct net_device *netdev, int new_mtu) +{ + if (new_mtu < 68) + return -EINVAL; + + netdev->mtu = new_mtu; + return 0; +} + +static void internal_dev_destructor(struct net_device *dev) +{ + struct vport *vport = ovs_internal_dev_get_vport(dev); + + ovs_vport_free(vport); + free_netdev(dev); +} + +static struct rtnl_link_stats64 * +internal_get_stats(struct net_device *dev, struct rtnl_link_stats64 *stats) +{ + int i; + + memset(stats, 0, sizeof(*stats)); + stats->rx_errors = dev->stats.rx_errors; + stats->tx_errors = dev->stats.tx_errors; + stats->tx_dropped = dev->stats.tx_dropped; + stats->rx_dropped = dev->stats.rx_dropped; + + for_each_possible_cpu(i) { + const struct pcpu_sw_netstats *percpu_stats; + struct pcpu_sw_netstats local_stats; + unsigned int start; + + percpu_stats = per_cpu_ptr(dev->tstats, i); + + do { + start = u64_stats_fetch_begin_irq(&percpu_stats->syncp); + local_stats = *percpu_stats; + } while (u64_stats_fetch_retry_irq(&percpu_stats->syncp, start)); + + stats->rx_bytes += local_stats.rx_bytes; + stats->rx_packets += local_stats.rx_packets; + stats->tx_bytes += local_stats.tx_bytes; + stats->tx_packets += local_stats.tx_packets; + } + + return stats; +} + +static const struct net_device_ops internal_dev_netdev_ops = { + .ndo_open = internal_dev_open, + .ndo_stop = internal_dev_stop, + .ndo_start_xmit = internal_dev_xmit, + .ndo_set_mac_address = eth_mac_addr, + .ndo_change_mtu = internal_dev_change_mtu, + .ndo_get_stats64 = internal_get_stats, +}; + +static struct rtnl_link_ops internal_dev_link_ops __read_mostly = { + .kind = "openvswitch", +}; + +static void do_setup(struct net_device *netdev) +{ + ether_setup(netdev); + + netdev->netdev_ops = &internal_dev_netdev_ops; + + netdev->priv_flags &= ~IFF_TX_SKB_SHARING; + netdev->priv_flags |= IFF_LIVE_ADDR_CHANGE | IFF_OPENVSWITCH; + netdev->destructor = internal_dev_destructor; + netdev->ethtool_ops = &internal_dev_ethtool_ops; + netdev->rtnl_link_ops = &internal_dev_link_ops; + netdev->tx_queue_len = 0; + + netdev->features = NETIF_F_LLTX | NETIF_F_SG | NETIF_F_FRAGLIST | + NETIF_F_HIGHDMA | NETIF_F_HW_CSUM | + NETIF_F_GSO_SOFTWARE | NETIF_F_GSO_ENCAP_ALL; + + netdev->vlan_features = netdev->features; + netdev->hw_enc_features = netdev->features; + netdev->features |= NETIF_F_HW_VLAN_CTAG_TX; + netdev->hw_features = netdev->features & ~NETIF_F_LLTX; + + eth_hw_addr_random(netdev); +} + +static struct vport *internal_dev_create(const struct vport_parms *parms) +{ + struct vport *vport; + struct internal_dev *internal_dev; + int err; + + vport = ovs_vport_alloc(0, &ovs_internal_vport_ops, parms); + if (IS_ERR(vport)) { + err = PTR_ERR(vport); + goto error; + } + + vport->dev = alloc_netdev(sizeof(struct internal_dev), + parms->name, NET_NAME_UNKNOWN, do_setup); + if (!vport->dev) { + err = -ENOMEM; + goto error_free_vport; + } + vport->dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); + if (!vport->dev->tstats) { + err = -ENOMEM; + goto error_free_netdev; + } + + dev_net_set(vport->dev, ovs_dp_get_net(vport->dp)); + internal_dev = internal_dev_priv(vport->dev); + internal_dev->vport = vport; + + /* Restrict bridge port to current netns. */ + if (vport->port_no == OVSP_LOCAL) + vport->dev->features |= NETIF_F_NETNS_LOCAL; + + rtnl_lock(); + err = register_netdevice(vport->dev); + if (err) + goto error_unlock; + + dev_set_promiscuity(vport->dev, 1); + rtnl_unlock(); + netif_start_queue(vport->dev); + + return vport; + +error_unlock: + rtnl_unlock(); + free_percpu(vport->dev->tstats); +error_free_netdev: + free_netdev(vport->dev); +error_free_vport: + ovs_vport_free(vport); +error: + return ERR_PTR(err); +} + +static void internal_dev_destroy(struct vport *vport) +{ + netif_stop_queue(vport->dev); + rtnl_lock(); + dev_set_promiscuity(vport->dev, -1); + + /* unregister_netdevice() waits for an RCU grace period. */ + unregister_netdevice(vport->dev); + free_percpu(vport->dev->tstats); + rtnl_unlock(); +} + +static netdev_tx_t internal_dev_recv(struct sk_buff *skb) +{ + struct net_device *netdev = skb->dev; + struct pcpu_sw_netstats *stats; + + if (unlikely(!(netdev->flags & IFF_UP))) { + kfree_skb(skb); + netdev->stats.rx_dropped++; + return NETDEV_TX_OK; + } + + skb_dst_drop(skb); + nf_reset(skb); + secpath_reset(skb); + + skb->pkt_type = PACKET_HOST; + skb->protocol = eth_type_trans(skb, netdev); + skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); + + stats = this_cpu_ptr(netdev->tstats); + u64_stats_update_begin(&stats->syncp); + stats->rx_packets++; + stats->rx_bytes += skb->len; + u64_stats_update_end(&stats->syncp); + + netif_rx(skb); + return NETDEV_TX_OK; +} + +static struct vport_ops ovs_internal_vport_ops = { + .type = OVS_VPORT_TYPE_INTERNAL, + .create = internal_dev_create, + .destroy = internal_dev_destroy, + .send = internal_dev_recv, +}; + +int ovs_is_internal_dev(const struct net_device *netdev) +{ + return netdev->netdev_ops == &internal_dev_netdev_ops; +} + +struct vport *ovs_internal_dev_get_vport(struct net_device *netdev) +{ + if (!ovs_is_internal_dev(netdev)) + return NULL; + + return internal_dev_priv(netdev)->vport; +} + +int ovs_internal_dev_rtnl_link_register(void) +{ + int err; + + err = rtnl_link_register(&internal_dev_link_ops); + if (err < 0) + return err; + + err = ovs_vport_ops_register(&ovs_internal_vport_ops); + if (err < 0) + rtnl_link_unregister(&internal_dev_link_ops); + + return err; +} + +void ovs_internal_dev_rtnl_link_unregister(void) +{ + ovs_vport_ops_unregister(&ovs_internal_vport_ops); + rtnl_link_unregister(&internal_dev_link_ops); +} diff --git a/net/openvswitch/vport-internal_dev.h b/net/openvswitch/vport-internal_dev.h new file mode 100644 index 000000000000..1b179a190cff --- /dev/null +++ b/net/openvswitch/vport-internal_dev.h @@ -0,0 +1,30 @@ +/* + * Copyright (c) 2007-2011 Nicira, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA + */ + +#ifndef VPORT_INTERNAL_DEV_H +#define VPORT_INTERNAL_DEV_H 1 + +#include "datapath.h" +#include "vport.h" + +int ovs_is_internal_dev(const struct net_device *); +struct vport *ovs_internal_dev_get_vport(struct net_device *); +int ovs_internal_dev_rtnl_link_register(void); +void ovs_internal_dev_rtnl_link_unregister(void); + +#endif /* vport-internal_dev.h */ diff --git a/net/openvswitch/vport-netdev.c b/net/openvswitch/vport-netdev.c new file mode 100644 index 000000000000..76fcaf1fd2a9 --- /dev/null +++ b/net/openvswitch/vport-netdev.c @@ -0,0 +1,222 @@ +/* + * Copyright (c) 2007-2012 Nicira, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/if_arp.h> +#include <linux/if_bridge.h> +#include <linux/if_vlan.h> +#include <linux/kernel.h> +#include <linux/llc.h> +#include <linux/rtnetlink.h> +#include <linux/skbuff.h> +#include <linux/openvswitch.h> +#include <linux/export.h> + +#include <net/ip_tunnels.h> +#include <net/rtnetlink.h> + +#include "datapath.h" +#include "vport.h" +#include "vport-internal_dev.h" +#include "vport-netdev.h" + +static struct vport_ops ovs_netdev_vport_ops; + +/* Must be called with rcu_read_lock. */ +static void netdev_port_receive(struct sk_buff *skb) +{ + struct vport *vport; + + vport = ovs_netdev_get_vport(skb->dev); + if (unlikely(!vport)) + goto error; + + if (unlikely(skb_warn_if_lro(skb))) + goto error; + + /* Make our own copy of the packet. Otherwise we will mangle the + * packet for anyone who came before us (e.g. tcpdump via AF_PACKET). + */ + skb = skb_share_check(skb, GFP_ATOMIC); + if (unlikely(!skb)) + return; + + skb_push(skb, ETH_HLEN); + skb_postpush_rcsum(skb, skb->data, ETH_HLEN); + ovs_vport_receive(vport, skb, skb_tunnel_info(skb)); + return; +error: + kfree_skb(skb); +} + +/* Called with rcu_read_lock and bottom-halves disabled. */ +static rx_handler_result_t netdev_frame_hook(struct sk_buff **pskb) +{ + struct sk_buff *skb = *pskb; + + if (unlikely(skb->pkt_type == PACKET_LOOPBACK)) + return RX_HANDLER_PASS; + + netdev_port_receive(skb); + return RX_HANDLER_CONSUMED; +} + +static struct net_device *get_dpdev(const struct datapath *dp) +{ + struct vport *local; + + local = ovs_vport_ovsl(dp, OVSP_LOCAL); + BUG_ON(!local); + return local->dev; +} + +struct vport *ovs_netdev_link(struct vport *vport, const char *name) +{ + int err; + + vport->dev = dev_get_by_name(ovs_dp_get_net(vport->dp), name); + if (!vport->dev) { + err = -ENODEV; + goto error_free_vport; + } + + if (vport->dev->flags & IFF_LOOPBACK || + vport->dev->type != ARPHRD_ETHER || + ovs_is_internal_dev(vport->dev)) { + err = -EINVAL; + goto error_put; + } + + rtnl_lock(); + err = netdev_master_upper_dev_link(vport->dev, + get_dpdev(vport->dp)); + if (err) + goto error_unlock; + + err = netdev_rx_handler_register(vport->dev, netdev_frame_hook, + vport); + if (err) + goto error_master_upper_dev_unlink; + + dev_disable_lro(vport->dev); + dev_set_promiscuity(vport->dev, 1); + vport->dev->priv_flags |= IFF_OVS_DATAPATH; + rtnl_unlock(); + + return vport; + +error_master_upper_dev_unlink: + netdev_upper_dev_unlink(vport->dev, get_dpdev(vport->dp)); +error_unlock: + rtnl_unlock(); +error_put: + dev_put(vport->dev); +error_free_vport: + ovs_vport_free(vport); + return ERR_PTR(err); +} +EXPORT_SYMBOL_GPL(ovs_netdev_link); + +static struct vport *netdev_create(const struct vport_parms *parms) +{ + struct vport *vport; + + vport = ovs_vport_alloc(0, &ovs_netdev_vport_ops, parms); + if (IS_ERR(vport)) + return vport; + + return ovs_netdev_link(vport, parms->name); +} + +static void vport_netdev_free(struct rcu_head *rcu) +{ + struct vport *vport = container_of(rcu, struct vport, rcu); + + if (vport->dev) + dev_put(vport->dev); + ovs_vport_free(vport); +} + +void ovs_netdev_detach_dev(struct vport *vport) +{ + ASSERT_RTNL(); + vport->dev->priv_flags &= ~IFF_OVS_DATAPATH; + netdev_rx_handler_unregister(vport->dev); + netdev_upper_dev_unlink(vport->dev, + netdev_master_upper_dev_get(vport->dev)); + dev_set_promiscuity(vport->dev, -1); +} +EXPORT_SYMBOL_GPL(ovs_netdev_detach_dev); + +static void netdev_destroy(struct vport *vport) +{ + rtnl_lock(); + if (vport->dev->priv_flags & IFF_OVS_DATAPATH) + ovs_netdev_detach_dev(vport); + rtnl_unlock(); + + call_rcu(&vport->rcu, vport_netdev_free); +} + +void ovs_netdev_tunnel_destroy(struct vport *vport) +{ + rtnl_lock(); + if (vport->dev->priv_flags & IFF_OVS_DATAPATH) + ovs_netdev_detach_dev(vport); + + /* We can be invoked by both explicit vport deletion and + * underlying netdev deregistration; delete the link only + * if it's not already shutting down. + */ + if (vport->dev->reg_state == NETREG_REGISTERED) + rtnl_delete_link(vport->dev); + dev_put(vport->dev); + vport->dev = NULL; + rtnl_unlock(); + + call_rcu(&vport->rcu, vport_netdev_free); +} +EXPORT_SYMBOL_GPL(ovs_netdev_tunnel_destroy); + +/* Returns null if this device is not attached to a datapath. */ +struct vport *ovs_netdev_get_vport(struct net_device *dev) +{ + if (likely(dev->priv_flags & IFF_OVS_DATAPATH)) + return (struct vport *) + rcu_dereference_rtnl(dev->rx_handler_data); + else + return NULL; +} + +static struct vport_ops ovs_netdev_vport_ops = { + .type = OVS_VPORT_TYPE_NETDEV, + .create = netdev_create, + .destroy = netdev_destroy, + .send = dev_queue_xmit, +}; + +int __init ovs_netdev_init(void) +{ + return ovs_vport_ops_register(&ovs_netdev_vport_ops); +} + +void ovs_netdev_exit(void) +{ + ovs_vport_ops_unregister(&ovs_netdev_vport_ops); +} diff --git a/net/openvswitch/vport-netdev.h b/net/openvswitch/vport-netdev.h new file mode 100644 index 000000000000..19e29c12adcc --- /dev/null +++ b/net/openvswitch/vport-netdev.h @@ -0,0 +1,36 @@ +/* + * Copyright (c) 2007-2011 Nicira, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA + */ + +#ifndef VPORT_NETDEV_H +#define VPORT_NETDEV_H 1 + +#include <linux/netdevice.h> +#include <linux/rcupdate.h> + +#include "vport.h" + +struct vport *ovs_netdev_get_vport(struct net_device *dev); + +struct vport *ovs_netdev_link(struct vport *vport, const char *name); +void ovs_netdev_detach_dev(struct vport *); + +int __init ovs_netdev_init(void); +void ovs_netdev_exit(void); + +void ovs_netdev_tunnel_destroy(struct vport *vport); +#endif /* vport_netdev.h */ diff --git a/net/openvswitch/vport-vxlan.c b/net/openvswitch/vport-vxlan.c new file mode 100644 index 000000000000..5eb7694348b5 --- /dev/null +++ b/net/openvswitch/vport-vxlan.c @@ -0,0 +1,174 @@ +/* + * Copyright (c) 2014 Nicira, Inc. + * Copyright (c) 2013 Cisco Systems, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA + */ + +#include <linux/kernel.h> +#include <linux/skbuff.h> +#include <linux/openvswitch.h> +#include <linux/module.h> +#include <net/udp.h> +#include <net/ip_tunnels.h> +#include <net/rtnetlink.h> +#include <net/vxlan.h> + +#include "datapath.h" +#include "vport.h" +#include "vport-netdev.h" + +static struct vport_ops ovs_vxlan_netdev_vport_ops; + +static int vxlan_get_options(const struct vport *vport, struct sk_buff *skb) +{ + struct vxlan_dev *vxlan = netdev_priv(vport->dev); + __be16 dst_port = vxlan->cfg.dst_port; + + if (nla_put_u16(skb, OVS_TUNNEL_ATTR_DST_PORT, ntohs(dst_port))) + return -EMSGSIZE; + + if (vxlan->flags & VXLAN_F_GBP) { + struct nlattr *exts; + + exts = nla_nest_start(skb, OVS_TUNNEL_ATTR_EXTENSION); + if (!exts) + return -EMSGSIZE; + + if (vxlan->flags & VXLAN_F_GBP && + nla_put_flag(skb, OVS_VXLAN_EXT_GBP)) + return -EMSGSIZE; + + nla_nest_end(skb, exts); + } + + return 0; +} + +static const struct nla_policy exts_policy[OVS_VXLAN_EXT_MAX + 1] = { + [OVS_VXLAN_EXT_GBP] = { .type = NLA_FLAG, }, +}; + +static int vxlan_configure_exts(struct vport *vport, struct nlattr *attr, + struct vxlan_config *conf) +{ + struct nlattr *exts[OVS_VXLAN_EXT_MAX + 1]; + int err; + + if (nla_len(attr) < sizeof(struct nlattr)) + return -EINVAL; + + err = nla_parse_nested(exts, OVS_VXLAN_EXT_MAX, attr, exts_policy); + if (err < 0) + return err; + + if (exts[OVS_VXLAN_EXT_GBP]) + conf->flags |= VXLAN_F_GBP; + + return 0; +} + +static struct vport *vxlan_tnl_create(const struct vport_parms *parms) +{ + struct net *net = ovs_dp_get_net(parms->dp); + struct nlattr *options = parms->options; + struct net_device *dev; + struct vport *vport; + struct nlattr *a; + int err; + struct vxlan_config conf = { + .no_share = true, + .flags = VXLAN_F_COLLECT_METADATA | VXLAN_F_UDP_ZERO_CSUM6_RX, + /* Don't restrict the packets that can be sent by MTU */ + .mtu = IP_MAX_MTU, + }; + + if (!options) { + err = -EINVAL; + goto error; + } + + a = nla_find_nested(options, OVS_TUNNEL_ATTR_DST_PORT); + if (a && nla_len(a) == sizeof(u16)) { + conf.dst_port = htons(nla_get_u16(a)); + } else { + /* Require destination port from userspace. */ + err = -EINVAL; + goto error; + } + + vport = ovs_vport_alloc(0, &ovs_vxlan_netdev_vport_ops, parms); + if (IS_ERR(vport)) + return vport; + + a = nla_find_nested(options, OVS_TUNNEL_ATTR_EXTENSION); + if (a) { + err = vxlan_configure_exts(vport, a, &conf); + if (err) { + ovs_vport_free(vport); + goto error; + } + } + + rtnl_lock(); + dev = vxlan_dev_create(net, parms->name, NET_NAME_USER, &conf); + if (IS_ERR(dev)) { + rtnl_unlock(); + ovs_vport_free(vport); + return ERR_CAST(dev); + } + + dev_change_flags(dev, dev->flags | IFF_UP); + rtnl_unlock(); + return vport; +error: + return ERR_PTR(err); +} + +static struct vport *vxlan_create(const struct vport_parms *parms) +{ + struct vport *vport; + + vport = vxlan_tnl_create(parms); + if (IS_ERR(vport)) + return vport; + + return ovs_netdev_link(vport, parms->name); +} + +static struct vport_ops ovs_vxlan_netdev_vport_ops = { + .type = OVS_VPORT_TYPE_VXLAN, + .create = vxlan_create, + .destroy = ovs_netdev_tunnel_destroy, + .get_options = vxlan_get_options, + .send = dev_queue_xmit, +}; + +static int __init ovs_vxlan_tnl_init(void) +{ + return ovs_vport_ops_register(&ovs_vxlan_netdev_vport_ops); +} + +static void __exit ovs_vxlan_tnl_exit(void) +{ + ovs_vport_ops_unregister(&ovs_vxlan_netdev_vport_ops); +} + +module_init(ovs_vxlan_tnl_init); +module_exit(ovs_vxlan_tnl_exit); + +MODULE_DESCRIPTION("OVS: VXLAN switching port"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("vport-type-4"); diff --git a/net/openvswitch/vport.c b/net/openvswitch/vport.c new file mode 100644 index 000000000000..31cbc8c5c7db --- /dev/null +++ b/net/openvswitch/vport.c @@ -0,0 +1,511 @@ +/* + * Copyright (c) 2007-2014 Nicira, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA + */ + +#include <linux/etherdevice.h> +#include <linux/if.h> +#include <linux/if_vlan.h> +#include <linux/jhash.h> +#include <linux/kernel.h> +#include <linux/list.h> +#include <linux/mutex.h> +#include <linux/percpu.h> +#include <linux/rcupdate.h> +#include <linux/rtnetlink.h> +#include <linux/compat.h> +#include <net/net_namespace.h> +#include <linux/module.h> + +#include "datapath.h" +#include "vport.h" +#include "vport-internal_dev.h" + +static LIST_HEAD(vport_ops_list); + +/* Protected by RCU read lock for reading, ovs_mutex for writing. */ +static struct hlist_head *dev_table; +#define VPORT_HASH_BUCKETS 1024 + +/** + * ovs_vport_init - initialize vport subsystem + * + * Called at module load time to initialize the vport subsystem. + */ +int ovs_vport_init(void) +{ + dev_table = kzalloc(VPORT_HASH_BUCKETS * sizeof(struct hlist_head), + GFP_KERNEL); + if (!dev_table) + return -ENOMEM; + + return 0; +} + +/** + * ovs_vport_exit - shutdown vport subsystem + * + * Called at module exit time to shutdown the vport subsystem. + */ +void ovs_vport_exit(void) +{ + kfree(dev_table); +} + +static struct hlist_head *hash_bucket(const struct net *net, const char *name) +{ + unsigned int hash = jhash(name, strlen(name), (unsigned long) net); + return &dev_table[hash & (VPORT_HASH_BUCKETS - 1)]; +} + +int __ovs_vport_ops_register(struct vport_ops *ops) +{ + int err = -EEXIST; + struct vport_ops *o; + + ovs_lock(); + list_for_each_entry(o, &vport_ops_list, list) + if (ops->type == o->type) + goto errout; + + list_add_tail(&ops->list, &vport_ops_list); + err = 0; +errout: + ovs_unlock(); + return err; +} +EXPORT_SYMBOL_GPL(__ovs_vport_ops_register); + +void ovs_vport_ops_unregister(struct vport_ops *ops) +{ + ovs_lock(); + list_del(&ops->list); + ovs_unlock(); +} +EXPORT_SYMBOL_GPL(ovs_vport_ops_unregister); + +/** + * ovs_vport_locate - find a port that has already been created + * + * @name: name of port to find + * + * Must be called with ovs or RCU read lock. + */ +struct vport *ovs_vport_locate(const struct net *net, const char *name) +{ + struct hlist_head *bucket = hash_bucket(net, name); + struct vport *vport; + + hlist_for_each_entry_rcu(vport, bucket, hash_node) + if (!strcmp(name, ovs_vport_name(vport)) && + net_eq(ovs_dp_get_net(vport->dp), net)) + return vport; + + return NULL; +} + +/** + * ovs_vport_alloc - allocate and initialize new vport + * + * @priv_size: Size of private data area to allocate. + * @ops: vport device ops + * + * Allocate and initialize a new vport defined by @ops. The vport will contain + * a private data area of size @priv_size that can be accessed using + * vport_priv(). vports that are no longer needed should be released with + * vport_free(). + */ +struct vport *ovs_vport_alloc(int priv_size, const struct vport_ops *ops, + const struct vport_parms *parms) +{ + struct vport *vport; + size_t alloc_size; + + alloc_size = sizeof(struct vport); + if (priv_size) { + alloc_size = ALIGN(alloc_size, VPORT_ALIGN); + alloc_size += priv_size; + } + + vport = kzalloc(alloc_size, GFP_KERNEL); + if (!vport) + return ERR_PTR(-ENOMEM); + + vport->dp = parms->dp; + vport->port_no = parms->port_no; + vport->ops = ops; + INIT_HLIST_NODE(&vport->dp_hash_node); + + if (ovs_vport_set_upcall_portids(vport, parms->upcall_portids)) { + kfree(vport); + return ERR_PTR(-EINVAL); + } + + return vport; +} +EXPORT_SYMBOL_GPL(ovs_vport_alloc); + +/** + * ovs_vport_free - uninitialize and free vport + * + * @vport: vport to free + * + * Frees a vport allocated with vport_alloc() when it is no longer needed. + * + * The caller must ensure that an RCU grace period has passed since the last + * time @vport was in a datapath. + */ +void ovs_vport_free(struct vport *vport) +{ + /* vport is freed from RCU callback or error path, Therefore + * it is safe to use raw dereference. + */ + kfree(rcu_dereference_raw(vport->upcall_portids)); + kfree(vport); +} +EXPORT_SYMBOL_GPL(ovs_vport_free); + +static struct vport_ops *ovs_vport_lookup(const struct vport_parms *parms) +{ + struct vport_ops *ops; + + list_for_each_entry(ops, &vport_ops_list, list) + if (ops->type == parms->type) + return ops; + + return NULL; +} + +/** + * ovs_vport_add - add vport device (for kernel callers) + * + * @parms: Information about new vport. + * + * Creates a new vport with the specified configuration (which is dependent on + * device type). ovs_mutex must be held. + */ +struct vport *ovs_vport_add(const struct vport_parms *parms) +{ + struct vport_ops *ops; + struct vport *vport; + + ops = ovs_vport_lookup(parms); + if (ops) { + struct hlist_head *bucket; + + if (!try_module_get(ops->owner)) + return ERR_PTR(-EAFNOSUPPORT); + + vport = ops->create(parms); + if (IS_ERR(vport)) { + module_put(ops->owner); + return vport; + } + + bucket = hash_bucket(ovs_dp_get_net(vport->dp), + ovs_vport_name(vport)); + hlist_add_head_rcu(&vport->hash_node, bucket); + return vport; + } + + /* Unlock to attempt module load and return -EAGAIN if load + * was successful as we need to restart the port addition + * workflow. + */ + ovs_unlock(); + request_module("vport-type-%d", parms->type); + ovs_lock(); + + if (!ovs_vport_lookup(parms)) + return ERR_PTR(-EAFNOSUPPORT); + else + return ERR_PTR(-EAGAIN); +} + +/** + * ovs_vport_set_options - modify existing vport device (for kernel callers) + * + * @vport: vport to modify. + * @options: New configuration. + * + * Modifies an existing device with the specified configuration (which is + * dependent on device type). ovs_mutex must be held. + */ +int ovs_vport_set_options(struct vport *vport, struct nlattr *options) +{ + if (!vport->ops->set_options) + return -EOPNOTSUPP; + return vport->ops->set_options(vport, options); +} + +/** + * ovs_vport_del - delete existing vport device + * + * @vport: vport to delete. + * + * Detaches @vport from its datapath and destroys it. ovs_mutex must + * be held. + */ +void ovs_vport_del(struct vport *vport) +{ + ASSERT_OVSL(); + + hlist_del_rcu(&vport->hash_node); + module_put(vport->ops->owner); + vport->ops->destroy(vport); +} + +/** + * ovs_vport_get_stats - retrieve device stats + * + * @vport: vport from which to retrieve the stats + * @stats: location to store stats + * + * Retrieves transmit, receive, and error stats for the given device. + * + * Must be called with ovs_mutex or rcu_read_lock. + */ +void ovs_vport_get_stats(struct vport *vport, struct ovs_vport_stats *stats) +{ + const struct rtnl_link_stats64 *dev_stats; + struct rtnl_link_stats64 temp; + + dev_stats = dev_get_stats(vport->dev, &temp); + stats->rx_errors = dev_stats->rx_errors; + stats->tx_errors = dev_stats->tx_errors; + stats->tx_dropped = dev_stats->tx_dropped; + stats->rx_dropped = dev_stats->rx_dropped; + + stats->rx_bytes = dev_stats->rx_bytes; + stats->rx_packets = dev_stats->rx_packets; + stats->tx_bytes = dev_stats->tx_bytes; + stats->tx_packets = dev_stats->tx_packets; +} + +/** + * ovs_vport_get_options - retrieve device options + * + * @vport: vport from which to retrieve the options. + * @skb: sk_buff where options should be appended. + * + * Retrieves the configuration of the given device, appending an + * %OVS_VPORT_ATTR_OPTIONS attribute that in turn contains nested + * vport-specific attributes to @skb. + * + * Returns 0 if successful, -EMSGSIZE if @skb has insufficient room, or another + * negative error code if a real error occurred. If an error occurs, @skb is + * left unmodified. + * + * Must be called with ovs_mutex or rcu_read_lock. + */ +int ovs_vport_get_options(const struct vport *vport, struct sk_buff *skb) +{ + struct nlattr *nla; + int err; + + if (!vport->ops->get_options) + return 0; + + nla = nla_nest_start(skb, OVS_VPORT_ATTR_OPTIONS); + if (!nla) + return -EMSGSIZE; + + err = vport->ops->get_options(vport, skb); + if (err) { + nla_nest_cancel(skb, nla); + return err; + } + + nla_nest_end(skb, nla); + return 0; +} + +/** + * ovs_vport_set_upcall_portids - set upcall portids of @vport. + * + * @vport: vport to modify. + * @ids: new configuration, an array of port ids. + * + * Sets the vport's upcall_portids to @ids. + * + * Returns 0 if successful, -EINVAL if @ids is zero length or cannot be parsed + * as an array of U32. + * + * Must be called with ovs_mutex. + */ +int ovs_vport_set_upcall_portids(struct vport *vport, const struct nlattr *ids) +{ + struct vport_portids *old, *vport_portids; + + if (!nla_len(ids) || nla_len(ids) % sizeof(u32)) + return -EINVAL; + + old = ovsl_dereference(vport->upcall_portids); + + vport_portids = kmalloc(sizeof(*vport_portids) + nla_len(ids), + GFP_KERNEL); + if (!vport_portids) + return -ENOMEM; + + vport_portids->n_ids = nla_len(ids) / sizeof(u32); + vport_portids->rn_ids = reciprocal_value(vport_portids->n_ids); + nla_memcpy(vport_portids->ids, ids, nla_len(ids)); + + rcu_assign_pointer(vport->upcall_portids, vport_portids); + + if (old) + kfree_rcu(old, rcu); + return 0; +} + +/** + * ovs_vport_get_upcall_portids - get the upcall_portids of @vport. + * + * @vport: vport from which to retrieve the portids. + * @skb: sk_buff where portids should be appended. + * + * Retrieves the configuration of the given vport, appending the + * %OVS_VPORT_ATTR_UPCALL_PID attribute which is the array of upcall + * portids to @skb. + * + * Returns 0 if successful, -EMSGSIZE if @skb has insufficient room. + * If an error occurs, @skb is left unmodified. Must be called with + * ovs_mutex or rcu_read_lock. + */ +int ovs_vport_get_upcall_portids(const struct vport *vport, + struct sk_buff *skb) +{ + struct vport_portids *ids; + + ids = rcu_dereference_ovsl(vport->upcall_portids); + + if (vport->dp->user_features & OVS_DP_F_VPORT_PIDS) + return nla_put(skb, OVS_VPORT_ATTR_UPCALL_PID, + ids->n_ids * sizeof(u32), (void *)ids->ids); + else + return nla_put_u32(skb, OVS_VPORT_ATTR_UPCALL_PID, ids->ids[0]); +} + +/** + * ovs_vport_find_upcall_portid - find the upcall portid to send upcall. + * + * @vport: vport from which the missed packet is received. + * @skb: skb that the missed packet was received. + * + * Uses the skb_get_hash() to select the upcall portid to send the + * upcall. + * + * Returns the portid of the target socket. Must be called with rcu_read_lock. + */ +u32 ovs_vport_find_upcall_portid(const struct vport *vport, struct sk_buff *skb) +{ + struct vport_portids *ids; + u32 ids_index; + u32 hash; + + ids = rcu_dereference(vport->upcall_portids); + + if (ids->n_ids == 1 && ids->ids[0] == 0) + return 0; + + hash = skb_get_hash(skb); + ids_index = hash - ids->n_ids * reciprocal_divide(hash, ids->rn_ids); + return ids->ids[ids_index]; +} + +/** + * ovs_vport_receive - pass up received packet to the datapath for processing + * + * @vport: vport that received the packet + * @skb: skb that was received + * @tun_key: tunnel (if any) that carried packet + * + * Must be called with rcu_read_lock. The packet cannot be shared and + * skb->data should point to the Ethernet header. + */ +int ovs_vport_receive(struct vport *vport, struct sk_buff *skb, + const struct ip_tunnel_info *tun_info) +{ + struct sw_flow_key key; + int error; + + OVS_CB(skb)->input_vport = vport; + OVS_CB(skb)->mru = 0; + if (unlikely(dev_net(skb->dev) != ovs_dp_get_net(vport->dp))) { + u32 mark; + + mark = skb->mark; + skb_scrub_packet(skb, true); + skb->mark = mark; + tun_info = NULL; + } + + /* Extract flow from 'skb' into 'key'. */ + error = ovs_flow_key_extract(tun_info, skb, &key); + if (unlikely(error)) { + kfree_skb(skb); + return error; + } + ovs_dp_process_packet(skb, &key); + return 0; +} +EXPORT_SYMBOL_GPL(ovs_vport_receive); + +static void free_vport_rcu(struct rcu_head *rcu) +{ + struct vport *vport = container_of(rcu, struct vport, rcu); + + ovs_vport_free(vport); +} + +void ovs_vport_deferred_free(struct vport *vport) +{ + if (!vport) + return; + + call_rcu(&vport->rcu, free_vport_rcu); +} +EXPORT_SYMBOL_GPL(ovs_vport_deferred_free); + +static unsigned int packet_length(const struct sk_buff *skb) +{ + unsigned int length = skb->len - ETH_HLEN; + + if (skb->protocol == htons(ETH_P_8021Q)) + length -= VLAN_HLEN; + + return length; +} + +void ovs_vport_send(struct vport *vport, struct sk_buff *skb) +{ + int mtu = vport->dev->mtu; + + if (unlikely(packet_length(skb) > mtu && !skb_is_gso(skb))) { + net_warn_ratelimited("%s: dropped over-mtu packet: %d > %d\n", + vport->dev->name, + packet_length(skb), mtu); + vport->dev->stats.tx_errors++; + goto drop; + } + + skb->dev = vport->dev; + vport->ops->send(skb); + return; + +drop: + kfree_skb(skb); +} diff --git a/net/openvswitch/vport.h b/net/openvswitch/vport.h new file mode 100644 index 000000000000..6e2b62f9d595 --- /dev/null +++ b/net/openvswitch/vport.h @@ -0,0 +1,222 @@ +/* + * Copyright (c) 2007-2012 Nicira, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA + */ + +#ifndef VPORT_H +#define VPORT_H 1 + +#include <linux/if_tunnel.h> +#include <linux/list.h> +#include <linux/netlink.h> +#include <linux/openvswitch.h> +#include <linux/reciprocal_div.h> +#include <linux/skbuff.h> +#include <linux/spinlock.h> +#include <linux/u64_stats_sync.h> + +#include "datapath.h" + +struct vport; +struct vport_parms; + +/* The following definitions are for users of the vport subsytem: */ + +int ovs_vport_init(void); +void ovs_vport_exit(void); + +struct vport *ovs_vport_add(const struct vport_parms *); +void ovs_vport_del(struct vport *); + +struct vport *ovs_vport_locate(const struct net *net, const char *name); + +void ovs_vport_get_stats(struct vport *, struct ovs_vport_stats *); + +int ovs_vport_set_options(struct vport *, struct nlattr *options); +int ovs_vport_get_options(const struct vport *, struct sk_buff *); + +int ovs_vport_set_upcall_portids(struct vport *, const struct nlattr *pids); +int ovs_vport_get_upcall_portids(const struct vport *, struct sk_buff *); +u32 ovs_vport_find_upcall_portid(const struct vport *, struct sk_buff *); + +/** + * struct vport_portids - array of netlink portids of a vport. + * must be protected by rcu. + * @rn_ids: The reciprocal value of @n_ids. + * @rcu: RCU callback head for deferred destruction. + * @n_ids: Size of @ids array. + * @ids: Array storing the Netlink socket pids to be used for packets received + * on this port that miss the flow table. + */ +struct vport_portids { + struct reciprocal_value rn_ids; + struct rcu_head rcu; + u32 n_ids; + u32 ids[]; +}; + +/** + * struct vport - one port within a datapath + * @rcu: RCU callback head for deferred destruction. + * @dp: Datapath to which this port belongs. + * @upcall_portids: RCU protected 'struct vport_portids'. + * @port_no: Index into @dp's @ports array. + * @hash_node: Element in @dev_table hash table in vport.c. + * @dp_hash_node: Element in @datapath->ports hash table in datapath.c. + * @ops: Class structure. + * @detach_list: list used for detaching vport in net-exit call. + */ +struct vport { + struct net_device *dev; + struct datapath *dp; + struct vport_portids __rcu *upcall_portids; + u16 port_no; + + struct hlist_node hash_node; + struct hlist_node dp_hash_node; + const struct vport_ops *ops; + + struct list_head detach_list; + struct rcu_head rcu; +}; + +/** + * struct vport_parms - parameters for creating a new vport + * + * @name: New vport's name. + * @type: New vport's type. + * @options: %OVS_VPORT_ATTR_OPTIONS attribute from Netlink message, %NULL if + * none was supplied. + * @dp: New vport's datapath. + * @port_no: New vport's port number. + */ +struct vport_parms { + const char *name; + enum ovs_vport_type type; + struct nlattr *options; + + /* For ovs_vport_alloc(). */ + struct datapath *dp; + u16 port_no; + struct nlattr *upcall_portids; +}; + +/** + * struct vport_ops - definition of a type of virtual port + * + * @type: %OVS_VPORT_TYPE_* value for this type of virtual port. + * @create: Create a new vport configured as specified. On success returns + * a new vport allocated with ovs_vport_alloc(), otherwise an ERR_PTR() value. + * @destroy: Destroys a vport. Must call vport_free() on the vport but not + * before an RCU grace period has elapsed. + * @set_options: Modify the configuration of an existing vport. May be %NULL + * if modification is not supported. + * @get_options: Appends vport-specific attributes for the configuration of an + * existing vport to a &struct sk_buff. May be %NULL for a vport that does not + * have any configuration. + * @send: Send a packet on the device. + * zero for dropped packets or negative for error. + */ +struct vport_ops { + enum ovs_vport_type type; + + /* Called with ovs_mutex. */ + struct vport *(*create)(const struct vport_parms *); + void (*destroy)(struct vport *); + + int (*set_options)(struct vport *, struct nlattr *); + int (*get_options)(const struct vport *, struct sk_buff *); + + netdev_tx_t (*send) (struct sk_buff *skb); + struct module *owner; + struct list_head list; +}; + +struct vport *ovs_vport_alloc(int priv_size, const struct vport_ops *, + const struct vport_parms *); +void ovs_vport_free(struct vport *); +void ovs_vport_deferred_free(struct vport *vport); + +#define VPORT_ALIGN 8 + +/** + * vport_priv - access private data area of vport + * + * @vport: vport to access + * + * If a nonzero size was passed in priv_size of vport_alloc() a private data + * area was allocated on creation. This allows that area to be accessed and + * used for any purpose needed by the vport implementer. + */ +static inline void *vport_priv(const struct vport *vport) +{ + return (u8 *)(uintptr_t)vport + ALIGN(sizeof(struct vport), VPORT_ALIGN); +} + +/** + * vport_from_priv - lookup vport from private data pointer + * + * @priv: Start of private data area. + * + * It is sometimes useful to translate from a pointer to the private data + * area to the vport, such as in the case where the private data pointer is + * the result of a hash table lookup. @priv must point to the start of the + * private data area. + */ +static inline struct vport *vport_from_priv(void *priv) +{ + return (struct vport *)((u8 *)priv - ALIGN(sizeof(struct vport), VPORT_ALIGN)); +} + +int ovs_vport_receive(struct vport *, struct sk_buff *, + const struct ip_tunnel_info *); + +static inline const char *ovs_vport_name(struct vport *vport) +{ + return vport->dev->name; +} + +int __ovs_vport_ops_register(struct vport_ops *ops); +#define ovs_vport_ops_register(ops) \ + ({ \ + (ops)->owner = THIS_MODULE; \ + __ovs_vport_ops_register(ops); \ + }) + +void ovs_vport_ops_unregister(struct vport_ops *ops); + +static inline struct rtable *ovs_tunnel_route_lookup(struct net *net, + const struct ip_tunnel_key *key, + u32 mark, + struct flowi4 *fl, + u8 protocol) +{ + struct rtable *rt; + + memset(fl, 0, sizeof(*fl)); + fl->daddr = key->u.ipv4.dst; + fl->saddr = key->u.ipv4.src; + fl->flowi4_tos = RT_TOS(key->tos); + fl->flowi4_mark = mark; + fl->flowi4_proto = protocol; + + rt = ip_route_output_key(net, fl); + return rt; +} + +void ovs_vport_send(struct vport *vport, struct sk_buff *skb); + +#endif /* vport.h */ |
